Merge tag 'task_work-2021-01-19' of git://git.kernel.dk/linux-block
[linux-2.6-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 #include <trace/events/intel_iommu.h>
48
49 #include "../irq_remapping.h"
50 #include "pasid.h"
51
52 #define ROOT_SIZE               VTD_PAGE_SIZE
53 #define CONTEXT_SIZE            VTD_PAGE_SIZE
54
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59
60 #define IOAPIC_RANGE_START      (0xfee00000)
61 #define IOAPIC_RANGE_END        (0xfeefffff)
62 #define IOVA_START_ADDR         (0x1000)
63
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68
69 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
71
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
75                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77
78 /* IO virtual address start page frame number */
79 #define IOVA_START_PFN          (1)
80
81 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
82
83 /* page table handling */
84 #define LEVEL_STRIDE            (9)
85 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
86
87 /*
88  * This bitmap is used to advertise the page sizes our hardware support
89  * to the IOMMU core, which will then use this information to split
90  * physically contiguous memory regions it is mapping into page sizes
91  * that we support.
92  *
93  * Traditionally the IOMMU core just handed us the mappings directly,
94  * after making sure the size is an order of a 4KiB page and that the
95  * mapping has natural alignment.
96  *
97  * To retain this behavior, we currently advertise that we support
98  * all page sizes that are an order of 4KiB.
99  *
100  * If at some point we'd like to utilize the IOMMU core's new behavior,
101  * we could change this to advertise the real page sizes we support.
102  */
103 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
104
105 static inline int agaw_to_level(int agaw)
106 {
107         return agaw + 2;
108 }
109
110 static inline int agaw_to_width(int agaw)
111 {
112         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
113 }
114
115 static inline int width_to_agaw(int width)
116 {
117         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
118 }
119
120 static inline unsigned int level_to_offset_bits(int level)
121 {
122         return (level - 1) * LEVEL_STRIDE;
123 }
124
125 static inline int pfn_level_offset(u64 pfn, int level)
126 {
127         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
128 }
129
130 static inline u64 level_mask(int level)
131 {
132         return -1ULL << level_to_offset_bits(level);
133 }
134
135 static inline u64 level_size(int level)
136 {
137         return 1ULL << level_to_offset_bits(level);
138 }
139
140 static inline u64 align_to_level(u64 pfn, int level)
141 {
142         return (pfn + level_size(level) - 1) & level_mask(level);
143 }
144
145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 {
147         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
148 }
149
150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
151    are never going to work. */
152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 {
154         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156
157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 {
159         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 {
163         return mm_to_dma_pfn(page_to_pfn(pg));
164 }
165 static inline unsigned long virt_to_dma_pfn(void *p)
166 {
167         return page_to_dma_pfn(virt_to_page(p));
168 }
169
170 /* global iommu list, set NULL for ignored DMAR units */
171 static struct intel_iommu **g_iommus;
172
173 static void __init check_tylersburg_isoch(void);
174 static int rwbf_quirk;
175
176 /*
177  * set to 1 to panic kernel if can't successfully enable VT-d
178  * (used when kernel is launched w/ TXT)
179  */
180 static int force_on = 0;
181 static int intel_iommu_tboot_noforce;
182 static int no_platform_optin;
183
184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
185
186 /*
187  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
188  * if marked present.
189  */
190 static phys_addr_t root_entry_lctp(struct root_entry *re)
191 {
192         if (!(re->lo & 1))
193                 return 0;
194
195         return re->lo & VTD_PAGE_MASK;
196 }
197
198 /*
199  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_uctp(struct root_entry *re)
203 {
204         if (!(re->hi & 1))
205                 return 0;
206
207         return re->hi & VTD_PAGE_MASK;
208 }
209
210 static inline void context_clear_pasid_enable(struct context_entry *context)
211 {
212         context->lo &= ~(1ULL << 11);
213 }
214
215 static inline bool context_pasid_enabled(struct context_entry *context)
216 {
217         return !!(context->lo & (1ULL << 11));
218 }
219
220 static inline void context_set_copied(struct context_entry *context)
221 {
222         context->hi |= (1ull << 3);
223 }
224
225 static inline bool context_copied(struct context_entry *context)
226 {
227         return !!(context->hi & (1ULL << 3));
228 }
229
230 static inline bool __context_present(struct context_entry *context)
231 {
232         return (context->lo & 1);
233 }
234
235 bool context_present(struct context_entry *context)
236 {
237         return context_pasid_enabled(context) ?
238              __context_present(context) :
239              __context_present(context) && !context_copied(context);
240 }
241
242 static inline void context_set_present(struct context_entry *context)
243 {
244         context->lo |= 1;
245 }
246
247 static inline void context_set_fault_enable(struct context_entry *context)
248 {
249         context->lo &= (((u64)-1) << 2) | 1;
250 }
251
252 static inline void context_set_translation_type(struct context_entry *context,
253                                                 unsigned long value)
254 {
255         context->lo &= (((u64)-1) << 4) | 3;
256         context->lo |= (value & 3) << 2;
257 }
258
259 static inline void context_set_address_root(struct context_entry *context,
260                                             unsigned long value)
261 {
262         context->lo &= ~VTD_PAGE_MASK;
263         context->lo |= value & VTD_PAGE_MASK;
264 }
265
266 static inline void context_set_address_width(struct context_entry *context,
267                                              unsigned long value)
268 {
269         context->hi |= value & 7;
270 }
271
272 static inline void context_set_domain_id(struct context_entry *context,
273                                          unsigned long value)
274 {
275         context->hi |= (value & ((1 << 16) - 1)) << 8;
276 }
277
278 static inline int context_domain_id(struct context_entry *c)
279 {
280         return((c->hi >> 8) & 0xffff);
281 }
282
283 static inline void context_clear_entry(struct context_entry *context)
284 {
285         context->lo = 0;
286         context->hi = 0;
287 }
288
289 /*
290  * This domain is a statically identity mapping domain.
291  *      1. This domain creats a static 1:1 mapping to all usable memory.
292  *      2. It maps to each iommu if successful.
293  *      3. Each iommu mapps to this domain if successful.
294  */
295 static struct dmar_domain *si_domain;
296 static int hw_pass_through = 1;
297
298 #define for_each_domain_iommu(idx, domain)                      \
299         for (idx = 0; idx < g_num_of_iommus; idx++)             \
300                 if (domain->iommu_refcnt[idx])
301
302 struct dmar_rmrr_unit {
303         struct list_head list;          /* list of rmrr units   */
304         struct acpi_dmar_header *hdr;   /* ACPI header          */
305         u64     base_address;           /* reserved base address*/
306         u64     end_address;            /* reserved end address */
307         struct dmar_dev_scope *devices; /* target devices */
308         int     devices_cnt;            /* target device count */
309 };
310
311 struct dmar_atsr_unit {
312         struct list_head list;          /* list of ATSR units */
313         struct acpi_dmar_header *hdr;   /* ACPI header */
314         struct dmar_dev_scope *devices; /* target devices */
315         int devices_cnt;                /* target device count */
316         u8 include_all:1;               /* include all ports */
317 };
318
319 static LIST_HEAD(dmar_atsr_units);
320 static LIST_HEAD(dmar_rmrr_units);
321
322 #define for_each_rmrr_units(rmrr) \
323         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
324
325 /* bitmap for indexing intel_iommus */
326 static int g_num_of_iommus;
327
328 static void domain_exit(struct dmar_domain *domain);
329 static void domain_remove_dev_info(struct dmar_domain *domain);
330 static void dmar_remove_one_dev_info(struct device *dev);
331 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
332 static int intel_iommu_attach_device(struct iommu_domain *domain,
333                                      struct device *dev);
334 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
335                                             dma_addr_t iova);
336
337 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
338 int dmar_disabled = 0;
339 #else
340 int dmar_disabled = 1;
341 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
342
343 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
344 int intel_iommu_sm = 1;
345 #else
346 int intel_iommu_sm;
347 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
348
349 int intel_iommu_enabled = 0;
350 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
351
352 static int dmar_map_gfx = 1;
353 static int dmar_forcedac;
354 static int intel_iommu_strict;
355 static int intel_iommu_superpage = 1;
356 static int iommu_identity_mapping;
357 static int iommu_skip_te_disable;
358
359 #define IDENTMAP_GFX            2
360 #define IDENTMAP_AZALIA         4
361
362 int intel_iommu_gfx_mapped;
363 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
364
365 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
366 struct device_domain_info *get_domain_info(struct device *dev)
367 {
368         struct device_domain_info *info;
369
370         if (!dev)
371                 return NULL;
372
373         info = dev_iommu_priv_get(dev);
374         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
375                 return NULL;
376
377         return info;
378 }
379
380 DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
382
383 /*
384  * Iterate over elements in device_domain_list and call the specified
385  * callback @fn against each element.
386  */
387 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
388                                      void *data), void *data)
389 {
390         int ret = 0;
391         unsigned long flags;
392         struct device_domain_info *info;
393
394         spin_lock_irqsave(&device_domain_lock, flags);
395         list_for_each_entry(info, &device_domain_list, global) {
396                 ret = fn(info, data);
397                 if (ret) {
398                         spin_unlock_irqrestore(&device_domain_lock, flags);
399                         return ret;
400                 }
401         }
402         spin_unlock_irqrestore(&device_domain_lock, flags);
403
404         return 0;
405 }
406
407 const struct iommu_ops intel_iommu_ops;
408
409 static bool translation_pre_enabled(struct intel_iommu *iommu)
410 {
411         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
412 }
413
414 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
415 {
416         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
417 }
418
419 static void init_translation_status(struct intel_iommu *iommu)
420 {
421         u32 gsts;
422
423         gsts = readl(iommu->reg + DMAR_GSTS_REG);
424         if (gsts & DMA_GSTS_TES)
425                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
426 }
427
428 static int __init intel_iommu_setup(char *str)
429 {
430         if (!str)
431                 return -EINVAL;
432         while (*str) {
433                 if (!strncmp(str, "on", 2)) {
434                         dmar_disabled = 0;
435                         pr_info("IOMMU enabled\n");
436                 } else if (!strncmp(str, "off", 3)) {
437                         dmar_disabled = 1;
438                         no_platform_optin = 1;
439                         pr_info("IOMMU disabled\n");
440                 } else if (!strncmp(str, "igfx_off", 8)) {
441                         dmar_map_gfx = 0;
442                         pr_info("Disable GFX device mapping\n");
443                 } else if (!strncmp(str, "forcedac", 8)) {
444                         pr_info("Forcing DAC for PCI devices\n");
445                         dmar_forcedac = 1;
446                 } else if (!strncmp(str, "strict", 6)) {
447                         pr_info("Disable batched IOTLB flush\n");
448                         intel_iommu_strict = 1;
449                 } else if (!strncmp(str, "sp_off", 6)) {
450                         pr_info("Disable supported super page\n");
451                         intel_iommu_superpage = 0;
452                 } else if (!strncmp(str, "sm_on", 5)) {
453                         pr_info("Intel-IOMMU: scalable mode supported\n");
454                         intel_iommu_sm = 1;
455                 } else if (!strncmp(str, "tboot_noforce", 13)) {
456                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457                         intel_iommu_tboot_noforce = 1;
458                 }
459
460                 str += strcspn(str, ",");
461                 while (*str == ',')
462                         str++;
463         }
464         return 0;
465 }
466 __setup("intel_iommu=", intel_iommu_setup);
467
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
470
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
472 {
473         struct dmar_domain **domains;
474         int idx = did >> 8;
475
476         domains = iommu->domains[idx];
477         if (!domains)
478                 return NULL;
479
480         return domains[did & 0xff];
481 }
482
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484                              struct dmar_domain *domain)
485 {
486         struct dmar_domain **domains;
487         int idx = did >> 8;
488
489         if (!iommu->domains[idx]) {
490                 size_t size = 256 * sizeof(struct dmar_domain *);
491                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
492         }
493
494         domains = iommu->domains[idx];
495         if (WARN_ON(!domains))
496                 return;
497         else
498                 domains[did & 0xff] = domain;
499 }
500
501 void *alloc_pgtable_page(int node)
502 {
503         struct page *page;
504         void *vaddr = NULL;
505
506         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
507         if (page)
508                 vaddr = page_address(page);
509         return vaddr;
510 }
511
512 void free_pgtable_page(void *vaddr)
513 {
514         free_page((unsigned long)vaddr);
515 }
516
517 static inline void *alloc_domain_mem(void)
518 {
519         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
520 }
521
522 static void free_domain_mem(void *vaddr)
523 {
524         kmem_cache_free(iommu_domain_cache, vaddr);
525 }
526
527 static inline void * alloc_devinfo_mem(void)
528 {
529         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
530 }
531
532 static inline void free_devinfo_mem(void *vaddr)
533 {
534         kmem_cache_free(iommu_devinfo_cache, vaddr);
535 }
536
537 static inline int domain_type_is_si(struct dmar_domain *domain)
538 {
539         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
540 }
541
542 static inline bool domain_use_first_level(struct dmar_domain *domain)
543 {
544         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
545 }
546
547 static inline int domain_pfn_supported(struct dmar_domain *domain,
548                                        unsigned long pfn)
549 {
550         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
551
552         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
553 }
554
555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
556 {
557         unsigned long sagaw;
558         int agaw = -1;
559
560         sagaw = cap_sagaw(iommu->cap);
561         for (agaw = width_to_agaw(max_gaw);
562              agaw >= 0; agaw--) {
563                 if (test_bit(agaw, &sagaw))
564                         break;
565         }
566
567         return agaw;
568 }
569
570 /*
571  * Calculate max SAGAW for each iommu.
572  */
573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
574 {
575         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 }
577
578 /*
579  * calculate agaw for each iommu.
580  * "SAGAW" may be different across iommus, use a default agaw, and
581  * get a supported less agaw for iommus that don't support the default agaw.
582  */
583 int iommu_calculate_agaw(struct intel_iommu *iommu)
584 {
585         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
586 }
587
588 /* This functionin only returns single iommu in a domain */
589 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 {
591         int iommu_id;
592
593         /* si_domain and vm domain should not get here. */
594         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
595                 return NULL;
596
597         for_each_domain_iommu(iommu_id, domain)
598                 break;
599
600         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
601                 return NULL;
602
603         return g_iommus[iommu_id];
604 }
605
606 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
607 {
608         return sm_supported(iommu) ?
609                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
610 }
611
612 static void domain_update_iommu_coherency(struct dmar_domain *domain)
613 {
614         struct dmar_drhd_unit *drhd;
615         struct intel_iommu *iommu;
616         bool found = false;
617         int i;
618
619         domain->iommu_coherency = 1;
620
621         for_each_domain_iommu(i, domain) {
622                 found = true;
623                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
624                         domain->iommu_coherency = 0;
625                         break;
626                 }
627         }
628         if (found)
629                 return;
630
631         /* No hardware attached; use lowest common denominator */
632         rcu_read_lock();
633         for_each_active_iommu(iommu, drhd) {
634                 if (!iommu_paging_structure_coherency(iommu)) {
635                         domain->iommu_coherency = 0;
636                         break;
637                 }
638         }
639         rcu_read_unlock();
640 }
641
642 static int domain_update_iommu_snooping(struct intel_iommu *skip)
643 {
644         struct dmar_drhd_unit *drhd;
645         struct intel_iommu *iommu;
646         int ret = 1;
647
648         rcu_read_lock();
649         for_each_active_iommu(iommu, drhd) {
650                 if (iommu != skip) {
651                         if (!ecap_sc_support(iommu->ecap)) {
652                                 ret = 0;
653                                 break;
654                         }
655                 }
656         }
657         rcu_read_unlock();
658
659         return ret;
660 }
661
662 static int domain_update_iommu_superpage(struct dmar_domain *domain,
663                                          struct intel_iommu *skip)
664 {
665         struct dmar_drhd_unit *drhd;
666         struct intel_iommu *iommu;
667         int mask = 0x3;
668
669         if (!intel_iommu_superpage) {
670                 return 0;
671         }
672
673         /* set iommu_superpage to the smallest common denominator */
674         rcu_read_lock();
675         for_each_active_iommu(iommu, drhd) {
676                 if (iommu != skip) {
677                         if (domain && domain_use_first_level(domain)) {
678                                 if (!cap_fl1gp_support(iommu->cap))
679                                         mask = 0x1;
680                         } else {
681                                 mask &= cap_super_page_val(iommu->cap);
682                         }
683
684                         if (!mask)
685                                 break;
686                 }
687         }
688         rcu_read_unlock();
689
690         return fls(mask);
691 }
692
693 static int domain_update_device_node(struct dmar_domain *domain)
694 {
695         struct device_domain_info *info;
696         int nid = NUMA_NO_NODE;
697
698         assert_spin_locked(&device_domain_lock);
699
700         if (list_empty(&domain->devices))
701                 return NUMA_NO_NODE;
702
703         list_for_each_entry(info, &domain->devices, link) {
704                 if (!info->dev)
705                         continue;
706
707                 /*
708                  * There could possibly be multiple device numa nodes as devices
709                  * within the same domain may sit behind different IOMMUs. There
710                  * isn't perfect answer in such situation, so we select first
711                  * come first served policy.
712                  */
713                 nid = dev_to_node(info->dev);
714                 if (nid != NUMA_NO_NODE)
715                         break;
716         }
717
718         return nid;
719 }
720
721 static void domain_update_iotlb(struct dmar_domain *domain);
722
723 /* Some capabilities may be different across iommus */
724 static void domain_update_iommu_cap(struct dmar_domain *domain)
725 {
726         domain_update_iommu_coherency(domain);
727         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
728         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
729
730         /*
731          * If RHSA is missing, we should default to the device numa domain
732          * as fall back.
733          */
734         if (domain->nid == NUMA_NO_NODE)
735                 domain->nid = domain_update_device_node(domain);
736
737         /*
738          * First-level translation restricts the input-address to a
739          * canonical address (i.e., address bits 63:N have the same
740          * value as address bit [N-1], where N is 48-bits with 4-level
741          * paging and 57-bits with 5-level paging). Hence, skip bit
742          * [N-1].
743          */
744         if (domain_use_first_level(domain))
745                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
746         else
747                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
748
749         domain_update_iotlb(domain);
750 }
751
752 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
753                                          u8 devfn, int alloc)
754 {
755         struct root_entry *root = &iommu->root_entry[bus];
756         struct context_entry *context;
757         u64 *entry;
758
759         entry = &root->lo;
760         if (sm_supported(iommu)) {
761                 if (devfn >= 0x80) {
762                         devfn -= 0x80;
763                         entry = &root->hi;
764                 }
765                 devfn *= 2;
766         }
767         if (*entry & 1)
768                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
769         else {
770                 unsigned long phy_addr;
771                 if (!alloc)
772                         return NULL;
773
774                 context = alloc_pgtable_page(iommu->node);
775                 if (!context)
776                         return NULL;
777
778                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
779                 phy_addr = virt_to_phys((void *)context);
780                 *entry = phy_addr | 1;
781                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
782         }
783         return &context[devfn];
784 }
785
786 static bool attach_deferred(struct device *dev)
787 {
788         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
789 }
790
791 /**
792  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
793  *                               sub-hierarchy of a candidate PCI-PCI bridge
794  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
795  * @bridge: the candidate PCI-PCI bridge
796  *
797  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
798  */
799 static bool
800 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
801 {
802         struct pci_dev *pdev, *pbridge;
803
804         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
805                 return false;
806
807         pdev = to_pci_dev(dev);
808         pbridge = to_pci_dev(bridge);
809
810         if (pbridge->subordinate &&
811             pbridge->subordinate->number <= pdev->bus->number &&
812             pbridge->subordinate->busn_res.end >= pdev->bus->number)
813                 return true;
814
815         return false;
816 }
817
818 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
819 {
820         struct dmar_drhd_unit *drhd;
821         u32 vtbar;
822         int rc;
823
824         /* We know that this device on this chipset has its own IOMMU.
825          * If we find it under a different IOMMU, then the BIOS is lying
826          * to us. Hope that the IOMMU for this device is actually
827          * disabled, and it needs no translation...
828          */
829         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
830         if (rc) {
831                 /* "can't" happen */
832                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
833                 return false;
834         }
835         vtbar &= 0xffff0000;
836
837         /* we know that the this iommu should be at offset 0xa000 from vtbar */
838         drhd = dmar_find_matched_drhd_unit(pdev);
839         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
840                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
841                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
842                 return true;
843         }
844
845         return false;
846 }
847
848 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
849 {
850         if (!iommu || iommu->drhd->ignored)
851                 return true;
852
853         if (dev_is_pci(dev)) {
854                 struct pci_dev *pdev = to_pci_dev(dev);
855
856                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
857                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
858                     quirk_ioat_snb_local_iommu(pdev))
859                         return true;
860         }
861
862         return false;
863 }
864
865 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
866 {
867         struct dmar_drhd_unit *drhd = NULL;
868         struct pci_dev *pdev = NULL;
869         struct intel_iommu *iommu;
870         struct device *tmp;
871         u16 segment = 0;
872         int i;
873
874         if (!dev)
875                 return NULL;
876
877         if (dev_is_pci(dev)) {
878                 struct pci_dev *pf_pdev;
879
880                 pdev = pci_real_dma_dev(to_pci_dev(dev));
881
882                 /* VFs aren't listed in scope tables; we need to look up
883                  * the PF instead to find the IOMMU. */
884                 pf_pdev = pci_physfn(pdev);
885                 dev = &pf_pdev->dev;
886                 segment = pci_domain_nr(pdev->bus);
887         } else if (has_acpi_companion(dev))
888                 dev = &ACPI_COMPANION(dev)->dev;
889
890         rcu_read_lock();
891         for_each_iommu(iommu, drhd) {
892                 if (pdev && segment != drhd->segment)
893                         continue;
894
895                 for_each_active_dev_scope(drhd->devices,
896                                           drhd->devices_cnt, i, tmp) {
897                         if (tmp == dev) {
898                                 /* For a VF use its original BDF# not that of the PF
899                                  * which we used for the IOMMU lookup. Strictly speaking
900                                  * we could do this for all PCI devices; we only need to
901                                  * get the BDF# from the scope table for ACPI matches. */
902                                 if (pdev && pdev->is_virtfn)
903                                         goto got_pdev;
904
905                                 if (bus && devfn) {
906                                         *bus = drhd->devices[i].bus;
907                                         *devfn = drhd->devices[i].devfn;
908                                 }
909                                 goto out;
910                         }
911
912                         if (is_downstream_to_pci_bridge(dev, tmp))
913                                 goto got_pdev;
914                 }
915
916                 if (pdev && drhd->include_all) {
917                 got_pdev:
918                         if (bus && devfn) {
919                                 *bus = pdev->bus->number;
920                                 *devfn = pdev->devfn;
921                         }
922                         goto out;
923                 }
924         }
925         iommu = NULL;
926  out:
927         if (iommu_is_dummy(iommu, dev))
928                 iommu = NULL;
929
930         rcu_read_unlock();
931
932         return iommu;
933 }
934
935 static void domain_flush_cache(struct dmar_domain *domain,
936                                void *addr, int size)
937 {
938         if (!domain->iommu_coherency)
939                 clflush_cache_range(addr, size);
940 }
941
942 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
943 {
944         struct context_entry *context;
945         int ret = 0;
946         unsigned long flags;
947
948         spin_lock_irqsave(&iommu->lock, flags);
949         context = iommu_context_addr(iommu, bus, devfn, 0);
950         if (context)
951                 ret = context_present(context);
952         spin_unlock_irqrestore(&iommu->lock, flags);
953         return ret;
954 }
955
956 static void free_context_table(struct intel_iommu *iommu)
957 {
958         int i;
959         unsigned long flags;
960         struct context_entry *context;
961
962         spin_lock_irqsave(&iommu->lock, flags);
963         if (!iommu->root_entry) {
964                 goto out;
965         }
966         for (i = 0; i < ROOT_ENTRY_NR; i++) {
967                 context = iommu_context_addr(iommu, i, 0, 0);
968                 if (context)
969                         free_pgtable_page(context);
970
971                 if (!sm_supported(iommu))
972                         continue;
973
974                 context = iommu_context_addr(iommu, i, 0x80, 0);
975                 if (context)
976                         free_pgtable_page(context);
977
978         }
979         free_pgtable_page(iommu->root_entry);
980         iommu->root_entry = NULL;
981 out:
982         spin_unlock_irqrestore(&iommu->lock, flags);
983 }
984
985 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
986                                       unsigned long pfn, int *target_level)
987 {
988         struct dma_pte *parent, *pte;
989         int level = agaw_to_level(domain->agaw);
990         int offset;
991
992         BUG_ON(!domain->pgd);
993
994         if (!domain_pfn_supported(domain, pfn))
995                 /* Address beyond IOMMU's addressing capabilities. */
996                 return NULL;
997
998         parent = domain->pgd;
999
1000         while (1) {
1001                 void *tmp_page;
1002
1003                 offset = pfn_level_offset(pfn, level);
1004                 pte = &parent[offset];
1005                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1006                         break;
1007                 if (level == *target_level)
1008                         break;
1009
1010                 if (!dma_pte_present(pte)) {
1011                         uint64_t pteval;
1012
1013                         tmp_page = alloc_pgtable_page(domain->nid);
1014
1015                         if (!tmp_page)
1016                                 return NULL;
1017
1018                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1019                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1020                         if (domain_use_first_level(domain))
1021                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1022                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1023                                 /* Someone else set it while we were thinking; use theirs. */
1024                                 free_pgtable_page(tmp_page);
1025                         else
1026                                 domain_flush_cache(domain, pte, sizeof(*pte));
1027                 }
1028                 if (level == 1)
1029                         break;
1030
1031                 parent = phys_to_virt(dma_pte_addr(pte));
1032                 level--;
1033         }
1034
1035         if (!*target_level)
1036                 *target_level = level;
1037
1038         return pte;
1039 }
1040
1041 /* return address's pte at specific level */
1042 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1043                                          unsigned long pfn,
1044                                          int level, int *large_page)
1045 {
1046         struct dma_pte *parent, *pte;
1047         int total = agaw_to_level(domain->agaw);
1048         int offset;
1049
1050         parent = domain->pgd;
1051         while (level <= total) {
1052                 offset = pfn_level_offset(pfn, total);
1053                 pte = &parent[offset];
1054                 if (level == total)
1055                         return pte;
1056
1057                 if (!dma_pte_present(pte)) {
1058                         *large_page = total;
1059                         break;
1060                 }
1061
1062                 if (dma_pte_superpage(pte)) {
1063                         *large_page = total;
1064                         return pte;
1065                 }
1066
1067                 parent = phys_to_virt(dma_pte_addr(pte));
1068                 total--;
1069         }
1070         return NULL;
1071 }
1072
1073 /* clear last level pte, a tlb flush should be followed */
1074 static void dma_pte_clear_range(struct dmar_domain *domain,
1075                                 unsigned long start_pfn,
1076                                 unsigned long last_pfn)
1077 {
1078         unsigned int large_page;
1079         struct dma_pte *first_pte, *pte;
1080
1081         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1082         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1083         BUG_ON(start_pfn > last_pfn);
1084
1085         /* we don't need lock here; nobody else touches the iova range */
1086         do {
1087                 large_page = 1;
1088                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1089                 if (!pte) {
1090                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1091                         continue;
1092                 }
1093                 do {
1094                         dma_clear_pte(pte);
1095                         start_pfn += lvl_to_nr_pages(large_page);
1096                         pte++;
1097                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1098
1099                 domain_flush_cache(domain, first_pte,
1100                                    (void *)pte - (void *)first_pte);
1101
1102         } while (start_pfn && start_pfn <= last_pfn);
1103 }
1104
1105 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1106                                int retain_level, struct dma_pte *pte,
1107                                unsigned long pfn, unsigned long start_pfn,
1108                                unsigned long last_pfn)
1109 {
1110         pfn = max(start_pfn, pfn);
1111         pte = &pte[pfn_level_offset(pfn, level)];
1112
1113         do {
1114                 unsigned long level_pfn;
1115                 struct dma_pte *level_pte;
1116
1117                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1118                         goto next;
1119
1120                 level_pfn = pfn & level_mask(level);
1121                 level_pte = phys_to_virt(dma_pte_addr(pte));
1122
1123                 if (level > 2) {
1124                         dma_pte_free_level(domain, level - 1, retain_level,
1125                                            level_pte, level_pfn, start_pfn,
1126                                            last_pfn);
1127                 }
1128
1129                 /*
1130                  * Free the page table if we're below the level we want to
1131                  * retain and the range covers the entire table.
1132                  */
1133                 if (level < retain_level && !(start_pfn > level_pfn ||
1134                       last_pfn < level_pfn + level_size(level) - 1)) {
1135                         dma_clear_pte(pte);
1136                         domain_flush_cache(domain, pte, sizeof(*pte));
1137                         free_pgtable_page(level_pte);
1138                 }
1139 next:
1140                 pfn += level_size(level);
1141         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1142 }
1143
1144 /*
1145  * clear last level (leaf) ptes and free page table pages below the
1146  * level we wish to keep intact.
1147  */
1148 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1149                                    unsigned long start_pfn,
1150                                    unsigned long last_pfn,
1151                                    int retain_level)
1152 {
1153         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1154         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1155         BUG_ON(start_pfn > last_pfn);
1156
1157         dma_pte_clear_range(domain, start_pfn, last_pfn);
1158
1159         /* We don't need lock here; nobody else touches the iova range */
1160         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1161                            domain->pgd, 0, start_pfn, last_pfn);
1162
1163         /* free pgd */
1164         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1165                 free_pgtable_page(domain->pgd);
1166                 domain->pgd = NULL;
1167         }
1168 }
1169
1170 /* When a page at a given level is being unlinked from its parent, we don't
1171    need to *modify* it at all. All we need to do is make a list of all the
1172    pages which can be freed just as soon as we've flushed the IOTLB and we
1173    know the hardware page-walk will no longer touch them.
1174    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1175    be freed. */
1176 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1177                                             int level, struct dma_pte *pte,
1178                                             struct page *freelist)
1179 {
1180         struct page *pg;
1181
1182         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1183         pg->freelist = freelist;
1184         freelist = pg;
1185
1186         if (level == 1)
1187                 return freelist;
1188
1189         pte = page_address(pg);
1190         do {
1191                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1192                         freelist = dma_pte_list_pagetables(domain, level - 1,
1193                                                            pte, freelist);
1194                 pte++;
1195         } while (!first_pte_in_page(pte));
1196
1197         return freelist;
1198 }
1199
1200 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1201                                         struct dma_pte *pte, unsigned long pfn,
1202                                         unsigned long start_pfn,
1203                                         unsigned long last_pfn,
1204                                         struct page *freelist)
1205 {
1206         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1207
1208         pfn = max(start_pfn, pfn);
1209         pte = &pte[pfn_level_offset(pfn, level)];
1210
1211         do {
1212                 unsigned long level_pfn;
1213
1214                 if (!dma_pte_present(pte))
1215                         goto next;
1216
1217                 level_pfn = pfn & level_mask(level);
1218
1219                 /* If range covers entire pagetable, free it */
1220                 if (start_pfn <= level_pfn &&
1221                     last_pfn >= level_pfn + level_size(level) - 1) {
1222                         /* These suborbinate page tables are going away entirely. Don't
1223                            bother to clear them; we're just going to *free* them. */
1224                         if (level > 1 && !dma_pte_superpage(pte))
1225                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1226
1227                         dma_clear_pte(pte);
1228                         if (!first_pte)
1229                                 first_pte = pte;
1230                         last_pte = pte;
1231                 } else if (level > 1) {
1232                         /* Recurse down into a level that isn't *entirely* obsolete */
1233                         freelist = dma_pte_clear_level(domain, level - 1,
1234                                                        phys_to_virt(dma_pte_addr(pte)),
1235                                                        level_pfn, start_pfn, last_pfn,
1236                                                        freelist);
1237                 }
1238 next:
1239                 pfn += level_size(level);
1240         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1241
1242         if (first_pte)
1243                 domain_flush_cache(domain, first_pte,
1244                                    (void *)++last_pte - (void *)first_pte);
1245
1246         return freelist;
1247 }
1248
1249 /* We can't just free the pages because the IOMMU may still be walking
1250    the page tables, and may have cached the intermediate levels. The
1251    pages can only be freed after the IOTLB flush has been done. */
1252 static struct page *domain_unmap(struct dmar_domain *domain,
1253                                  unsigned long start_pfn,
1254                                  unsigned long last_pfn,
1255                                  struct page *freelist)
1256 {
1257         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1258         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1259         BUG_ON(start_pfn > last_pfn);
1260
1261         /* we don't need lock here; nobody else touches the iova range */
1262         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1263                                        domain->pgd, 0, start_pfn, last_pfn,
1264                                        freelist);
1265
1266         /* free pgd */
1267         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1268                 struct page *pgd_page = virt_to_page(domain->pgd);
1269                 pgd_page->freelist = freelist;
1270                 freelist = pgd_page;
1271
1272                 domain->pgd = NULL;
1273         }
1274
1275         return freelist;
1276 }
1277
1278 static void dma_free_pagelist(struct page *freelist)
1279 {
1280         struct page *pg;
1281
1282         while ((pg = freelist)) {
1283                 freelist = pg->freelist;
1284                 free_pgtable_page(page_address(pg));
1285         }
1286 }
1287
1288 /* iommu handling */
1289 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1290 {
1291         struct root_entry *root;
1292         unsigned long flags;
1293
1294         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1295         if (!root) {
1296                 pr_err("Allocating root entry for %s failed\n",
1297                         iommu->name);
1298                 return -ENOMEM;
1299         }
1300
1301         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1302
1303         spin_lock_irqsave(&iommu->lock, flags);
1304         iommu->root_entry = root;
1305         spin_unlock_irqrestore(&iommu->lock, flags);
1306
1307         return 0;
1308 }
1309
1310 static void iommu_set_root_entry(struct intel_iommu *iommu)
1311 {
1312         u64 addr;
1313         u32 sts;
1314         unsigned long flag;
1315
1316         addr = virt_to_phys(iommu->root_entry);
1317         if (sm_supported(iommu))
1318                 addr |= DMA_RTADDR_SMT;
1319
1320         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1321         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1322
1323         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1324
1325         /* Make sure hardware complete it */
1326         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1327                       readl, (sts & DMA_GSTS_RTPS), sts);
1328
1329         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1330 }
1331
1332 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1333 {
1334         u32 val;
1335         unsigned long flag;
1336
1337         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1338                 return;
1339
1340         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1341         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1342
1343         /* Make sure hardware complete it */
1344         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1345                       readl, (!(val & DMA_GSTS_WBFS)), val);
1346
1347         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1348 }
1349
1350 /* return value determine if we need a write buffer flush */
1351 static void __iommu_flush_context(struct intel_iommu *iommu,
1352                                   u16 did, u16 source_id, u8 function_mask,
1353                                   u64 type)
1354 {
1355         u64 val = 0;
1356         unsigned long flag;
1357
1358         switch (type) {
1359         case DMA_CCMD_GLOBAL_INVL:
1360                 val = DMA_CCMD_GLOBAL_INVL;
1361                 break;
1362         case DMA_CCMD_DOMAIN_INVL:
1363                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1364                 break;
1365         case DMA_CCMD_DEVICE_INVL:
1366                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1367                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1368                 break;
1369         default:
1370                 BUG();
1371         }
1372         val |= DMA_CCMD_ICC;
1373
1374         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1375         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1376
1377         /* Make sure hardware complete it */
1378         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1379                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1380
1381         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1382 }
1383
1384 /* return value determine if we need a write buffer flush */
1385 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1386                                 u64 addr, unsigned int size_order, u64 type)
1387 {
1388         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1389         u64 val = 0, val_iva = 0;
1390         unsigned long flag;
1391
1392         switch (type) {
1393         case DMA_TLB_GLOBAL_FLUSH:
1394                 /* global flush doesn't need set IVA_REG */
1395                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1396                 break;
1397         case DMA_TLB_DSI_FLUSH:
1398                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1399                 break;
1400         case DMA_TLB_PSI_FLUSH:
1401                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1402                 /* IH bit is passed in as part of address */
1403                 val_iva = size_order | addr;
1404                 break;
1405         default:
1406                 BUG();
1407         }
1408         /* Note: set drain read/write */
1409 #if 0
1410         /*
1411          * This is probably to be super secure.. Looks like we can
1412          * ignore it without any impact.
1413          */
1414         if (cap_read_drain(iommu->cap))
1415                 val |= DMA_TLB_READ_DRAIN;
1416 #endif
1417         if (cap_write_drain(iommu->cap))
1418                 val |= DMA_TLB_WRITE_DRAIN;
1419
1420         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1421         /* Note: Only uses first TLB reg currently */
1422         if (val_iva)
1423                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1424         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1425
1426         /* Make sure hardware complete it */
1427         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1428                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1429
1430         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1431
1432         /* check IOTLB invalidation granularity */
1433         if (DMA_TLB_IAIG(val) == 0)
1434                 pr_err("Flush IOTLB failed\n");
1435         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1436                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1437                         (unsigned long long)DMA_TLB_IIRG(type),
1438                         (unsigned long long)DMA_TLB_IAIG(val));
1439 }
1440
1441 static struct device_domain_info *
1442 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1443                          u8 bus, u8 devfn)
1444 {
1445         struct device_domain_info *info;
1446
1447         assert_spin_locked(&device_domain_lock);
1448
1449         if (!iommu->qi)
1450                 return NULL;
1451
1452         list_for_each_entry(info, &domain->devices, link)
1453                 if (info->iommu == iommu && info->bus == bus &&
1454                     info->devfn == devfn) {
1455                         if (info->ats_supported && info->dev)
1456                                 return info;
1457                         break;
1458                 }
1459
1460         return NULL;
1461 }
1462
1463 static void domain_update_iotlb(struct dmar_domain *domain)
1464 {
1465         struct device_domain_info *info;
1466         bool has_iotlb_device = false;
1467
1468         assert_spin_locked(&device_domain_lock);
1469
1470         list_for_each_entry(info, &domain->devices, link)
1471                 if (info->ats_enabled) {
1472                         has_iotlb_device = true;
1473                         break;
1474                 }
1475
1476         if (!has_iotlb_device) {
1477                 struct subdev_domain_info *sinfo;
1478
1479                 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1480                         info = get_domain_info(sinfo->pdev);
1481                         if (info && info->ats_enabled) {
1482                                 has_iotlb_device = true;
1483                                 break;
1484                         }
1485                 }
1486         }
1487
1488         domain->has_iotlb_device = has_iotlb_device;
1489 }
1490
1491 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1492 {
1493         struct pci_dev *pdev;
1494
1495         assert_spin_locked(&device_domain_lock);
1496
1497         if (!info || !dev_is_pci(info->dev))
1498                 return;
1499
1500         pdev = to_pci_dev(info->dev);
1501         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1502          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1503          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1504          * reserved, which should be set to 0.
1505          */
1506         if (!ecap_dit(info->iommu->ecap))
1507                 info->pfsid = 0;
1508         else {
1509                 struct pci_dev *pf_pdev;
1510
1511                 /* pdev will be returned if device is not a vf */
1512                 pf_pdev = pci_physfn(pdev);
1513                 info->pfsid = pci_dev_id(pf_pdev);
1514         }
1515
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517         /* The PCIe spec, in its wisdom, declares that the behaviour of
1518            the device if you enable PASID support after ATS support is
1519            undefined. So always enable PASID support on devices which
1520            have it, even if we can't yet know if we're ever going to
1521            use it. */
1522         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1523                 info->pasid_enabled = 1;
1524
1525         if (info->pri_supported &&
1526             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1527             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1528                 info->pri_enabled = 1;
1529 #endif
1530         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1531             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1532                 info->ats_enabled = 1;
1533                 domain_update_iotlb(info->domain);
1534                 info->ats_qdep = pci_ats_queue_depth(pdev);
1535         }
1536 }
1537
1538 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1539 {
1540         struct pci_dev *pdev;
1541
1542         assert_spin_locked(&device_domain_lock);
1543
1544         if (!dev_is_pci(info->dev))
1545                 return;
1546
1547         pdev = to_pci_dev(info->dev);
1548
1549         if (info->ats_enabled) {
1550                 pci_disable_ats(pdev);
1551                 info->ats_enabled = 0;
1552                 domain_update_iotlb(info->domain);
1553         }
1554 #ifdef CONFIG_INTEL_IOMMU_SVM
1555         if (info->pri_enabled) {
1556                 pci_disable_pri(pdev);
1557                 info->pri_enabled = 0;
1558         }
1559         if (info->pasid_enabled) {
1560                 pci_disable_pasid(pdev);
1561                 info->pasid_enabled = 0;
1562         }
1563 #endif
1564 }
1565
1566 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1567                                     u64 addr, unsigned int mask)
1568 {
1569         u16 sid, qdep;
1570
1571         if (!info || !info->ats_enabled)
1572                 return;
1573
1574         sid = info->bus << 8 | info->devfn;
1575         qdep = info->ats_qdep;
1576         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1577                            qdep, addr, mask);
1578 }
1579
1580 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1581                                   u64 addr, unsigned mask)
1582 {
1583         unsigned long flags;
1584         struct device_domain_info *info;
1585         struct subdev_domain_info *sinfo;
1586
1587         if (!domain->has_iotlb_device)
1588                 return;
1589
1590         spin_lock_irqsave(&device_domain_lock, flags);
1591         list_for_each_entry(info, &domain->devices, link)
1592                 __iommu_flush_dev_iotlb(info, addr, mask);
1593
1594         list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1595                 info = get_domain_info(sinfo->pdev);
1596                 __iommu_flush_dev_iotlb(info, addr, mask);
1597         }
1598         spin_unlock_irqrestore(&device_domain_lock, flags);
1599 }
1600
1601 static void domain_flush_piotlb(struct intel_iommu *iommu,
1602                                 struct dmar_domain *domain,
1603                                 u64 addr, unsigned long npages, bool ih)
1604 {
1605         u16 did = domain->iommu_did[iommu->seq_id];
1606
1607         if (domain->default_pasid)
1608                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1609                                 addr, npages, ih);
1610
1611         if (!list_empty(&domain->devices))
1612                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1613 }
1614
1615 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1616                                   struct dmar_domain *domain,
1617                                   unsigned long pfn, unsigned int pages,
1618                                   int ih, int map)
1619 {
1620         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1621         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1622         u16 did = domain->iommu_did[iommu->seq_id];
1623
1624         BUG_ON(pages == 0);
1625
1626         if (ih)
1627                 ih = 1 << 6;
1628
1629         if (domain_use_first_level(domain)) {
1630                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1631         } else {
1632                 /*
1633                  * Fallback to domain selective flush if no PSI support or
1634                  * the size is too big. PSI requires page size to be 2 ^ x,
1635                  * and the base address is naturally aligned to the size.
1636                  */
1637                 if (!cap_pgsel_inv(iommu->cap) ||
1638                     mask > cap_max_amask_val(iommu->cap))
1639                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1640                                                         DMA_TLB_DSI_FLUSH);
1641                 else
1642                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1643                                                         DMA_TLB_PSI_FLUSH);
1644         }
1645
1646         /*
1647          * In caching mode, changes of pages from non-present to present require
1648          * flush. However, device IOTLB doesn't need to be flushed in this case.
1649          */
1650         if (!cap_caching_mode(iommu->cap) || !map)
1651                 iommu_flush_dev_iotlb(domain, addr, mask);
1652 }
1653
1654 /* Notification for newly created mappings */
1655 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1656                                         struct dmar_domain *domain,
1657                                         unsigned long pfn, unsigned int pages)
1658 {
1659         /*
1660          * It's a non-present to present mapping. Only flush if caching mode
1661          * and second level.
1662          */
1663         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1664                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1665         else
1666                 iommu_flush_write_buffer(iommu);
1667 }
1668
1669 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1670 {
1671         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1672         int idx;
1673
1674         for_each_domain_iommu(idx, dmar_domain) {
1675                 struct intel_iommu *iommu = g_iommus[idx];
1676                 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1677
1678                 if (domain_use_first_level(dmar_domain))
1679                         domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1680                 else
1681                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1682                                                  DMA_TLB_DSI_FLUSH);
1683
1684                 if (!cap_caching_mode(iommu->cap))
1685                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1686                                               0, MAX_AGAW_PFN_WIDTH);
1687         }
1688 }
1689
1690 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1691 {
1692         u32 pmen;
1693         unsigned long flags;
1694
1695         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1696                 return;
1697
1698         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1699         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1700         pmen &= ~DMA_PMEN_EPM;
1701         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1702
1703         /* wait for the protected region status bit to clear */
1704         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1705                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1706
1707         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1708 }
1709
1710 static void iommu_enable_translation(struct intel_iommu *iommu)
1711 {
1712         u32 sts;
1713         unsigned long flags;
1714
1715         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1716         iommu->gcmd |= DMA_GCMD_TE;
1717         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1718
1719         /* Make sure hardware complete it */
1720         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1721                       readl, (sts & DMA_GSTS_TES), sts);
1722
1723         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1724 }
1725
1726 static void iommu_disable_translation(struct intel_iommu *iommu)
1727 {
1728         u32 sts;
1729         unsigned long flag;
1730
1731         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1732             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1733                 return;
1734
1735         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1736         iommu->gcmd &= ~DMA_GCMD_TE;
1737         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1738
1739         /* Make sure hardware complete it */
1740         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1741                       readl, (!(sts & DMA_GSTS_TES)), sts);
1742
1743         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1744 }
1745
1746 static int iommu_init_domains(struct intel_iommu *iommu)
1747 {
1748         u32 ndomains, nlongs;
1749         size_t size;
1750
1751         ndomains = cap_ndoms(iommu->cap);
1752         pr_debug("%s: Number of Domains supported <%d>\n",
1753                  iommu->name, ndomains);
1754         nlongs = BITS_TO_LONGS(ndomains);
1755
1756         spin_lock_init(&iommu->lock);
1757
1758         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1759         if (!iommu->domain_ids) {
1760                 pr_err("%s: Allocating domain id array failed\n",
1761                        iommu->name);
1762                 return -ENOMEM;
1763         }
1764
1765         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1766         iommu->domains = kzalloc(size, GFP_KERNEL);
1767
1768         if (iommu->domains) {
1769                 size = 256 * sizeof(struct dmar_domain *);
1770                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1771         }
1772
1773         if (!iommu->domains || !iommu->domains[0]) {
1774                 pr_err("%s: Allocating domain array failed\n",
1775                        iommu->name);
1776                 kfree(iommu->domain_ids);
1777                 kfree(iommu->domains);
1778                 iommu->domain_ids = NULL;
1779                 iommu->domains    = NULL;
1780                 return -ENOMEM;
1781         }
1782
1783         /*
1784          * If Caching mode is set, then invalid translations are tagged
1785          * with domain-id 0, hence we need to pre-allocate it. We also
1786          * use domain-id 0 as a marker for non-allocated domain-id, so
1787          * make sure it is not used for a real domain.
1788          */
1789         set_bit(0, iommu->domain_ids);
1790
1791         /*
1792          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1793          * entry for first-level or pass-through translation modes should
1794          * be programmed with a domain id different from those used for
1795          * second-level or nested translation. We reserve a domain id for
1796          * this purpose.
1797          */
1798         if (sm_supported(iommu))
1799                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1800
1801         return 0;
1802 }
1803
1804 static void disable_dmar_iommu(struct intel_iommu *iommu)
1805 {
1806         struct device_domain_info *info, *tmp;
1807         unsigned long flags;
1808
1809         if (!iommu->domains || !iommu->domain_ids)
1810                 return;
1811
1812         spin_lock_irqsave(&device_domain_lock, flags);
1813         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1814                 if (info->iommu != iommu)
1815                         continue;
1816
1817                 if (!info->dev || !info->domain)
1818                         continue;
1819
1820                 __dmar_remove_one_dev_info(info);
1821         }
1822         spin_unlock_irqrestore(&device_domain_lock, flags);
1823
1824         if (iommu->gcmd & DMA_GCMD_TE)
1825                 iommu_disable_translation(iommu);
1826 }
1827
1828 static void free_dmar_iommu(struct intel_iommu *iommu)
1829 {
1830         if ((iommu->domains) && (iommu->domain_ids)) {
1831                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1832                 int i;
1833
1834                 for (i = 0; i < elems; i++)
1835                         kfree(iommu->domains[i]);
1836                 kfree(iommu->domains);
1837                 kfree(iommu->domain_ids);
1838                 iommu->domains = NULL;
1839                 iommu->domain_ids = NULL;
1840         }
1841
1842         g_iommus[iommu->seq_id] = NULL;
1843
1844         /* free context mapping */
1845         free_context_table(iommu);
1846
1847 #ifdef CONFIG_INTEL_IOMMU_SVM
1848         if (pasid_supported(iommu)) {
1849                 if (ecap_prs(iommu->ecap))
1850                         intel_svm_finish_prq(iommu);
1851         }
1852         if (vccap_pasid(iommu->vccap))
1853                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1854
1855 #endif
1856 }
1857
1858 /*
1859  * Check and return whether first level is used by default for
1860  * DMA translation.
1861  */
1862 static bool first_level_by_default(void)
1863 {
1864         struct dmar_drhd_unit *drhd;
1865         struct intel_iommu *iommu;
1866         static int first_level_support = -1;
1867
1868         if (likely(first_level_support != -1))
1869                 return first_level_support;
1870
1871         first_level_support = 1;
1872
1873         rcu_read_lock();
1874         for_each_active_iommu(iommu, drhd) {
1875                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1876                         first_level_support = 0;
1877                         break;
1878                 }
1879         }
1880         rcu_read_unlock();
1881
1882         return first_level_support;
1883 }
1884
1885 static struct dmar_domain *alloc_domain(int flags)
1886 {
1887         struct dmar_domain *domain;
1888
1889         domain = alloc_domain_mem();
1890         if (!domain)
1891                 return NULL;
1892
1893         memset(domain, 0, sizeof(*domain));
1894         domain->nid = NUMA_NO_NODE;
1895         domain->flags = flags;
1896         if (first_level_by_default())
1897                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1898         domain->has_iotlb_device = false;
1899         INIT_LIST_HEAD(&domain->devices);
1900         INIT_LIST_HEAD(&domain->subdevices);
1901
1902         return domain;
1903 }
1904
1905 /* Must be called with iommu->lock */
1906 static int domain_attach_iommu(struct dmar_domain *domain,
1907                                struct intel_iommu *iommu)
1908 {
1909         unsigned long ndomains;
1910         int num;
1911
1912         assert_spin_locked(&device_domain_lock);
1913         assert_spin_locked(&iommu->lock);
1914
1915         domain->iommu_refcnt[iommu->seq_id] += 1;
1916         domain->iommu_count += 1;
1917         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1918                 ndomains = cap_ndoms(iommu->cap);
1919                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1920
1921                 if (num >= ndomains) {
1922                         pr_err("%s: No free domain ids\n", iommu->name);
1923                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1924                         domain->iommu_count -= 1;
1925                         return -ENOSPC;
1926                 }
1927
1928                 set_bit(num, iommu->domain_ids);
1929                 set_iommu_domain(iommu, num, domain);
1930
1931                 domain->iommu_did[iommu->seq_id] = num;
1932                 domain->nid                      = iommu->node;
1933
1934                 domain_update_iommu_cap(domain);
1935         }
1936
1937         return 0;
1938 }
1939
1940 static int domain_detach_iommu(struct dmar_domain *domain,
1941                                struct intel_iommu *iommu)
1942 {
1943         int num, count;
1944
1945         assert_spin_locked(&device_domain_lock);
1946         assert_spin_locked(&iommu->lock);
1947
1948         domain->iommu_refcnt[iommu->seq_id] -= 1;
1949         count = --domain->iommu_count;
1950         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1951                 num = domain->iommu_did[iommu->seq_id];
1952                 clear_bit(num, iommu->domain_ids);
1953                 set_iommu_domain(iommu, num, NULL);
1954
1955                 domain_update_iommu_cap(domain);
1956                 domain->iommu_did[iommu->seq_id] = 0;
1957         }
1958
1959         return count;
1960 }
1961
1962 static inline int guestwidth_to_adjustwidth(int gaw)
1963 {
1964         int agaw;
1965         int r = (gaw - 12) % 9;
1966
1967         if (r == 0)
1968                 agaw = gaw;
1969         else
1970                 agaw = gaw + 9 - r;
1971         if (agaw > 64)
1972                 agaw = 64;
1973         return agaw;
1974 }
1975
1976 static void domain_exit(struct dmar_domain *domain)
1977 {
1978
1979         /* Remove associated devices and clear attached or cached domains */
1980         domain_remove_dev_info(domain);
1981
1982         /* destroy iovas */
1983         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1984                 iommu_put_dma_cookie(&domain->domain);
1985
1986         if (domain->pgd) {
1987                 struct page *freelist;
1988
1989                 freelist = domain_unmap(domain, 0,
1990                                         DOMAIN_MAX_PFN(domain->gaw), NULL);
1991                 dma_free_pagelist(freelist);
1992         }
1993
1994         free_domain_mem(domain);
1995 }
1996
1997 /*
1998  * Get the PASID directory size for scalable mode context entry.
1999  * Value of X in the PDTS field of a scalable mode context entry
2000  * indicates PASID directory with 2^(X + 7) entries.
2001  */
2002 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2003 {
2004         int pds, max_pde;
2005
2006         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2007         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2008         if (pds < 7)
2009                 return 0;
2010
2011         return pds - 7;
2012 }
2013
2014 /*
2015  * Set the RID_PASID field of a scalable mode context entry. The
2016  * IOMMU hardware will use the PASID value set in this field for
2017  * DMA translations of DMA requests without PASID.
2018  */
2019 static inline void
2020 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2021 {
2022         context->hi |= pasid & ((1 << 20) - 1);
2023 }
2024
2025 /*
2026  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2027  * entry.
2028  */
2029 static inline void context_set_sm_dte(struct context_entry *context)
2030 {
2031         context->lo |= (1 << 2);
2032 }
2033
2034 /*
2035  * Set the PRE(Page Request Enable) field of a scalable mode context
2036  * entry.
2037  */
2038 static inline void context_set_sm_pre(struct context_entry *context)
2039 {
2040         context->lo |= (1 << 4);
2041 }
2042
2043 /* Convert value to context PASID directory size field coding. */
2044 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2045
2046 static int domain_context_mapping_one(struct dmar_domain *domain,
2047                                       struct intel_iommu *iommu,
2048                                       struct pasid_table *table,
2049                                       u8 bus, u8 devfn)
2050 {
2051         u16 did = domain->iommu_did[iommu->seq_id];
2052         int translation = CONTEXT_TT_MULTI_LEVEL;
2053         struct device_domain_info *info = NULL;
2054         struct context_entry *context;
2055         unsigned long flags;
2056         int ret;
2057
2058         WARN_ON(did == 0);
2059
2060         if (hw_pass_through && domain_type_is_si(domain))
2061                 translation = CONTEXT_TT_PASS_THROUGH;
2062
2063         pr_debug("Set context mapping for %02x:%02x.%d\n",
2064                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2065
2066         BUG_ON(!domain->pgd);
2067
2068         spin_lock_irqsave(&device_domain_lock, flags);
2069         spin_lock(&iommu->lock);
2070
2071         ret = -ENOMEM;
2072         context = iommu_context_addr(iommu, bus, devfn, 1);
2073         if (!context)
2074                 goto out_unlock;
2075
2076         ret = 0;
2077         if (context_present(context))
2078                 goto out_unlock;
2079
2080         /*
2081          * For kdump cases, old valid entries may be cached due to the
2082          * in-flight DMA and copied pgtable, but there is no unmapping
2083          * behaviour for them, thus we need an explicit cache flush for
2084          * the newly-mapped device. For kdump, at this point, the device
2085          * is supposed to finish reset at its driver probe stage, so no
2086          * in-flight DMA will exist, and we don't need to worry anymore
2087          * hereafter.
2088          */
2089         if (context_copied(context)) {
2090                 u16 did_old = context_domain_id(context);
2091
2092                 if (did_old < cap_ndoms(iommu->cap)) {
2093                         iommu->flush.flush_context(iommu, did_old,
2094                                                    (((u16)bus) << 8) | devfn,
2095                                                    DMA_CCMD_MASK_NOBIT,
2096                                                    DMA_CCMD_DEVICE_INVL);
2097                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2098                                                  DMA_TLB_DSI_FLUSH);
2099                 }
2100         }
2101
2102         context_clear_entry(context);
2103
2104         if (sm_supported(iommu)) {
2105                 unsigned long pds;
2106
2107                 WARN_ON(!table);
2108
2109                 /* Setup the PASID DIR pointer: */
2110                 pds = context_get_sm_pds(table);
2111                 context->lo = (u64)virt_to_phys(table->table) |
2112                                 context_pdts(pds);
2113
2114                 /* Setup the RID_PASID field: */
2115                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2116
2117                 /*
2118                  * Setup the Device-TLB enable bit and Page request
2119                  * Enable bit:
2120                  */
2121                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2122                 if (info && info->ats_supported)
2123                         context_set_sm_dte(context);
2124                 if (info && info->pri_supported)
2125                         context_set_sm_pre(context);
2126         } else {
2127                 struct dma_pte *pgd = domain->pgd;
2128                 int agaw;
2129
2130                 context_set_domain_id(context, did);
2131
2132                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2133                         /*
2134                          * Skip top levels of page tables for iommu which has
2135                          * less agaw than default. Unnecessary for PT mode.
2136                          */
2137                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2138                                 ret = -ENOMEM;
2139                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2140                                 if (!dma_pte_present(pgd))
2141                                         goto out_unlock;
2142                         }
2143
2144                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2145                         if (info && info->ats_supported)
2146                                 translation = CONTEXT_TT_DEV_IOTLB;
2147                         else
2148                                 translation = CONTEXT_TT_MULTI_LEVEL;
2149
2150                         context_set_address_root(context, virt_to_phys(pgd));
2151                         context_set_address_width(context, agaw);
2152                 } else {
2153                         /*
2154                          * In pass through mode, AW must be programmed to
2155                          * indicate the largest AGAW value supported by
2156                          * hardware. And ASR is ignored by hardware.
2157                          */
2158                         context_set_address_width(context, iommu->msagaw);
2159                 }
2160
2161                 context_set_translation_type(context, translation);
2162         }
2163
2164         context_set_fault_enable(context);
2165         context_set_present(context);
2166         if (!ecap_coherent(iommu->ecap))
2167                 clflush_cache_range(context, sizeof(*context));
2168
2169         /*
2170          * It's a non-present to present mapping. If hardware doesn't cache
2171          * non-present entry we only need to flush the write-buffer. If the
2172          * _does_ cache non-present entries, then it does so in the special
2173          * domain #0, which we have to flush:
2174          */
2175         if (cap_caching_mode(iommu->cap)) {
2176                 iommu->flush.flush_context(iommu, 0,
2177                                            (((u16)bus) << 8) | devfn,
2178                                            DMA_CCMD_MASK_NOBIT,
2179                                            DMA_CCMD_DEVICE_INVL);
2180                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2181         } else {
2182                 iommu_flush_write_buffer(iommu);
2183         }
2184         iommu_enable_dev_iotlb(info);
2185
2186         ret = 0;
2187
2188 out_unlock:
2189         spin_unlock(&iommu->lock);
2190         spin_unlock_irqrestore(&device_domain_lock, flags);
2191
2192         return ret;
2193 }
2194
2195 struct domain_context_mapping_data {
2196         struct dmar_domain *domain;
2197         struct intel_iommu *iommu;
2198         struct pasid_table *table;
2199 };
2200
2201 static int domain_context_mapping_cb(struct pci_dev *pdev,
2202                                      u16 alias, void *opaque)
2203 {
2204         struct domain_context_mapping_data *data = opaque;
2205
2206         return domain_context_mapping_one(data->domain, data->iommu,
2207                                           data->table, PCI_BUS_NUM(alias),
2208                                           alias & 0xff);
2209 }
2210
2211 static int
2212 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2213 {
2214         struct domain_context_mapping_data data;
2215         struct pasid_table *table;
2216         struct intel_iommu *iommu;
2217         u8 bus, devfn;
2218
2219         iommu = device_to_iommu(dev, &bus, &devfn);
2220         if (!iommu)
2221                 return -ENODEV;
2222
2223         table = intel_pasid_get_table(dev);
2224
2225         if (!dev_is_pci(dev))
2226                 return domain_context_mapping_one(domain, iommu, table,
2227                                                   bus, devfn);
2228
2229         data.domain = domain;
2230         data.iommu = iommu;
2231         data.table = table;
2232
2233         return pci_for_each_dma_alias(to_pci_dev(dev),
2234                                       &domain_context_mapping_cb, &data);
2235 }
2236
2237 static int domain_context_mapped_cb(struct pci_dev *pdev,
2238                                     u16 alias, void *opaque)
2239 {
2240         struct intel_iommu *iommu = opaque;
2241
2242         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2243 }
2244
2245 static int domain_context_mapped(struct device *dev)
2246 {
2247         struct intel_iommu *iommu;
2248         u8 bus, devfn;
2249
2250         iommu = device_to_iommu(dev, &bus, &devfn);
2251         if (!iommu)
2252                 return -ENODEV;
2253
2254         if (!dev_is_pci(dev))
2255                 return device_context_mapped(iommu, bus, devfn);
2256
2257         return !pci_for_each_dma_alias(to_pci_dev(dev),
2258                                        domain_context_mapped_cb, iommu);
2259 }
2260
2261 /* Returns a number of VTD pages, but aligned to MM page size */
2262 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2263                                             size_t size)
2264 {
2265         host_addr &= ~PAGE_MASK;
2266         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2267 }
2268
2269 /* Return largest possible superpage level for a given mapping */
2270 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2271                                           unsigned long iov_pfn,
2272                                           unsigned long phy_pfn,
2273                                           unsigned long pages)
2274 {
2275         int support, level = 1;
2276         unsigned long pfnmerge;
2277
2278         support = domain->iommu_superpage;
2279
2280         /* To use a large page, the virtual *and* physical addresses
2281            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2282            of them will mean we have to use smaller pages. So just
2283            merge them and check both at once. */
2284         pfnmerge = iov_pfn | phy_pfn;
2285
2286         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2287                 pages >>= VTD_STRIDE_SHIFT;
2288                 if (!pages)
2289                         break;
2290                 pfnmerge >>= VTD_STRIDE_SHIFT;
2291                 level++;
2292                 support--;
2293         }
2294         return level;
2295 }
2296
2297 static int
2298 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2299                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2300 {
2301         struct dma_pte *first_pte = NULL, *pte = NULL;
2302         unsigned int largepage_lvl = 0;
2303         unsigned long lvl_pages = 0;
2304         phys_addr_t pteval;
2305         u64 attr;
2306
2307         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2308
2309         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2310                 return -EINVAL;
2311
2312         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2313         if (domain_use_first_level(domain))
2314                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2315
2316         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2317
2318         while (nr_pages > 0) {
2319                 uint64_t tmp;
2320
2321                 if (!pte) {
2322                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2323                                         phys_pfn, nr_pages);
2324
2325                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2326                         if (!pte)
2327                                 return -ENOMEM;
2328                         /* It is large page*/
2329                         if (largepage_lvl > 1) {
2330                                 unsigned long nr_superpages, end_pfn;
2331
2332                                 pteval |= DMA_PTE_LARGE_PAGE;
2333                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2334
2335                                 nr_superpages = nr_pages / lvl_pages;
2336                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2337
2338                                 /*
2339                                  * Ensure that old small page tables are
2340                                  * removed to make room for superpage(s).
2341                                  * We're adding new large pages, so make sure
2342                                  * we don't remove their parent tables.
2343                                  */
2344                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2345                                                        largepage_lvl + 1);
2346                         } else {
2347                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2348                         }
2349
2350                 }
2351                 /* We don't need lock here, nobody else
2352                  * touches the iova range
2353                  */
2354                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2355                 if (tmp) {
2356                         static int dumps = 5;
2357                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2358                                 iov_pfn, tmp, (unsigned long long)pteval);
2359                         if (dumps) {
2360                                 dumps--;
2361                                 debug_dma_dump_mappings(NULL);
2362                         }
2363                         WARN_ON(1);
2364                 }
2365
2366                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2367
2368                 BUG_ON(nr_pages < lvl_pages);
2369
2370                 nr_pages -= lvl_pages;
2371                 iov_pfn += lvl_pages;
2372                 phys_pfn += lvl_pages;
2373                 pteval += lvl_pages * VTD_PAGE_SIZE;
2374
2375                 /* If the next PTE would be the first in a new page, then we
2376                  * need to flush the cache on the entries we've just written.
2377                  * And then we'll need to recalculate 'pte', so clear it and
2378                  * let it get set again in the if (!pte) block above.
2379                  *
2380                  * If we're done (!nr_pages) we need to flush the cache too.
2381                  *
2382                  * Also if we've been setting superpages, we may need to
2383                  * recalculate 'pte' and switch back to smaller pages for the
2384                  * end of the mapping, if the trailing size is not enough to
2385                  * use another superpage (i.e. nr_pages < lvl_pages).
2386                  */
2387                 pte++;
2388                 if (!nr_pages || first_pte_in_page(pte) ||
2389                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2390                         domain_flush_cache(domain, first_pte,
2391                                            (void *)pte - (void *)first_pte);
2392                         pte = NULL;
2393                 }
2394         }
2395
2396         return 0;
2397 }
2398
2399 static int
2400 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2401                unsigned long phys_pfn, unsigned long nr_pages, int prot)
2402 {
2403         int iommu_id, ret;
2404         struct intel_iommu *iommu;
2405
2406         /* Do the real mapping first */
2407         ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2408         if (ret)
2409                 return ret;
2410
2411         for_each_domain_iommu(iommu_id, domain) {
2412                 iommu = g_iommus[iommu_id];
2413                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2414         }
2415
2416         return 0;
2417 }
2418
2419 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2420 {
2421         unsigned long flags;
2422         struct context_entry *context;
2423         u16 did_old;
2424
2425         if (!iommu)
2426                 return;
2427
2428         spin_lock_irqsave(&iommu->lock, flags);
2429         context = iommu_context_addr(iommu, bus, devfn, 0);
2430         if (!context) {
2431                 spin_unlock_irqrestore(&iommu->lock, flags);
2432                 return;
2433         }
2434         did_old = context_domain_id(context);
2435         context_clear_entry(context);
2436         __iommu_flush_cache(iommu, context, sizeof(*context));
2437         spin_unlock_irqrestore(&iommu->lock, flags);
2438         iommu->flush.flush_context(iommu,
2439                                    did_old,
2440                                    (((u16)bus) << 8) | devfn,
2441                                    DMA_CCMD_MASK_NOBIT,
2442                                    DMA_CCMD_DEVICE_INVL);
2443         iommu->flush.flush_iotlb(iommu,
2444                                  did_old,
2445                                  0,
2446                                  0,
2447                                  DMA_TLB_DSI_FLUSH);
2448 }
2449
2450 static inline void unlink_domain_info(struct device_domain_info *info)
2451 {
2452         assert_spin_locked(&device_domain_lock);
2453         list_del(&info->link);
2454         list_del(&info->global);
2455         if (info->dev)
2456                 dev_iommu_priv_set(info->dev, NULL);
2457 }
2458
2459 static void domain_remove_dev_info(struct dmar_domain *domain)
2460 {
2461         struct device_domain_info *info, *tmp;
2462         unsigned long flags;
2463
2464         spin_lock_irqsave(&device_domain_lock, flags);
2465         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2466                 __dmar_remove_one_dev_info(info);
2467         spin_unlock_irqrestore(&device_domain_lock, flags);
2468 }
2469
2470 struct dmar_domain *find_domain(struct device *dev)
2471 {
2472         struct device_domain_info *info;
2473
2474         if (unlikely(!dev || !dev->iommu))
2475                 return NULL;
2476
2477         if (unlikely(attach_deferred(dev)))
2478                 return NULL;
2479
2480         /* No lock here, assumes no domain exit in normal case */
2481         info = get_domain_info(dev);
2482         if (likely(info))
2483                 return info->domain;
2484
2485         return NULL;
2486 }
2487
2488 static inline struct device_domain_info *
2489 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2490 {
2491         struct device_domain_info *info;
2492
2493         list_for_each_entry(info, &device_domain_list, global)
2494                 if (info->segment == segment && info->bus == bus &&
2495                     info->devfn == devfn)
2496                         return info;
2497
2498         return NULL;
2499 }
2500
2501 static int domain_setup_first_level(struct intel_iommu *iommu,
2502                                     struct dmar_domain *domain,
2503                                     struct device *dev,
2504                                     u32 pasid)
2505 {
2506         int flags = PASID_FLAG_SUPERVISOR_MODE;
2507         struct dma_pte *pgd = domain->pgd;
2508         int agaw, level;
2509
2510         /*
2511          * Skip top levels of page tables for iommu which has
2512          * less agaw than default. Unnecessary for PT mode.
2513          */
2514         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2515                 pgd = phys_to_virt(dma_pte_addr(pgd));
2516                 if (!dma_pte_present(pgd))
2517                         return -ENOMEM;
2518         }
2519
2520         level = agaw_to_level(agaw);
2521         if (level != 4 && level != 5)
2522                 return -EINVAL;
2523
2524         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2525
2526         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2527                                              domain->iommu_did[iommu->seq_id],
2528                                              flags);
2529 }
2530
2531 static bool dev_is_real_dma_subdevice(struct device *dev)
2532 {
2533         return dev && dev_is_pci(dev) &&
2534                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2535 }
2536
2537 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2538                                                     int bus, int devfn,
2539                                                     struct device *dev,
2540                                                     struct dmar_domain *domain)
2541 {
2542         struct dmar_domain *found = NULL;
2543         struct device_domain_info *info;
2544         unsigned long flags;
2545         int ret;
2546
2547         info = alloc_devinfo_mem();
2548         if (!info)
2549                 return NULL;
2550
2551         if (!dev_is_real_dma_subdevice(dev)) {
2552                 info->bus = bus;
2553                 info->devfn = devfn;
2554                 info->segment = iommu->segment;
2555         } else {
2556                 struct pci_dev *pdev = to_pci_dev(dev);
2557
2558                 info->bus = pdev->bus->number;
2559                 info->devfn = pdev->devfn;
2560                 info->segment = pci_domain_nr(pdev->bus);
2561         }
2562
2563         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2564         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2565         info->ats_qdep = 0;
2566         info->dev = dev;
2567         info->domain = domain;
2568         info->iommu = iommu;
2569         info->pasid_table = NULL;
2570         info->auxd_enabled = 0;
2571         INIT_LIST_HEAD(&info->subdevices);
2572
2573         if (dev && dev_is_pci(dev)) {
2574                 struct pci_dev *pdev = to_pci_dev(info->dev);
2575
2576                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2577                     pci_ats_supported(pdev) &&
2578                     dmar_find_matched_atsr_unit(pdev))
2579                         info->ats_supported = 1;
2580
2581                 if (sm_supported(iommu)) {
2582                         if (pasid_supported(iommu)) {
2583                                 int features = pci_pasid_features(pdev);
2584                                 if (features >= 0)
2585                                         info->pasid_supported = features | 1;
2586                         }
2587
2588                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2589                             pci_pri_supported(pdev))
2590                                 info->pri_supported = 1;
2591                 }
2592         }
2593
2594         spin_lock_irqsave(&device_domain_lock, flags);
2595         if (dev)
2596                 found = find_domain(dev);
2597
2598         if (!found) {
2599                 struct device_domain_info *info2;
2600                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2601                                                        info->devfn);
2602                 if (info2) {
2603                         found      = info2->domain;
2604                         info2->dev = dev;
2605                 }
2606         }
2607
2608         if (found) {
2609                 spin_unlock_irqrestore(&device_domain_lock, flags);
2610                 free_devinfo_mem(info);
2611                 /* Caller must free the original domain */
2612                 return found;
2613         }
2614
2615         spin_lock(&iommu->lock);
2616         ret = domain_attach_iommu(domain, iommu);
2617         spin_unlock(&iommu->lock);
2618
2619         if (ret) {
2620                 spin_unlock_irqrestore(&device_domain_lock, flags);
2621                 free_devinfo_mem(info);
2622                 return NULL;
2623         }
2624
2625         list_add(&info->link, &domain->devices);
2626         list_add(&info->global, &device_domain_list);
2627         if (dev)
2628                 dev_iommu_priv_set(dev, info);
2629         spin_unlock_irqrestore(&device_domain_lock, flags);
2630
2631         /* PASID table is mandatory for a PCI device in scalable mode. */
2632         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2633                 ret = intel_pasid_alloc_table(dev);
2634                 if (ret) {
2635                         dev_err(dev, "PASID table allocation failed\n");
2636                         dmar_remove_one_dev_info(dev);
2637                         return NULL;
2638                 }
2639
2640                 /* Setup the PASID entry for requests without PASID: */
2641                 spin_lock_irqsave(&iommu->lock, flags);
2642                 if (hw_pass_through && domain_type_is_si(domain))
2643                         ret = intel_pasid_setup_pass_through(iommu, domain,
2644                                         dev, PASID_RID2PASID);
2645                 else if (domain_use_first_level(domain))
2646                         ret = domain_setup_first_level(iommu, domain, dev,
2647                                         PASID_RID2PASID);
2648                 else
2649                         ret = intel_pasid_setup_second_level(iommu, domain,
2650                                         dev, PASID_RID2PASID);
2651                 spin_unlock_irqrestore(&iommu->lock, flags);
2652                 if (ret) {
2653                         dev_err(dev, "Setup RID2PASID failed\n");
2654                         dmar_remove_one_dev_info(dev);
2655                         return NULL;
2656                 }
2657         }
2658
2659         if (dev && domain_context_mapping(domain, dev)) {
2660                 dev_err(dev, "Domain context map failed\n");
2661                 dmar_remove_one_dev_info(dev);
2662                 return NULL;
2663         }
2664
2665         return domain;
2666 }
2667
2668 static int iommu_domain_identity_map(struct dmar_domain *domain,
2669                                      unsigned long first_vpfn,
2670                                      unsigned long last_vpfn)
2671 {
2672         /*
2673          * RMRR range might have overlap with physical memory range,
2674          * clear it first
2675          */
2676         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2677
2678         return __domain_mapping(domain, first_vpfn,
2679                                 first_vpfn, last_vpfn - first_vpfn + 1,
2680                                 DMA_PTE_READ|DMA_PTE_WRITE);
2681 }
2682
2683 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2684
2685 static int __init si_domain_init(int hw)
2686 {
2687         struct dmar_rmrr_unit *rmrr;
2688         struct device *dev;
2689         int i, nid, ret;
2690
2691         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2692         if (!si_domain)
2693                 return -EFAULT;
2694
2695         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2696                 domain_exit(si_domain);
2697                 return -EFAULT;
2698         }
2699
2700         if (hw)
2701                 return 0;
2702
2703         for_each_online_node(nid) {
2704                 unsigned long start_pfn, end_pfn;
2705                 int i;
2706
2707                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2708                         ret = iommu_domain_identity_map(si_domain,
2709                                         mm_to_dma_pfn(start_pfn),
2710                                         mm_to_dma_pfn(end_pfn));
2711                         if (ret)
2712                                 return ret;
2713                 }
2714         }
2715
2716         /*
2717          * Identity map the RMRRs so that devices with RMRRs could also use
2718          * the si_domain.
2719          */
2720         for_each_rmrr_units(rmrr) {
2721                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2722                                           i, dev) {
2723                         unsigned long long start = rmrr->base_address;
2724                         unsigned long long end = rmrr->end_address;
2725
2726                         if (WARN_ON(end < start ||
2727                                     end >> agaw_to_width(si_domain->agaw)))
2728                                 continue;
2729
2730                         ret = iommu_domain_identity_map(si_domain,
2731                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2732                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2733                         if (ret)
2734                                 return ret;
2735                 }
2736         }
2737
2738         return 0;
2739 }
2740
2741 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2742 {
2743         struct dmar_domain *ndomain;
2744         struct intel_iommu *iommu;
2745         u8 bus, devfn;
2746
2747         iommu = device_to_iommu(dev, &bus, &devfn);
2748         if (!iommu)
2749                 return -ENODEV;
2750
2751         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2752         if (ndomain != domain)
2753                 return -EBUSY;
2754
2755         return 0;
2756 }
2757
2758 static bool device_has_rmrr(struct device *dev)
2759 {
2760         struct dmar_rmrr_unit *rmrr;
2761         struct device *tmp;
2762         int i;
2763
2764         rcu_read_lock();
2765         for_each_rmrr_units(rmrr) {
2766                 /*
2767                  * Return TRUE if this RMRR contains the device that
2768                  * is passed in.
2769                  */
2770                 for_each_active_dev_scope(rmrr->devices,
2771                                           rmrr->devices_cnt, i, tmp)
2772                         if (tmp == dev ||
2773                             is_downstream_to_pci_bridge(dev, tmp)) {
2774                                 rcu_read_unlock();
2775                                 return true;
2776                         }
2777         }
2778         rcu_read_unlock();
2779         return false;
2780 }
2781
2782 /**
2783  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2784  * is relaxable (ie. is allowed to be not enforced under some conditions)
2785  * @dev: device handle
2786  *
2787  * We assume that PCI USB devices with RMRRs have them largely
2788  * for historical reasons and that the RMRR space is not actively used post
2789  * boot.  This exclusion may change if vendors begin to abuse it.
2790  *
2791  * The same exception is made for graphics devices, with the requirement that
2792  * any use of the RMRR regions will be torn down before assigning the device
2793  * to a guest.
2794  *
2795  * Return: true if the RMRR is relaxable, false otherwise
2796  */
2797 static bool device_rmrr_is_relaxable(struct device *dev)
2798 {
2799         struct pci_dev *pdev;
2800
2801         if (!dev_is_pci(dev))
2802                 return false;
2803
2804         pdev = to_pci_dev(dev);
2805         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2806                 return true;
2807         else
2808                 return false;
2809 }
2810
2811 /*
2812  * There are a couple cases where we need to restrict the functionality of
2813  * devices associated with RMRRs.  The first is when evaluating a device for
2814  * identity mapping because problems exist when devices are moved in and out
2815  * of domains and their respective RMRR information is lost.  This means that
2816  * a device with associated RMRRs will never be in a "passthrough" domain.
2817  * The second is use of the device through the IOMMU API.  This interface
2818  * expects to have full control of the IOVA space for the device.  We cannot
2819  * satisfy both the requirement that RMRR access is maintained and have an
2820  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2821  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2822  * We therefore prevent devices associated with an RMRR from participating in
2823  * the IOMMU API, which eliminates them from device assignment.
2824  *
2825  * In both cases, devices which have relaxable RMRRs are not concerned by this
2826  * restriction. See device_rmrr_is_relaxable comment.
2827  */
2828 static bool device_is_rmrr_locked(struct device *dev)
2829 {
2830         if (!device_has_rmrr(dev))
2831                 return false;
2832
2833         if (device_rmrr_is_relaxable(dev))
2834                 return false;
2835
2836         return true;
2837 }
2838
2839 /*
2840  * Return the required default domain type for a specific device.
2841  *
2842  * @dev: the device in query
2843  * @startup: true if this is during early boot
2844  *
2845  * Returns:
2846  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2847  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2848  *  - 0: both identity and dynamic domains work for this device
2849  */
2850 static int device_def_domain_type(struct device *dev)
2851 {
2852         if (dev_is_pci(dev)) {
2853                 struct pci_dev *pdev = to_pci_dev(dev);
2854
2855                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2856                         return IOMMU_DOMAIN_IDENTITY;
2857
2858                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2859                         return IOMMU_DOMAIN_IDENTITY;
2860         }
2861
2862         return 0;
2863 }
2864
2865 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2866 {
2867         /*
2868          * Start from the sane iommu hardware state.
2869          * If the queued invalidation is already initialized by us
2870          * (for example, while enabling interrupt-remapping) then
2871          * we got the things already rolling from a sane state.
2872          */
2873         if (!iommu->qi) {
2874                 /*
2875                  * Clear any previous faults.
2876                  */
2877                 dmar_fault(-1, iommu);
2878                 /*
2879                  * Disable queued invalidation if supported and already enabled
2880                  * before OS handover.
2881                  */
2882                 dmar_disable_qi(iommu);
2883         }
2884
2885         if (dmar_enable_qi(iommu)) {
2886                 /*
2887                  * Queued Invalidate not enabled, use Register Based Invalidate
2888                  */
2889                 iommu->flush.flush_context = __iommu_flush_context;
2890                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2891                 pr_info("%s: Using Register based invalidation\n",
2892                         iommu->name);
2893         } else {
2894                 iommu->flush.flush_context = qi_flush_context;
2895                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2896                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2897         }
2898 }
2899
2900 static int copy_context_table(struct intel_iommu *iommu,
2901                               struct root_entry *old_re,
2902                               struct context_entry **tbl,
2903                               int bus, bool ext)
2904 {
2905         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2906         struct context_entry *new_ce = NULL, ce;
2907         struct context_entry *old_ce = NULL;
2908         struct root_entry re;
2909         phys_addr_t old_ce_phys;
2910
2911         tbl_idx = ext ? bus * 2 : bus;
2912         memcpy(&re, old_re, sizeof(re));
2913
2914         for (devfn = 0; devfn < 256; devfn++) {
2915                 /* First calculate the correct index */
2916                 idx = (ext ? devfn * 2 : devfn) % 256;
2917
2918                 if (idx == 0) {
2919                         /* First save what we may have and clean up */
2920                         if (new_ce) {
2921                                 tbl[tbl_idx] = new_ce;
2922                                 __iommu_flush_cache(iommu, new_ce,
2923                                                     VTD_PAGE_SIZE);
2924                                 pos = 1;
2925                         }
2926
2927                         if (old_ce)
2928                                 memunmap(old_ce);
2929
2930                         ret = 0;
2931                         if (devfn < 0x80)
2932                                 old_ce_phys = root_entry_lctp(&re);
2933                         else
2934                                 old_ce_phys = root_entry_uctp(&re);
2935
2936                         if (!old_ce_phys) {
2937                                 if (ext && devfn == 0) {
2938                                         /* No LCTP, try UCTP */
2939                                         devfn = 0x7f;
2940                                         continue;
2941                                 } else {
2942                                         goto out;
2943                                 }
2944                         }
2945
2946                         ret = -ENOMEM;
2947                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2948                                         MEMREMAP_WB);
2949                         if (!old_ce)
2950                                 goto out;
2951
2952                         new_ce = alloc_pgtable_page(iommu->node);
2953                         if (!new_ce)
2954                                 goto out_unmap;
2955
2956                         ret = 0;
2957                 }
2958
2959                 /* Now copy the context entry */
2960                 memcpy(&ce, old_ce + idx, sizeof(ce));
2961
2962                 if (!__context_present(&ce))
2963                         continue;
2964
2965                 did = context_domain_id(&ce);
2966                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2967                         set_bit(did, iommu->domain_ids);
2968
2969                 /*
2970                  * We need a marker for copied context entries. This
2971                  * marker needs to work for the old format as well as
2972                  * for extended context entries.
2973                  *
2974                  * Bit 67 of the context entry is used. In the old
2975                  * format this bit is available to software, in the
2976                  * extended format it is the PGE bit, but PGE is ignored
2977                  * by HW if PASIDs are disabled (and thus still
2978                  * available).
2979                  *
2980                  * So disable PASIDs first and then mark the entry
2981                  * copied. This means that we don't copy PASID
2982                  * translations from the old kernel, but this is fine as
2983                  * faults there are not fatal.
2984                  */
2985                 context_clear_pasid_enable(&ce);
2986                 context_set_copied(&ce);
2987
2988                 new_ce[idx] = ce;
2989         }
2990
2991         tbl[tbl_idx + pos] = new_ce;
2992
2993         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2994
2995 out_unmap:
2996         memunmap(old_ce);
2997
2998 out:
2999         return ret;
3000 }
3001
3002 static int copy_translation_tables(struct intel_iommu *iommu)
3003 {
3004         struct context_entry **ctxt_tbls;
3005         struct root_entry *old_rt;
3006         phys_addr_t old_rt_phys;
3007         int ctxt_table_entries;
3008         unsigned long flags;
3009         u64 rtaddr_reg;
3010         int bus, ret;
3011         bool new_ext, ext;
3012
3013         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3014         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3015         new_ext    = !!ecap_ecs(iommu->ecap);
3016
3017         /*
3018          * The RTT bit can only be changed when translation is disabled,
3019          * but disabling translation means to open a window for data
3020          * corruption. So bail out and don't copy anything if we would
3021          * have to change the bit.
3022          */
3023         if (new_ext != ext)
3024                 return -EINVAL;
3025
3026         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3027         if (!old_rt_phys)
3028                 return -EINVAL;
3029
3030         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3031         if (!old_rt)
3032                 return -ENOMEM;
3033
3034         /* This is too big for the stack - allocate it from slab */
3035         ctxt_table_entries = ext ? 512 : 256;
3036         ret = -ENOMEM;
3037         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3038         if (!ctxt_tbls)
3039                 goto out_unmap;
3040
3041         for (bus = 0; bus < 256; bus++) {
3042                 ret = copy_context_table(iommu, &old_rt[bus],
3043                                          ctxt_tbls, bus, ext);
3044                 if (ret) {
3045                         pr_err("%s: Failed to copy context table for bus %d\n",
3046                                 iommu->name, bus);
3047                         continue;
3048                 }
3049         }
3050
3051         spin_lock_irqsave(&iommu->lock, flags);
3052
3053         /* Context tables are copied, now write them to the root_entry table */
3054         for (bus = 0; bus < 256; bus++) {
3055                 int idx = ext ? bus * 2 : bus;
3056                 u64 val;
3057
3058                 if (ctxt_tbls[idx]) {
3059                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3060                         iommu->root_entry[bus].lo = val;
3061                 }
3062
3063                 if (!ext || !ctxt_tbls[idx + 1])
3064                         continue;
3065
3066                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3067                 iommu->root_entry[bus].hi = val;
3068         }
3069
3070         spin_unlock_irqrestore(&iommu->lock, flags);
3071
3072         kfree(ctxt_tbls);
3073
3074         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3075
3076         ret = 0;
3077
3078 out_unmap:
3079         memunmap(old_rt);
3080
3081         return ret;
3082 }
3083
3084 #ifdef CONFIG_INTEL_IOMMU_SVM
3085 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3086 {
3087         struct intel_iommu *iommu = data;
3088         ioasid_t ioasid;
3089
3090         if (!iommu)
3091                 return INVALID_IOASID;
3092         /*
3093          * VT-d virtual command interface always uses the full 20 bit
3094          * PASID range. Host can partition guest PASID range based on
3095          * policies but it is out of guest's control.
3096          */
3097         if (min < PASID_MIN || max > intel_pasid_max_id)
3098                 return INVALID_IOASID;
3099
3100         if (vcmd_alloc_pasid(iommu, &ioasid))
3101                 return INVALID_IOASID;
3102
3103         return ioasid;
3104 }
3105
3106 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3107 {
3108         struct intel_iommu *iommu = data;
3109
3110         if (!iommu)
3111                 return;
3112         /*
3113          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3114          * We can only free the PASID when all the devices are unbound.
3115          */
3116         if (ioasid_find(NULL, ioasid, NULL)) {
3117                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3118                 return;
3119         }
3120         vcmd_free_pasid(iommu, ioasid);
3121 }
3122
3123 static void register_pasid_allocator(struct intel_iommu *iommu)
3124 {
3125         /*
3126          * If we are running in the host, no need for custom allocator
3127          * in that PASIDs are allocated from the host system-wide.
3128          */
3129         if (!cap_caching_mode(iommu->cap))
3130                 return;
3131
3132         if (!sm_supported(iommu)) {
3133                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3134                 return;
3135         }
3136
3137         /*
3138          * Register a custom PASID allocator if we are running in a guest,
3139          * guest PASID must be obtained via virtual command interface.
3140          * There can be multiple vIOMMUs in each guest but only one allocator
3141          * is active. All vIOMMU allocators will eventually be calling the same
3142          * host allocator.
3143          */
3144         if (!vccap_pasid(iommu->vccap))
3145                 return;
3146
3147         pr_info("Register custom PASID allocator\n");
3148         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3149         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3150         iommu->pasid_allocator.pdata = (void *)iommu;
3151         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3152                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3153                 /*
3154                  * Disable scalable mode on this IOMMU if there
3155                  * is no custom allocator. Mixing SM capable vIOMMU
3156                  * and non-SM vIOMMU are not supported.
3157                  */
3158                 intel_iommu_sm = 0;
3159         }
3160 }
3161 #endif
3162
3163 static int __init init_dmars(void)
3164 {
3165         struct dmar_drhd_unit *drhd;
3166         struct intel_iommu *iommu;
3167         int ret;
3168
3169         /*
3170          * for each drhd
3171          *    allocate root
3172          *    initialize and program root entry to not present
3173          * endfor
3174          */
3175         for_each_drhd_unit(drhd) {
3176                 /*
3177                  * lock not needed as this is only incremented in the single
3178                  * threaded kernel __init code path all other access are read
3179                  * only
3180                  */
3181                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3182                         g_num_of_iommus++;
3183                         continue;
3184                 }
3185                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3186         }
3187
3188         /* Preallocate enough resources for IOMMU hot-addition */
3189         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3190                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3191
3192         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3193                         GFP_KERNEL);
3194         if (!g_iommus) {
3195                 pr_err("Allocating global iommu array failed\n");
3196                 ret = -ENOMEM;
3197                 goto error;
3198         }
3199
3200         for_each_iommu(iommu, drhd) {
3201                 if (drhd->ignored) {
3202                         iommu_disable_translation(iommu);
3203                         continue;
3204                 }
3205
3206                 /*
3207                  * Find the max pasid size of all IOMMU's in the system.
3208                  * We need to ensure the system pasid table is no bigger
3209                  * than the smallest supported.
3210                  */
3211                 if (pasid_supported(iommu)) {
3212                         u32 temp = 2 << ecap_pss(iommu->ecap);
3213
3214                         intel_pasid_max_id = min_t(u32, temp,
3215                                                    intel_pasid_max_id);
3216                 }
3217
3218                 g_iommus[iommu->seq_id] = iommu;
3219
3220                 intel_iommu_init_qi(iommu);
3221
3222                 ret = iommu_init_domains(iommu);
3223                 if (ret)
3224                         goto free_iommu;
3225
3226                 init_translation_status(iommu);
3227
3228                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3229                         iommu_disable_translation(iommu);
3230                         clear_translation_pre_enabled(iommu);
3231                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3232                                 iommu->name);
3233                 }
3234
3235                 /*
3236                  * TBD:
3237                  * we could share the same root & context tables
3238                  * among all IOMMU's. Need to Split it later.
3239                  */
3240                 ret = iommu_alloc_root_entry(iommu);
3241                 if (ret)
3242                         goto free_iommu;
3243
3244                 if (translation_pre_enabled(iommu)) {
3245                         pr_info("Translation already enabled - trying to copy translation structures\n");
3246
3247                         ret = copy_translation_tables(iommu);
3248                         if (ret) {
3249                                 /*
3250                                  * We found the IOMMU with translation
3251                                  * enabled - but failed to copy over the
3252                                  * old root-entry table. Try to proceed
3253                                  * by disabling translation now and
3254                                  * allocating a clean root-entry table.
3255                                  * This might cause DMAR faults, but
3256                                  * probably the dump will still succeed.
3257                                  */
3258                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3259                                        iommu->name);
3260                                 iommu_disable_translation(iommu);
3261                                 clear_translation_pre_enabled(iommu);
3262                         } else {
3263                                 pr_info("Copied translation tables from previous kernel for %s\n",
3264                                         iommu->name);
3265                         }
3266                 }
3267
3268                 if (!ecap_pass_through(iommu->ecap))
3269                         hw_pass_through = 0;
3270                 intel_svm_check(iommu);
3271         }
3272
3273         /*
3274          * Now that qi is enabled on all iommus, set the root entry and flush
3275          * caches. This is required on some Intel X58 chipsets, otherwise the
3276          * flush_context function will loop forever and the boot hangs.
3277          */
3278         for_each_active_iommu(iommu, drhd) {
3279                 iommu_flush_write_buffer(iommu);
3280 #ifdef CONFIG_INTEL_IOMMU_SVM
3281                 register_pasid_allocator(iommu);
3282 #endif
3283                 iommu_set_root_entry(iommu);
3284                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3285                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3286         }
3287
3288 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3289         dmar_map_gfx = 0;
3290 #endif
3291
3292         if (!dmar_map_gfx)
3293                 iommu_identity_mapping |= IDENTMAP_GFX;
3294
3295         check_tylersburg_isoch();
3296
3297         ret = si_domain_init(hw_pass_through);
3298         if (ret)
3299                 goto free_iommu;
3300
3301         /*
3302          * for each drhd
3303          *   enable fault log
3304          *   global invalidate context cache
3305          *   global invalidate iotlb
3306          *   enable translation
3307          */
3308         for_each_iommu(iommu, drhd) {
3309                 if (drhd->ignored) {
3310                         /*
3311                          * we always have to disable PMRs or DMA may fail on
3312                          * this device
3313                          */
3314                         if (force_on)
3315                                 iommu_disable_protect_mem_regions(iommu);
3316                         continue;
3317                 }
3318
3319                 iommu_flush_write_buffer(iommu);
3320
3321 #ifdef CONFIG_INTEL_IOMMU_SVM
3322                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3323                         /*
3324                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3325                          * could cause possible lock race condition.
3326                          */
3327                         up_write(&dmar_global_lock);
3328                         ret = intel_svm_enable_prq(iommu);
3329                         down_write(&dmar_global_lock);
3330                         if (ret)
3331                                 goto free_iommu;
3332                 }
3333 #endif
3334                 ret = dmar_set_interrupt(iommu);
3335                 if (ret)
3336                         goto free_iommu;
3337         }
3338
3339         return 0;
3340
3341 free_iommu:
3342         for_each_active_iommu(iommu, drhd) {
3343                 disable_dmar_iommu(iommu);
3344                 free_dmar_iommu(iommu);
3345         }
3346
3347         kfree(g_iommus);
3348
3349 error:
3350         return ret;
3351 }
3352
3353 static inline int iommu_domain_cache_init(void)
3354 {
3355         int ret = 0;
3356
3357         iommu_domain_cache = kmem_cache_create("iommu_domain",
3358                                          sizeof(struct dmar_domain),
3359                                          0,
3360                                          SLAB_HWCACHE_ALIGN,
3361
3362                                          NULL);
3363         if (!iommu_domain_cache) {
3364                 pr_err("Couldn't create iommu_domain cache\n");
3365                 ret = -ENOMEM;
3366         }
3367
3368         return ret;
3369 }
3370
3371 static inline int iommu_devinfo_cache_init(void)
3372 {
3373         int ret = 0;
3374
3375         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3376                                          sizeof(struct device_domain_info),
3377                                          0,
3378                                          SLAB_HWCACHE_ALIGN,
3379                                          NULL);
3380         if (!iommu_devinfo_cache) {
3381                 pr_err("Couldn't create devinfo cache\n");
3382                 ret = -ENOMEM;
3383         }
3384
3385         return ret;
3386 }
3387
3388 static int __init iommu_init_mempool(void)
3389 {
3390         int ret;
3391         ret = iova_cache_get();
3392         if (ret)
3393                 return ret;
3394
3395         ret = iommu_domain_cache_init();
3396         if (ret)
3397                 goto domain_error;
3398
3399         ret = iommu_devinfo_cache_init();
3400         if (!ret)
3401                 return ret;
3402
3403         kmem_cache_destroy(iommu_domain_cache);
3404 domain_error:
3405         iova_cache_put();
3406
3407         return -ENOMEM;
3408 }
3409
3410 static void __init iommu_exit_mempool(void)
3411 {
3412         kmem_cache_destroy(iommu_devinfo_cache);
3413         kmem_cache_destroy(iommu_domain_cache);
3414         iova_cache_put();
3415 }
3416
3417 static void __init init_no_remapping_devices(void)
3418 {
3419         struct dmar_drhd_unit *drhd;
3420         struct device *dev;
3421         int i;
3422
3423         for_each_drhd_unit(drhd) {
3424                 if (!drhd->include_all) {
3425                         for_each_active_dev_scope(drhd->devices,
3426                                                   drhd->devices_cnt, i, dev)
3427                                 break;
3428                         /* ignore DMAR unit if no devices exist */
3429                         if (i == drhd->devices_cnt)
3430                                 drhd->ignored = 1;
3431                 }
3432         }
3433
3434         for_each_active_drhd_unit(drhd) {
3435                 if (drhd->include_all)
3436                         continue;
3437
3438                 for_each_active_dev_scope(drhd->devices,
3439                                           drhd->devices_cnt, i, dev)
3440                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3441                                 break;
3442                 if (i < drhd->devices_cnt)
3443                         continue;
3444
3445                 /* This IOMMU has *only* gfx devices. Either bypass it or
3446                    set the gfx_mapped flag, as appropriate */
3447                 drhd->gfx_dedicated = 1;
3448                 if (!dmar_map_gfx)
3449                         drhd->ignored = 1;
3450         }
3451 }
3452
3453 #ifdef CONFIG_SUSPEND
3454 static int init_iommu_hw(void)
3455 {
3456         struct dmar_drhd_unit *drhd;
3457         struct intel_iommu *iommu = NULL;
3458
3459         for_each_active_iommu(iommu, drhd)
3460                 if (iommu->qi)
3461                         dmar_reenable_qi(iommu);
3462
3463         for_each_iommu(iommu, drhd) {
3464                 if (drhd->ignored) {
3465                         /*
3466                          * we always have to disable PMRs or DMA may fail on
3467                          * this device
3468                          */
3469                         if (force_on)
3470                                 iommu_disable_protect_mem_regions(iommu);
3471                         continue;
3472                 }
3473
3474                 iommu_flush_write_buffer(iommu);
3475
3476                 iommu_set_root_entry(iommu);
3477
3478                 iommu->flush.flush_context(iommu, 0, 0, 0,
3479                                            DMA_CCMD_GLOBAL_INVL);
3480                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3481                 iommu_enable_translation(iommu);
3482                 iommu_disable_protect_mem_regions(iommu);
3483         }
3484
3485         return 0;
3486 }
3487
3488 static void iommu_flush_all(void)
3489 {
3490         struct dmar_drhd_unit *drhd;
3491         struct intel_iommu *iommu;
3492
3493         for_each_active_iommu(iommu, drhd) {
3494                 iommu->flush.flush_context(iommu, 0, 0, 0,
3495                                            DMA_CCMD_GLOBAL_INVL);
3496                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3497                                          DMA_TLB_GLOBAL_FLUSH);
3498         }
3499 }
3500
3501 static int iommu_suspend(void)
3502 {
3503         struct dmar_drhd_unit *drhd;
3504         struct intel_iommu *iommu = NULL;
3505         unsigned long flag;
3506
3507         for_each_active_iommu(iommu, drhd) {
3508                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3509                                              GFP_KERNEL);
3510                 if (!iommu->iommu_state)
3511                         goto nomem;
3512         }
3513
3514         iommu_flush_all();
3515
3516         for_each_active_iommu(iommu, drhd) {
3517                 iommu_disable_translation(iommu);
3518
3519                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3520
3521                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3522                         readl(iommu->reg + DMAR_FECTL_REG);
3523                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3524                         readl(iommu->reg + DMAR_FEDATA_REG);
3525                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3526                         readl(iommu->reg + DMAR_FEADDR_REG);
3527                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3528                         readl(iommu->reg + DMAR_FEUADDR_REG);
3529
3530                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3531         }
3532         return 0;
3533
3534 nomem:
3535         for_each_active_iommu(iommu, drhd)
3536                 kfree(iommu->iommu_state);
3537
3538         return -ENOMEM;
3539 }
3540
3541 static void iommu_resume(void)
3542 {
3543         struct dmar_drhd_unit *drhd;
3544         struct intel_iommu *iommu = NULL;
3545         unsigned long flag;
3546
3547         if (init_iommu_hw()) {
3548                 if (force_on)
3549                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3550                 else
3551                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3552                 return;
3553         }
3554
3555         for_each_active_iommu(iommu, drhd) {
3556
3557                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3558
3559                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3560                         iommu->reg + DMAR_FECTL_REG);
3561                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3562                         iommu->reg + DMAR_FEDATA_REG);
3563                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3564                         iommu->reg + DMAR_FEADDR_REG);
3565                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3566                         iommu->reg + DMAR_FEUADDR_REG);
3567
3568                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3569         }
3570
3571         for_each_active_iommu(iommu, drhd)
3572                 kfree(iommu->iommu_state);
3573 }
3574
3575 static struct syscore_ops iommu_syscore_ops = {
3576         .resume         = iommu_resume,
3577         .suspend        = iommu_suspend,
3578 };
3579
3580 static void __init init_iommu_pm_ops(void)
3581 {
3582         register_syscore_ops(&iommu_syscore_ops);
3583 }
3584
3585 #else
3586 static inline void init_iommu_pm_ops(void) {}
3587 #endif  /* CONFIG_PM */
3588
3589 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3590 {
3591         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3592             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3593             rmrr->end_address <= rmrr->base_address ||
3594             arch_rmrr_sanity_check(rmrr))
3595                 return -EINVAL;
3596
3597         return 0;
3598 }
3599
3600 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3601 {
3602         struct acpi_dmar_reserved_memory *rmrr;
3603         struct dmar_rmrr_unit *rmrru;
3604
3605         rmrr = (struct acpi_dmar_reserved_memory *)header;
3606         if (rmrr_sanity_check(rmrr)) {
3607                 pr_warn(FW_BUG
3608                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3609                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3610                            rmrr->base_address, rmrr->end_address,
3611                            dmi_get_system_info(DMI_BIOS_VENDOR),
3612                            dmi_get_system_info(DMI_BIOS_VERSION),
3613                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3614                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3615         }
3616
3617         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3618         if (!rmrru)
3619                 goto out;
3620
3621         rmrru->hdr = header;
3622
3623         rmrru->base_address = rmrr->base_address;
3624         rmrru->end_address = rmrr->end_address;
3625
3626         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3627                                 ((void *)rmrr) + rmrr->header.length,
3628                                 &rmrru->devices_cnt);
3629         if (rmrru->devices_cnt && rmrru->devices == NULL)
3630                 goto free_rmrru;
3631
3632         list_add(&rmrru->list, &dmar_rmrr_units);
3633
3634         return 0;
3635 free_rmrru:
3636         kfree(rmrru);
3637 out:
3638         return -ENOMEM;
3639 }
3640
3641 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3642 {
3643         struct dmar_atsr_unit *atsru;
3644         struct acpi_dmar_atsr *tmp;
3645
3646         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3647                                 dmar_rcu_check()) {
3648                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3649                 if (atsr->segment != tmp->segment)
3650                         continue;
3651                 if (atsr->header.length != tmp->header.length)
3652                         continue;
3653                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3654                         return atsru;
3655         }
3656
3657         return NULL;
3658 }
3659
3660 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3661 {
3662         struct acpi_dmar_atsr *atsr;
3663         struct dmar_atsr_unit *atsru;
3664
3665         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3666                 return 0;
3667
3668         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3669         atsru = dmar_find_atsr(atsr);
3670         if (atsru)
3671                 return 0;
3672
3673         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3674         if (!atsru)
3675                 return -ENOMEM;
3676
3677         /*
3678          * If memory is allocated from slab by ACPI _DSM method, we need to
3679          * copy the memory content because the memory buffer will be freed
3680          * on return.
3681          */
3682         atsru->hdr = (void *)(atsru + 1);
3683         memcpy(atsru->hdr, hdr, hdr->length);
3684         atsru->include_all = atsr->flags & 0x1;
3685         if (!atsru->include_all) {
3686                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3687                                 (void *)atsr + atsr->header.length,
3688                                 &atsru->devices_cnt);
3689                 if (atsru->devices_cnt && atsru->devices == NULL) {
3690                         kfree(atsru);
3691                         return -ENOMEM;
3692                 }
3693         }
3694
3695         list_add_rcu(&atsru->list, &dmar_atsr_units);
3696
3697         return 0;
3698 }
3699
3700 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3701 {
3702         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3703         kfree(atsru);
3704 }
3705
3706 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3707 {
3708         struct acpi_dmar_atsr *atsr;
3709         struct dmar_atsr_unit *atsru;
3710
3711         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3712         atsru = dmar_find_atsr(atsr);
3713         if (atsru) {
3714                 list_del_rcu(&atsru->list);
3715                 synchronize_rcu();
3716                 intel_iommu_free_atsr(atsru);
3717         }
3718
3719         return 0;
3720 }
3721
3722 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3723 {
3724         int i;
3725         struct device *dev;
3726         struct acpi_dmar_atsr *atsr;
3727         struct dmar_atsr_unit *atsru;
3728
3729         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3730         atsru = dmar_find_atsr(atsr);
3731         if (!atsru)
3732                 return 0;
3733
3734         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3735                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3736                                           i, dev)
3737                         return -EBUSY;
3738         }
3739
3740         return 0;
3741 }
3742
3743 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3744 {
3745         int sp, ret;
3746         struct intel_iommu *iommu = dmaru->iommu;
3747
3748         if (g_iommus[iommu->seq_id])
3749                 return 0;
3750
3751         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3752                 pr_warn("%s: Doesn't support hardware pass through.\n",
3753                         iommu->name);
3754                 return -ENXIO;
3755         }
3756         if (!ecap_sc_support(iommu->ecap) &&
3757             domain_update_iommu_snooping(iommu)) {
3758                 pr_warn("%s: Doesn't support snooping.\n",
3759                         iommu->name);
3760                 return -ENXIO;
3761         }
3762         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3763         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3764                 pr_warn("%s: Doesn't support large page.\n",
3765                         iommu->name);
3766                 return -ENXIO;
3767         }
3768
3769         /*
3770          * Disable translation if already enabled prior to OS handover.
3771          */
3772         if (iommu->gcmd & DMA_GCMD_TE)
3773                 iommu_disable_translation(iommu);
3774
3775         g_iommus[iommu->seq_id] = iommu;
3776         ret = iommu_init_domains(iommu);
3777         if (ret == 0)
3778                 ret = iommu_alloc_root_entry(iommu);
3779         if (ret)
3780                 goto out;
3781
3782         intel_svm_check(iommu);
3783
3784         if (dmaru->ignored) {
3785                 /*
3786                  * we always have to disable PMRs or DMA may fail on this device
3787                  */
3788                 if (force_on)
3789                         iommu_disable_protect_mem_regions(iommu);
3790                 return 0;
3791         }
3792
3793         intel_iommu_init_qi(iommu);
3794         iommu_flush_write_buffer(iommu);
3795
3796 #ifdef CONFIG_INTEL_IOMMU_SVM
3797         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3798                 ret = intel_svm_enable_prq(iommu);
3799                 if (ret)
3800                         goto disable_iommu;
3801         }
3802 #endif
3803         ret = dmar_set_interrupt(iommu);
3804         if (ret)
3805                 goto disable_iommu;
3806
3807         iommu_set_root_entry(iommu);
3808         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3809         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3810         iommu_enable_translation(iommu);
3811
3812         iommu_disable_protect_mem_regions(iommu);
3813         return 0;
3814
3815 disable_iommu:
3816         disable_dmar_iommu(iommu);
3817 out:
3818         free_dmar_iommu(iommu);
3819         return ret;
3820 }
3821
3822 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3823 {
3824         int ret = 0;
3825         struct intel_iommu *iommu = dmaru->iommu;
3826
3827         if (!intel_iommu_enabled)
3828                 return 0;
3829         if (iommu == NULL)
3830                 return -EINVAL;
3831
3832         if (insert) {
3833                 ret = intel_iommu_add(dmaru);
3834         } else {
3835                 disable_dmar_iommu(iommu);
3836                 free_dmar_iommu(iommu);
3837         }
3838
3839         return ret;
3840 }
3841
3842 static void intel_iommu_free_dmars(void)
3843 {
3844         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3845         struct dmar_atsr_unit *atsru, *atsr_n;
3846
3847         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3848                 list_del(&rmrru->list);
3849                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3850                 kfree(rmrru);
3851         }
3852
3853         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3854                 list_del(&atsru->list);
3855                 intel_iommu_free_atsr(atsru);
3856         }
3857 }
3858
3859 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3860 {
3861         int i, ret = 1;
3862         struct pci_bus *bus;
3863         struct pci_dev *bridge = NULL;
3864         struct device *tmp;
3865         struct acpi_dmar_atsr *atsr;
3866         struct dmar_atsr_unit *atsru;
3867
3868         dev = pci_physfn(dev);
3869         for (bus = dev->bus; bus; bus = bus->parent) {
3870                 bridge = bus->self;
3871                 /* If it's an integrated device, allow ATS */
3872                 if (!bridge)
3873                         return 1;
3874                 /* Connected via non-PCIe: no ATS */
3875                 if (!pci_is_pcie(bridge) ||
3876                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3877                         return 0;
3878                 /* If we found the root port, look it up in the ATSR */
3879                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3880                         break;
3881         }
3882
3883         rcu_read_lock();
3884         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3885                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3886                 if (atsr->segment != pci_domain_nr(dev->bus))
3887                         continue;
3888
3889                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3890                         if (tmp == &bridge->dev)
3891                                 goto out;
3892
3893                 if (atsru->include_all)
3894                         goto out;
3895         }
3896         ret = 0;
3897 out:
3898         rcu_read_unlock();
3899
3900         return ret;
3901 }
3902
3903 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3904 {
3905         int ret;
3906         struct dmar_rmrr_unit *rmrru;
3907         struct dmar_atsr_unit *atsru;
3908         struct acpi_dmar_atsr *atsr;
3909         struct acpi_dmar_reserved_memory *rmrr;
3910
3911         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3912                 return 0;
3913
3914         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3915                 rmrr = container_of(rmrru->hdr,
3916                                     struct acpi_dmar_reserved_memory, header);
3917                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3918                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3919                                 ((void *)rmrr) + rmrr->header.length,
3920                                 rmrr->segment, rmrru->devices,
3921                                 rmrru->devices_cnt);
3922                         if (ret < 0)
3923                                 return ret;
3924                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3925                         dmar_remove_dev_scope(info, rmrr->segment,
3926                                 rmrru->devices, rmrru->devices_cnt);
3927                 }
3928         }
3929
3930         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3931                 if (atsru->include_all)
3932                         continue;
3933
3934                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3935                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3936                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3937                                         (void *)atsr + atsr->header.length,
3938                                         atsr->segment, atsru->devices,
3939                                         atsru->devices_cnt);
3940                         if (ret > 0)
3941                                 break;
3942                         else if (ret < 0)
3943                                 return ret;
3944                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3945                         if (dmar_remove_dev_scope(info, atsr->segment,
3946                                         atsru->devices, atsru->devices_cnt))
3947                                 break;
3948                 }
3949         }
3950
3951         return 0;
3952 }
3953
3954 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3955                                        unsigned long val, void *v)
3956 {
3957         struct memory_notify *mhp = v;
3958         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3959         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3960                         mhp->nr_pages - 1);
3961
3962         switch (val) {
3963         case MEM_GOING_ONLINE:
3964                 if (iommu_domain_identity_map(si_domain,
3965                                               start_vpfn, last_vpfn)) {
3966                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3967                                 start_vpfn, last_vpfn);
3968                         return NOTIFY_BAD;
3969                 }
3970                 break;
3971
3972         case MEM_OFFLINE:
3973         case MEM_CANCEL_ONLINE:
3974                 {
3975                         struct dmar_drhd_unit *drhd;
3976                         struct intel_iommu *iommu;
3977                         struct page *freelist;
3978
3979                         freelist = domain_unmap(si_domain,
3980                                                 start_vpfn, last_vpfn,
3981                                                 NULL);
3982
3983                         rcu_read_lock();
3984                         for_each_active_iommu(iommu, drhd)
3985                                 iommu_flush_iotlb_psi(iommu, si_domain,
3986                                         start_vpfn, mhp->nr_pages,
3987                                         !freelist, 0);
3988                         rcu_read_unlock();
3989                         dma_free_pagelist(freelist);
3990                 }
3991                 break;
3992         }
3993
3994         return NOTIFY_OK;
3995 }
3996
3997 static struct notifier_block intel_iommu_memory_nb = {
3998         .notifier_call = intel_iommu_memory_notifier,
3999         .priority = 0
4000 };
4001
4002 static void free_all_cpu_cached_iovas(unsigned int cpu)
4003 {
4004         int i;
4005
4006         for (i = 0; i < g_num_of_iommus; i++) {
4007                 struct intel_iommu *iommu = g_iommus[i];
4008                 struct dmar_domain *domain;
4009                 int did;
4010
4011                 if (!iommu)
4012                         continue;
4013
4014                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4015                         domain = get_iommu_domain(iommu, (u16)did);
4016
4017                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4018                                 continue;
4019
4020                         iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4021                 }
4022         }
4023 }
4024
4025 static int intel_iommu_cpu_dead(unsigned int cpu)
4026 {
4027         free_all_cpu_cached_iovas(cpu);
4028         return 0;
4029 }
4030
4031 static void intel_disable_iommus(void)
4032 {
4033         struct intel_iommu *iommu = NULL;
4034         struct dmar_drhd_unit *drhd;
4035
4036         for_each_iommu(iommu, drhd)
4037                 iommu_disable_translation(iommu);
4038 }
4039
4040 void intel_iommu_shutdown(void)
4041 {
4042         struct dmar_drhd_unit *drhd;
4043         struct intel_iommu *iommu = NULL;
4044
4045         if (no_iommu || dmar_disabled)
4046                 return;
4047
4048         down_write(&dmar_global_lock);
4049
4050         /* Disable PMRs explicitly here. */
4051         for_each_iommu(iommu, drhd)
4052                 iommu_disable_protect_mem_regions(iommu);
4053
4054         /* Make sure the IOMMUs are switched off */
4055         intel_disable_iommus();
4056
4057         up_write(&dmar_global_lock);
4058 }
4059
4060 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4061 {
4062         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4063
4064         return container_of(iommu_dev, struct intel_iommu, iommu);
4065 }
4066
4067 static ssize_t intel_iommu_show_version(struct device *dev,
4068                                         struct device_attribute *attr,
4069                                         char *buf)
4070 {
4071         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4072         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4073         return sprintf(buf, "%d:%d\n",
4074                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4075 }
4076 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4077
4078 static ssize_t intel_iommu_show_address(struct device *dev,
4079                                         struct device_attribute *attr,
4080                                         char *buf)
4081 {
4082         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4083         return sprintf(buf, "%llx\n", iommu->reg_phys);
4084 }
4085 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4086
4087 static ssize_t intel_iommu_show_cap(struct device *dev,
4088                                     struct device_attribute *attr,
4089                                     char *buf)
4090 {
4091         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4092         return sprintf(buf, "%llx\n", iommu->cap);
4093 }
4094 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4095
4096 static ssize_t intel_iommu_show_ecap(struct device *dev,
4097                                     struct device_attribute *attr,
4098                                     char *buf)
4099 {
4100         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4101         return sprintf(buf, "%llx\n", iommu->ecap);
4102 }
4103 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4104
4105 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4106                                       struct device_attribute *attr,
4107                                       char *buf)
4108 {
4109         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4110         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4111 }
4112 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4113
4114 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4115                                            struct device_attribute *attr,
4116                                            char *buf)
4117 {
4118         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4119         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4120                                                   cap_ndoms(iommu->cap)));
4121 }
4122 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4123
4124 static struct attribute *intel_iommu_attrs[] = {
4125         &dev_attr_version.attr,
4126         &dev_attr_address.attr,
4127         &dev_attr_cap.attr,
4128         &dev_attr_ecap.attr,
4129         &dev_attr_domains_supported.attr,
4130         &dev_attr_domains_used.attr,
4131         NULL,
4132 };
4133
4134 static struct attribute_group intel_iommu_group = {
4135         .name = "intel-iommu",
4136         .attrs = intel_iommu_attrs,
4137 };
4138
4139 const struct attribute_group *intel_iommu_groups[] = {
4140         &intel_iommu_group,
4141         NULL,
4142 };
4143
4144 static inline bool has_external_pci(void)
4145 {
4146         struct pci_dev *pdev = NULL;
4147
4148         for_each_pci_dev(pdev)
4149                 if (pdev->external_facing)
4150                         return true;
4151
4152         return false;
4153 }
4154
4155 static int __init platform_optin_force_iommu(void)
4156 {
4157         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4158                 return 0;
4159
4160         if (no_iommu || dmar_disabled)
4161                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4162
4163         /*
4164          * If Intel-IOMMU is disabled by default, we will apply identity
4165          * map for all devices except those marked as being untrusted.
4166          */
4167         if (dmar_disabled)
4168                 iommu_set_default_passthrough(false);
4169
4170         dmar_disabled = 0;
4171         no_iommu = 0;
4172
4173         return 1;
4174 }
4175
4176 static int __init probe_acpi_namespace_devices(void)
4177 {
4178         struct dmar_drhd_unit *drhd;
4179         /* To avoid a -Wunused-but-set-variable warning. */
4180         struct intel_iommu *iommu __maybe_unused;
4181         struct device *dev;
4182         int i, ret = 0;
4183
4184         for_each_active_iommu(iommu, drhd) {
4185                 for_each_active_dev_scope(drhd->devices,
4186                                           drhd->devices_cnt, i, dev) {
4187                         struct acpi_device_physical_node *pn;
4188                         struct iommu_group *group;
4189                         struct acpi_device *adev;
4190
4191                         if (dev->bus != &acpi_bus_type)
4192                                 continue;
4193
4194                         adev = to_acpi_device(dev);
4195                         mutex_lock(&adev->physical_node_lock);
4196                         list_for_each_entry(pn,
4197                                             &adev->physical_node_list, node) {
4198                                 group = iommu_group_get(pn->dev);
4199                                 if (group) {
4200                                         iommu_group_put(group);
4201                                         continue;
4202                                 }
4203
4204                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4205                                 ret = iommu_probe_device(pn->dev);
4206                                 if (ret)
4207                                         break;
4208                         }
4209                         mutex_unlock(&adev->physical_node_lock);
4210
4211                         if (ret)
4212                                 return ret;
4213                 }
4214         }
4215
4216         return 0;
4217 }
4218
4219 int __init intel_iommu_init(void)
4220 {
4221         int ret = -ENODEV;
4222         struct dmar_drhd_unit *drhd;
4223         struct intel_iommu *iommu;
4224
4225         /*
4226          * Intel IOMMU is required for a TXT/tboot launch or platform
4227          * opt in, so enforce that.
4228          */
4229         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4230                     platform_optin_force_iommu();
4231
4232         if (iommu_init_mempool()) {
4233                 if (force_on)
4234                         panic("tboot: Failed to initialize iommu memory\n");
4235                 return -ENOMEM;
4236         }
4237
4238         down_write(&dmar_global_lock);
4239         if (dmar_table_init()) {
4240                 if (force_on)
4241                         panic("tboot: Failed to initialize DMAR table\n");
4242                 goto out_free_dmar;
4243         }
4244
4245         if (dmar_dev_scope_init() < 0) {
4246                 if (force_on)
4247                         panic("tboot: Failed to initialize DMAR device scope\n");
4248                 goto out_free_dmar;
4249         }
4250
4251         up_write(&dmar_global_lock);
4252
4253         /*
4254          * The bus notifier takes the dmar_global_lock, so lockdep will
4255          * complain later when we register it under the lock.
4256          */
4257         dmar_register_bus_notifier();
4258
4259         down_write(&dmar_global_lock);
4260
4261         if (!no_iommu)
4262                 intel_iommu_debugfs_init();
4263
4264         if (no_iommu || dmar_disabled) {
4265                 /*
4266                  * We exit the function here to ensure IOMMU's remapping and
4267                  * mempool aren't setup, which means that the IOMMU's PMRs
4268                  * won't be disabled via the call to init_dmars(). So disable
4269                  * it explicitly here. The PMRs were setup by tboot prior to
4270                  * calling SENTER, but the kernel is expected to reset/tear
4271                  * down the PMRs.
4272                  */
4273                 if (intel_iommu_tboot_noforce) {
4274                         for_each_iommu(iommu, drhd)
4275                                 iommu_disable_protect_mem_regions(iommu);
4276                 }
4277
4278                 /*
4279                  * Make sure the IOMMUs are switched off, even when we
4280                  * boot into a kexec kernel and the previous kernel left
4281                  * them enabled
4282                  */
4283                 intel_disable_iommus();
4284                 goto out_free_dmar;
4285         }
4286
4287         if (list_empty(&dmar_rmrr_units))
4288                 pr_info("No RMRR found\n");
4289
4290         if (list_empty(&dmar_atsr_units))
4291                 pr_info("No ATSR found\n");
4292
4293         if (dmar_map_gfx)
4294                 intel_iommu_gfx_mapped = 1;
4295
4296         init_no_remapping_devices();
4297
4298         ret = init_dmars();
4299         if (ret) {
4300                 if (force_on)
4301                         panic("tboot: Failed to initialize DMARs\n");
4302                 pr_err("Initialization failed\n");
4303                 goto out_free_dmar;
4304         }
4305         up_write(&dmar_global_lock);
4306
4307         init_iommu_pm_ops();
4308
4309         down_read(&dmar_global_lock);
4310         for_each_active_iommu(iommu, drhd) {
4311                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4312                                        intel_iommu_groups,
4313                                        "%s", iommu->name);
4314                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4315                 iommu_device_register(&iommu->iommu);
4316         }
4317         up_read(&dmar_global_lock);
4318
4319         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4320         if (si_domain && !hw_pass_through)
4321                 register_memory_notifier(&intel_iommu_memory_nb);
4322         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4323                           intel_iommu_cpu_dead);
4324
4325         down_read(&dmar_global_lock);
4326         if (probe_acpi_namespace_devices())
4327                 pr_warn("ACPI name space devices didn't probe correctly\n");
4328
4329         /* Finally, we enable the DMA remapping hardware. */
4330         for_each_iommu(iommu, drhd) {
4331                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4332                         iommu_enable_translation(iommu);
4333
4334                 iommu_disable_protect_mem_regions(iommu);
4335         }
4336         up_read(&dmar_global_lock);
4337
4338         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4339
4340         intel_iommu_enabled = 1;
4341
4342         return 0;
4343
4344 out_free_dmar:
4345         intel_iommu_free_dmars();
4346         up_write(&dmar_global_lock);
4347         iommu_exit_mempool();
4348         return ret;
4349 }
4350
4351 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4352 {
4353         struct intel_iommu *iommu = opaque;
4354
4355         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4356         return 0;
4357 }
4358
4359 /*
4360  * NB - intel-iommu lacks any sort of reference counting for the users of
4361  * dependent devices.  If multiple endpoints have intersecting dependent
4362  * devices, unbinding the driver from any one of them will possibly leave
4363  * the others unable to operate.
4364  */
4365 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4366 {
4367         if (!iommu || !dev || !dev_is_pci(dev))
4368                 return;
4369
4370         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4371 }
4372
4373 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4374 {
4375         struct dmar_domain *domain;
4376         struct intel_iommu *iommu;
4377         unsigned long flags;
4378
4379         assert_spin_locked(&device_domain_lock);
4380
4381         if (WARN_ON(!info))
4382                 return;
4383
4384         iommu = info->iommu;
4385         domain = info->domain;
4386
4387         if (info->dev) {
4388                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4389                         intel_pasid_tear_down_entry(iommu, info->dev,
4390                                         PASID_RID2PASID, false);
4391
4392                 iommu_disable_dev_iotlb(info);
4393                 if (!dev_is_real_dma_subdevice(info->dev))
4394                         domain_context_clear(iommu, info->dev);
4395                 intel_pasid_free_table(info->dev);
4396         }
4397
4398         unlink_domain_info(info);
4399
4400         spin_lock_irqsave(&iommu->lock, flags);
4401         domain_detach_iommu(domain, iommu);
4402         spin_unlock_irqrestore(&iommu->lock, flags);
4403
4404         free_devinfo_mem(info);
4405 }
4406
4407 static void dmar_remove_one_dev_info(struct device *dev)
4408 {
4409         struct device_domain_info *info;
4410         unsigned long flags;
4411
4412         spin_lock_irqsave(&device_domain_lock, flags);
4413         info = get_domain_info(dev);
4414         if (info)
4415                 __dmar_remove_one_dev_info(info);
4416         spin_unlock_irqrestore(&device_domain_lock, flags);
4417 }
4418
4419 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4420 {
4421         int adjust_width;
4422
4423         /* calculate AGAW */
4424         domain->gaw = guest_width;
4425         adjust_width = guestwidth_to_adjustwidth(guest_width);
4426         domain->agaw = width_to_agaw(adjust_width);
4427
4428         domain->iommu_coherency = 0;
4429         domain->iommu_snooping = 0;
4430         domain->iommu_superpage = 0;
4431         domain->max_addr = 0;
4432
4433         /* always allocate the top pgd */
4434         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4435         if (!domain->pgd)
4436                 return -ENOMEM;
4437         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4438         return 0;
4439 }
4440
4441 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4442 {
4443         struct dmar_domain *dmar_domain;
4444         struct iommu_domain *domain;
4445
4446         switch (type) {
4447         case IOMMU_DOMAIN_DMA:
4448         case IOMMU_DOMAIN_UNMANAGED:
4449                 dmar_domain = alloc_domain(0);
4450                 if (!dmar_domain) {
4451                         pr_err("Can't allocate dmar_domain\n");
4452                         return NULL;
4453                 }
4454                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4455                         pr_err("Domain initialization failed\n");
4456                         domain_exit(dmar_domain);
4457                         return NULL;
4458                 }
4459
4460                 if (type == IOMMU_DOMAIN_DMA &&
4461                     iommu_get_dma_cookie(&dmar_domain->domain))
4462                         return NULL;
4463
4464                 domain = &dmar_domain->domain;
4465                 domain->geometry.aperture_start = 0;
4466                 domain->geometry.aperture_end   =
4467                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4468                 domain->geometry.force_aperture = true;
4469
4470                 return domain;
4471         case IOMMU_DOMAIN_IDENTITY:
4472                 return &si_domain->domain;
4473         default:
4474                 return NULL;
4475         }
4476
4477         return NULL;
4478 }
4479
4480 static void intel_iommu_domain_free(struct iommu_domain *domain)
4481 {
4482         if (domain != &si_domain->domain)
4483                 domain_exit(to_dmar_domain(domain));
4484 }
4485
4486 /*
4487  * Check whether a @domain could be attached to the @dev through the
4488  * aux-domain attach/detach APIs.
4489  */
4490 static inline bool
4491 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4492 {
4493         struct device_domain_info *info = get_domain_info(dev);
4494
4495         return info && info->auxd_enabled &&
4496                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4497 }
4498
4499 static inline struct subdev_domain_info *
4500 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4501 {
4502         struct subdev_domain_info *sinfo;
4503
4504         if (!list_empty(&domain->subdevices)) {
4505                 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4506                         if (sinfo->pdev == dev)
4507                                 return sinfo;
4508                 }
4509         }
4510
4511         return NULL;
4512 }
4513
4514 static int auxiliary_link_device(struct dmar_domain *domain,
4515                                  struct device *dev)
4516 {
4517         struct device_domain_info *info = get_domain_info(dev);
4518         struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4519
4520         assert_spin_locked(&device_domain_lock);
4521         if (WARN_ON(!info))
4522                 return -EINVAL;
4523
4524         if (!sinfo) {
4525                 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4526                 sinfo->domain = domain;
4527                 sinfo->pdev = dev;
4528                 list_add(&sinfo->link_phys, &info->subdevices);
4529                 list_add(&sinfo->link_domain, &domain->subdevices);
4530         }
4531
4532         return ++sinfo->users;
4533 }
4534
4535 static int auxiliary_unlink_device(struct dmar_domain *domain,
4536                                    struct device *dev)
4537 {
4538         struct device_domain_info *info = get_domain_info(dev);
4539         struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4540         int ret;
4541
4542         assert_spin_locked(&device_domain_lock);
4543         if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4544                 return -EINVAL;
4545
4546         ret = --sinfo->users;
4547         if (!ret) {
4548                 list_del(&sinfo->link_phys);
4549                 list_del(&sinfo->link_domain);
4550                 kfree(sinfo);
4551         }
4552
4553         return ret;
4554 }
4555
4556 static int aux_domain_add_dev(struct dmar_domain *domain,
4557                               struct device *dev)
4558 {
4559         int ret;
4560         unsigned long flags;
4561         struct intel_iommu *iommu;
4562
4563         iommu = device_to_iommu(dev, NULL, NULL);
4564         if (!iommu)
4565                 return -ENODEV;
4566
4567         if (domain->default_pasid <= 0) {
4568                 u32 pasid;
4569
4570                 /* No private data needed for the default pasid */
4571                 pasid = ioasid_alloc(NULL, PASID_MIN,
4572                                      pci_max_pasids(to_pci_dev(dev)) - 1,
4573                                      NULL);
4574                 if (pasid == INVALID_IOASID) {
4575                         pr_err("Can't allocate default pasid\n");
4576                         return -ENODEV;
4577                 }
4578                 domain->default_pasid = pasid;
4579         }
4580
4581         spin_lock_irqsave(&device_domain_lock, flags);
4582         ret = auxiliary_link_device(domain, dev);
4583         if (ret <= 0)
4584                 goto link_failed;
4585
4586         /*
4587          * Subdevices from the same physical device can be attached to the
4588          * same domain. For such cases, only the first subdevice attachment
4589          * needs to go through the full steps in this function. So if ret >
4590          * 1, just goto out.
4591          */
4592         if (ret > 1)
4593                 goto out;
4594
4595         /*
4596          * iommu->lock must be held to attach domain to iommu and setup the
4597          * pasid entry for second level translation.
4598          */
4599         spin_lock(&iommu->lock);
4600         ret = domain_attach_iommu(domain, iommu);
4601         if (ret)
4602                 goto attach_failed;
4603
4604         /* Setup the PASID entry for mediated devices: */
4605         if (domain_use_first_level(domain))
4606                 ret = domain_setup_first_level(iommu, domain, dev,
4607                                                domain->default_pasid);
4608         else
4609                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4610                                                      domain->default_pasid);
4611         if (ret)
4612                 goto table_failed;
4613
4614         spin_unlock(&iommu->lock);
4615 out:
4616         spin_unlock_irqrestore(&device_domain_lock, flags);
4617
4618         return 0;
4619
4620 table_failed:
4621         domain_detach_iommu(domain, iommu);
4622 attach_failed:
4623         spin_unlock(&iommu->lock);
4624         auxiliary_unlink_device(domain, dev);
4625 link_failed:
4626         spin_unlock_irqrestore(&device_domain_lock, flags);
4627         if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4628                 ioasid_put(domain->default_pasid);
4629
4630         return ret;
4631 }
4632
4633 static void aux_domain_remove_dev(struct dmar_domain *domain,
4634                                   struct device *dev)
4635 {
4636         struct device_domain_info *info;
4637         struct intel_iommu *iommu;
4638         unsigned long flags;
4639
4640         if (!is_aux_domain(dev, &domain->domain))
4641                 return;
4642
4643         spin_lock_irqsave(&device_domain_lock, flags);
4644         info = get_domain_info(dev);
4645         iommu = info->iommu;
4646
4647         if (!auxiliary_unlink_device(domain, dev)) {
4648                 spin_lock(&iommu->lock);
4649                 intel_pasid_tear_down_entry(iommu, dev,
4650                                             domain->default_pasid, false);
4651                 domain_detach_iommu(domain, iommu);
4652                 spin_unlock(&iommu->lock);
4653         }
4654
4655         spin_unlock_irqrestore(&device_domain_lock, flags);
4656
4657         if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4658                 ioasid_put(domain->default_pasid);
4659 }
4660
4661 static int prepare_domain_attach_device(struct iommu_domain *domain,
4662                                         struct device *dev)
4663 {
4664         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4665         struct intel_iommu *iommu;
4666         int addr_width;
4667
4668         iommu = device_to_iommu(dev, NULL, NULL);
4669         if (!iommu)
4670                 return -ENODEV;
4671
4672         /* check if this iommu agaw is sufficient for max mapped address */
4673         addr_width = agaw_to_width(iommu->agaw);
4674         if (addr_width > cap_mgaw(iommu->cap))
4675                 addr_width = cap_mgaw(iommu->cap);
4676
4677         if (dmar_domain->max_addr > (1LL << addr_width)) {
4678                 dev_err(dev, "%s: iommu width (%d) is not "
4679                         "sufficient for the mapped address (%llx)\n",
4680                         __func__, addr_width, dmar_domain->max_addr);
4681                 return -EFAULT;
4682         }
4683         dmar_domain->gaw = addr_width;
4684
4685         /*
4686          * Knock out extra levels of page tables if necessary
4687          */
4688         while (iommu->agaw < dmar_domain->agaw) {
4689                 struct dma_pte *pte;
4690
4691                 pte = dmar_domain->pgd;
4692                 if (dma_pte_present(pte)) {
4693                         dmar_domain->pgd = (struct dma_pte *)
4694                                 phys_to_virt(dma_pte_addr(pte));
4695                         free_pgtable_page(pte);
4696                 }
4697                 dmar_domain->agaw--;
4698         }
4699
4700         return 0;
4701 }
4702
4703 static int intel_iommu_attach_device(struct iommu_domain *domain,
4704                                      struct device *dev)
4705 {
4706         int ret;
4707
4708         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4709             device_is_rmrr_locked(dev)) {
4710                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4711                 return -EPERM;
4712         }
4713
4714         if (is_aux_domain(dev, domain))
4715                 return -EPERM;
4716
4717         /* normally dev is not mapped */
4718         if (unlikely(domain_context_mapped(dev))) {
4719                 struct dmar_domain *old_domain;
4720
4721                 old_domain = find_domain(dev);
4722                 if (old_domain)
4723                         dmar_remove_one_dev_info(dev);
4724         }
4725
4726         ret = prepare_domain_attach_device(domain, dev);
4727         if (ret)
4728                 return ret;
4729
4730         return domain_add_dev_info(to_dmar_domain(domain), dev);
4731 }
4732
4733 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4734                                          struct device *dev)
4735 {
4736         int ret;
4737
4738         if (!is_aux_domain(dev, domain))
4739                 return -EPERM;
4740
4741         ret = prepare_domain_attach_device(domain, dev);
4742         if (ret)
4743                 return ret;
4744
4745         return aux_domain_add_dev(to_dmar_domain(domain), dev);
4746 }
4747
4748 static void intel_iommu_detach_device(struct iommu_domain *domain,
4749                                       struct device *dev)
4750 {
4751         dmar_remove_one_dev_info(dev);
4752 }
4753
4754 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4755                                           struct device *dev)
4756 {
4757         aux_domain_remove_dev(to_dmar_domain(domain), dev);
4758 }
4759
4760 #ifdef CONFIG_INTEL_IOMMU_SVM
4761 /*
4762  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4763  * VT-d granularity. Invalidation is typically included in the unmap operation
4764  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4765  * owns the first level page tables. Invalidations of translation caches in the
4766  * guest are trapped and passed down to the host.
4767  *
4768  * vIOMMU in the guest will only expose first level page tables, therefore
4769  * we do not support IOTLB granularity for request without PASID (second level).
4770  *
4771  * For example, to find the VT-d granularity encoding for IOTLB
4772  * type and page selective granularity within PASID:
4773  * X: indexed by iommu cache type
4774  * Y: indexed by enum iommu_inv_granularity
4775  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4776  */
4777
4778 static const int
4779 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4780         /*
4781          * PASID based IOTLB invalidation: PASID selective (per PASID),
4782          * page selective (address granularity)
4783          */
4784         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4785         /* PASID based dev TLBs */
4786         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4787         /* PASID cache */
4788         {-EINVAL, -EINVAL, -EINVAL}
4789 };
4790
4791 static inline int to_vtd_granularity(int type, int granu)
4792 {
4793         return inv_type_granu_table[type][granu];
4794 }
4795
4796 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4797 {
4798         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4799
4800         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4801          * IOMMU cache invalidate API passes granu_size in bytes, and number of
4802          * granu size in contiguous memory.
4803          */
4804         return order_base_2(nr_pages);
4805 }
4806
4807 static int
4808 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4809                            struct iommu_cache_invalidate_info *inv_info)
4810 {
4811         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4812         struct device_domain_info *info;
4813         struct intel_iommu *iommu;
4814         unsigned long flags;
4815         int cache_type;
4816         u8 bus, devfn;
4817         u16 did, sid;
4818         int ret = 0;
4819         u64 size = 0;
4820
4821         if (!inv_info || !dmar_domain)
4822                 return -EINVAL;
4823
4824         if (!dev || !dev_is_pci(dev))
4825                 return -ENODEV;
4826
4827         iommu = device_to_iommu(dev, &bus, &devfn);
4828         if (!iommu)
4829                 return -ENODEV;
4830
4831         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4832                 return -EINVAL;
4833
4834         spin_lock_irqsave(&device_domain_lock, flags);
4835         spin_lock(&iommu->lock);
4836         info = get_domain_info(dev);
4837         if (!info) {
4838                 ret = -EINVAL;
4839                 goto out_unlock;
4840         }
4841         did = dmar_domain->iommu_did[iommu->seq_id];
4842         sid = PCI_DEVID(bus, devfn);
4843
4844         /* Size is only valid in address selective invalidation */
4845         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4846                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4847                                    inv_info->granu.addr_info.nb_granules);
4848
4849         for_each_set_bit(cache_type,
4850                          (unsigned long *)&inv_info->cache,
4851                          IOMMU_CACHE_INV_TYPE_NR) {
4852                 int granu = 0;
4853                 u64 pasid = 0;
4854                 u64 addr = 0;
4855
4856                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
4857                 if (granu == -EINVAL) {
4858                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4859                                            cache_type, inv_info->granularity);
4860                         break;
4861                 }
4862
4863                 /*
4864                  * PASID is stored in different locations based on the
4865                  * granularity.
4866                  */
4867                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4868                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4869                         pasid = inv_info->granu.pasid_info.pasid;
4870                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4871                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4872                         pasid = inv_info->granu.addr_info.pasid;
4873
4874                 switch (BIT(cache_type)) {
4875                 case IOMMU_CACHE_INV_TYPE_IOTLB:
4876                         /* HW will ignore LSB bits based on address mask */
4877                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4878                             size &&
4879                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4880                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4881                                                    inv_info->granu.addr_info.addr, size);
4882                         }
4883
4884                         /*
4885                          * If granu is PASID-selective, address is ignored.
4886                          * We use npages = -1 to indicate that.
4887                          */
4888                         qi_flush_piotlb(iommu, did, pasid,
4889                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4890                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4891                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4892
4893                         if (!info->ats_enabled)
4894                                 break;
4895                         /*
4896                          * Always flush device IOTLB if ATS is enabled. vIOMMU
4897                          * in the guest may assume IOTLB flush is inclusive,
4898                          * which is more efficient.
4899                          */
4900                         fallthrough;
4901                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4902                         /*
4903                          * PASID based device TLB invalidation does not support
4904                          * IOMMU_INV_GRANU_PASID granularity but only supports
4905                          * IOMMU_INV_GRANU_ADDR.
4906                          * The equivalent of that is we set the size to be the
4907                          * entire range of 64 bit. User only provides PASID info
4908                          * without address info. So we set addr to 0.
4909                          */
4910                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4911                                 size = 64 - VTD_PAGE_SHIFT;
4912                                 addr = 0;
4913                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4914                                 addr = inv_info->granu.addr_info.addr;
4915                         }
4916
4917                         if (info->ats_enabled)
4918                                 qi_flush_dev_iotlb_pasid(iommu, sid,
4919                                                 info->pfsid, pasid,
4920                                                 info->ats_qdep, addr,
4921                                                 size);
4922                         else
4923                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4924                         break;
4925                 default:
4926                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4927                                             cache_type);
4928                         ret = -EINVAL;
4929                 }
4930         }
4931 out_unlock:
4932         spin_unlock(&iommu->lock);
4933         spin_unlock_irqrestore(&device_domain_lock, flags);
4934
4935         return ret;
4936 }
4937 #endif
4938
4939 static int intel_iommu_map(struct iommu_domain *domain,
4940                            unsigned long iova, phys_addr_t hpa,
4941                            size_t size, int iommu_prot, gfp_t gfp)
4942 {
4943         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4944         u64 max_addr;
4945         int prot = 0;
4946         int ret;
4947
4948         if (iommu_prot & IOMMU_READ)
4949                 prot |= DMA_PTE_READ;
4950         if (iommu_prot & IOMMU_WRITE)
4951                 prot |= DMA_PTE_WRITE;
4952         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4953                 prot |= DMA_PTE_SNP;
4954
4955         max_addr = iova + size;
4956         if (dmar_domain->max_addr < max_addr) {
4957                 u64 end;
4958
4959                 /* check if minimum agaw is sufficient for mapped address */
4960                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4961                 if (end < max_addr) {
4962                         pr_err("%s: iommu width (%d) is not "
4963                                "sufficient for the mapped address (%llx)\n",
4964                                __func__, dmar_domain->gaw, max_addr);
4965                         return -EFAULT;
4966                 }
4967                 dmar_domain->max_addr = max_addr;
4968         }
4969         /* Round up size to next multiple of PAGE_SIZE, if it and
4970            the low bits of hpa would take us onto the next page */
4971         size = aligned_nrpages(hpa, size);
4972         ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4973                              hpa >> VTD_PAGE_SHIFT, size, prot);
4974         return ret;
4975 }
4976
4977 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4978                                 unsigned long iova, size_t size,
4979                                 struct iommu_iotlb_gather *gather)
4980 {
4981         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4982         unsigned long start_pfn, last_pfn;
4983         int level = 0;
4984
4985         /* Cope with horrid API which requires us to unmap more than the
4986            size argument if it happens to be a large-page mapping. */
4987         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4988
4989         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4990                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4991
4992         start_pfn = iova >> VTD_PAGE_SHIFT;
4993         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4994
4995         gather->freelist = domain_unmap(dmar_domain, start_pfn,
4996                                         last_pfn, gather->freelist);
4997
4998         if (dmar_domain->max_addr == iova + size)
4999                 dmar_domain->max_addr = iova;
5000
5001         iommu_iotlb_gather_add_page(domain, gather, iova, size);
5002
5003         return size;
5004 }
5005
5006 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5007                                  struct iommu_iotlb_gather *gather)
5008 {
5009         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5010         unsigned long iova_pfn = IOVA_PFN(gather->start);
5011         size_t size = gather->end - gather->start;
5012         unsigned long start_pfn;
5013         unsigned long nrpages;
5014         int iommu_id;
5015
5016         nrpages = aligned_nrpages(gather->start, size);
5017         start_pfn = mm_to_dma_pfn(iova_pfn);
5018
5019         for_each_domain_iommu(iommu_id, dmar_domain)
5020                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5021                                       start_pfn, nrpages, !gather->freelist, 0);
5022
5023         dma_free_pagelist(gather->freelist);
5024 }
5025
5026 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5027                                             dma_addr_t iova)
5028 {
5029         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5030         struct dma_pte *pte;
5031         int level = 0;
5032         u64 phys = 0;
5033
5034         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5035         if (pte && dma_pte_present(pte))
5036                 phys = dma_pte_addr(pte) +
5037                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5038                                                 VTD_PAGE_SHIFT) - 1));
5039
5040         return phys;
5041 }
5042
5043 static inline bool scalable_mode_support(void)
5044 {
5045         struct dmar_drhd_unit *drhd;
5046         struct intel_iommu *iommu;
5047         bool ret = true;
5048
5049         rcu_read_lock();
5050         for_each_active_iommu(iommu, drhd) {
5051                 if (!sm_supported(iommu)) {
5052                         ret = false;
5053                         break;
5054                 }
5055         }
5056         rcu_read_unlock();
5057
5058         return ret;
5059 }
5060
5061 static inline bool iommu_pasid_support(void)
5062 {
5063         struct dmar_drhd_unit *drhd;
5064         struct intel_iommu *iommu;
5065         bool ret = true;
5066
5067         rcu_read_lock();
5068         for_each_active_iommu(iommu, drhd) {
5069                 if (!pasid_supported(iommu)) {
5070                         ret = false;
5071                         break;
5072                 }
5073         }
5074         rcu_read_unlock();
5075
5076         return ret;
5077 }
5078
5079 static inline bool nested_mode_support(void)
5080 {
5081         struct dmar_drhd_unit *drhd;
5082         struct intel_iommu *iommu;
5083         bool ret = true;
5084
5085         rcu_read_lock();
5086         for_each_active_iommu(iommu, drhd) {
5087                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5088                         ret = false;
5089                         break;
5090                 }
5091         }
5092         rcu_read_unlock();
5093
5094         return ret;
5095 }
5096
5097 static bool intel_iommu_capable(enum iommu_cap cap)
5098 {
5099         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5100                 return domain_update_iommu_snooping(NULL) == 1;
5101         if (cap == IOMMU_CAP_INTR_REMAP)
5102                 return irq_remapping_enabled == 1;
5103
5104         return false;
5105 }
5106
5107 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5108 {
5109         struct intel_iommu *iommu;
5110
5111         iommu = device_to_iommu(dev, NULL, NULL);
5112         if (!iommu)
5113                 return ERR_PTR(-ENODEV);
5114
5115         if (translation_pre_enabled(iommu))
5116                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5117
5118         return &iommu->iommu;
5119 }
5120
5121 static void intel_iommu_release_device(struct device *dev)
5122 {
5123         struct intel_iommu *iommu;
5124
5125         iommu = device_to_iommu(dev, NULL, NULL);
5126         if (!iommu)
5127                 return;
5128
5129         dmar_remove_one_dev_info(dev);
5130
5131         set_dma_ops(dev, NULL);
5132 }
5133
5134 static void intel_iommu_probe_finalize(struct device *dev)
5135 {
5136         dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5137         struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5138         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5139
5140         if (domain && domain->type == IOMMU_DOMAIN_DMA)
5141                 iommu_setup_dma_ops(dev, base,
5142                                     __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5143         else
5144                 set_dma_ops(dev, NULL);
5145 }
5146
5147 static void intel_iommu_get_resv_regions(struct device *device,
5148                                          struct list_head *head)
5149 {
5150         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5151         struct iommu_resv_region *reg;
5152         struct dmar_rmrr_unit *rmrr;
5153         struct device *i_dev;
5154         int i;
5155
5156         down_read(&dmar_global_lock);
5157         for_each_rmrr_units(rmrr) {
5158                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5159                                           i, i_dev) {
5160                         struct iommu_resv_region *resv;
5161                         enum iommu_resv_type type;
5162                         size_t length;
5163
5164                         if (i_dev != device &&
5165                             !is_downstream_to_pci_bridge(device, i_dev))
5166                                 continue;
5167
5168                         length = rmrr->end_address - rmrr->base_address + 1;
5169
5170                         type = device_rmrr_is_relaxable(device) ?
5171                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5172
5173                         resv = iommu_alloc_resv_region(rmrr->base_address,
5174                                                        length, prot, type);
5175                         if (!resv)
5176                                 break;
5177
5178                         list_add_tail(&resv->list, head);
5179                 }
5180         }
5181         up_read(&dmar_global_lock);
5182
5183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5184         if (dev_is_pci(device)) {
5185                 struct pci_dev *pdev = to_pci_dev(device);
5186
5187                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5188                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5189                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5190                         if (reg)
5191                                 list_add_tail(&reg->list, head);
5192                 }
5193         }
5194 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5195
5196         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5197                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5198                                       0, IOMMU_RESV_MSI);
5199         if (!reg)
5200                 return;
5201         list_add_tail(&reg->list, head);
5202 }
5203
5204 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5205 {
5206         struct device_domain_info *info;
5207         struct context_entry *context;
5208         struct dmar_domain *domain;
5209         unsigned long flags;
5210         u64 ctx_lo;
5211         int ret;
5212
5213         domain = find_domain(dev);
5214         if (!domain)
5215                 return -EINVAL;
5216
5217         spin_lock_irqsave(&device_domain_lock, flags);
5218         spin_lock(&iommu->lock);
5219
5220         ret = -EINVAL;
5221         info = get_domain_info(dev);
5222         if (!info || !info->pasid_supported)
5223                 goto out;
5224
5225         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5226         if (WARN_ON(!context))
5227                 goto out;
5228
5229         ctx_lo = context[0].lo;
5230
5231         if (!(ctx_lo & CONTEXT_PASIDE)) {
5232                 ctx_lo |= CONTEXT_PASIDE;
5233                 context[0].lo = ctx_lo;
5234                 wmb();
5235                 iommu->flush.flush_context(iommu,
5236                                            domain->iommu_did[iommu->seq_id],
5237                                            PCI_DEVID(info->bus, info->devfn),
5238                                            DMA_CCMD_MASK_NOBIT,
5239                                            DMA_CCMD_DEVICE_INVL);
5240         }
5241
5242         /* Enable PASID support in the device, if it wasn't already */
5243         if (!info->pasid_enabled)
5244                 iommu_enable_dev_iotlb(info);
5245
5246         ret = 0;
5247
5248  out:
5249         spin_unlock(&iommu->lock);
5250         spin_unlock_irqrestore(&device_domain_lock, flags);
5251
5252         return ret;
5253 }
5254
5255 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5256 {
5257         if (dev_is_pci(dev))
5258                 return pci_device_group(dev);
5259         return generic_device_group(dev);
5260 }
5261
5262 static int intel_iommu_enable_auxd(struct device *dev)
5263 {
5264         struct device_domain_info *info;
5265         struct intel_iommu *iommu;
5266         unsigned long flags;
5267         int ret;
5268
5269         iommu = device_to_iommu(dev, NULL, NULL);
5270         if (!iommu || dmar_disabled)
5271                 return -EINVAL;
5272
5273         if (!sm_supported(iommu) || !pasid_supported(iommu))
5274                 return -EINVAL;
5275
5276         ret = intel_iommu_enable_pasid(iommu, dev);
5277         if (ret)
5278                 return -ENODEV;
5279
5280         spin_lock_irqsave(&device_domain_lock, flags);
5281         info = get_domain_info(dev);
5282         info->auxd_enabled = 1;
5283         spin_unlock_irqrestore(&device_domain_lock, flags);
5284
5285         return 0;
5286 }
5287
5288 static int intel_iommu_disable_auxd(struct device *dev)
5289 {
5290         struct device_domain_info *info;
5291         unsigned long flags;
5292
5293         spin_lock_irqsave(&device_domain_lock, flags);
5294         info = get_domain_info(dev);
5295         if (!WARN_ON(!info))
5296                 info->auxd_enabled = 0;
5297         spin_unlock_irqrestore(&device_domain_lock, flags);
5298
5299         return 0;
5300 }
5301
5302 /*
5303  * A PCI express designated vendor specific extended capability is defined
5304  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5305  * for system software and tools to detect endpoint devices supporting the
5306  * Intel scalable IO virtualization without host driver dependency.
5307  *
5308  * Returns the address of the matching extended capability structure within
5309  * the device's PCI configuration space or 0 if the device does not support
5310  * it.
5311  */
5312 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5313 {
5314         int pos;
5315         u16 vendor, id;
5316
5317         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5318         while (pos) {
5319                 pci_read_config_word(pdev, pos + 4, &vendor);
5320                 pci_read_config_word(pdev, pos + 8, &id);
5321                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5322                         return pos;
5323
5324                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5325         }
5326
5327         return 0;
5328 }
5329
5330 static bool
5331 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5332 {
5333         if (feat == IOMMU_DEV_FEAT_AUX) {
5334                 int ret;
5335
5336                 if (!dev_is_pci(dev) || dmar_disabled ||
5337                     !scalable_mode_support() || !iommu_pasid_support())
5338                         return false;
5339
5340                 ret = pci_pasid_features(to_pci_dev(dev));
5341                 if (ret < 0)
5342                         return false;
5343
5344                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5345         }
5346
5347         if (feat == IOMMU_DEV_FEAT_SVA) {
5348                 struct device_domain_info *info = get_domain_info(dev);
5349
5350                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5351                         info->pasid_supported && info->pri_supported &&
5352                         info->ats_supported;
5353         }
5354
5355         return false;
5356 }
5357
5358 static int
5359 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5360 {
5361         if (feat == IOMMU_DEV_FEAT_AUX)
5362                 return intel_iommu_enable_auxd(dev);
5363
5364         if (feat == IOMMU_DEV_FEAT_SVA) {
5365                 struct device_domain_info *info = get_domain_info(dev);
5366
5367                 if (!info)
5368                         return -EINVAL;
5369
5370                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5371                         return 0;
5372         }
5373
5374         return -ENODEV;
5375 }
5376
5377 static int
5378 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5379 {
5380         if (feat == IOMMU_DEV_FEAT_AUX)
5381                 return intel_iommu_disable_auxd(dev);
5382
5383         return -ENODEV;
5384 }
5385
5386 static bool
5387 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5388 {
5389         struct device_domain_info *info = get_domain_info(dev);
5390
5391         if (feat == IOMMU_DEV_FEAT_AUX)
5392                 return scalable_mode_support() && info && info->auxd_enabled;
5393
5394         return false;
5395 }
5396
5397 static int
5398 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5399 {
5400         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5401
5402         return dmar_domain->default_pasid > 0 ?
5403                         dmar_domain->default_pasid : -EINVAL;
5404 }
5405
5406 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5407                                            struct device *dev)
5408 {
5409         return attach_deferred(dev);
5410 }
5411
5412 static int
5413 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5414                             enum iommu_attr attr, void *data)
5415 {
5416         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5417         unsigned long flags;
5418         int ret = 0;
5419
5420         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5421                 return -EINVAL;
5422
5423         switch (attr) {
5424         case DOMAIN_ATTR_NESTING:
5425                 spin_lock_irqsave(&device_domain_lock, flags);
5426                 if (nested_mode_support() &&
5427                     list_empty(&dmar_domain->devices)) {
5428                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5429                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5430                 } else {
5431                         ret = -ENODEV;
5432                 }
5433                 spin_unlock_irqrestore(&device_domain_lock, flags);
5434                 break;
5435         default:
5436                 ret = -EINVAL;
5437                 break;
5438         }
5439
5440         return ret;
5441 }
5442
5443 static int
5444 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5445                             enum iommu_attr attr, void *data)
5446 {
5447         switch (domain->type) {
5448         case IOMMU_DOMAIN_UNMANAGED:
5449                 return -ENODEV;
5450         case IOMMU_DOMAIN_DMA:
5451                 switch (attr) {
5452                 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5453                         *(int *)data = !intel_iommu_strict;
5454                         return 0;
5455                 default:
5456                         return -ENODEV;
5457                 }
5458                 break;
5459         default:
5460                 return -EINVAL;
5461         }
5462 }
5463
5464 /*
5465  * Check that the device does not live on an external facing PCI port that is
5466  * marked as untrusted. Such devices should not be able to apply quirks and
5467  * thus not be able to bypass the IOMMU restrictions.
5468  */
5469 static bool risky_device(struct pci_dev *pdev)
5470 {
5471         if (pdev->untrusted) {
5472                 pci_info(pdev,
5473                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5474                          pdev->vendor, pdev->device);
5475                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5476                 return true;
5477         }
5478         return false;
5479 }
5480
5481 const struct iommu_ops intel_iommu_ops = {
5482         .capable                = intel_iommu_capable,
5483         .domain_alloc           = intel_iommu_domain_alloc,
5484         .domain_free            = intel_iommu_domain_free,
5485         .domain_get_attr        = intel_iommu_domain_get_attr,
5486         .domain_set_attr        = intel_iommu_domain_set_attr,
5487         .attach_dev             = intel_iommu_attach_device,
5488         .detach_dev             = intel_iommu_detach_device,
5489         .aux_attach_dev         = intel_iommu_aux_attach_device,
5490         .aux_detach_dev         = intel_iommu_aux_detach_device,
5491         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5492         .map                    = intel_iommu_map,
5493         .unmap                  = intel_iommu_unmap,
5494         .flush_iotlb_all        = intel_flush_iotlb_all,
5495         .iotlb_sync             = intel_iommu_tlb_sync,
5496         .iova_to_phys           = intel_iommu_iova_to_phys,
5497         .probe_device           = intel_iommu_probe_device,
5498         .probe_finalize         = intel_iommu_probe_finalize,
5499         .release_device         = intel_iommu_release_device,
5500         .get_resv_regions       = intel_iommu_get_resv_regions,
5501         .put_resv_regions       = generic_iommu_put_resv_regions,
5502         .device_group           = intel_iommu_device_group,
5503         .dev_has_feat           = intel_iommu_dev_has_feat,
5504         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5505         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5506         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5507         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5508         .def_domain_type        = device_def_domain_type,
5509         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5510 #ifdef CONFIG_INTEL_IOMMU_SVM
5511         .cache_invalidate       = intel_iommu_sva_invalidate,
5512         .sva_bind_gpasid        = intel_svm_bind_gpasid,
5513         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
5514         .sva_bind               = intel_svm_bind,
5515         .sva_unbind             = intel_svm_unbind,
5516         .sva_get_pasid          = intel_svm_get_pasid,
5517         .page_response          = intel_svm_page_response,
5518 #endif
5519 };
5520
5521 static void quirk_iommu_igfx(struct pci_dev *dev)
5522 {
5523         if (risky_device(dev))
5524                 return;
5525
5526         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5527         dmar_map_gfx = 0;
5528 }
5529
5530 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5538
5539 /* Broadwell igfx malfunctions with dmar */
5540 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5545 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5552 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5553 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5555 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5556 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5557 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5558 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5560 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5564
5565 static void quirk_iommu_rwbf(struct pci_dev *dev)
5566 {
5567         if (risky_device(dev))
5568                 return;
5569
5570         /*
5571          * Mobile 4 Series Chipset neglects to set RWBF capability,
5572          * but needs it. Same seems to hold for the desktop versions.
5573          */
5574         pci_info(dev, "Forcing write-buffer flush capability\n");
5575         rwbf_quirk = 1;
5576 }
5577
5578 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5582 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5585
5586 #define GGC 0x52
5587 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5588 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5589 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5590 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5591 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5592 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5593 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5594 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5595
5596 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5597 {
5598         unsigned short ggc;
5599
5600         if (risky_device(dev))
5601                 return;
5602
5603         if (pci_read_config_word(dev, GGC, &ggc))
5604                 return;
5605
5606         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5607                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5608                 dmar_map_gfx = 0;
5609         } else if (dmar_map_gfx) {
5610                 /* we have to ensure the gfx device is idle before we flush */
5611                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5612                 intel_iommu_strict = 1;
5613        }
5614 }
5615 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5616 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5617 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5618 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5619
5620 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5621 {
5622         unsigned short ver;
5623
5624         if (!IS_GFX_DEVICE(dev))
5625                 return;
5626
5627         ver = (dev->device >> 8) & 0xff;
5628         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5629             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5630             ver != 0x9a)
5631                 return;
5632
5633         if (risky_device(dev))
5634                 return;
5635
5636         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5637         iommu_skip_te_disable = 1;
5638 }
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5640
5641 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5642    ISOCH DMAR unit for the Azalia sound device, but not give it any
5643    TLB entries, which causes it to deadlock. Check for that.  We do
5644    this in a function called from init_dmars(), instead of in a PCI
5645    quirk, because we don't want to print the obnoxious "BIOS broken"
5646    message if VT-d is actually disabled.
5647 */
5648 static void __init check_tylersburg_isoch(void)
5649 {
5650         struct pci_dev *pdev;
5651         uint32_t vtisochctrl;
5652
5653         /* If there's no Azalia in the system anyway, forget it. */
5654         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5655         if (!pdev)
5656                 return;
5657
5658         if (risky_device(pdev)) {
5659                 pci_dev_put(pdev);
5660                 return;
5661         }
5662
5663         pci_dev_put(pdev);
5664
5665         /* System Management Registers. Might be hidden, in which case
5666            we can't do the sanity check. But that's OK, because the
5667            known-broken BIOSes _don't_ actually hide it, so far. */
5668         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5669         if (!pdev)
5670                 return;
5671
5672         if (risky_device(pdev)) {
5673                 pci_dev_put(pdev);
5674                 return;
5675         }
5676
5677         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5678                 pci_dev_put(pdev);
5679                 return;
5680         }
5681
5682         pci_dev_put(pdev);
5683
5684         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5685         if (vtisochctrl & 1)
5686                 return;
5687
5688         /* Drop all bits other than the number of TLB entries */
5689         vtisochctrl &= 0x1c;
5690
5691         /* If we have the recommended number of TLB entries (16), fine. */
5692         if (vtisochctrl == 0x10)
5693                 return;
5694
5695         /* Zero TLB entries? You get to ride the short bus to school. */
5696         if (!vtisochctrl) {
5697                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5698                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5699                      dmi_get_system_info(DMI_BIOS_VENDOR),
5700                      dmi_get_system_info(DMI_BIOS_VERSION),
5701                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5702                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5703                 return;
5704         }
5705
5706         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5707                vtisochctrl);
5708 }