Documentation: PM: Drop pme_interrupt reference
[linux-2.6-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
28
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
131
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_clear_pasid_enable(struct context_entry *context)
172 {
173         context->lo &= ~(1ULL << 11);
174 }
175
176 static inline bool context_pasid_enabled(struct context_entry *context)
177 {
178         return !!(context->lo & (1ULL << 11));
179 }
180
181 static inline void context_set_copied(struct context_entry *context)
182 {
183         context->hi |= (1ull << 3);
184 }
185
186 static inline bool context_copied(struct context_entry *context)
187 {
188         return !!(context->hi & (1ULL << 3));
189 }
190
191 static inline bool __context_present(struct context_entry *context)
192 {
193         return (context->lo & 1);
194 }
195
196 bool context_present(struct context_entry *context)
197 {
198         return context_pasid_enabled(context) ?
199              __context_present(context) :
200              __context_present(context) && !context_copied(context);
201 }
202
203 static inline void context_set_present(struct context_entry *context)
204 {
205         context->lo |= 1;
206 }
207
208 static inline void context_set_fault_enable(struct context_entry *context)
209 {
210         context->lo &= (((u64)-1) << 2) | 1;
211 }
212
213 static inline void context_set_translation_type(struct context_entry *context,
214                                                 unsigned long value)
215 {
216         context->lo &= (((u64)-1) << 4) | 3;
217         context->lo |= (value & 3) << 2;
218 }
219
220 static inline void context_set_address_root(struct context_entry *context,
221                                             unsigned long value)
222 {
223         context->lo &= ~VTD_PAGE_MASK;
224         context->lo |= value & VTD_PAGE_MASK;
225 }
226
227 static inline void context_set_address_width(struct context_entry *context,
228                                              unsigned long value)
229 {
230         context->hi |= value & 7;
231 }
232
233 static inline void context_set_domain_id(struct context_entry *context,
234                                          unsigned long value)
235 {
236         context->hi |= (value & ((1 << 16) - 1)) << 8;
237 }
238
239 static inline int context_domain_id(struct context_entry *c)
240 {
241         return((c->hi >> 8) & 0xffff);
242 }
243
244 static inline void context_clear_entry(struct context_entry *context)
245 {
246         context->lo = 0;
247         context->hi = 0;
248 }
249
250 /*
251  * This domain is a statically identity mapping domain.
252  *      1. This domain creats a static 1:1 mapping to all usable memory.
253  *      2. It maps to each iommu if successful.
254  *      3. Each iommu mapps to this domain if successful.
255  */
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
258
259 #define for_each_domain_iommu(idx, domain)                      \
260         for (idx = 0; idx < g_num_of_iommus; idx++)             \
261                 if (domain->iommu_refcnt[idx])
262
263 struct dmar_rmrr_unit {
264         struct list_head list;          /* list of rmrr units   */
265         struct acpi_dmar_header *hdr;   /* ACPI header          */
266         u64     base_address;           /* reserved base address*/
267         u64     end_address;            /* reserved end address */
268         struct dmar_dev_scope *devices; /* target devices */
269         int     devices_cnt;            /* target device count */
270 };
271
272 struct dmar_atsr_unit {
273         struct list_head list;          /* list of ATSR units */
274         struct acpi_dmar_header *hdr;   /* ACPI header */
275         struct dmar_dev_scope *devices; /* target devices */
276         int devices_cnt;                /* target device count */
277         u8 include_all:1;               /* include all ports */
278 };
279
280 struct dmar_satc_unit {
281         struct list_head list;          /* list of SATC units */
282         struct acpi_dmar_header *hdr;   /* ACPI header */
283         struct dmar_dev_scope *devices; /* target devices */
284         struct intel_iommu *iommu;      /* the corresponding iommu */
285         int devices_cnt;                /* target device count */
286         u8 atc_required:1;              /* ATS is required */
287 };
288
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
292
293 #define for_each_rmrr_units(rmrr) \
294         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
295
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
298
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
302
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
305
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
308
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
313
314 #define IDENTMAP_GFX            2
315 #define IDENTMAP_AZALIA         4
316
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
319
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
322
323 /*
324  * Iterate over elements in device_domain_list and call the specified
325  * callback @fn against each element.
326  */
327 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
328                                      void *data), void *data)
329 {
330         int ret = 0;
331         unsigned long flags;
332         struct device_domain_info *info;
333
334         spin_lock_irqsave(&device_domain_lock, flags);
335         list_for_each_entry(info, &device_domain_list, global) {
336                 ret = fn(info, data);
337                 if (ret) {
338                         spin_unlock_irqrestore(&device_domain_lock, flags);
339                         return ret;
340                 }
341         }
342         spin_unlock_irqrestore(&device_domain_lock, flags);
343
344         return 0;
345 }
346
347 const struct iommu_ops intel_iommu_ops;
348
349 static bool translation_pre_enabled(struct intel_iommu *iommu)
350 {
351         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
352 }
353
354 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
355 {
356         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
357 }
358
359 static void init_translation_status(struct intel_iommu *iommu)
360 {
361         u32 gsts;
362
363         gsts = readl(iommu->reg + DMAR_GSTS_REG);
364         if (gsts & DMA_GSTS_TES)
365                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
366 }
367
368 static int __init intel_iommu_setup(char *str)
369 {
370         if (!str)
371                 return -EINVAL;
372
373         while (*str) {
374                 if (!strncmp(str, "on", 2)) {
375                         dmar_disabled = 0;
376                         pr_info("IOMMU enabled\n");
377                 } else if (!strncmp(str, "off", 3)) {
378                         dmar_disabled = 1;
379                         no_platform_optin = 1;
380                         pr_info("IOMMU disabled\n");
381                 } else if (!strncmp(str, "igfx_off", 8)) {
382                         dmar_map_gfx = 0;
383                         pr_info("Disable GFX device mapping\n");
384                 } else if (!strncmp(str, "forcedac", 8)) {
385                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
386                         iommu_dma_forcedac = true;
387                 } else if (!strncmp(str, "strict", 6)) {
388                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
389                         iommu_set_dma_strict();
390                 } else if (!strncmp(str, "sp_off", 6)) {
391                         pr_info("Disable supported super page\n");
392                         intel_iommu_superpage = 0;
393                 } else if (!strncmp(str, "sm_on", 5)) {
394                         pr_info("Enable scalable mode if hardware supports\n");
395                         intel_iommu_sm = 1;
396                 } else if (!strncmp(str, "sm_off", 6)) {
397                         pr_info("Scalable mode is disallowed\n");
398                         intel_iommu_sm = 0;
399                 } else if (!strncmp(str, "tboot_noforce", 13)) {
400                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
401                         intel_iommu_tboot_noforce = 1;
402                 } else {
403                         pr_notice("Unknown option - '%s'\n", str);
404                 }
405
406                 str += strcspn(str, ",");
407                 while (*str == ',')
408                         str++;
409         }
410
411         return 1;
412 }
413 __setup("intel_iommu=", intel_iommu_setup);
414
415 void *alloc_pgtable_page(int node)
416 {
417         struct page *page;
418         void *vaddr = NULL;
419
420         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
421         if (page)
422                 vaddr = page_address(page);
423         return vaddr;
424 }
425
426 void free_pgtable_page(void *vaddr)
427 {
428         free_page((unsigned long)vaddr);
429 }
430
431 static inline int domain_type_is_si(struct dmar_domain *domain)
432 {
433         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
434 }
435
436 static inline bool domain_use_first_level(struct dmar_domain *domain)
437 {
438         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
439 }
440
441 static inline int domain_pfn_supported(struct dmar_domain *domain,
442                                        unsigned long pfn)
443 {
444         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
445
446         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
447 }
448
449 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
450 {
451         unsigned long sagaw;
452         int agaw;
453
454         sagaw = cap_sagaw(iommu->cap);
455         for (agaw = width_to_agaw(max_gaw);
456              agaw >= 0; agaw--) {
457                 if (test_bit(agaw, &sagaw))
458                         break;
459         }
460
461         return agaw;
462 }
463
464 /*
465  * Calculate max SAGAW for each iommu.
466  */
467 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
468 {
469         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
470 }
471
472 /*
473  * calculate agaw for each iommu.
474  * "SAGAW" may be different across iommus, use a default agaw, and
475  * get a supported less agaw for iommus that don't support the default agaw.
476  */
477 int iommu_calculate_agaw(struct intel_iommu *iommu)
478 {
479         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
480 }
481
482 /* This functionin only returns single iommu in a domain */
483 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
484 {
485         int iommu_id;
486
487         /* si_domain and vm domain should not get here. */
488         if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
489                 return NULL;
490
491         for_each_domain_iommu(iommu_id, domain)
492                 break;
493
494         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
495                 return NULL;
496
497         return g_iommus[iommu_id];
498 }
499
500 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
501 {
502         return sm_supported(iommu) ?
503                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
504 }
505
506 static void domain_update_iommu_coherency(struct dmar_domain *domain)
507 {
508         struct dmar_drhd_unit *drhd;
509         struct intel_iommu *iommu;
510         bool found = false;
511         int i;
512
513         domain->iommu_coherency = true;
514
515         for_each_domain_iommu(i, domain) {
516                 found = true;
517                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
518                         domain->iommu_coherency = false;
519                         break;
520                 }
521         }
522         if (found)
523                 return;
524
525         /* No hardware attached; use lowest common denominator */
526         rcu_read_lock();
527         for_each_active_iommu(iommu, drhd) {
528                 if (!iommu_paging_structure_coherency(iommu)) {
529                         domain->iommu_coherency = false;
530                         break;
531                 }
532         }
533         rcu_read_unlock();
534 }
535
536 static int domain_update_iommu_superpage(struct dmar_domain *domain,
537                                          struct intel_iommu *skip)
538 {
539         struct dmar_drhd_unit *drhd;
540         struct intel_iommu *iommu;
541         int mask = 0x3;
542
543         if (!intel_iommu_superpage)
544                 return 0;
545
546         /* set iommu_superpage to the smallest common denominator */
547         rcu_read_lock();
548         for_each_active_iommu(iommu, drhd) {
549                 if (iommu != skip) {
550                         if (domain && domain_use_first_level(domain)) {
551                                 if (!cap_fl1gp_support(iommu->cap))
552                                         mask = 0x1;
553                         } else {
554                                 mask &= cap_super_page_val(iommu->cap);
555                         }
556
557                         if (!mask)
558                                 break;
559                 }
560         }
561         rcu_read_unlock();
562
563         return fls(mask);
564 }
565
566 static int domain_update_device_node(struct dmar_domain *domain)
567 {
568         struct device_domain_info *info;
569         int nid = NUMA_NO_NODE;
570
571         assert_spin_locked(&device_domain_lock);
572
573         if (list_empty(&domain->devices))
574                 return NUMA_NO_NODE;
575
576         list_for_each_entry(info, &domain->devices, link) {
577                 if (!info->dev)
578                         continue;
579
580                 /*
581                  * There could possibly be multiple device numa nodes as devices
582                  * within the same domain may sit behind different IOMMUs. There
583                  * isn't perfect answer in such situation, so we select first
584                  * come first served policy.
585                  */
586                 nid = dev_to_node(info->dev);
587                 if (nid != NUMA_NO_NODE)
588                         break;
589         }
590
591         return nid;
592 }
593
594 static void domain_update_iotlb(struct dmar_domain *domain);
595
596 /* Return the super pagesize bitmap if supported. */
597 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
598 {
599         unsigned long bitmap = 0;
600
601         /*
602          * 1-level super page supports page size of 2MiB, 2-level super page
603          * supports page size of both 2MiB and 1GiB.
604          */
605         if (domain->iommu_superpage == 1)
606                 bitmap |= SZ_2M;
607         else if (domain->iommu_superpage == 2)
608                 bitmap |= SZ_2M | SZ_1G;
609
610         return bitmap;
611 }
612
613 /* Some capabilities may be different across iommus */
614 static void domain_update_iommu_cap(struct dmar_domain *domain)
615 {
616         domain_update_iommu_coherency(domain);
617         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
618
619         /*
620          * If RHSA is missing, we should default to the device numa domain
621          * as fall back.
622          */
623         if (domain->nid == NUMA_NO_NODE)
624                 domain->nid = domain_update_device_node(domain);
625
626         /*
627          * First-level translation restricts the input-address to a
628          * canonical address (i.e., address bits 63:N have the same
629          * value as address bit [N-1], where N is 48-bits with 4-level
630          * paging and 57-bits with 5-level paging). Hence, skip bit
631          * [N-1].
632          */
633         if (domain_use_first_level(domain))
634                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
635         else
636                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
637
638         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
639         domain_update_iotlb(domain);
640 }
641
642 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
643                                          u8 devfn, int alloc)
644 {
645         struct root_entry *root = &iommu->root_entry[bus];
646         struct context_entry *context;
647         u64 *entry;
648
649         entry = &root->lo;
650         if (sm_supported(iommu)) {
651                 if (devfn >= 0x80) {
652                         devfn -= 0x80;
653                         entry = &root->hi;
654                 }
655                 devfn *= 2;
656         }
657         if (*entry & 1)
658                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
659         else {
660                 unsigned long phy_addr;
661                 if (!alloc)
662                         return NULL;
663
664                 context = alloc_pgtable_page(iommu->node);
665                 if (!context)
666                         return NULL;
667
668                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
669                 phy_addr = virt_to_phys((void *)context);
670                 *entry = phy_addr | 1;
671                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
672         }
673         return &context[devfn];
674 }
675
676 /**
677  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
678  *                               sub-hierarchy of a candidate PCI-PCI bridge
679  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
680  * @bridge: the candidate PCI-PCI bridge
681  *
682  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
683  */
684 static bool
685 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
686 {
687         struct pci_dev *pdev, *pbridge;
688
689         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
690                 return false;
691
692         pdev = to_pci_dev(dev);
693         pbridge = to_pci_dev(bridge);
694
695         if (pbridge->subordinate &&
696             pbridge->subordinate->number <= pdev->bus->number &&
697             pbridge->subordinate->busn_res.end >= pdev->bus->number)
698                 return true;
699
700         return false;
701 }
702
703 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
704 {
705         struct dmar_drhd_unit *drhd;
706         u32 vtbar;
707         int rc;
708
709         /* We know that this device on this chipset has its own IOMMU.
710          * If we find it under a different IOMMU, then the BIOS is lying
711          * to us. Hope that the IOMMU for this device is actually
712          * disabled, and it needs no translation...
713          */
714         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
715         if (rc) {
716                 /* "can't" happen */
717                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
718                 return false;
719         }
720         vtbar &= 0xffff0000;
721
722         /* we know that the this iommu should be at offset 0xa000 from vtbar */
723         drhd = dmar_find_matched_drhd_unit(pdev);
724         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
725                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
726                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
727                 return true;
728         }
729
730         return false;
731 }
732
733 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
734 {
735         if (!iommu || iommu->drhd->ignored)
736                 return true;
737
738         if (dev_is_pci(dev)) {
739                 struct pci_dev *pdev = to_pci_dev(dev);
740
741                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
742                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
743                     quirk_ioat_snb_local_iommu(pdev))
744                         return true;
745         }
746
747         return false;
748 }
749
750 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
751 {
752         struct dmar_drhd_unit *drhd = NULL;
753         struct pci_dev *pdev = NULL;
754         struct intel_iommu *iommu;
755         struct device *tmp;
756         u16 segment = 0;
757         int i;
758
759         if (!dev)
760                 return NULL;
761
762         if (dev_is_pci(dev)) {
763                 struct pci_dev *pf_pdev;
764
765                 pdev = pci_real_dma_dev(to_pci_dev(dev));
766
767                 /* VFs aren't listed in scope tables; we need to look up
768                  * the PF instead to find the IOMMU. */
769                 pf_pdev = pci_physfn(pdev);
770                 dev = &pf_pdev->dev;
771                 segment = pci_domain_nr(pdev->bus);
772         } else if (has_acpi_companion(dev))
773                 dev = &ACPI_COMPANION(dev)->dev;
774
775         rcu_read_lock();
776         for_each_iommu(iommu, drhd) {
777                 if (pdev && segment != drhd->segment)
778                         continue;
779
780                 for_each_active_dev_scope(drhd->devices,
781                                           drhd->devices_cnt, i, tmp) {
782                         if (tmp == dev) {
783                                 /* For a VF use its original BDF# not that of the PF
784                                  * which we used for the IOMMU lookup. Strictly speaking
785                                  * we could do this for all PCI devices; we only need to
786                                  * get the BDF# from the scope table for ACPI matches. */
787                                 if (pdev && pdev->is_virtfn)
788                                         goto got_pdev;
789
790                                 if (bus && devfn) {
791                                         *bus = drhd->devices[i].bus;
792                                         *devfn = drhd->devices[i].devfn;
793                                 }
794                                 goto out;
795                         }
796
797                         if (is_downstream_to_pci_bridge(dev, tmp))
798                                 goto got_pdev;
799                 }
800
801                 if (pdev && drhd->include_all) {
802 got_pdev:
803                         if (bus && devfn) {
804                                 *bus = pdev->bus->number;
805                                 *devfn = pdev->devfn;
806                         }
807                         goto out;
808                 }
809         }
810         iommu = NULL;
811 out:
812         if (iommu_is_dummy(iommu, dev))
813                 iommu = NULL;
814
815         rcu_read_unlock();
816
817         return iommu;
818 }
819
820 static void domain_flush_cache(struct dmar_domain *domain,
821                                void *addr, int size)
822 {
823         if (!domain->iommu_coherency)
824                 clflush_cache_range(addr, size);
825 }
826
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
828 {
829         struct context_entry *context;
830         int ret = 0;
831         unsigned long flags;
832
833         spin_lock_irqsave(&iommu->lock, flags);
834         context = iommu_context_addr(iommu, bus, devfn, 0);
835         if (context)
836                 ret = context_present(context);
837         spin_unlock_irqrestore(&iommu->lock, flags);
838         return ret;
839 }
840
841 static void free_context_table(struct intel_iommu *iommu)
842 {
843         int i;
844         unsigned long flags;
845         struct context_entry *context;
846
847         spin_lock_irqsave(&iommu->lock, flags);
848         if (!iommu->root_entry) {
849                 goto out;
850         }
851         for (i = 0; i < ROOT_ENTRY_NR; i++) {
852                 context = iommu_context_addr(iommu, i, 0, 0);
853                 if (context)
854                         free_pgtable_page(context);
855
856                 if (!sm_supported(iommu))
857                         continue;
858
859                 context = iommu_context_addr(iommu, i, 0x80, 0);
860                 if (context)
861                         free_pgtable_page(context);
862
863         }
864         free_pgtable_page(iommu->root_entry);
865         iommu->root_entry = NULL;
866 out:
867         spin_unlock_irqrestore(&iommu->lock, flags);
868 }
869
870 #ifdef CONFIG_DMAR_DEBUG
871 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
872 {
873         struct device_domain_info *info;
874         struct dma_pte *parent, *pte;
875         struct dmar_domain *domain;
876         int offset, level;
877
878         info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
879         if (!info || !info->domain) {
880                 pr_info("device [%02x:%02x.%d] not probed\n",
881                         bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
882                 return;
883         }
884
885         domain = info->domain;
886         level = agaw_to_level(domain->agaw);
887         parent = domain->pgd;
888         if (!parent) {
889                 pr_info("no page table setup\n");
890                 return;
891         }
892
893         while (1) {
894                 offset = pfn_level_offset(pfn, level);
895                 pte = &parent[offset];
896                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
897                         pr_info("PTE not present at level %d\n", level);
898                         break;
899                 }
900
901                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
902
903                 if (level == 1)
904                         break;
905
906                 parent = phys_to_virt(dma_pte_addr(pte));
907                 level--;
908         }
909 }
910
911 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
912                           unsigned long long addr, u32 pasid)
913 {
914         struct pasid_dir_entry *dir, *pde;
915         struct pasid_entry *entries, *pte;
916         struct context_entry *ctx_entry;
917         struct root_entry *rt_entry;
918         u8 devfn = source_id & 0xff;
919         u8 bus = source_id >> 8;
920         int i, dir_index, index;
921
922         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
923
924         /* root entry dump */
925         rt_entry = &iommu->root_entry[bus];
926         if (!rt_entry) {
927                 pr_info("root table entry is not present\n");
928                 return;
929         }
930
931         if (sm_supported(iommu))
932                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
933                         rt_entry->hi, rt_entry->lo);
934         else
935                 pr_info("root entry: 0x%016llx", rt_entry->lo);
936
937         /* context entry dump */
938         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
939         if (!ctx_entry) {
940                 pr_info("context table entry is not present\n");
941                 return;
942         }
943
944         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
945                 ctx_entry->hi, ctx_entry->lo);
946
947         /* legacy mode does not require PASID entries */
948         if (!sm_supported(iommu))
949                 goto pgtable_walk;
950
951         /* get the pointer to pasid directory entry */
952         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
953         if (!dir) {
954                 pr_info("pasid directory entry is not present\n");
955                 return;
956         }
957         /* For request-without-pasid, get the pasid from context entry */
958         if (intel_iommu_sm && pasid == INVALID_IOASID)
959                 pasid = PASID_RID2PASID;
960
961         dir_index = pasid >> PASID_PDE_SHIFT;
962         pde = &dir[dir_index];
963         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
964
965         /* get the pointer to the pasid table entry */
966         entries = get_pasid_table_from_pde(pde);
967         if (!entries) {
968                 pr_info("pasid table entry is not present\n");
969                 return;
970         }
971         index = pasid & PASID_PTE_MASK;
972         pte = &entries[index];
973         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
974                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
975
976 pgtable_walk:
977         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
978 }
979 #endif
980
981 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
982                                       unsigned long pfn, int *target_level)
983 {
984         struct dma_pte *parent, *pte;
985         int level = agaw_to_level(domain->agaw);
986         int offset;
987
988         BUG_ON(!domain->pgd);
989
990         if (!domain_pfn_supported(domain, pfn))
991                 /* Address beyond IOMMU's addressing capabilities. */
992                 return NULL;
993
994         parent = domain->pgd;
995
996         while (1) {
997                 void *tmp_page;
998
999                 offset = pfn_level_offset(pfn, level);
1000                 pte = &parent[offset];
1001                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1002                         break;
1003                 if (level == *target_level)
1004                         break;
1005
1006                 if (!dma_pte_present(pte)) {
1007                         uint64_t pteval;
1008
1009                         tmp_page = alloc_pgtable_page(domain->nid);
1010
1011                         if (!tmp_page)
1012                                 return NULL;
1013
1014                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1015                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1016                         if (domain_use_first_level(domain)) {
1017                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1018                                 if (iommu_is_dma_domain(&domain->domain))
1019                                         pteval |= DMA_FL_PTE_ACCESS;
1020                         }
1021                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1022                                 /* Someone else set it while we were thinking; use theirs. */
1023                                 free_pgtable_page(tmp_page);
1024                         else
1025                                 domain_flush_cache(domain, pte, sizeof(*pte));
1026                 }
1027                 if (level == 1)
1028                         break;
1029
1030                 parent = phys_to_virt(dma_pte_addr(pte));
1031                 level--;
1032         }
1033
1034         if (!*target_level)
1035                 *target_level = level;
1036
1037         return pte;
1038 }
1039
1040 /* return address's pte at specific level */
1041 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1042                                          unsigned long pfn,
1043                                          int level, int *large_page)
1044 {
1045         struct dma_pte *parent, *pte;
1046         int total = agaw_to_level(domain->agaw);
1047         int offset;
1048
1049         parent = domain->pgd;
1050         while (level <= total) {
1051                 offset = pfn_level_offset(pfn, total);
1052                 pte = &parent[offset];
1053                 if (level == total)
1054                         return pte;
1055
1056                 if (!dma_pte_present(pte)) {
1057                         *large_page = total;
1058                         break;
1059                 }
1060
1061                 if (dma_pte_superpage(pte)) {
1062                         *large_page = total;
1063                         return pte;
1064                 }
1065
1066                 parent = phys_to_virt(dma_pte_addr(pte));
1067                 total--;
1068         }
1069         return NULL;
1070 }
1071
1072 /* clear last level pte, a tlb flush should be followed */
1073 static void dma_pte_clear_range(struct dmar_domain *domain,
1074                                 unsigned long start_pfn,
1075                                 unsigned long last_pfn)
1076 {
1077         unsigned int large_page;
1078         struct dma_pte *first_pte, *pte;
1079
1080         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1081         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1082         BUG_ON(start_pfn > last_pfn);
1083
1084         /* we don't need lock here; nobody else touches the iova range */
1085         do {
1086                 large_page = 1;
1087                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1088                 if (!pte) {
1089                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1090                         continue;
1091                 }
1092                 do {
1093                         dma_clear_pte(pte);
1094                         start_pfn += lvl_to_nr_pages(large_page);
1095                         pte++;
1096                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1097
1098                 domain_flush_cache(domain, first_pte,
1099                                    (void *)pte - (void *)first_pte);
1100
1101         } while (start_pfn && start_pfn <= last_pfn);
1102 }
1103
1104 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1105                                int retain_level, struct dma_pte *pte,
1106                                unsigned long pfn, unsigned long start_pfn,
1107                                unsigned long last_pfn)
1108 {
1109         pfn = max(start_pfn, pfn);
1110         pte = &pte[pfn_level_offset(pfn, level)];
1111
1112         do {
1113                 unsigned long level_pfn;
1114                 struct dma_pte *level_pte;
1115
1116                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1117                         goto next;
1118
1119                 level_pfn = pfn & level_mask(level);
1120                 level_pte = phys_to_virt(dma_pte_addr(pte));
1121
1122                 if (level > 2) {
1123                         dma_pte_free_level(domain, level - 1, retain_level,
1124                                            level_pte, level_pfn, start_pfn,
1125                                            last_pfn);
1126                 }
1127
1128                 /*
1129                  * Free the page table if we're below the level we want to
1130                  * retain and the range covers the entire table.
1131                  */
1132                 if (level < retain_level && !(start_pfn > level_pfn ||
1133                       last_pfn < level_pfn + level_size(level) - 1)) {
1134                         dma_clear_pte(pte);
1135                         domain_flush_cache(domain, pte, sizeof(*pte));
1136                         free_pgtable_page(level_pte);
1137                 }
1138 next:
1139                 pfn += level_size(level);
1140         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1141 }
1142
1143 /*
1144  * clear last level (leaf) ptes and free page table pages below the
1145  * level we wish to keep intact.
1146  */
1147 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1148                                    unsigned long start_pfn,
1149                                    unsigned long last_pfn,
1150                                    int retain_level)
1151 {
1152         dma_pte_clear_range(domain, start_pfn, last_pfn);
1153
1154         /* We don't need lock here; nobody else touches the iova range */
1155         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1156                            domain->pgd, 0, start_pfn, last_pfn);
1157
1158         /* free pgd */
1159         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1160                 free_pgtable_page(domain->pgd);
1161                 domain->pgd = NULL;
1162         }
1163 }
1164
1165 /* When a page at a given level is being unlinked from its parent, we don't
1166    need to *modify* it at all. All we need to do is make a list of all the
1167    pages which can be freed just as soon as we've flushed the IOTLB and we
1168    know the hardware page-walk will no longer touch them.
1169    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1170    be freed. */
1171 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1172                                     int level, struct dma_pte *pte,
1173                                     struct list_head *freelist)
1174 {
1175         struct page *pg;
1176
1177         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1178         list_add_tail(&pg->lru, freelist);
1179
1180         if (level == 1)
1181                 return;
1182
1183         pte = page_address(pg);
1184         do {
1185                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1186                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1187                 pte++;
1188         } while (!first_pte_in_page(pte));
1189 }
1190
1191 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1192                                 struct dma_pte *pte, unsigned long pfn,
1193                                 unsigned long start_pfn, unsigned long last_pfn,
1194                                 struct list_head *freelist)
1195 {
1196         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1197
1198         pfn = max(start_pfn, pfn);
1199         pte = &pte[pfn_level_offset(pfn, level)];
1200
1201         do {
1202                 unsigned long level_pfn = pfn & level_mask(level);
1203
1204                 if (!dma_pte_present(pte))
1205                         goto next;
1206
1207                 /* If range covers entire pagetable, free it */
1208                 if (start_pfn <= level_pfn &&
1209                     last_pfn >= level_pfn + level_size(level) - 1) {
1210                         /* These suborbinate page tables are going away entirely. Don't
1211                            bother to clear them; we're just going to *free* them. */
1212                         if (level > 1 && !dma_pte_superpage(pte))
1213                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1214
1215                         dma_clear_pte(pte);
1216                         if (!first_pte)
1217                                 first_pte = pte;
1218                         last_pte = pte;
1219                 } else if (level > 1) {
1220                         /* Recurse down into a level that isn't *entirely* obsolete */
1221                         dma_pte_clear_level(domain, level - 1,
1222                                             phys_to_virt(dma_pte_addr(pte)),
1223                                             level_pfn, start_pfn, last_pfn,
1224                                             freelist);
1225                 }
1226 next:
1227                 pfn = level_pfn + level_size(level);
1228         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1229
1230         if (first_pte)
1231                 domain_flush_cache(domain, first_pte,
1232                                    (void *)++last_pte - (void *)first_pte);
1233 }
1234
1235 /* We can't just free the pages because the IOMMU may still be walking
1236    the page tables, and may have cached the intermediate levels. The
1237    pages can only be freed after the IOTLB flush has been done. */
1238 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1239                          unsigned long last_pfn, struct list_head *freelist)
1240 {
1241         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1242         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1243         BUG_ON(start_pfn > last_pfn);
1244
1245         /* we don't need lock here; nobody else touches the iova range */
1246         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1247                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1248
1249         /* free pgd */
1250         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1251                 struct page *pgd_page = virt_to_page(domain->pgd);
1252                 list_add_tail(&pgd_page->lru, freelist);
1253                 domain->pgd = NULL;
1254         }
1255 }
1256
1257 /* iommu handling */
1258 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1259 {
1260         struct root_entry *root;
1261         unsigned long flags;
1262
1263         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1264         if (!root) {
1265                 pr_err("Allocating root entry for %s failed\n",
1266                         iommu->name);
1267                 return -ENOMEM;
1268         }
1269
1270         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1271
1272         spin_lock_irqsave(&iommu->lock, flags);
1273         iommu->root_entry = root;
1274         spin_unlock_irqrestore(&iommu->lock, flags);
1275
1276         return 0;
1277 }
1278
1279 static void iommu_set_root_entry(struct intel_iommu *iommu)
1280 {
1281         u64 addr;
1282         u32 sts;
1283         unsigned long flag;
1284
1285         addr = virt_to_phys(iommu->root_entry);
1286         if (sm_supported(iommu))
1287                 addr |= DMA_RTADDR_SMT;
1288
1289         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1290         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1291
1292         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1293
1294         /* Make sure hardware complete it */
1295         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1296                       readl, (sts & DMA_GSTS_RTPS), sts);
1297
1298         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1299
1300         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1301         if (sm_supported(iommu))
1302                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1303         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1304 }
1305
1306 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1307 {
1308         u32 val;
1309         unsigned long flag;
1310
1311         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1312                 return;
1313
1314         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1315         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1316
1317         /* Make sure hardware complete it */
1318         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1319                       readl, (!(val & DMA_GSTS_WBFS)), val);
1320
1321         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1322 }
1323
1324 /* return value determine if we need a write buffer flush */
1325 static void __iommu_flush_context(struct intel_iommu *iommu,
1326                                   u16 did, u16 source_id, u8 function_mask,
1327                                   u64 type)
1328 {
1329         u64 val = 0;
1330         unsigned long flag;
1331
1332         switch (type) {
1333         case DMA_CCMD_GLOBAL_INVL:
1334                 val = DMA_CCMD_GLOBAL_INVL;
1335                 break;
1336         case DMA_CCMD_DOMAIN_INVL:
1337                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1338                 break;
1339         case DMA_CCMD_DEVICE_INVL:
1340                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1341                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1342                 break;
1343         default:
1344                 BUG();
1345         }
1346         val |= DMA_CCMD_ICC;
1347
1348         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1349         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1350
1351         /* Make sure hardware complete it */
1352         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1353                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1354
1355         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1356 }
1357
1358 /* return value determine if we need a write buffer flush */
1359 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1360                                 u64 addr, unsigned int size_order, u64 type)
1361 {
1362         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1363         u64 val = 0, val_iva = 0;
1364         unsigned long flag;
1365
1366         switch (type) {
1367         case DMA_TLB_GLOBAL_FLUSH:
1368                 /* global flush doesn't need set IVA_REG */
1369                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1370                 break;
1371         case DMA_TLB_DSI_FLUSH:
1372                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1373                 break;
1374         case DMA_TLB_PSI_FLUSH:
1375                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1376                 /* IH bit is passed in as part of address */
1377                 val_iva = size_order | addr;
1378                 break;
1379         default:
1380                 BUG();
1381         }
1382         /* Note: set drain read/write */
1383 #if 0
1384         /*
1385          * This is probably to be super secure.. Looks like we can
1386          * ignore it without any impact.
1387          */
1388         if (cap_read_drain(iommu->cap))
1389                 val |= DMA_TLB_READ_DRAIN;
1390 #endif
1391         if (cap_write_drain(iommu->cap))
1392                 val |= DMA_TLB_WRITE_DRAIN;
1393
1394         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1395         /* Note: Only uses first TLB reg currently */
1396         if (val_iva)
1397                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1398         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1399
1400         /* Make sure hardware complete it */
1401         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1402                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1403
1404         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1405
1406         /* check IOTLB invalidation granularity */
1407         if (DMA_TLB_IAIG(val) == 0)
1408                 pr_err("Flush IOTLB failed\n");
1409         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1410                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1411                         (unsigned long long)DMA_TLB_IIRG(type),
1412                         (unsigned long long)DMA_TLB_IAIG(val));
1413 }
1414
1415 static struct device_domain_info *
1416 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1417                          u8 bus, u8 devfn)
1418 {
1419         struct device_domain_info *info;
1420
1421         assert_spin_locked(&device_domain_lock);
1422
1423         if (!iommu->qi)
1424                 return NULL;
1425
1426         list_for_each_entry(info, &domain->devices, link)
1427                 if (info->iommu == iommu && info->bus == bus &&
1428                     info->devfn == devfn) {
1429                         if (info->ats_supported && info->dev)
1430                                 return info;
1431                         break;
1432                 }
1433
1434         return NULL;
1435 }
1436
1437 static void domain_update_iotlb(struct dmar_domain *domain)
1438 {
1439         struct device_domain_info *info;
1440         bool has_iotlb_device = false;
1441
1442         assert_spin_locked(&device_domain_lock);
1443
1444         list_for_each_entry(info, &domain->devices, link)
1445                 if (info->ats_enabled) {
1446                         has_iotlb_device = true;
1447                         break;
1448                 }
1449
1450         domain->has_iotlb_device = has_iotlb_device;
1451 }
1452
1453 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1454 {
1455         struct pci_dev *pdev;
1456
1457         assert_spin_locked(&device_domain_lock);
1458
1459         if (!info || !dev_is_pci(info->dev))
1460                 return;
1461
1462         pdev = to_pci_dev(info->dev);
1463         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1464          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1465          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1466          * reserved, which should be set to 0.
1467          */
1468         if (!ecap_dit(info->iommu->ecap))
1469                 info->pfsid = 0;
1470         else {
1471                 struct pci_dev *pf_pdev;
1472
1473                 /* pdev will be returned if device is not a vf */
1474                 pf_pdev = pci_physfn(pdev);
1475                 info->pfsid = pci_dev_id(pf_pdev);
1476         }
1477
1478 #ifdef CONFIG_INTEL_IOMMU_SVM
1479         /* The PCIe spec, in its wisdom, declares that the behaviour of
1480            the device if you enable PASID support after ATS support is
1481            undefined. So always enable PASID support on devices which
1482            have it, even if we can't yet know if we're ever going to
1483            use it. */
1484         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1485                 info->pasid_enabled = 1;
1486
1487         if (info->pri_supported &&
1488             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1489             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1490                 info->pri_enabled = 1;
1491 #endif
1492         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1493             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1494                 info->ats_enabled = 1;
1495                 domain_update_iotlb(info->domain);
1496                 info->ats_qdep = pci_ats_queue_depth(pdev);
1497         }
1498 }
1499
1500 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1501 {
1502         struct pci_dev *pdev;
1503
1504         assert_spin_locked(&device_domain_lock);
1505
1506         if (!dev_is_pci(info->dev))
1507                 return;
1508
1509         pdev = to_pci_dev(info->dev);
1510
1511         if (info->ats_enabled) {
1512                 pci_disable_ats(pdev);
1513                 info->ats_enabled = 0;
1514                 domain_update_iotlb(info->domain);
1515         }
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517         if (info->pri_enabled) {
1518                 pci_disable_pri(pdev);
1519                 info->pri_enabled = 0;
1520         }
1521         if (info->pasid_enabled) {
1522                 pci_disable_pasid(pdev);
1523                 info->pasid_enabled = 0;
1524         }
1525 #endif
1526 }
1527
1528 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1529                                     u64 addr, unsigned int mask)
1530 {
1531         u16 sid, qdep;
1532
1533         if (!info || !info->ats_enabled)
1534                 return;
1535
1536         sid = info->bus << 8 | info->devfn;
1537         qdep = info->ats_qdep;
1538         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1539                            qdep, addr, mask);
1540 }
1541
1542 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1543                                   u64 addr, unsigned mask)
1544 {
1545         unsigned long flags;
1546         struct device_domain_info *info;
1547
1548         if (!domain->has_iotlb_device)
1549                 return;
1550
1551         spin_lock_irqsave(&device_domain_lock, flags);
1552         list_for_each_entry(info, &domain->devices, link)
1553                 __iommu_flush_dev_iotlb(info, addr, mask);
1554
1555         spin_unlock_irqrestore(&device_domain_lock, flags);
1556 }
1557
1558 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1559                                   struct dmar_domain *domain,
1560                                   unsigned long pfn, unsigned int pages,
1561                                   int ih, int map)
1562 {
1563         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1564         unsigned int mask = ilog2(aligned_pages);
1565         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1566         u16 did = domain->iommu_did[iommu->seq_id];
1567
1568         BUG_ON(pages == 0);
1569
1570         if (ih)
1571                 ih = 1 << 6;
1572
1573         if (domain_use_first_level(domain)) {
1574                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1575         } else {
1576                 unsigned long bitmask = aligned_pages - 1;
1577
1578                 /*
1579                  * PSI masks the low order bits of the base address. If the
1580                  * address isn't aligned to the mask, then compute a mask value
1581                  * needed to ensure the target range is flushed.
1582                  */
1583                 if (unlikely(bitmask & pfn)) {
1584                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1585
1586                         /*
1587                          * Since end_pfn <= pfn + bitmask, the only way bits
1588                          * higher than bitmask can differ in pfn and end_pfn is
1589                          * by carrying. This means after masking out bitmask,
1590                          * high bits starting with the first set bit in
1591                          * shared_bits are all equal in both pfn and end_pfn.
1592                          */
1593                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1594                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1595                 }
1596
1597                 /*
1598                  * Fallback to domain selective flush if no PSI support or
1599                  * the size is too big.
1600                  */
1601                 if (!cap_pgsel_inv(iommu->cap) ||
1602                     mask > cap_max_amask_val(iommu->cap))
1603                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1604                                                         DMA_TLB_DSI_FLUSH);
1605                 else
1606                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1607                                                         DMA_TLB_PSI_FLUSH);
1608         }
1609
1610         /*
1611          * In caching mode, changes of pages from non-present to present require
1612          * flush. However, device IOTLB doesn't need to be flushed in this case.
1613          */
1614         if (!cap_caching_mode(iommu->cap) || !map)
1615                 iommu_flush_dev_iotlb(domain, addr, mask);
1616 }
1617
1618 /* Notification for newly created mappings */
1619 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1620                                         struct dmar_domain *domain,
1621                                         unsigned long pfn, unsigned int pages)
1622 {
1623         /*
1624          * It's a non-present to present mapping. Only flush if caching mode
1625          * and second level.
1626          */
1627         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1628                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1629         else
1630                 iommu_flush_write_buffer(iommu);
1631 }
1632
1633 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1634 {
1635         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1636         int idx;
1637
1638         for_each_domain_iommu(idx, dmar_domain) {
1639                 struct intel_iommu *iommu = g_iommus[idx];
1640                 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1641
1642                 if (domain_use_first_level(dmar_domain))
1643                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1644                 else
1645                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1646                                                  DMA_TLB_DSI_FLUSH);
1647
1648                 if (!cap_caching_mode(iommu->cap))
1649                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1650         }
1651 }
1652
1653 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1654 {
1655         u32 pmen;
1656         unsigned long flags;
1657
1658         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1659                 return;
1660
1661         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1662         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1663         pmen &= ~DMA_PMEN_EPM;
1664         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1665
1666         /* wait for the protected region status bit to clear */
1667         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1668                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1669
1670         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1671 }
1672
1673 static void iommu_enable_translation(struct intel_iommu *iommu)
1674 {
1675         u32 sts;
1676         unsigned long flags;
1677
1678         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1679         iommu->gcmd |= DMA_GCMD_TE;
1680         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1681
1682         /* Make sure hardware complete it */
1683         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1684                       readl, (sts & DMA_GSTS_TES), sts);
1685
1686         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1687 }
1688
1689 static void iommu_disable_translation(struct intel_iommu *iommu)
1690 {
1691         u32 sts;
1692         unsigned long flag;
1693
1694         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1695             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1696                 return;
1697
1698         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1699         iommu->gcmd &= ~DMA_GCMD_TE;
1700         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1701
1702         /* Make sure hardware complete it */
1703         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1704                       readl, (!(sts & DMA_GSTS_TES)), sts);
1705
1706         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1707 }
1708
1709 static int iommu_init_domains(struct intel_iommu *iommu)
1710 {
1711         u32 ndomains;
1712
1713         ndomains = cap_ndoms(iommu->cap);
1714         pr_debug("%s: Number of Domains supported <%d>\n",
1715                  iommu->name, ndomains);
1716
1717         spin_lock_init(&iommu->lock);
1718
1719         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1720         if (!iommu->domain_ids)
1721                 return -ENOMEM;
1722
1723         /*
1724          * If Caching mode is set, then invalid translations are tagged
1725          * with domain-id 0, hence we need to pre-allocate it. We also
1726          * use domain-id 0 as a marker for non-allocated domain-id, so
1727          * make sure it is not used for a real domain.
1728          */
1729         set_bit(0, iommu->domain_ids);
1730
1731         /*
1732          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1733          * entry for first-level or pass-through translation modes should
1734          * be programmed with a domain id different from those used for
1735          * second-level or nested translation. We reserve a domain id for
1736          * this purpose.
1737          */
1738         if (sm_supported(iommu))
1739                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1740
1741         return 0;
1742 }
1743
1744 static void disable_dmar_iommu(struct intel_iommu *iommu)
1745 {
1746         struct device_domain_info *info, *tmp;
1747         unsigned long flags;
1748
1749         if (!iommu->domain_ids)
1750                 return;
1751
1752         spin_lock_irqsave(&device_domain_lock, flags);
1753         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1754                 if (info->iommu != iommu)
1755                         continue;
1756
1757                 if (!info->dev || !info->domain)
1758                         continue;
1759
1760                 __dmar_remove_one_dev_info(info);
1761         }
1762         spin_unlock_irqrestore(&device_domain_lock, flags);
1763
1764         if (iommu->gcmd & DMA_GCMD_TE)
1765                 iommu_disable_translation(iommu);
1766 }
1767
1768 static void free_dmar_iommu(struct intel_iommu *iommu)
1769 {
1770         if (iommu->domain_ids) {
1771                 bitmap_free(iommu->domain_ids);
1772                 iommu->domain_ids = NULL;
1773         }
1774
1775         g_iommus[iommu->seq_id] = NULL;
1776
1777         /* free context mapping */
1778         free_context_table(iommu);
1779
1780 #ifdef CONFIG_INTEL_IOMMU_SVM
1781         if (pasid_supported(iommu)) {
1782                 if (ecap_prs(iommu->ecap))
1783                         intel_svm_finish_prq(iommu);
1784         }
1785         if (vccap_pasid(iommu->vccap))
1786                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1787
1788 #endif
1789 }
1790
1791 /*
1792  * Check and return whether first level is used by default for
1793  * DMA translation.
1794  */
1795 static bool first_level_by_default(unsigned int type)
1796 {
1797         /* Only SL is available in legacy mode */
1798         if (!scalable_mode_support())
1799                 return false;
1800
1801         /* Only level (either FL or SL) is available, just use it */
1802         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1803                 return intel_cap_flts_sanity();
1804
1805         /* Both levels are available, decide it based on domain type */
1806         return type != IOMMU_DOMAIN_UNMANAGED;
1807 }
1808
1809 static struct dmar_domain *alloc_domain(unsigned int type)
1810 {
1811         struct dmar_domain *domain;
1812
1813         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1814         if (!domain)
1815                 return NULL;
1816
1817         domain->nid = NUMA_NO_NODE;
1818         if (first_level_by_default(type))
1819                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1820         domain->has_iotlb_device = false;
1821         INIT_LIST_HEAD(&domain->devices);
1822
1823         return domain;
1824 }
1825
1826 /* Must be called with iommu->lock */
1827 static int domain_attach_iommu(struct dmar_domain *domain,
1828                                struct intel_iommu *iommu)
1829 {
1830         unsigned long ndomains;
1831         int num;
1832
1833         assert_spin_locked(&device_domain_lock);
1834         assert_spin_locked(&iommu->lock);
1835
1836         domain->iommu_refcnt[iommu->seq_id] += 1;
1837         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1838                 ndomains = cap_ndoms(iommu->cap);
1839                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1840
1841                 if (num >= ndomains) {
1842                         pr_err("%s: No free domain ids\n", iommu->name);
1843                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1844                         return -ENOSPC;
1845                 }
1846
1847                 set_bit(num, iommu->domain_ids);
1848                 domain->iommu_did[iommu->seq_id] = num;
1849                 domain->nid                      = iommu->node;
1850                 domain_update_iommu_cap(domain);
1851         }
1852
1853         return 0;
1854 }
1855
1856 static void domain_detach_iommu(struct dmar_domain *domain,
1857                                 struct intel_iommu *iommu)
1858 {
1859         int num;
1860
1861         assert_spin_locked(&device_domain_lock);
1862         assert_spin_locked(&iommu->lock);
1863
1864         domain->iommu_refcnt[iommu->seq_id] -= 1;
1865         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1866                 num = domain->iommu_did[iommu->seq_id];
1867                 clear_bit(num, iommu->domain_ids);
1868                 domain_update_iommu_cap(domain);
1869                 domain->iommu_did[iommu->seq_id] = 0;
1870         }
1871 }
1872
1873 static inline int guestwidth_to_adjustwidth(int gaw)
1874 {
1875         int agaw;
1876         int r = (gaw - 12) % 9;
1877
1878         if (r == 0)
1879                 agaw = gaw;
1880         else
1881                 agaw = gaw + 9 - r;
1882         if (agaw > 64)
1883                 agaw = 64;
1884         return agaw;
1885 }
1886
1887 static void domain_exit(struct dmar_domain *domain)
1888 {
1889
1890         /* Remove associated devices and clear attached or cached domains */
1891         domain_remove_dev_info(domain);
1892
1893         if (domain->pgd) {
1894                 LIST_HEAD(freelist);
1895
1896                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1897                 put_pages_list(&freelist);
1898         }
1899
1900         kfree(domain);
1901 }
1902
1903 /*
1904  * Get the PASID directory size for scalable mode context entry.
1905  * Value of X in the PDTS field of a scalable mode context entry
1906  * indicates PASID directory with 2^(X + 7) entries.
1907  */
1908 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1909 {
1910         unsigned long pds, max_pde;
1911
1912         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1913         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1914         if (pds < 7)
1915                 return 0;
1916
1917         return pds - 7;
1918 }
1919
1920 /*
1921  * Set the RID_PASID field of a scalable mode context entry. The
1922  * IOMMU hardware will use the PASID value set in this field for
1923  * DMA translations of DMA requests without PASID.
1924  */
1925 static inline void
1926 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1927 {
1928         context->hi |= pasid & ((1 << 20) - 1);
1929 }
1930
1931 /*
1932  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1933  * entry.
1934  */
1935 static inline void context_set_sm_dte(struct context_entry *context)
1936 {
1937         context->lo |= (1 << 2);
1938 }
1939
1940 /*
1941  * Set the PRE(Page Request Enable) field of a scalable mode context
1942  * entry.
1943  */
1944 static inline void context_set_sm_pre(struct context_entry *context)
1945 {
1946         context->lo |= (1 << 4);
1947 }
1948
1949 /* Convert value to context PASID directory size field coding. */
1950 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1951
1952 static int domain_context_mapping_one(struct dmar_domain *domain,
1953                                       struct intel_iommu *iommu,
1954                                       struct pasid_table *table,
1955                                       u8 bus, u8 devfn)
1956 {
1957         u16 did = domain->iommu_did[iommu->seq_id];
1958         int translation = CONTEXT_TT_MULTI_LEVEL;
1959         struct device_domain_info *info = NULL;
1960         struct context_entry *context;
1961         unsigned long flags;
1962         int ret;
1963
1964         WARN_ON(did == 0);
1965
1966         if (hw_pass_through && domain_type_is_si(domain))
1967                 translation = CONTEXT_TT_PASS_THROUGH;
1968
1969         pr_debug("Set context mapping for %02x:%02x.%d\n",
1970                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1971
1972         BUG_ON(!domain->pgd);
1973
1974         spin_lock_irqsave(&device_domain_lock, flags);
1975         spin_lock(&iommu->lock);
1976
1977         ret = -ENOMEM;
1978         context = iommu_context_addr(iommu, bus, devfn, 1);
1979         if (!context)
1980                 goto out_unlock;
1981
1982         ret = 0;
1983         if (context_present(context))
1984                 goto out_unlock;
1985
1986         /*
1987          * For kdump cases, old valid entries may be cached due to the
1988          * in-flight DMA and copied pgtable, but there is no unmapping
1989          * behaviour for them, thus we need an explicit cache flush for
1990          * the newly-mapped device. For kdump, at this point, the device
1991          * is supposed to finish reset at its driver probe stage, so no
1992          * in-flight DMA will exist, and we don't need to worry anymore
1993          * hereafter.
1994          */
1995         if (context_copied(context)) {
1996                 u16 did_old = context_domain_id(context);
1997
1998                 if (did_old < cap_ndoms(iommu->cap)) {
1999                         iommu->flush.flush_context(iommu, did_old,
2000                                                    (((u16)bus) << 8) | devfn,
2001                                                    DMA_CCMD_MASK_NOBIT,
2002                                                    DMA_CCMD_DEVICE_INVL);
2003                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2004                                                  DMA_TLB_DSI_FLUSH);
2005                 }
2006         }
2007
2008         context_clear_entry(context);
2009
2010         if (sm_supported(iommu)) {
2011                 unsigned long pds;
2012
2013                 WARN_ON(!table);
2014
2015                 /* Setup the PASID DIR pointer: */
2016                 pds = context_get_sm_pds(table);
2017                 context->lo = (u64)virt_to_phys(table->table) |
2018                                 context_pdts(pds);
2019
2020                 /* Setup the RID_PASID field: */
2021                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2022
2023                 /*
2024                  * Setup the Device-TLB enable bit and Page request
2025                  * Enable bit:
2026                  */
2027                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2028                 if (info && info->ats_supported)
2029                         context_set_sm_dte(context);
2030                 if (info && info->pri_supported)
2031                         context_set_sm_pre(context);
2032         } else {
2033                 struct dma_pte *pgd = domain->pgd;
2034                 int agaw;
2035
2036                 context_set_domain_id(context, did);
2037
2038                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2039                         /*
2040                          * Skip top levels of page tables for iommu which has
2041                          * less agaw than default. Unnecessary for PT mode.
2042                          */
2043                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2044                                 ret = -ENOMEM;
2045                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2046                                 if (!dma_pte_present(pgd))
2047                                         goto out_unlock;
2048                         }
2049
2050                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2051                         if (info && info->ats_supported)
2052                                 translation = CONTEXT_TT_DEV_IOTLB;
2053                         else
2054                                 translation = CONTEXT_TT_MULTI_LEVEL;
2055
2056                         context_set_address_root(context, virt_to_phys(pgd));
2057                         context_set_address_width(context, agaw);
2058                 } else {
2059                         /*
2060                          * In pass through mode, AW must be programmed to
2061                          * indicate the largest AGAW value supported by
2062                          * hardware. And ASR is ignored by hardware.
2063                          */
2064                         context_set_address_width(context, iommu->msagaw);
2065                 }
2066
2067                 context_set_translation_type(context, translation);
2068         }
2069
2070         context_set_fault_enable(context);
2071         context_set_present(context);
2072         if (!ecap_coherent(iommu->ecap))
2073                 clflush_cache_range(context, sizeof(*context));
2074
2075         /*
2076          * It's a non-present to present mapping. If hardware doesn't cache
2077          * non-present entry we only need to flush the write-buffer. If the
2078          * _does_ cache non-present entries, then it does so in the special
2079          * domain #0, which we have to flush:
2080          */
2081         if (cap_caching_mode(iommu->cap)) {
2082                 iommu->flush.flush_context(iommu, 0,
2083                                            (((u16)bus) << 8) | devfn,
2084                                            DMA_CCMD_MASK_NOBIT,
2085                                            DMA_CCMD_DEVICE_INVL);
2086                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2087         } else {
2088                 iommu_flush_write_buffer(iommu);
2089         }
2090         iommu_enable_dev_iotlb(info);
2091
2092         ret = 0;
2093
2094 out_unlock:
2095         spin_unlock(&iommu->lock);
2096         spin_unlock_irqrestore(&device_domain_lock, flags);
2097
2098         return ret;
2099 }
2100
2101 struct domain_context_mapping_data {
2102         struct dmar_domain *domain;
2103         struct intel_iommu *iommu;
2104         struct pasid_table *table;
2105 };
2106
2107 static int domain_context_mapping_cb(struct pci_dev *pdev,
2108                                      u16 alias, void *opaque)
2109 {
2110         struct domain_context_mapping_data *data = opaque;
2111
2112         return domain_context_mapping_one(data->domain, data->iommu,
2113                                           data->table, PCI_BUS_NUM(alias),
2114                                           alias & 0xff);
2115 }
2116
2117 static int
2118 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2119 {
2120         struct domain_context_mapping_data data;
2121         struct pasid_table *table;
2122         struct intel_iommu *iommu;
2123         u8 bus, devfn;
2124
2125         iommu = device_to_iommu(dev, &bus, &devfn);
2126         if (!iommu)
2127                 return -ENODEV;
2128
2129         table = intel_pasid_get_table(dev);
2130
2131         if (!dev_is_pci(dev))
2132                 return domain_context_mapping_one(domain, iommu, table,
2133                                                   bus, devfn);
2134
2135         data.domain = domain;
2136         data.iommu = iommu;
2137         data.table = table;
2138
2139         return pci_for_each_dma_alias(to_pci_dev(dev),
2140                                       &domain_context_mapping_cb, &data);
2141 }
2142
2143 static int domain_context_mapped_cb(struct pci_dev *pdev,
2144                                     u16 alias, void *opaque)
2145 {
2146         struct intel_iommu *iommu = opaque;
2147
2148         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2149 }
2150
2151 static int domain_context_mapped(struct device *dev)
2152 {
2153         struct intel_iommu *iommu;
2154         u8 bus, devfn;
2155
2156         iommu = device_to_iommu(dev, &bus, &devfn);
2157         if (!iommu)
2158                 return -ENODEV;
2159
2160         if (!dev_is_pci(dev))
2161                 return device_context_mapped(iommu, bus, devfn);
2162
2163         return !pci_for_each_dma_alias(to_pci_dev(dev),
2164                                        domain_context_mapped_cb, iommu);
2165 }
2166
2167 /* Returns a number of VTD pages, but aligned to MM page size */
2168 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2169                                             size_t size)
2170 {
2171         host_addr &= ~PAGE_MASK;
2172         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2173 }
2174
2175 /* Return largest possible superpage level for a given mapping */
2176 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2177                                           unsigned long iov_pfn,
2178                                           unsigned long phy_pfn,
2179                                           unsigned long pages)
2180 {
2181         int support, level = 1;
2182         unsigned long pfnmerge;
2183
2184         support = domain->iommu_superpage;
2185
2186         /* To use a large page, the virtual *and* physical addresses
2187            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2188            of them will mean we have to use smaller pages. So just
2189            merge them and check both at once. */
2190         pfnmerge = iov_pfn | phy_pfn;
2191
2192         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2193                 pages >>= VTD_STRIDE_SHIFT;
2194                 if (!pages)
2195                         break;
2196                 pfnmerge >>= VTD_STRIDE_SHIFT;
2197                 level++;
2198                 support--;
2199         }
2200         return level;
2201 }
2202
2203 /*
2204  * Ensure that old small page tables are removed to make room for superpage(s).
2205  * We're going to add new large pages, so make sure we don't remove their parent
2206  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2207  */
2208 static void switch_to_super_page(struct dmar_domain *domain,
2209                                  unsigned long start_pfn,
2210                                  unsigned long end_pfn, int level)
2211 {
2212         unsigned long lvl_pages = lvl_to_nr_pages(level);
2213         struct dma_pte *pte = NULL;
2214         int i;
2215
2216         while (start_pfn <= end_pfn) {
2217                 if (!pte)
2218                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2219
2220                 if (dma_pte_present(pte)) {
2221                         dma_pte_free_pagetable(domain, start_pfn,
2222                                                start_pfn + lvl_pages - 1,
2223                                                level + 1);
2224
2225                         for_each_domain_iommu(i, domain)
2226                                 iommu_flush_iotlb_psi(g_iommus[i], domain,
2227                                                       start_pfn, lvl_pages,
2228                                                       0, 0);
2229                 }
2230
2231                 pte++;
2232                 start_pfn += lvl_pages;
2233                 if (first_pte_in_page(pte))
2234                         pte = NULL;
2235         }
2236 }
2237
2238 static int
2239 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2240                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2241 {
2242         struct dma_pte *first_pte = NULL, *pte = NULL;
2243         unsigned int largepage_lvl = 0;
2244         unsigned long lvl_pages = 0;
2245         phys_addr_t pteval;
2246         u64 attr;
2247
2248         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2249
2250         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2251                 return -EINVAL;
2252
2253         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2254         attr |= DMA_FL_PTE_PRESENT;
2255         if (domain_use_first_level(domain)) {
2256                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2257                 if (prot & DMA_PTE_WRITE)
2258                         attr |= DMA_FL_PTE_DIRTY;
2259         }
2260
2261         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2262
2263         while (nr_pages > 0) {
2264                 uint64_t tmp;
2265
2266                 if (!pte) {
2267                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2268                                         phys_pfn, nr_pages);
2269
2270                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2271                         if (!pte)
2272                                 return -ENOMEM;
2273                         first_pte = pte;
2274
2275                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2276
2277                         /* It is large page*/
2278                         if (largepage_lvl > 1) {
2279                                 unsigned long end_pfn;
2280                                 unsigned long pages_to_remove;
2281
2282                                 pteval |= DMA_PTE_LARGE_PAGE;
2283                                 pages_to_remove = min_t(unsigned long, nr_pages,
2284                                                         nr_pte_to_next_page(pte) * lvl_pages);
2285                                 end_pfn = iov_pfn + pages_to_remove - 1;
2286                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2287                         } else {
2288                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289                         }
2290
2291                 }
2292                 /* We don't need lock here, nobody else
2293                  * touches the iova range
2294                  */
2295                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296                 if (tmp) {
2297                         static int dumps = 5;
2298                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299                                 iov_pfn, tmp, (unsigned long long)pteval);
2300                         if (dumps) {
2301                                 dumps--;
2302                                 debug_dma_dump_mappings(NULL);
2303                         }
2304                         WARN_ON(1);
2305                 }
2306
2307                 nr_pages -= lvl_pages;
2308                 iov_pfn += lvl_pages;
2309                 phys_pfn += lvl_pages;
2310                 pteval += lvl_pages * VTD_PAGE_SIZE;
2311
2312                 /* If the next PTE would be the first in a new page, then we
2313                  * need to flush the cache on the entries we've just written.
2314                  * And then we'll need to recalculate 'pte', so clear it and
2315                  * let it get set again in the if (!pte) block above.
2316                  *
2317                  * If we're done (!nr_pages) we need to flush the cache too.
2318                  *
2319                  * Also if we've been setting superpages, we may need to
2320                  * recalculate 'pte' and switch back to smaller pages for the
2321                  * end of the mapping, if the trailing size is not enough to
2322                  * use another superpage (i.e. nr_pages < lvl_pages).
2323                  */
2324                 pte++;
2325                 if (!nr_pages || first_pte_in_page(pte) ||
2326                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2327                         domain_flush_cache(domain, first_pte,
2328                                            (void *)pte - (void *)first_pte);
2329                         pte = NULL;
2330                 }
2331         }
2332
2333         return 0;
2334 }
2335
2336 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2337 {
2338         struct intel_iommu *iommu = info->iommu;
2339         struct context_entry *context;
2340         unsigned long flags;
2341         u16 did_old;
2342
2343         if (!iommu)
2344                 return;
2345
2346         spin_lock_irqsave(&iommu->lock, flags);
2347         context = iommu_context_addr(iommu, bus, devfn, 0);
2348         if (!context) {
2349                 spin_unlock_irqrestore(&iommu->lock, flags);
2350                 return;
2351         }
2352
2353         if (sm_supported(iommu)) {
2354                 if (hw_pass_through && domain_type_is_si(info->domain))
2355                         did_old = FLPT_DEFAULT_DID;
2356                 else
2357                         did_old = info->domain->iommu_did[iommu->seq_id];
2358         } else {
2359                 did_old = context_domain_id(context);
2360         }
2361
2362         context_clear_entry(context);
2363         __iommu_flush_cache(iommu, context, sizeof(*context));
2364         spin_unlock_irqrestore(&iommu->lock, flags);
2365         iommu->flush.flush_context(iommu,
2366                                    did_old,
2367                                    (((u16)bus) << 8) | devfn,
2368                                    DMA_CCMD_MASK_NOBIT,
2369                                    DMA_CCMD_DEVICE_INVL);
2370
2371         if (sm_supported(iommu))
2372                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2373
2374         iommu->flush.flush_iotlb(iommu,
2375                                  did_old,
2376                                  0,
2377                                  0,
2378                                  DMA_TLB_DSI_FLUSH);
2379
2380         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2381 }
2382
2383 static void domain_remove_dev_info(struct dmar_domain *domain)
2384 {
2385         struct device_domain_info *info, *tmp;
2386         unsigned long flags;
2387
2388         spin_lock_irqsave(&device_domain_lock, flags);
2389         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2390                 __dmar_remove_one_dev_info(info);
2391         spin_unlock_irqrestore(&device_domain_lock, flags);
2392 }
2393
2394 static inline struct device_domain_info *
2395 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2396 {
2397         struct device_domain_info *info;
2398
2399         list_for_each_entry(info, &device_domain_list, global)
2400                 if (info->segment == segment && info->bus == bus &&
2401                     info->devfn == devfn)
2402                         return info;
2403
2404         return NULL;
2405 }
2406
2407 static int domain_setup_first_level(struct intel_iommu *iommu,
2408                                     struct dmar_domain *domain,
2409                                     struct device *dev,
2410                                     u32 pasid)
2411 {
2412         struct dma_pte *pgd = domain->pgd;
2413         int agaw, level;
2414         int flags = 0;
2415
2416         /*
2417          * Skip top levels of page tables for iommu which has
2418          * less agaw than default. Unnecessary for PT mode.
2419          */
2420         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2421                 pgd = phys_to_virt(dma_pte_addr(pgd));
2422                 if (!dma_pte_present(pgd))
2423                         return -ENOMEM;
2424         }
2425
2426         level = agaw_to_level(agaw);
2427         if (level != 4 && level != 5)
2428                 return -EINVAL;
2429
2430         if (pasid != PASID_RID2PASID)
2431                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2432         if (level == 5)
2433                 flags |= PASID_FLAG_FL5LP;
2434
2435         if (domain->force_snooping)
2436                 flags |= PASID_FLAG_PAGE_SNOOP;
2437
2438         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2439                                              domain->iommu_did[iommu->seq_id],
2440                                              flags);
2441 }
2442
2443 static bool dev_is_real_dma_subdevice(struct device *dev)
2444 {
2445         return dev && dev_is_pci(dev) &&
2446                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2447 }
2448
2449 static int iommu_domain_identity_map(struct dmar_domain *domain,
2450                                      unsigned long first_vpfn,
2451                                      unsigned long last_vpfn)
2452 {
2453         /*
2454          * RMRR range might have overlap with physical memory range,
2455          * clear it first
2456          */
2457         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2458
2459         return __domain_mapping(domain, first_vpfn,
2460                                 first_vpfn, last_vpfn - first_vpfn + 1,
2461                                 DMA_PTE_READ|DMA_PTE_WRITE);
2462 }
2463
2464 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2465
2466 static int __init si_domain_init(int hw)
2467 {
2468         struct dmar_rmrr_unit *rmrr;
2469         struct device *dev;
2470         int i, nid, ret;
2471
2472         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2473         if (!si_domain)
2474                 return -EFAULT;
2475
2476         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2477                 domain_exit(si_domain);
2478                 return -EFAULT;
2479         }
2480
2481         if (hw)
2482                 return 0;
2483
2484         for_each_online_node(nid) {
2485                 unsigned long start_pfn, end_pfn;
2486                 int i;
2487
2488                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2489                         ret = iommu_domain_identity_map(si_domain,
2490                                         mm_to_dma_pfn(start_pfn),
2491                                         mm_to_dma_pfn(end_pfn));
2492                         if (ret)
2493                                 return ret;
2494                 }
2495         }
2496
2497         /*
2498          * Identity map the RMRRs so that devices with RMRRs could also use
2499          * the si_domain.
2500          */
2501         for_each_rmrr_units(rmrr) {
2502                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2503                                           i, dev) {
2504                         unsigned long long start = rmrr->base_address;
2505                         unsigned long long end = rmrr->end_address;
2506
2507                         if (WARN_ON(end < start ||
2508                                     end >> agaw_to_width(si_domain->agaw)))
2509                                 continue;
2510
2511                         ret = iommu_domain_identity_map(si_domain,
2512                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2513                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2514                         if (ret)
2515                                 return ret;
2516                 }
2517         }
2518
2519         return 0;
2520 }
2521
2522 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2523 {
2524         struct device_domain_info *info = dev_iommu_priv_get(dev);
2525         struct intel_iommu *iommu;
2526         unsigned long flags;
2527         u8 bus, devfn;
2528         int ret;
2529
2530         iommu = device_to_iommu(dev, &bus, &devfn);
2531         if (!iommu)
2532                 return -ENODEV;
2533
2534         spin_lock_irqsave(&device_domain_lock, flags);
2535         info->domain = domain;
2536         spin_lock(&iommu->lock);
2537         ret = domain_attach_iommu(domain, iommu);
2538         spin_unlock(&iommu->lock);
2539         if (ret) {
2540                 spin_unlock_irqrestore(&device_domain_lock, flags);
2541                 return ret;
2542         }
2543         list_add(&info->link, &domain->devices);
2544         spin_unlock_irqrestore(&device_domain_lock, flags);
2545
2546         /* PASID table is mandatory for a PCI device in scalable mode. */
2547         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2548                 ret = intel_pasid_alloc_table(dev);
2549                 if (ret) {
2550                         dev_err(dev, "PASID table allocation failed\n");
2551                         dmar_remove_one_dev_info(dev);
2552                         return ret;
2553                 }
2554
2555                 /* Setup the PASID entry for requests without PASID: */
2556                 spin_lock_irqsave(&iommu->lock, flags);
2557                 if (hw_pass_through && domain_type_is_si(domain))
2558                         ret = intel_pasid_setup_pass_through(iommu, domain,
2559                                         dev, PASID_RID2PASID);
2560                 else if (domain_use_first_level(domain))
2561                         ret = domain_setup_first_level(iommu, domain, dev,
2562                                         PASID_RID2PASID);
2563                 else
2564                         ret = intel_pasid_setup_second_level(iommu, domain,
2565                                         dev, PASID_RID2PASID);
2566                 spin_unlock_irqrestore(&iommu->lock, flags);
2567                 if (ret) {
2568                         dev_err(dev, "Setup RID2PASID failed\n");
2569                         dmar_remove_one_dev_info(dev);
2570                         return ret;
2571                 }
2572         }
2573
2574         ret = domain_context_mapping(domain, dev);
2575         if (ret) {
2576                 dev_err(dev, "Domain context map failed\n");
2577                 dmar_remove_one_dev_info(dev);
2578                 return ret;
2579         }
2580
2581         return 0;
2582 }
2583
2584 static bool device_has_rmrr(struct device *dev)
2585 {
2586         struct dmar_rmrr_unit *rmrr;
2587         struct device *tmp;
2588         int i;
2589
2590         rcu_read_lock();
2591         for_each_rmrr_units(rmrr) {
2592                 /*
2593                  * Return TRUE if this RMRR contains the device that
2594                  * is passed in.
2595                  */
2596                 for_each_active_dev_scope(rmrr->devices,
2597                                           rmrr->devices_cnt, i, tmp)
2598                         if (tmp == dev ||
2599                             is_downstream_to_pci_bridge(dev, tmp)) {
2600                                 rcu_read_unlock();
2601                                 return true;
2602                         }
2603         }
2604         rcu_read_unlock();
2605         return false;
2606 }
2607
2608 /**
2609  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2610  * is relaxable (ie. is allowed to be not enforced under some conditions)
2611  * @dev: device handle
2612  *
2613  * We assume that PCI USB devices with RMRRs have them largely
2614  * for historical reasons and that the RMRR space is not actively used post
2615  * boot.  This exclusion may change if vendors begin to abuse it.
2616  *
2617  * The same exception is made for graphics devices, with the requirement that
2618  * any use of the RMRR regions will be torn down before assigning the device
2619  * to a guest.
2620  *
2621  * Return: true if the RMRR is relaxable, false otherwise
2622  */
2623 static bool device_rmrr_is_relaxable(struct device *dev)
2624 {
2625         struct pci_dev *pdev;
2626
2627         if (!dev_is_pci(dev))
2628                 return false;
2629
2630         pdev = to_pci_dev(dev);
2631         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2632                 return true;
2633         else
2634                 return false;
2635 }
2636
2637 /*
2638  * There are a couple cases where we need to restrict the functionality of
2639  * devices associated with RMRRs.  The first is when evaluating a device for
2640  * identity mapping because problems exist when devices are moved in and out
2641  * of domains and their respective RMRR information is lost.  This means that
2642  * a device with associated RMRRs will never be in a "passthrough" domain.
2643  * The second is use of the device through the IOMMU API.  This interface
2644  * expects to have full control of the IOVA space for the device.  We cannot
2645  * satisfy both the requirement that RMRR access is maintained and have an
2646  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2647  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2648  * We therefore prevent devices associated with an RMRR from participating in
2649  * the IOMMU API, which eliminates them from device assignment.
2650  *
2651  * In both cases, devices which have relaxable RMRRs are not concerned by this
2652  * restriction. See device_rmrr_is_relaxable comment.
2653  */
2654 static bool device_is_rmrr_locked(struct device *dev)
2655 {
2656         if (!device_has_rmrr(dev))
2657                 return false;
2658
2659         if (device_rmrr_is_relaxable(dev))
2660                 return false;
2661
2662         return true;
2663 }
2664
2665 /*
2666  * Return the required default domain type for a specific device.
2667  *
2668  * @dev: the device in query
2669  * @startup: true if this is during early boot
2670  *
2671  * Returns:
2672  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2673  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2674  *  - 0: both identity and dynamic domains work for this device
2675  */
2676 static int device_def_domain_type(struct device *dev)
2677 {
2678         if (dev_is_pci(dev)) {
2679                 struct pci_dev *pdev = to_pci_dev(dev);
2680
2681                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2682                         return IOMMU_DOMAIN_IDENTITY;
2683
2684                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2685                         return IOMMU_DOMAIN_IDENTITY;
2686         }
2687
2688         return 0;
2689 }
2690
2691 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2692 {
2693         /*
2694          * Start from the sane iommu hardware state.
2695          * If the queued invalidation is already initialized by us
2696          * (for example, while enabling interrupt-remapping) then
2697          * we got the things already rolling from a sane state.
2698          */
2699         if (!iommu->qi) {
2700                 /*
2701                  * Clear any previous faults.
2702                  */
2703                 dmar_fault(-1, iommu);
2704                 /*
2705                  * Disable queued invalidation if supported and already enabled
2706                  * before OS handover.
2707                  */
2708                 dmar_disable_qi(iommu);
2709         }
2710
2711         if (dmar_enable_qi(iommu)) {
2712                 /*
2713                  * Queued Invalidate not enabled, use Register Based Invalidate
2714                  */
2715                 iommu->flush.flush_context = __iommu_flush_context;
2716                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2717                 pr_info("%s: Using Register based invalidation\n",
2718                         iommu->name);
2719         } else {
2720                 iommu->flush.flush_context = qi_flush_context;
2721                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2722                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2723         }
2724 }
2725
2726 static int copy_context_table(struct intel_iommu *iommu,
2727                               struct root_entry *old_re,
2728                               struct context_entry **tbl,
2729                               int bus, bool ext)
2730 {
2731         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2732         struct context_entry *new_ce = NULL, ce;
2733         struct context_entry *old_ce = NULL;
2734         struct root_entry re;
2735         phys_addr_t old_ce_phys;
2736
2737         tbl_idx = ext ? bus * 2 : bus;
2738         memcpy(&re, old_re, sizeof(re));
2739
2740         for (devfn = 0; devfn < 256; devfn++) {
2741                 /* First calculate the correct index */
2742                 idx = (ext ? devfn * 2 : devfn) % 256;
2743
2744                 if (idx == 0) {
2745                         /* First save what we may have and clean up */
2746                         if (new_ce) {
2747                                 tbl[tbl_idx] = new_ce;
2748                                 __iommu_flush_cache(iommu, new_ce,
2749                                                     VTD_PAGE_SIZE);
2750                                 pos = 1;
2751                         }
2752
2753                         if (old_ce)
2754                                 memunmap(old_ce);
2755
2756                         ret = 0;
2757                         if (devfn < 0x80)
2758                                 old_ce_phys = root_entry_lctp(&re);
2759                         else
2760                                 old_ce_phys = root_entry_uctp(&re);
2761
2762                         if (!old_ce_phys) {
2763                                 if (ext && devfn == 0) {
2764                                         /* No LCTP, try UCTP */
2765                                         devfn = 0x7f;
2766                                         continue;
2767                                 } else {
2768                                         goto out;
2769                                 }
2770                         }
2771
2772                         ret = -ENOMEM;
2773                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2774                                         MEMREMAP_WB);
2775                         if (!old_ce)
2776                                 goto out;
2777
2778                         new_ce = alloc_pgtable_page(iommu->node);
2779                         if (!new_ce)
2780                                 goto out_unmap;
2781
2782                         ret = 0;
2783                 }
2784
2785                 /* Now copy the context entry */
2786                 memcpy(&ce, old_ce + idx, sizeof(ce));
2787
2788                 if (!__context_present(&ce))
2789                         continue;
2790
2791                 did = context_domain_id(&ce);
2792                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2793                         set_bit(did, iommu->domain_ids);
2794
2795                 /*
2796                  * We need a marker for copied context entries. This
2797                  * marker needs to work for the old format as well as
2798                  * for extended context entries.
2799                  *
2800                  * Bit 67 of the context entry is used. In the old
2801                  * format this bit is available to software, in the
2802                  * extended format it is the PGE bit, but PGE is ignored
2803                  * by HW if PASIDs are disabled (and thus still
2804                  * available).
2805                  *
2806                  * So disable PASIDs first and then mark the entry
2807                  * copied. This means that we don't copy PASID
2808                  * translations from the old kernel, but this is fine as
2809                  * faults there are not fatal.
2810                  */
2811                 context_clear_pasid_enable(&ce);
2812                 context_set_copied(&ce);
2813
2814                 new_ce[idx] = ce;
2815         }
2816
2817         tbl[tbl_idx + pos] = new_ce;
2818
2819         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2820
2821 out_unmap:
2822         memunmap(old_ce);
2823
2824 out:
2825         return ret;
2826 }
2827
2828 static int copy_translation_tables(struct intel_iommu *iommu)
2829 {
2830         struct context_entry **ctxt_tbls;
2831         struct root_entry *old_rt;
2832         phys_addr_t old_rt_phys;
2833         int ctxt_table_entries;
2834         unsigned long flags;
2835         u64 rtaddr_reg;
2836         int bus, ret;
2837         bool new_ext, ext;
2838
2839         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2840         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2841         new_ext    = !!ecap_ecs(iommu->ecap);
2842
2843         /*
2844          * The RTT bit can only be changed when translation is disabled,
2845          * but disabling translation means to open a window for data
2846          * corruption. So bail out and don't copy anything if we would
2847          * have to change the bit.
2848          */
2849         if (new_ext != ext)
2850                 return -EINVAL;
2851
2852         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2853         if (!old_rt_phys)
2854                 return -EINVAL;
2855
2856         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2857         if (!old_rt)
2858                 return -ENOMEM;
2859
2860         /* This is too big for the stack - allocate it from slab */
2861         ctxt_table_entries = ext ? 512 : 256;
2862         ret = -ENOMEM;
2863         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2864         if (!ctxt_tbls)
2865                 goto out_unmap;
2866
2867         for (bus = 0; bus < 256; bus++) {
2868                 ret = copy_context_table(iommu, &old_rt[bus],
2869                                          ctxt_tbls, bus, ext);
2870                 if (ret) {
2871                         pr_err("%s: Failed to copy context table for bus %d\n",
2872                                 iommu->name, bus);
2873                         continue;
2874                 }
2875         }
2876
2877         spin_lock_irqsave(&iommu->lock, flags);
2878
2879         /* Context tables are copied, now write them to the root_entry table */
2880         for (bus = 0; bus < 256; bus++) {
2881                 int idx = ext ? bus * 2 : bus;
2882                 u64 val;
2883
2884                 if (ctxt_tbls[idx]) {
2885                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2886                         iommu->root_entry[bus].lo = val;
2887                 }
2888
2889                 if (!ext || !ctxt_tbls[idx + 1])
2890                         continue;
2891
2892                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2893                 iommu->root_entry[bus].hi = val;
2894         }
2895
2896         spin_unlock_irqrestore(&iommu->lock, flags);
2897
2898         kfree(ctxt_tbls);
2899
2900         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2901
2902         ret = 0;
2903
2904 out_unmap:
2905         memunmap(old_rt);
2906
2907         return ret;
2908 }
2909
2910 #ifdef CONFIG_INTEL_IOMMU_SVM
2911 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2912 {
2913         struct intel_iommu *iommu = data;
2914         ioasid_t ioasid;
2915
2916         if (!iommu)
2917                 return INVALID_IOASID;
2918         /*
2919          * VT-d virtual command interface always uses the full 20 bit
2920          * PASID range. Host can partition guest PASID range based on
2921          * policies but it is out of guest's control.
2922          */
2923         if (min < PASID_MIN || max > intel_pasid_max_id)
2924                 return INVALID_IOASID;
2925
2926         if (vcmd_alloc_pasid(iommu, &ioasid))
2927                 return INVALID_IOASID;
2928
2929         return ioasid;
2930 }
2931
2932 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2933 {
2934         struct intel_iommu *iommu = data;
2935
2936         if (!iommu)
2937                 return;
2938         /*
2939          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2940          * We can only free the PASID when all the devices are unbound.
2941          */
2942         if (ioasid_find(NULL, ioasid, NULL)) {
2943                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2944                 return;
2945         }
2946         vcmd_free_pasid(iommu, ioasid);
2947 }
2948
2949 static void register_pasid_allocator(struct intel_iommu *iommu)
2950 {
2951         /*
2952          * If we are running in the host, no need for custom allocator
2953          * in that PASIDs are allocated from the host system-wide.
2954          */
2955         if (!cap_caching_mode(iommu->cap))
2956                 return;
2957
2958         if (!sm_supported(iommu)) {
2959                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2960                 return;
2961         }
2962
2963         /*
2964          * Register a custom PASID allocator if we are running in a guest,
2965          * guest PASID must be obtained via virtual command interface.
2966          * There can be multiple vIOMMUs in each guest but only one allocator
2967          * is active. All vIOMMU allocators will eventually be calling the same
2968          * host allocator.
2969          */
2970         if (!vccap_pasid(iommu->vccap))
2971                 return;
2972
2973         pr_info("Register custom PASID allocator\n");
2974         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2975         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2976         iommu->pasid_allocator.pdata = (void *)iommu;
2977         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2978                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2979                 /*
2980                  * Disable scalable mode on this IOMMU if there
2981                  * is no custom allocator. Mixing SM capable vIOMMU
2982                  * and non-SM vIOMMU are not supported.
2983                  */
2984                 intel_iommu_sm = 0;
2985         }
2986 }
2987 #endif
2988
2989 static int __init init_dmars(void)
2990 {
2991         struct dmar_drhd_unit *drhd;
2992         struct intel_iommu *iommu;
2993         int ret;
2994
2995         /*
2996          * for each drhd
2997          *    allocate root
2998          *    initialize and program root entry to not present
2999          * endfor
3000          */
3001         for_each_drhd_unit(drhd) {
3002                 /*
3003                  * lock not needed as this is only incremented in the single
3004                  * threaded kernel __init code path all other access are read
3005                  * only
3006                  */
3007                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3008                         g_num_of_iommus++;
3009                         continue;
3010                 }
3011                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3012         }
3013
3014         /* Preallocate enough resources for IOMMU hot-addition */
3015         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3016                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3017
3018         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3019                         GFP_KERNEL);
3020         if (!g_iommus) {
3021                 ret = -ENOMEM;
3022                 goto error;
3023         }
3024
3025         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3026         if (ret)
3027                 goto free_iommu;
3028
3029         for_each_iommu(iommu, drhd) {
3030                 if (drhd->ignored) {
3031                         iommu_disable_translation(iommu);
3032                         continue;
3033                 }
3034
3035                 /*
3036                  * Find the max pasid size of all IOMMU's in the system.
3037                  * We need to ensure the system pasid table is no bigger
3038                  * than the smallest supported.
3039                  */
3040                 if (pasid_supported(iommu)) {
3041                         u32 temp = 2 << ecap_pss(iommu->ecap);
3042
3043                         intel_pasid_max_id = min_t(u32, temp,
3044                                                    intel_pasid_max_id);
3045                 }
3046
3047                 g_iommus[iommu->seq_id] = iommu;
3048
3049                 intel_iommu_init_qi(iommu);
3050
3051                 ret = iommu_init_domains(iommu);
3052                 if (ret)
3053                         goto free_iommu;
3054
3055                 init_translation_status(iommu);
3056
3057                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3058                         iommu_disable_translation(iommu);
3059                         clear_translation_pre_enabled(iommu);
3060                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3061                                 iommu->name);
3062                 }
3063
3064                 /*
3065                  * TBD:
3066                  * we could share the same root & context tables
3067                  * among all IOMMU's. Need to Split it later.
3068                  */
3069                 ret = iommu_alloc_root_entry(iommu);
3070                 if (ret)
3071                         goto free_iommu;
3072
3073                 if (translation_pre_enabled(iommu)) {
3074                         pr_info("Translation already enabled - trying to copy translation structures\n");
3075
3076                         ret = copy_translation_tables(iommu);
3077                         if (ret) {
3078                                 /*
3079                                  * We found the IOMMU with translation
3080                                  * enabled - but failed to copy over the
3081                                  * old root-entry table. Try to proceed
3082                                  * by disabling translation now and
3083                                  * allocating a clean root-entry table.
3084                                  * This might cause DMAR faults, but
3085                                  * probably the dump will still succeed.
3086                                  */
3087                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3088                                        iommu->name);
3089                                 iommu_disable_translation(iommu);
3090                                 clear_translation_pre_enabled(iommu);
3091                         } else {
3092                                 pr_info("Copied translation tables from previous kernel for %s\n",
3093                                         iommu->name);
3094                         }
3095                 }
3096
3097                 if (!ecap_pass_through(iommu->ecap))
3098                         hw_pass_through = 0;
3099                 intel_svm_check(iommu);
3100         }
3101
3102         /*
3103          * Now that qi is enabled on all iommus, set the root entry and flush
3104          * caches. This is required on some Intel X58 chipsets, otherwise the
3105          * flush_context function will loop forever and the boot hangs.
3106          */
3107         for_each_active_iommu(iommu, drhd) {
3108                 iommu_flush_write_buffer(iommu);
3109 #ifdef CONFIG_INTEL_IOMMU_SVM
3110                 register_pasid_allocator(iommu);
3111 #endif
3112                 iommu_set_root_entry(iommu);
3113         }
3114
3115 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3116         dmar_map_gfx = 0;
3117 #endif
3118
3119         if (!dmar_map_gfx)
3120                 iommu_identity_mapping |= IDENTMAP_GFX;
3121
3122         check_tylersburg_isoch();
3123
3124         ret = si_domain_init(hw_pass_through);
3125         if (ret)
3126                 goto free_iommu;
3127
3128         /*
3129          * for each drhd
3130          *   enable fault log
3131          *   global invalidate context cache
3132          *   global invalidate iotlb
3133          *   enable translation
3134          */
3135         for_each_iommu(iommu, drhd) {
3136                 if (drhd->ignored) {
3137                         /*
3138                          * we always have to disable PMRs or DMA may fail on
3139                          * this device
3140                          */
3141                         if (force_on)
3142                                 iommu_disable_protect_mem_regions(iommu);
3143                         continue;
3144                 }
3145
3146                 iommu_flush_write_buffer(iommu);
3147
3148 #ifdef CONFIG_INTEL_IOMMU_SVM
3149                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3150                         /*
3151                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3152                          * could cause possible lock race condition.
3153                          */
3154                         up_write(&dmar_global_lock);
3155                         ret = intel_svm_enable_prq(iommu);
3156                         down_write(&dmar_global_lock);
3157                         if (ret)
3158                                 goto free_iommu;
3159                 }
3160 #endif
3161                 ret = dmar_set_interrupt(iommu);
3162                 if (ret)
3163                         goto free_iommu;
3164         }
3165
3166         return 0;
3167
3168 free_iommu:
3169         for_each_active_iommu(iommu, drhd) {
3170                 disable_dmar_iommu(iommu);
3171                 free_dmar_iommu(iommu);
3172         }
3173
3174         kfree(g_iommus);
3175
3176 error:
3177         return ret;
3178 }
3179
3180 static void __init init_no_remapping_devices(void)
3181 {
3182         struct dmar_drhd_unit *drhd;
3183         struct device *dev;
3184         int i;
3185
3186         for_each_drhd_unit(drhd) {
3187                 if (!drhd->include_all) {
3188                         for_each_active_dev_scope(drhd->devices,
3189                                                   drhd->devices_cnt, i, dev)
3190                                 break;
3191                         /* ignore DMAR unit if no devices exist */
3192                         if (i == drhd->devices_cnt)
3193                                 drhd->ignored = 1;
3194                 }
3195         }
3196
3197         for_each_active_drhd_unit(drhd) {
3198                 if (drhd->include_all)
3199                         continue;
3200
3201                 for_each_active_dev_scope(drhd->devices,
3202                                           drhd->devices_cnt, i, dev)
3203                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3204                                 break;
3205                 if (i < drhd->devices_cnt)
3206                         continue;
3207
3208                 /* This IOMMU has *only* gfx devices. Either bypass it or
3209                    set the gfx_mapped flag, as appropriate */
3210                 drhd->gfx_dedicated = 1;
3211                 if (!dmar_map_gfx)
3212                         drhd->ignored = 1;
3213         }
3214 }
3215
3216 #ifdef CONFIG_SUSPEND
3217 static int init_iommu_hw(void)
3218 {
3219         struct dmar_drhd_unit *drhd;
3220         struct intel_iommu *iommu = NULL;
3221
3222         for_each_active_iommu(iommu, drhd)
3223                 if (iommu->qi)
3224                         dmar_reenable_qi(iommu);
3225
3226         for_each_iommu(iommu, drhd) {
3227                 if (drhd->ignored) {
3228                         /*
3229                          * we always have to disable PMRs or DMA may fail on
3230                          * this device
3231                          */
3232                         if (force_on)
3233                                 iommu_disable_protect_mem_regions(iommu);
3234                         continue;
3235                 }
3236
3237                 iommu_flush_write_buffer(iommu);
3238                 iommu_set_root_entry(iommu);
3239                 iommu_enable_translation(iommu);
3240                 iommu_disable_protect_mem_regions(iommu);
3241         }
3242
3243         return 0;
3244 }
3245
3246 static void iommu_flush_all(void)
3247 {
3248         struct dmar_drhd_unit *drhd;
3249         struct intel_iommu *iommu;
3250
3251         for_each_active_iommu(iommu, drhd) {
3252                 iommu->flush.flush_context(iommu, 0, 0, 0,
3253                                            DMA_CCMD_GLOBAL_INVL);
3254                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3255                                          DMA_TLB_GLOBAL_FLUSH);
3256         }
3257 }
3258
3259 static int iommu_suspend(void)
3260 {
3261         struct dmar_drhd_unit *drhd;
3262         struct intel_iommu *iommu = NULL;
3263         unsigned long flag;
3264
3265         for_each_active_iommu(iommu, drhd) {
3266                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3267                                              GFP_KERNEL);
3268                 if (!iommu->iommu_state)
3269                         goto nomem;
3270         }
3271
3272         iommu_flush_all();
3273
3274         for_each_active_iommu(iommu, drhd) {
3275                 iommu_disable_translation(iommu);
3276
3277                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3278
3279                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3280                         readl(iommu->reg + DMAR_FECTL_REG);
3281                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3282                         readl(iommu->reg + DMAR_FEDATA_REG);
3283                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3284                         readl(iommu->reg + DMAR_FEADDR_REG);
3285                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3286                         readl(iommu->reg + DMAR_FEUADDR_REG);
3287
3288                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3289         }
3290         return 0;
3291
3292 nomem:
3293         for_each_active_iommu(iommu, drhd)
3294                 kfree(iommu->iommu_state);
3295
3296         return -ENOMEM;
3297 }
3298
3299 static void iommu_resume(void)
3300 {
3301         struct dmar_drhd_unit *drhd;
3302         struct intel_iommu *iommu = NULL;
3303         unsigned long flag;
3304
3305         if (init_iommu_hw()) {
3306                 if (force_on)
3307                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3308                 else
3309                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3310                 return;
3311         }
3312
3313         for_each_active_iommu(iommu, drhd) {
3314
3315                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3316
3317                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3318                         iommu->reg + DMAR_FECTL_REG);
3319                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3320                         iommu->reg + DMAR_FEDATA_REG);
3321                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3322                         iommu->reg + DMAR_FEADDR_REG);
3323                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3324                         iommu->reg + DMAR_FEUADDR_REG);
3325
3326                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3327         }
3328
3329         for_each_active_iommu(iommu, drhd)
3330                 kfree(iommu->iommu_state);
3331 }
3332
3333 static struct syscore_ops iommu_syscore_ops = {
3334         .resume         = iommu_resume,
3335         .suspend        = iommu_suspend,
3336 };
3337
3338 static void __init init_iommu_pm_ops(void)
3339 {
3340         register_syscore_ops(&iommu_syscore_ops);
3341 }
3342
3343 #else
3344 static inline void init_iommu_pm_ops(void) {}
3345 #endif  /* CONFIG_PM */
3346
3347 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3348 {
3349         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3350             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3351             rmrr->end_address <= rmrr->base_address ||
3352             arch_rmrr_sanity_check(rmrr))
3353                 return -EINVAL;
3354
3355         return 0;
3356 }
3357
3358 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3359 {
3360         struct acpi_dmar_reserved_memory *rmrr;
3361         struct dmar_rmrr_unit *rmrru;
3362
3363         rmrr = (struct acpi_dmar_reserved_memory *)header;
3364         if (rmrr_sanity_check(rmrr)) {
3365                 pr_warn(FW_BUG
3366                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3367                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3368                            rmrr->base_address, rmrr->end_address,
3369                            dmi_get_system_info(DMI_BIOS_VENDOR),
3370                            dmi_get_system_info(DMI_BIOS_VERSION),
3371                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3372                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3373         }
3374
3375         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3376         if (!rmrru)
3377                 goto out;
3378
3379         rmrru->hdr = header;
3380
3381         rmrru->base_address = rmrr->base_address;
3382         rmrru->end_address = rmrr->end_address;
3383
3384         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3385                                 ((void *)rmrr) + rmrr->header.length,
3386                                 &rmrru->devices_cnt);
3387         if (rmrru->devices_cnt && rmrru->devices == NULL)
3388                 goto free_rmrru;
3389
3390         list_add(&rmrru->list, &dmar_rmrr_units);
3391
3392         return 0;
3393 free_rmrru:
3394         kfree(rmrru);
3395 out:
3396         return -ENOMEM;
3397 }
3398
3399 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3400 {
3401         struct dmar_atsr_unit *atsru;
3402         struct acpi_dmar_atsr *tmp;
3403
3404         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3405                                 dmar_rcu_check()) {
3406                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3407                 if (atsr->segment != tmp->segment)
3408                         continue;
3409                 if (atsr->header.length != tmp->header.length)
3410                         continue;
3411                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3412                         return atsru;
3413         }
3414
3415         return NULL;
3416 }
3417
3418 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3419 {
3420         struct acpi_dmar_atsr *atsr;
3421         struct dmar_atsr_unit *atsru;
3422
3423         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3424                 return 0;
3425
3426         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3427         atsru = dmar_find_atsr(atsr);
3428         if (atsru)
3429                 return 0;
3430
3431         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3432         if (!atsru)
3433                 return -ENOMEM;
3434
3435         /*
3436          * If memory is allocated from slab by ACPI _DSM method, we need to
3437          * copy the memory content because the memory buffer will be freed
3438          * on return.
3439          */
3440         atsru->hdr = (void *)(atsru + 1);
3441         memcpy(atsru->hdr, hdr, hdr->length);
3442         atsru->include_all = atsr->flags & 0x1;
3443         if (!atsru->include_all) {
3444                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3445                                 (void *)atsr + atsr->header.length,
3446                                 &atsru->devices_cnt);
3447                 if (atsru->devices_cnt && atsru->devices == NULL) {
3448                         kfree(atsru);
3449                         return -ENOMEM;
3450                 }
3451         }
3452
3453         list_add_rcu(&atsru->list, &dmar_atsr_units);
3454
3455         return 0;
3456 }
3457
3458 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3459 {
3460         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3461         kfree(atsru);
3462 }
3463
3464 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3465 {
3466         struct acpi_dmar_atsr *atsr;
3467         struct dmar_atsr_unit *atsru;
3468
3469         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3470         atsru = dmar_find_atsr(atsr);
3471         if (atsru) {
3472                 list_del_rcu(&atsru->list);
3473                 synchronize_rcu();
3474                 intel_iommu_free_atsr(atsru);
3475         }
3476
3477         return 0;
3478 }
3479
3480 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3481 {
3482         int i;
3483         struct device *dev;
3484         struct acpi_dmar_atsr *atsr;
3485         struct dmar_atsr_unit *atsru;
3486
3487         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3488         atsru = dmar_find_atsr(atsr);
3489         if (!atsru)
3490                 return 0;
3491
3492         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3493                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3494                                           i, dev)
3495                         return -EBUSY;
3496         }
3497
3498         return 0;
3499 }
3500
3501 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3502 {
3503         struct dmar_satc_unit *satcu;
3504         struct acpi_dmar_satc *tmp;
3505
3506         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3507                                 dmar_rcu_check()) {
3508                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3509                 if (satc->segment != tmp->segment)
3510                         continue;
3511                 if (satc->header.length != tmp->header.length)
3512                         continue;
3513                 if (memcmp(satc, tmp, satc->header.length) == 0)
3514                         return satcu;
3515         }
3516
3517         return NULL;
3518 }
3519
3520 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3521 {
3522         struct acpi_dmar_satc *satc;
3523         struct dmar_satc_unit *satcu;
3524
3525         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3526                 return 0;
3527
3528         satc = container_of(hdr, struct acpi_dmar_satc, header);
3529         satcu = dmar_find_satc(satc);
3530         if (satcu)
3531                 return 0;
3532
3533         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3534         if (!satcu)
3535                 return -ENOMEM;
3536
3537         satcu->hdr = (void *)(satcu + 1);
3538         memcpy(satcu->hdr, hdr, hdr->length);
3539         satcu->atc_required = satc->flags & 0x1;
3540         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3541                                               (void *)satc + satc->header.length,
3542                                               &satcu->devices_cnt);
3543         if (satcu->devices_cnt && !satcu->devices) {
3544                 kfree(satcu);
3545                 return -ENOMEM;
3546         }
3547         list_add_rcu(&satcu->list, &dmar_satc_units);
3548
3549         return 0;
3550 }
3551
3552 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3553 {
3554         int sp, ret;
3555         struct intel_iommu *iommu = dmaru->iommu;
3556
3557         if (g_iommus[iommu->seq_id])
3558                 return 0;
3559
3560         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3561         if (ret)
3562                 goto out;
3563
3564         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3565                 pr_warn("%s: Doesn't support hardware pass through.\n",
3566                         iommu->name);
3567                 return -ENXIO;
3568         }
3569
3570         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3571         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3572                 pr_warn("%s: Doesn't support large page.\n",
3573                         iommu->name);
3574                 return -ENXIO;
3575         }
3576
3577         /*
3578          * Disable translation if already enabled prior to OS handover.
3579          */
3580         if (iommu->gcmd & DMA_GCMD_TE)
3581                 iommu_disable_translation(iommu);
3582
3583         g_iommus[iommu->seq_id] = iommu;
3584         ret = iommu_init_domains(iommu);
3585         if (ret == 0)
3586                 ret = iommu_alloc_root_entry(iommu);
3587         if (ret)
3588                 goto out;
3589
3590         intel_svm_check(iommu);
3591
3592         if (dmaru->ignored) {
3593                 /*
3594                  * we always have to disable PMRs or DMA may fail on this device
3595                  */
3596                 if (force_on)
3597                         iommu_disable_protect_mem_regions(iommu);
3598                 return 0;
3599         }
3600
3601         intel_iommu_init_qi(iommu);
3602         iommu_flush_write_buffer(iommu);
3603
3604 #ifdef CONFIG_INTEL_IOMMU_SVM
3605         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3606                 ret = intel_svm_enable_prq(iommu);
3607                 if (ret)
3608                         goto disable_iommu;
3609         }
3610 #endif
3611         ret = dmar_set_interrupt(iommu);
3612         if (ret)
3613                 goto disable_iommu;
3614
3615         iommu_set_root_entry(iommu);
3616         iommu_enable_translation(iommu);
3617
3618         iommu_disable_protect_mem_regions(iommu);
3619         return 0;
3620
3621 disable_iommu:
3622         disable_dmar_iommu(iommu);
3623 out:
3624         free_dmar_iommu(iommu);
3625         return ret;
3626 }
3627
3628 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3629 {
3630         int ret = 0;
3631         struct intel_iommu *iommu = dmaru->iommu;
3632
3633         if (!intel_iommu_enabled)
3634                 return 0;
3635         if (iommu == NULL)
3636                 return -EINVAL;
3637
3638         if (insert) {
3639                 ret = intel_iommu_add(dmaru);
3640         } else {
3641                 disable_dmar_iommu(iommu);
3642                 free_dmar_iommu(iommu);
3643         }
3644
3645         return ret;
3646 }
3647
3648 static void intel_iommu_free_dmars(void)
3649 {
3650         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3651         struct dmar_atsr_unit *atsru, *atsr_n;
3652         struct dmar_satc_unit *satcu, *satc_n;
3653
3654         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3655                 list_del(&rmrru->list);
3656                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3657                 kfree(rmrru);
3658         }
3659
3660         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3661                 list_del(&atsru->list);
3662                 intel_iommu_free_atsr(atsru);
3663         }
3664         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3665                 list_del(&satcu->list);
3666                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3667                 kfree(satcu);
3668         }
3669 }
3670
3671 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3672 {
3673         struct dmar_satc_unit *satcu;
3674         struct acpi_dmar_satc *satc;
3675         struct device *tmp;
3676         int i;
3677
3678         dev = pci_physfn(dev);
3679         rcu_read_lock();
3680
3681         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3682                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3683                 if (satc->segment != pci_domain_nr(dev->bus))
3684                         continue;
3685                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3686                         if (to_pci_dev(tmp) == dev)
3687                                 goto out;
3688         }
3689         satcu = NULL;
3690 out:
3691         rcu_read_unlock();
3692         return satcu;
3693 }
3694
3695 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3696 {
3697         int i, ret = 1;
3698         struct pci_bus *bus;
3699         struct pci_dev *bridge = NULL;
3700         struct device *tmp;
3701         struct acpi_dmar_atsr *atsr;
3702         struct dmar_atsr_unit *atsru;
3703         struct dmar_satc_unit *satcu;
3704
3705         dev = pci_physfn(dev);
3706         satcu = dmar_find_matched_satc_unit(dev);
3707         if (satcu)
3708                 /*
3709                  * This device supports ATS as it is in SATC table.
3710                  * When IOMMU is in legacy mode, enabling ATS is done
3711                  * automatically by HW for the device that requires
3712                  * ATS, hence OS should not enable this device ATS
3713                  * to avoid duplicated TLB invalidation.
3714                  */
3715                 return !(satcu->atc_required && !sm_supported(iommu));
3716
3717         for (bus = dev->bus; bus; bus = bus->parent) {
3718                 bridge = bus->self;
3719                 /* If it's an integrated device, allow ATS */
3720                 if (!bridge)
3721                         return 1;
3722                 /* Connected via non-PCIe: no ATS */
3723                 if (!pci_is_pcie(bridge) ||
3724                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3725                         return 0;
3726                 /* If we found the root port, look it up in the ATSR */
3727                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3728                         break;
3729         }
3730
3731         rcu_read_lock();
3732         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3733                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3734                 if (atsr->segment != pci_domain_nr(dev->bus))
3735                         continue;
3736
3737                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3738                         if (tmp == &bridge->dev)
3739                                 goto out;
3740
3741                 if (atsru->include_all)
3742                         goto out;
3743         }
3744         ret = 0;
3745 out:
3746         rcu_read_unlock();
3747
3748         return ret;
3749 }
3750
3751 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3752 {
3753         int ret;
3754         struct dmar_rmrr_unit *rmrru;
3755         struct dmar_atsr_unit *atsru;
3756         struct dmar_satc_unit *satcu;
3757         struct acpi_dmar_atsr *atsr;
3758         struct acpi_dmar_reserved_memory *rmrr;
3759         struct acpi_dmar_satc *satc;
3760
3761         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3762                 return 0;
3763
3764         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3765                 rmrr = container_of(rmrru->hdr,
3766                                     struct acpi_dmar_reserved_memory, header);
3767                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3768                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3769                                 ((void *)rmrr) + rmrr->header.length,
3770                                 rmrr->segment, rmrru->devices,
3771                                 rmrru->devices_cnt);
3772                         if (ret < 0)
3773                                 return ret;
3774                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3775                         dmar_remove_dev_scope(info, rmrr->segment,
3776                                 rmrru->devices, rmrru->devices_cnt);
3777                 }
3778         }
3779
3780         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3781                 if (atsru->include_all)
3782                         continue;
3783
3784                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3785                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3786                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3787                                         (void *)atsr + atsr->header.length,
3788                                         atsr->segment, atsru->devices,
3789                                         atsru->devices_cnt);
3790                         if (ret > 0)
3791                                 break;
3792                         else if (ret < 0)
3793                                 return ret;
3794                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3795                         if (dmar_remove_dev_scope(info, atsr->segment,
3796                                         atsru->devices, atsru->devices_cnt))
3797                                 break;
3798                 }
3799         }
3800         list_for_each_entry(satcu, &dmar_satc_units, list) {
3801                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3802                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3803                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3804                                         (void *)satc + satc->header.length,
3805                                         satc->segment, satcu->devices,
3806                                         satcu->devices_cnt);
3807                         if (ret > 0)
3808                                 break;
3809                         else if (ret < 0)
3810                                 return ret;
3811                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3812                         if (dmar_remove_dev_scope(info, satc->segment,
3813                                         satcu->devices, satcu->devices_cnt))
3814                                 break;
3815                 }
3816         }
3817
3818         return 0;
3819 }
3820
3821 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3822                                        unsigned long val, void *v)
3823 {
3824         struct memory_notify *mhp = v;
3825         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3826         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3827                         mhp->nr_pages - 1);
3828
3829         switch (val) {
3830         case MEM_GOING_ONLINE:
3831                 if (iommu_domain_identity_map(si_domain,
3832                                               start_vpfn, last_vpfn)) {
3833                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3834                                 start_vpfn, last_vpfn);
3835                         return NOTIFY_BAD;
3836                 }
3837                 break;
3838
3839         case MEM_OFFLINE:
3840         case MEM_CANCEL_ONLINE:
3841                 {
3842                         struct dmar_drhd_unit *drhd;
3843                         struct intel_iommu *iommu;
3844                         LIST_HEAD(freelist);
3845
3846                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3847
3848                         rcu_read_lock();
3849                         for_each_active_iommu(iommu, drhd)
3850                                 iommu_flush_iotlb_psi(iommu, si_domain,
3851                                         start_vpfn, mhp->nr_pages,
3852                                         list_empty(&freelist), 0);
3853                         rcu_read_unlock();
3854                         put_pages_list(&freelist);
3855                 }
3856                 break;
3857         }
3858
3859         return NOTIFY_OK;
3860 }
3861
3862 static struct notifier_block intel_iommu_memory_nb = {
3863         .notifier_call = intel_iommu_memory_notifier,
3864         .priority = 0
3865 };
3866
3867 static void intel_disable_iommus(void)
3868 {
3869         struct intel_iommu *iommu = NULL;
3870         struct dmar_drhd_unit *drhd;
3871
3872         for_each_iommu(iommu, drhd)
3873                 iommu_disable_translation(iommu);
3874 }
3875
3876 void intel_iommu_shutdown(void)
3877 {
3878         struct dmar_drhd_unit *drhd;
3879         struct intel_iommu *iommu = NULL;
3880
3881         if (no_iommu || dmar_disabled)
3882                 return;
3883
3884         down_write(&dmar_global_lock);
3885
3886         /* Disable PMRs explicitly here. */
3887         for_each_iommu(iommu, drhd)
3888                 iommu_disable_protect_mem_regions(iommu);
3889
3890         /* Make sure the IOMMUs are switched off */
3891         intel_disable_iommus();
3892
3893         up_write(&dmar_global_lock);
3894 }
3895
3896 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3897 {
3898         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3899
3900         return container_of(iommu_dev, struct intel_iommu, iommu);
3901 }
3902
3903 static ssize_t version_show(struct device *dev,
3904                             struct device_attribute *attr, char *buf)
3905 {
3906         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3907         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3908         return sprintf(buf, "%d:%d\n",
3909                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3910 }
3911 static DEVICE_ATTR_RO(version);
3912
3913 static ssize_t address_show(struct device *dev,
3914                             struct device_attribute *attr, char *buf)
3915 {
3916         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3917         return sprintf(buf, "%llx\n", iommu->reg_phys);
3918 }
3919 static DEVICE_ATTR_RO(address);
3920
3921 static ssize_t cap_show(struct device *dev,
3922                         struct device_attribute *attr, char *buf)
3923 {
3924         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3925         return sprintf(buf, "%llx\n", iommu->cap);
3926 }
3927 static DEVICE_ATTR_RO(cap);
3928
3929 static ssize_t ecap_show(struct device *dev,
3930                          struct device_attribute *attr, char *buf)
3931 {
3932         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3933         return sprintf(buf, "%llx\n", iommu->ecap);
3934 }
3935 static DEVICE_ATTR_RO(ecap);
3936
3937 static ssize_t domains_supported_show(struct device *dev,
3938                                       struct device_attribute *attr, char *buf)
3939 {
3940         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3941         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3942 }
3943 static DEVICE_ATTR_RO(domains_supported);
3944
3945 static ssize_t domains_used_show(struct device *dev,
3946                                  struct device_attribute *attr, char *buf)
3947 {
3948         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3949         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3950                                                   cap_ndoms(iommu->cap)));
3951 }
3952 static DEVICE_ATTR_RO(domains_used);
3953
3954 static struct attribute *intel_iommu_attrs[] = {
3955         &dev_attr_version.attr,
3956         &dev_attr_address.attr,
3957         &dev_attr_cap.attr,
3958         &dev_attr_ecap.attr,
3959         &dev_attr_domains_supported.attr,
3960         &dev_attr_domains_used.attr,
3961         NULL,
3962 };
3963
3964 static struct attribute_group intel_iommu_group = {
3965         .name = "intel-iommu",
3966         .attrs = intel_iommu_attrs,
3967 };
3968
3969 const struct attribute_group *intel_iommu_groups[] = {
3970         &intel_iommu_group,
3971         NULL,
3972 };
3973
3974 static inline bool has_external_pci(void)
3975 {
3976         struct pci_dev *pdev = NULL;
3977
3978         for_each_pci_dev(pdev)
3979                 if (pdev->external_facing)
3980                         return true;
3981
3982         return false;
3983 }
3984
3985 static int __init platform_optin_force_iommu(void)
3986 {
3987         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3988                 return 0;
3989
3990         if (no_iommu || dmar_disabled)
3991                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3992
3993         /*
3994          * If Intel-IOMMU is disabled by default, we will apply identity
3995          * map for all devices except those marked as being untrusted.
3996          */
3997         if (dmar_disabled)
3998                 iommu_set_default_passthrough(false);
3999
4000         dmar_disabled = 0;
4001         no_iommu = 0;
4002
4003         return 1;
4004 }
4005
4006 static int __init probe_acpi_namespace_devices(void)
4007 {
4008         struct dmar_drhd_unit *drhd;
4009         /* To avoid a -Wunused-but-set-variable warning. */
4010         struct intel_iommu *iommu __maybe_unused;
4011         struct device *dev;
4012         int i, ret = 0;
4013
4014         for_each_active_iommu(iommu, drhd) {
4015                 for_each_active_dev_scope(drhd->devices,
4016                                           drhd->devices_cnt, i, dev) {
4017                         struct acpi_device_physical_node *pn;
4018                         struct iommu_group *group;
4019                         struct acpi_device *adev;
4020
4021                         if (dev->bus != &acpi_bus_type)
4022                                 continue;
4023
4024                         adev = to_acpi_device(dev);
4025                         mutex_lock(&adev->physical_node_lock);
4026                         list_for_each_entry(pn,
4027                                             &adev->physical_node_list, node) {
4028                                 group = iommu_group_get(pn->dev);
4029                                 if (group) {
4030                                         iommu_group_put(group);
4031                                         continue;
4032                                 }
4033
4034                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4035                                 ret = iommu_probe_device(pn->dev);
4036                                 if (ret)
4037                                         break;
4038                         }
4039                         mutex_unlock(&adev->physical_node_lock);
4040
4041                         if (ret)
4042                                 return ret;
4043                 }
4044         }
4045
4046         return 0;
4047 }
4048
4049 int __init intel_iommu_init(void)
4050 {
4051         int ret = -ENODEV;
4052         struct dmar_drhd_unit *drhd;
4053         struct intel_iommu *iommu;
4054
4055         /*
4056          * Intel IOMMU is required for a TXT/tboot launch or platform
4057          * opt in, so enforce that.
4058          */
4059         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4060                     platform_optin_force_iommu();
4061
4062         down_write(&dmar_global_lock);
4063         if (dmar_table_init()) {
4064                 if (force_on)
4065                         panic("tboot: Failed to initialize DMAR table\n");
4066                 goto out_free_dmar;
4067         }
4068
4069         if (dmar_dev_scope_init() < 0) {
4070                 if (force_on)
4071                         panic("tboot: Failed to initialize DMAR device scope\n");
4072                 goto out_free_dmar;
4073         }
4074
4075         up_write(&dmar_global_lock);
4076
4077         /*
4078          * The bus notifier takes the dmar_global_lock, so lockdep will
4079          * complain later when we register it under the lock.
4080          */
4081         dmar_register_bus_notifier();
4082
4083         down_write(&dmar_global_lock);
4084
4085         if (!no_iommu)
4086                 intel_iommu_debugfs_init();
4087
4088         if (no_iommu || dmar_disabled) {
4089                 /*
4090                  * We exit the function here to ensure IOMMU's remapping and
4091                  * mempool aren't setup, which means that the IOMMU's PMRs
4092                  * won't be disabled via the call to init_dmars(). So disable
4093                  * it explicitly here. The PMRs were setup by tboot prior to
4094                  * calling SENTER, but the kernel is expected to reset/tear
4095                  * down the PMRs.
4096                  */
4097                 if (intel_iommu_tboot_noforce) {
4098                         for_each_iommu(iommu, drhd)
4099                                 iommu_disable_protect_mem_regions(iommu);
4100                 }
4101
4102                 /*
4103                  * Make sure the IOMMUs are switched off, even when we
4104                  * boot into a kexec kernel and the previous kernel left
4105                  * them enabled
4106                  */
4107                 intel_disable_iommus();
4108                 goto out_free_dmar;
4109         }
4110
4111         if (list_empty(&dmar_rmrr_units))
4112                 pr_info("No RMRR found\n");
4113
4114         if (list_empty(&dmar_atsr_units))
4115                 pr_info("No ATSR found\n");
4116
4117         if (list_empty(&dmar_satc_units))
4118                 pr_info("No SATC found\n");
4119
4120         if (dmar_map_gfx)
4121                 intel_iommu_gfx_mapped = 1;
4122
4123         init_no_remapping_devices();
4124
4125         ret = init_dmars();
4126         if (ret) {
4127                 if (force_on)
4128                         panic("tboot: Failed to initialize DMARs\n");
4129                 pr_err("Initialization failed\n");
4130                 goto out_free_dmar;
4131         }
4132         up_write(&dmar_global_lock);
4133
4134         init_iommu_pm_ops();
4135
4136         down_read(&dmar_global_lock);
4137         for_each_active_iommu(iommu, drhd) {
4138                 /*
4139                  * The flush queue implementation does not perform
4140                  * page-selective invalidations that are required for efficient
4141                  * TLB flushes in virtual environments.  The benefit of batching
4142                  * is likely to be much lower than the overhead of synchronizing
4143                  * the virtual and physical IOMMU page-tables.
4144                  */
4145                 if (cap_caching_mode(iommu->cap)) {
4146                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4147                         iommu_set_dma_strict();
4148                 }
4149                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4150                                        intel_iommu_groups,
4151                                        "%s", iommu->name);
4152                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4153         }
4154         up_read(&dmar_global_lock);
4155
4156         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4157         if (si_domain && !hw_pass_through)
4158                 register_memory_notifier(&intel_iommu_memory_nb);
4159
4160         down_read(&dmar_global_lock);
4161         if (probe_acpi_namespace_devices())
4162                 pr_warn("ACPI name space devices didn't probe correctly\n");
4163
4164         /* Finally, we enable the DMA remapping hardware. */
4165         for_each_iommu(iommu, drhd) {
4166                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4167                         iommu_enable_translation(iommu);
4168
4169                 iommu_disable_protect_mem_regions(iommu);
4170         }
4171         up_read(&dmar_global_lock);
4172
4173         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4174
4175         intel_iommu_enabled = 1;
4176
4177         return 0;
4178
4179 out_free_dmar:
4180         intel_iommu_free_dmars();
4181         up_write(&dmar_global_lock);
4182         return ret;
4183 }
4184
4185 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4186 {
4187         struct device_domain_info *info = opaque;
4188
4189         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4190         return 0;
4191 }
4192
4193 /*
4194  * NB - intel-iommu lacks any sort of reference counting for the users of
4195  * dependent devices.  If multiple endpoints have intersecting dependent
4196  * devices, unbinding the driver from any one of them will possibly leave
4197  * the others unable to operate.
4198  */
4199 static void domain_context_clear(struct device_domain_info *info)
4200 {
4201         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4202                 return;
4203
4204         pci_for_each_dma_alias(to_pci_dev(info->dev),
4205                                &domain_context_clear_one_cb, info);
4206 }
4207
4208 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4209 {
4210         struct dmar_domain *domain;
4211         struct intel_iommu *iommu;
4212         unsigned long flags;
4213
4214         assert_spin_locked(&device_domain_lock);
4215
4216         if (WARN_ON(!info))
4217                 return;
4218
4219         iommu = info->iommu;
4220         domain = info->domain;
4221
4222         if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4223                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4224                         intel_pasid_tear_down_entry(iommu, info->dev,
4225                                         PASID_RID2PASID, false);
4226
4227                 iommu_disable_dev_iotlb(info);
4228                 domain_context_clear(info);
4229                 intel_pasid_free_table(info->dev);
4230         }
4231
4232         list_del(&info->link);
4233
4234         spin_lock_irqsave(&iommu->lock, flags);
4235         domain_detach_iommu(domain, iommu);
4236         spin_unlock_irqrestore(&iommu->lock, flags);
4237 }
4238
4239 static void dmar_remove_one_dev_info(struct device *dev)
4240 {
4241         struct device_domain_info *info;
4242         unsigned long flags;
4243
4244         spin_lock_irqsave(&device_domain_lock, flags);
4245         info = dev_iommu_priv_get(dev);
4246         if (info)
4247                 __dmar_remove_one_dev_info(info);
4248         spin_unlock_irqrestore(&device_domain_lock, flags);
4249 }
4250
4251 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4252 {
4253         int adjust_width;
4254
4255         /* calculate AGAW */
4256         domain->gaw = guest_width;
4257         adjust_width = guestwidth_to_adjustwidth(guest_width);
4258         domain->agaw = width_to_agaw(adjust_width);
4259
4260         domain->iommu_coherency = false;
4261         domain->iommu_superpage = 0;
4262         domain->max_addr = 0;
4263
4264         /* always allocate the top pgd */
4265         domain->pgd = alloc_pgtable_page(domain->nid);
4266         if (!domain->pgd)
4267                 return -ENOMEM;
4268         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4269         return 0;
4270 }
4271
4272 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4273 {
4274         struct dmar_domain *dmar_domain;
4275         struct iommu_domain *domain;
4276
4277         switch (type) {
4278         case IOMMU_DOMAIN_DMA:
4279         case IOMMU_DOMAIN_DMA_FQ:
4280         case IOMMU_DOMAIN_UNMANAGED:
4281                 dmar_domain = alloc_domain(type);
4282                 if (!dmar_domain) {
4283                         pr_err("Can't allocate dmar_domain\n");
4284                         return NULL;
4285                 }
4286                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4287                         pr_err("Domain initialization failed\n");
4288                         domain_exit(dmar_domain);
4289                         return NULL;
4290                 }
4291
4292                 domain = &dmar_domain->domain;
4293                 domain->geometry.aperture_start = 0;
4294                 domain->geometry.aperture_end   =
4295                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4296                 domain->geometry.force_aperture = true;
4297
4298                 return domain;
4299         case IOMMU_DOMAIN_IDENTITY:
4300                 return &si_domain->domain;
4301         default:
4302                 return NULL;
4303         }
4304
4305         return NULL;
4306 }
4307
4308 static void intel_iommu_domain_free(struct iommu_domain *domain)
4309 {
4310         if (domain != &si_domain->domain)
4311                 domain_exit(to_dmar_domain(domain));
4312 }
4313
4314 static int prepare_domain_attach_device(struct iommu_domain *domain,
4315                                         struct device *dev)
4316 {
4317         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4318         struct intel_iommu *iommu;
4319         int addr_width;
4320
4321         iommu = device_to_iommu(dev, NULL, NULL);
4322         if (!iommu)
4323                 return -ENODEV;
4324
4325         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4326                 return -EOPNOTSUPP;
4327
4328         /* check if this iommu agaw is sufficient for max mapped address */
4329         addr_width = agaw_to_width(iommu->agaw);
4330         if (addr_width > cap_mgaw(iommu->cap))
4331                 addr_width = cap_mgaw(iommu->cap);
4332
4333         if (dmar_domain->max_addr > (1LL << addr_width)) {
4334                 dev_err(dev, "%s: iommu width (%d) is not "
4335                         "sufficient for the mapped address (%llx)\n",
4336                         __func__, addr_width, dmar_domain->max_addr);
4337                 return -EFAULT;
4338         }
4339         dmar_domain->gaw = addr_width;
4340
4341         /*
4342          * Knock out extra levels of page tables if necessary
4343          */
4344         while (iommu->agaw < dmar_domain->agaw) {
4345                 struct dma_pte *pte;
4346
4347                 pte = dmar_domain->pgd;
4348                 if (dma_pte_present(pte)) {
4349                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4350                         free_pgtable_page(pte);
4351                 }
4352                 dmar_domain->agaw--;
4353         }
4354
4355         return 0;
4356 }
4357
4358 static int intel_iommu_attach_device(struct iommu_domain *domain,
4359                                      struct device *dev)
4360 {
4361         int ret;
4362
4363         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4364             device_is_rmrr_locked(dev)) {
4365                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4366                 return -EPERM;
4367         }
4368
4369         /* normally dev is not mapped */
4370         if (unlikely(domain_context_mapped(dev))) {
4371                 struct device_domain_info *info = dev_iommu_priv_get(dev);
4372
4373                 if (info->domain)
4374                         dmar_remove_one_dev_info(dev);
4375         }
4376
4377         ret = prepare_domain_attach_device(domain, dev);
4378         if (ret)
4379                 return ret;
4380
4381         return domain_add_dev_info(to_dmar_domain(domain), dev);
4382 }
4383
4384 static void intel_iommu_detach_device(struct iommu_domain *domain,
4385                                       struct device *dev)
4386 {
4387         dmar_remove_one_dev_info(dev);
4388 }
4389
4390 static int intel_iommu_map(struct iommu_domain *domain,
4391                            unsigned long iova, phys_addr_t hpa,
4392                            size_t size, int iommu_prot, gfp_t gfp)
4393 {
4394         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4395         u64 max_addr;
4396         int prot = 0;
4397
4398         if (iommu_prot & IOMMU_READ)
4399                 prot |= DMA_PTE_READ;
4400         if (iommu_prot & IOMMU_WRITE)
4401                 prot |= DMA_PTE_WRITE;
4402         if (dmar_domain->set_pte_snp)
4403                 prot |= DMA_PTE_SNP;
4404
4405         max_addr = iova + size;
4406         if (dmar_domain->max_addr < max_addr) {
4407                 u64 end;
4408
4409                 /* check if minimum agaw is sufficient for mapped address */
4410                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4411                 if (end < max_addr) {
4412                         pr_err("%s: iommu width (%d) is not "
4413                                "sufficient for the mapped address (%llx)\n",
4414                                __func__, dmar_domain->gaw, max_addr);
4415                         return -EFAULT;
4416                 }
4417                 dmar_domain->max_addr = max_addr;
4418         }
4419         /* Round up size to next multiple of PAGE_SIZE, if it and
4420            the low bits of hpa would take us onto the next page */
4421         size = aligned_nrpages(hpa, size);
4422         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4423                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4424 }
4425
4426 static int intel_iommu_map_pages(struct iommu_domain *domain,
4427                                  unsigned long iova, phys_addr_t paddr,
4428                                  size_t pgsize, size_t pgcount,
4429                                  int prot, gfp_t gfp, size_t *mapped)
4430 {
4431         unsigned long pgshift = __ffs(pgsize);
4432         size_t size = pgcount << pgshift;
4433         int ret;
4434
4435         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4436                 return -EINVAL;
4437
4438         if (!IS_ALIGNED(iova | paddr, pgsize))
4439                 return -EINVAL;
4440
4441         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4442         if (!ret && mapped)
4443                 *mapped = size;
4444
4445         return ret;
4446 }
4447
4448 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4449                                 unsigned long iova, size_t size,
4450                                 struct iommu_iotlb_gather *gather)
4451 {
4452         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4453         unsigned long start_pfn, last_pfn;
4454         int level = 0;
4455
4456         /* Cope with horrid API which requires us to unmap more than the
4457            size argument if it happens to be a large-page mapping. */
4458         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4459
4460         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4461                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4462
4463         start_pfn = iova >> VTD_PAGE_SHIFT;
4464         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4465
4466         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4467
4468         if (dmar_domain->max_addr == iova + size)
4469                 dmar_domain->max_addr = iova;
4470
4471         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4472
4473         return size;
4474 }
4475
4476 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4477                                       unsigned long iova,
4478                                       size_t pgsize, size_t pgcount,
4479                                       struct iommu_iotlb_gather *gather)
4480 {
4481         unsigned long pgshift = __ffs(pgsize);
4482         size_t size = pgcount << pgshift;
4483
4484         return intel_iommu_unmap(domain, iova, size, gather);
4485 }
4486
4487 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4488                                  struct iommu_iotlb_gather *gather)
4489 {
4490         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4491         unsigned long iova_pfn = IOVA_PFN(gather->start);
4492         size_t size = gather->end - gather->start;
4493         unsigned long start_pfn;
4494         unsigned long nrpages;
4495         int iommu_id;
4496
4497         nrpages = aligned_nrpages(gather->start, size);
4498         start_pfn = mm_to_dma_pfn(iova_pfn);
4499
4500         for_each_domain_iommu(iommu_id, dmar_domain)
4501                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4502                                       start_pfn, nrpages,
4503                                       list_empty(&gather->freelist), 0);
4504
4505         put_pages_list(&gather->freelist);
4506 }
4507
4508 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4509                                             dma_addr_t iova)
4510 {
4511         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4512         struct dma_pte *pte;
4513         int level = 0;
4514         u64 phys = 0;
4515
4516         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4517         if (pte && dma_pte_present(pte))
4518                 phys = dma_pte_addr(pte) +
4519                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4520                                                 VTD_PAGE_SHIFT) - 1));
4521
4522         return phys;
4523 }
4524
4525 static bool domain_support_force_snooping(struct dmar_domain *domain)
4526 {
4527         struct device_domain_info *info;
4528         bool support = true;
4529
4530         assert_spin_locked(&device_domain_lock);
4531         list_for_each_entry(info, &domain->devices, link) {
4532                 if (!ecap_sc_support(info->iommu->ecap)) {
4533                         support = false;
4534                         break;
4535                 }
4536         }
4537
4538         return support;
4539 }
4540
4541 static void domain_set_force_snooping(struct dmar_domain *domain)
4542 {
4543         struct device_domain_info *info;
4544
4545         assert_spin_locked(&device_domain_lock);
4546
4547         /*
4548          * Second level page table supports per-PTE snoop control. The
4549          * iommu_map() interface will handle this by setting SNP bit.
4550          */
4551         if (!domain_use_first_level(domain)) {
4552                 domain->set_pte_snp = true;
4553                 return;
4554         }
4555
4556         list_for_each_entry(info, &domain->devices, link)
4557                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4558                                                      PASID_RID2PASID);
4559 }
4560
4561 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4562 {
4563         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4564         unsigned long flags;
4565
4566         if (dmar_domain->force_snooping)
4567                 return true;
4568
4569         spin_lock_irqsave(&device_domain_lock, flags);
4570         if (!domain_support_force_snooping(dmar_domain)) {
4571                 spin_unlock_irqrestore(&device_domain_lock, flags);
4572                 return false;
4573         }
4574
4575         domain_set_force_snooping(dmar_domain);
4576         dmar_domain->force_snooping = true;
4577         spin_unlock_irqrestore(&device_domain_lock, flags);
4578
4579         return true;
4580 }
4581
4582 static bool intel_iommu_capable(enum iommu_cap cap)
4583 {
4584         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4585                 return true;
4586         if (cap == IOMMU_CAP_INTR_REMAP)
4587                 return irq_remapping_enabled == 1;
4588         if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4589                 return dmar_platform_optin();
4590
4591         return false;
4592 }
4593
4594 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4595 {
4596         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4597         struct device_domain_info *info;
4598         struct intel_iommu *iommu;
4599         unsigned long flags;
4600         u8 bus, devfn;
4601
4602         iommu = device_to_iommu(dev, &bus, &devfn);
4603         if (!iommu)
4604                 return ERR_PTR(-ENODEV);
4605
4606         info = kzalloc(sizeof(*info), GFP_KERNEL);
4607         if (!info)
4608                 return ERR_PTR(-ENOMEM);
4609
4610         if (dev_is_real_dma_subdevice(dev)) {
4611                 info->bus = pdev->bus->number;
4612                 info->devfn = pdev->devfn;
4613                 info->segment = pci_domain_nr(pdev->bus);
4614         } else {
4615                 info->bus = bus;
4616                 info->devfn = devfn;
4617                 info->segment = iommu->segment;
4618         }
4619
4620         info->dev = dev;
4621         info->iommu = iommu;
4622         if (dev_is_pci(dev)) {
4623                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4624                     pci_ats_supported(pdev) &&
4625                     dmar_ats_supported(pdev, iommu))
4626                         info->ats_supported = 1;
4627
4628                 if (sm_supported(iommu)) {
4629                         if (pasid_supported(iommu)) {
4630                                 int features = pci_pasid_features(pdev);
4631
4632                                 if (features >= 0)
4633                                         info->pasid_supported = features | 1;
4634                         }
4635
4636                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4637                             pci_pri_supported(pdev))
4638                                 info->pri_supported = 1;
4639                 }
4640         }
4641
4642         spin_lock_irqsave(&device_domain_lock, flags);
4643         list_add(&info->global, &device_domain_list);
4644         dev_iommu_priv_set(dev, info);
4645         spin_unlock_irqrestore(&device_domain_lock, flags);
4646
4647         return &iommu->iommu;
4648 }
4649
4650 static void intel_iommu_release_device(struct device *dev)
4651 {
4652         struct device_domain_info *info = dev_iommu_priv_get(dev);
4653         unsigned long flags;
4654
4655         dmar_remove_one_dev_info(dev);
4656
4657         spin_lock_irqsave(&device_domain_lock, flags);
4658         dev_iommu_priv_set(dev, NULL);
4659         list_del(&info->global);
4660         spin_unlock_irqrestore(&device_domain_lock, flags);
4661
4662         kfree(info);
4663         set_dma_ops(dev, NULL);
4664 }
4665
4666 static void intel_iommu_probe_finalize(struct device *dev)
4667 {
4668         set_dma_ops(dev, NULL);
4669         iommu_setup_dma_ops(dev, 0, U64_MAX);
4670 }
4671
4672 static void intel_iommu_get_resv_regions(struct device *device,
4673                                          struct list_head *head)
4674 {
4675         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4676         struct iommu_resv_region *reg;
4677         struct dmar_rmrr_unit *rmrr;
4678         struct device *i_dev;
4679         int i;
4680
4681         down_read(&dmar_global_lock);
4682         for_each_rmrr_units(rmrr) {
4683                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4684                                           i, i_dev) {
4685                         struct iommu_resv_region *resv;
4686                         enum iommu_resv_type type;
4687                         size_t length;
4688
4689                         if (i_dev != device &&
4690                             !is_downstream_to_pci_bridge(device, i_dev))
4691                                 continue;
4692
4693                         length = rmrr->end_address - rmrr->base_address + 1;
4694
4695                         type = device_rmrr_is_relaxable(device) ?
4696                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4697
4698                         resv = iommu_alloc_resv_region(rmrr->base_address,
4699                                                        length, prot, type);
4700                         if (!resv)
4701                                 break;
4702
4703                         list_add_tail(&resv->list, head);
4704                 }
4705         }
4706         up_read(&dmar_global_lock);
4707
4708 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4709         if (dev_is_pci(device)) {
4710                 struct pci_dev *pdev = to_pci_dev(device);
4711
4712                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4713                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4714                                                    IOMMU_RESV_DIRECT_RELAXABLE);
4715                         if (reg)
4716                                 list_add_tail(&reg->list, head);
4717                 }
4718         }
4719 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4720
4721         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4722                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4723                                       0, IOMMU_RESV_MSI);
4724         if (!reg)
4725                 return;
4726         list_add_tail(&reg->list, head);
4727 }
4728
4729 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4730 {
4731         struct device_domain_info *info = dev_iommu_priv_get(dev);
4732         struct context_entry *context;
4733         struct dmar_domain *domain;
4734         unsigned long flags;
4735         u64 ctx_lo;
4736         int ret;
4737
4738         domain = info->domain;
4739         if (!domain)
4740                 return -EINVAL;
4741
4742         spin_lock_irqsave(&device_domain_lock, flags);
4743         spin_lock(&iommu->lock);
4744
4745         ret = -EINVAL;
4746         if (!info->pasid_supported)
4747                 goto out;
4748
4749         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4750         if (WARN_ON(!context))
4751                 goto out;
4752
4753         ctx_lo = context[0].lo;
4754
4755         if (!(ctx_lo & CONTEXT_PASIDE)) {
4756                 ctx_lo |= CONTEXT_PASIDE;
4757                 context[0].lo = ctx_lo;
4758                 wmb();
4759                 iommu->flush.flush_context(iommu,
4760                                            domain->iommu_did[iommu->seq_id],
4761                                            PCI_DEVID(info->bus, info->devfn),
4762                                            DMA_CCMD_MASK_NOBIT,
4763                                            DMA_CCMD_DEVICE_INVL);
4764         }
4765
4766         /* Enable PASID support in the device, if it wasn't already */
4767         if (!info->pasid_enabled)
4768                 iommu_enable_dev_iotlb(info);
4769
4770         ret = 0;
4771
4772  out:
4773         spin_unlock(&iommu->lock);
4774         spin_unlock_irqrestore(&device_domain_lock, flags);
4775
4776         return ret;
4777 }
4778
4779 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4780 {
4781         if (dev_is_pci(dev))
4782                 return pci_device_group(dev);
4783         return generic_device_group(dev);
4784 }
4785
4786 static int intel_iommu_enable_sva(struct device *dev)
4787 {
4788         struct device_domain_info *info = dev_iommu_priv_get(dev);
4789         struct intel_iommu *iommu;
4790         int ret;
4791
4792         if (!info || dmar_disabled)
4793                 return -EINVAL;
4794
4795         iommu = info->iommu;
4796         if (!iommu)
4797                 return -EINVAL;
4798
4799         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4800                 return -ENODEV;
4801
4802         if (intel_iommu_enable_pasid(iommu, dev))
4803                 return -ENODEV;
4804
4805         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4806                 return -EINVAL;
4807
4808         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4809         if (!ret)
4810                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4811
4812         return ret;
4813 }
4814
4815 static int intel_iommu_disable_sva(struct device *dev)
4816 {
4817         struct device_domain_info *info = dev_iommu_priv_get(dev);
4818         struct intel_iommu *iommu = info->iommu;
4819         int ret;
4820
4821         ret = iommu_unregister_device_fault_handler(dev);
4822         if (!ret)
4823                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4824
4825         return ret;
4826 }
4827
4828 static int intel_iommu_enable_iopf(struct device *dev)
4829 {
4830         struct device_domain_info *info = dev_iommu_priv_get(dev);
4831
4832         if (info && info->pri_supported)
4833                 return 0;
4834
4835         return -ENODEV;
4836 }
4837
4838 static int
4839 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4840 {
4841         switch (feat) {
4842         case IOMMU_DEV_FEAT_IOPF:
4843                 return intel_iommu_enable_iopf(dev);
4844
4845         case IOMMU_DEV_FEAT_SVA:
4846                 return intel_iommu_enable_sva(dev);
4847
4848         default:
4849                 return -ENODEV;
4850         }
4851 }
4852
4853 static int
4854 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4855 {
4856         switch (feat) {
4857         case IOMMU_DEV_FEAT_IOPF:
4858                 return 0;
4859
4860         case IOMMU_DEV_FEAT_SVA:
4861                 return intel_iommu_disable_sva(dev);
4862
4863         default:
4864                 return -ENODEV;
4865         }
4866 }
4867
4868 static bool intel_iommu_is_attach_deferred(struct device *dev)
4869 {
4870         struct device_domain_info *info = dev_iommu_priv_get(dev);
4871
4872         return translation_pre_enabled(info->iommu) && !info->domain;
4873 }
4874
4875 /*
4876  * Check that the device does not live on an external facing PCI port that is
4877  * marked as untrusted. Such devices should not be able to apply quirks and
4878  * thus not be able to bypass the IOMMU restrictions.
4879  */
4880 static bool risky_device(struct pci_dev *pdev)
4881 {
4882         if (pdev->untrusted) {
4883                 pci_info(pdev,
4884                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4885                          pdev->vendor, pdev->device);
4886                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4887                 return true;
4888         }
4889         return false;
4890 }
4891
4892 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4893                                        unsigned long iova, size_t size)
4894 {
4895         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4896         unsigned long pages = aligned_nrpages(iova, size);
4897         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4898         struct intel_iommu *iommu;
4899         int iommu_id;
4900
4901         for_each_domain_iommu(iommu_id, dmar_domain) {
4902                 iommu = g_iommus[iommu_id];
4903                 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
4904         }
4905 }
4906
4907 const struct iommu_ops intel_iommu_ops = {
4908         .capable                = intel_iommu_capable,
4909         .domain_alloc           = intel_iommu_domain_alloc,
4910         .probe_device           = intel_iommu_probe_device,
4911         .probe_finalize         = intel_iommu_probe_finalize,
4912         .release_device         = intel_iommu_release_device,
4913         .get_resv_regions       = intel_iommu_get_resv_regions,
4914         .put_resv_regions       = generic_iommu_put_resv_regions,
4915         .device_group           = intel_iommu_device_group,
4916         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4917         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4918         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4919         .def_domain_type        = device_def_domain_type,
4920         .pgsize_bitmap          = SZ_4K,
4921 #ifdef CONFIG_INTEL_IOMMU_SVM
4922         .sva_bind               = intel_svm_bind,
4923         .sva_unbind             = intel_svm_unbind,
4924         .sva_get_pasid          = intel_svm_get_pasid,
4925         .page_response          = intel_svm_page_response,
4926 #endif
4927         .default_domain_ops = &(const struct iommu_domain_ops) {
4928                 .attach_dev             = intel_iommu_attach_device,
4929                 .detach_dev             = intel_iommu_detach_device,
4930                 .map_pages              = intel_iommu_map_pages,
4931                 .unmap_pages            = intel_iommu_unmap_pages,
4932                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4933                 .flush_iotlb_all        = intel_flush_iotlb_all,
4934                 .iotlb_sync             = intel_iommu_tlb_sync,
4935                 .iova_to_phys           = intel_iommu_iova_to_phys,
4936                 .free                   = intel_iommu_domain_free,
4937                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4938         }
4939 };
4940
4941 static void quirk_iommu_igfx(struct pci_dev *dev)
4942 {
4943         if (risky_device(dev))
4944                 return;
4945
4946         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4947         dmar_map_gfx = 0;
4948 }
4949
4950 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4958
4959 /* Broadwell igfx malfunctions with dmar */
4960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4968 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4969 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4970 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4971 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4972 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4973 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4981 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4982 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4983 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4984
4985 static void quirk_iommu_rwbf(struct pci_dev *dev)
4986 {
4987         if (risky_device(dev))
4988                 return;
4989
4990         /*
4991          * Mobile 4 Series Chipset neglects to set RWBF capability,
4992          * but needs it. Same seems to hold for the desktop versions.
4993          */
4994         pci_info(dev, "Forcing write-buffer flush capability\n");
4995         rwbf_quirk = 1;
4996 }
4997
4998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5005
5006 #define GGC 0x52
5007 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5008 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5009 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5010 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5011 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5012 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5013 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5014 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5015
5016 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5017 {
5018         unsigned short ggc;
5019
5020         if (risky_device(dev))
5021                 return;
5022
5023         if (pci_read_config_word(dev, GGC, &ggc))
5024                 return;
5025
5026         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5027                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5028                 dmar_map_gfx = 0;
5029         } else if (dmar_map_gfx) {
5030                 /* we have to ensure the gfx device is idle before we flush */
5031                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5032                 iommu_set_dma_strict();
5033         }
5034 }
5035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5038 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5039
5040 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5041 {
5042         unsigned short ver;
5043
5044         if (!IS_GFX_DEVICE(dev))
5045                 return;
5046
5047         ver = (dev->device >> 8) & 0xff;
5048         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5049             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5050             ver != 0x9a && ver != 0xa7)
5051                 return;
5052
5053         if (risky_device(dev))
5054                 return;
5055
5056         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5057         iommu_skip_te_disable = 1;
5058 }
5059 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5060
5061 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5062    ISOCH DMAR unit for the Azalia sound device, but not give it any
5063    TLB entries, which causes it to deadlock. Check for that.  We do
5064    this in a function called from init_dmars(), instead of in a PCI
5065    quirk, because we don't want to print the obnoxious "BIOS broken"
5066    message if VT-d is actually disabled.
5067 */
5068 static void __init check_tylersburg_isoch(void)
5069 {
5070         struct pci_dev *pdev;
5071         uint32_t vtisochctrl;
5072
5073         /* If there's no Azalia in the system anyway, forget it. */
5074         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5075         if (!pdev)
5076                 return;
5077
5078         if (risky_device(pdev)) {
5079                 pci_dev_put(pdev);
5080                 return;
5081         }
5082
5083         pci_dev_put(pdev);
5084
5085         /* System Management Registers. Might be hidden, in which case
5086            we can't do the sanity check. But that's OK, because the
5087            known-broken BIOSes _don't_ actually hide it, so far. */
5088         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5089         if (!pdev)
5090                 return;
5091
5092         if (risky_device(pdev)) {
5093                 pci_dev_put(pdev);
5094                 return;
5095         }
5096
5097         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5098                 pci_dev_put(pdev);
5099                 return;
5100         }
5101
5102         pci_dev_put(pdev);
5103
5104         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5105         if (vtisochctrl & 1)
5106                 return;
5107
5108         /* Drop all bits other than the number of TLB entries */
5109         vtisochctrl &= 0x1c;
5110
5111         /* If we have the recommended number of TLB entries (16), fine. */
5112         if (vtisochctrl == 0x10)
5113                 return;
5114
5115         /* Zero TLB entries? You get to ride the short bus to school. */
5116         if (!vtisochctrl) {
5117                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5118                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5119                      dmi_get_system_info(DMI_BIOS_VENDOR),
5120                      dmi_get_system_info(DMI_BIOS_VERSION),
5121                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5122                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5123                 return;
5124         }
5125
5126         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5127                vtisochctrl);
5128 }