Merge tag 'block-6.1-2022-10-20' of git://git.kernel.dk/linux
[linux-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148         if (!(re->lo & 1))
149                 return 0;
150
151         return re->lo & VTD_PAGE_MASK;
152 }
153
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160         if (!(re->hi & 1))
161                 return 0;
162
163         return re->hi & VTD_PAGE_MASK;
164 }
165
166 static inline void context_set_present(struct context_entry *context)
167 {
168         context->lo |= 1;
169 }
170
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173         context->lo &= (((u64)-1) << 2) | 1;
174 }
175
176 static inline void context_set_translation_type(struct context_entry *context,
177                                                 unsigned long value)
178 {
179         context->lo &= (((u64)-1) << 4) | 3;
180         context->lo |= (value & 3) << 2;
181 }
182
183 static inline void context_set_address_root(struct context_entry *context,
184                                             unsigned long value)
185 {
186         context->lo &= ~VTD_PAGE_MASK;
187         context->lo |= value & VTD_PAGE_MASK;
188 }
189
190 static inline void context_set_address_width(struct context_entry *context,
191                                              unsigned long value)
192 {
193         context->hi |= value & 7;
194 }
195
196 static inline void context_set_domain_id(struct context_entry *context,
197                                          unsigned long value)
198 {
199         context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201
202 static inline void context_set_pasid(struct context_entry *context)
203 {
204         context->lo |= CONTEXT_PASIDE;
205 }
206
207 static inline int context_domain_id(struct context_entry *c)
208 {
209         return((c->hi >> 8) & 0xffff);
210 }
211
212 static inline void context_clear_entry(struct context_entry *context)
213 {
214         context->lo = 0;
215         context->hi = 0;
216 }
217
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220         if (!iommu->copied_tables)
221                 return false;
222
223         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225
226 static inline void
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231
232 static inline void
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237
238 /*
239  * This domain is a statically identity mapping domain.
240  *      1. This domain creats a static 1:1 mapping to all usable memory.
241  *      2. It maps to each iommu if successful.
242  *      3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246
247 struct dmar_rmrr_unit {
248         struct list_head list;          /* list of rmrr units   */
249         struct acpi_dmar_header *hdr;   /* ACPI header          */
250         u64     base_address;           /* reserved base address*/
251         u64     end_address;            /* reserved end address */
252         struct dmar_dev_scope *devices; /* target devices */
253         int     devices_cnt;            /* target device count */
254 };
255
256 struct dmar_atsr_unit {
257         struct list_head list;          /* list of ATSR units */
258         struct acpi_dmar_header *hdr;   /* ACPI header */
259         struct dmar_dev_scope *devices; /* target devices */
260         int devices_cnt;                /* target device count */
261         u8 include_all:1;               /* include all ports */
262 };
263
264 struct dmar_satc_unit {
265         struct list_head list;          /* list of SATC units */
266         struct acpi_dmar_header *hdr;   /* ACPI header */
267         struct dmar_dev_scope *devices; /* target devices */
268         struct intel_iommu *iommu;      /* the corresponding iommu */
269         int devices_cnt;                /* target device count */
270         u8 atc_required:1;              /* ATS is required */
271 };
272
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276
277 #define for_each_rmrr_units(rmrr) \
278         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279
280 static void dmar_remove_one_dev_info(struct device *dev);
281
282 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
283 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
284
285 int intel_iommu_enabled = 0;
286 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
287
288 static int dmar_map_gfx = 1;
289 static int intel_iommu_superpage = 1;
290 static int iommu_identity_mapping;
291 static int iommu_skip_te_disable;
292
293 #define IDENTMAP_GFX            2
294 #define IDENTMAP_AZALIA         4
295
296 const struct iommu_ops intel_iommu_ops;
297
298 static bool translation_pre_enabled(struct intel_iommu *iommu)
299 {
300         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
301 }
302
303 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
304 {
305         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
306 }
307
308 static void init_translation_status(struct intel_iommu *iommu)
309 {
310         u32 gsts;
311
312         gsts = readl(iommu->reg + DMAR_GSTS_REG);
313         if (gsts & DMA_GSTS_TES)
314                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
315 }
316
317 static int __init intel_iommu_setup(char *str)
318 {
319         if (!str)
320                 return -EINVAL;
321
322         while (*str) {
323                 if (!strncmp(str, "on", 2)) {
324                         dmar_disabled = 0;
325                         pr_info("IOMMU enabled\n");
326                 } else if (!strncmp(str, "off", 3)) {
327                         dmar_disabled = 1;
328                         no_platform_optin = 1;
329                         pr_info("IOMMU disabled\n");
330                 } else if (!strncmp(str, "igfx_off", 8)) {
331                         dmar_map_gfx = 0;
332                         pr_info("Disable GFX device mapping\n");
333                 } else if (!strncmp(str, "forcedac", 8)) {
334                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
335                         iommu_dma_forcedac = true;
336                 } else if (!strncmp(str, "strict", 6)) {
337                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
338                         iommu_set_dma_strict();
339                 } else if (!strncmp(str, "sp_off", 6)) {
340                         pr_info("Disable supported super page\n");
341                         intel_iommu_superpage = 0;
342                 } else if (!strncmp(str, "sm_on", 5)) {
343                         pr_info("Enable scalable mode if hardware supports\n");
344                         intel_iommu_sm = 1;
345                 } else if (!strncmp(str, "sm_off", 6)) {
346                         pr_info("Scalable mode is disallowed\n");
347                         intel_iommu_sm = 0;
348                 } else if (!strncmp(str, "tboot_noforce", 13)) {
349                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
350                         intel_iommu_tboot_noforce = 1;
351                 } else {
352                         pr_notice("Unknown option - '%s'\n", str);
353                 }
354
355                 str += strcspn(str, ",");
356                 while (*str == ',')
357                         str++;
358         }
359
360         return 1;
361 }
362 __setup("intel_iommu=", intel_iommu_setup);
363
364 void *alloc_pgtable_page(int node)
365 {
366         struct page *page;
367         void *vaddr = NULL;
368
369         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
370         if (page)
371                 vaddr = page_address(page);
372         return vaddr;
373 }
374
375 void free_pgtable_page(void *vaddr)
376 {
377         free_page((unsigned long)vaddr);
378 }
379
380 static inline int domain_type_is_si(struct dmar_domain *domain)
381 {
382         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
383 }
384
385 static inline bool domain_use_first_level(struct dmar_domain *domain)
386 {
387         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
388 }
389
390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391                                        unsigned long pfn)
392 {
393         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394
395         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397
398 /*
399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401  * the returned SAGAW.
402  */
403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405         unsigned long fl_sagaw, sl_sagaw;
406
407         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408         sl_sagaw = cap_sagaw(iommu->cap);
409
410         /* Second level only. */
411         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412                 return sl_sagaw;
413
414         /* First level only. */
415         if (!ecap_slts(iommu->ecap))
416                 return fl_sagaw;
417
418         return fl_sagaw & sl_sagaw;
419 }
420
421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423         unsigned long sagaw;
424         int agaw;
425
426         sagaw = __iommu_calculate_sagaw(iommu);
427         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428                 if (test_bit(agaw, &sagaw))
429                         break;
430         }
431
432         return agaw;
433 }
434
435 /*
436  * Calculate max SAGAW for each iommu.
437  */
438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442
443 /*
444  * calculate agaw for each iommu.
445  * "SAGAW" may be different across iommus, use a default agaw, and
446  * get a supported less agaw for iommus that don't support the default agaw.
447  */
448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452
453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455         return sm_supported(iommu) ?
456                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458
459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461         struct iommu_domain_info *info;
462         struct dmar_drhd_unit *drhd;
463         struct intel_iommu *iommu;
464         bool found = false;
465         unsigned long i;
466
467         domain->iommu_coherency = true;
468         xa_for_each(&domain->iommu_array, i, info) {
469                 found = true;
470                 if (!iommu_paging_structure_coherency(info->iommu)) {
471                         domain->iommu_coherency = false;
472                         break;
473                 }
474         }
475         if (found)
476                 return;
477
478         /* No hardware attached; use lowest common denominator */
479         rcu_read_lock();
480         for_each_active_iommu(iommu, drhd) {
481                 if (!iommu_paging_structure_coherency(iommu)) {
482                         domain->iommu_coherency = false;
483                         break;
484                 }
485         }
486         rcu_read_unlock();
487 }
488
489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490                                          struct intel_iommu *skip)
491 {
492         struct dmar_drhd_unit *drhd;
493         struct intel_iommu *iommu;
494         int mask = 0x3;
495
496         if (!intel_iommu_superpage)
497                 return 0;
498
499         /* set iommu_superpage to the smallest common denominator */
500         rcu_read_lock();
501         for_each_active_iommu(iommu, drhd) {
502                 if (iommu != skip) {
503                         if (domain && domain_use_first_level(domain)) {
504                                 if (!cap_fl1gp_support(iommu->cap))
505                                         mask = 0x1;
506                         } else {
507                                 mask &= cap_super_page_val(iommu->cap);
508                         }
509
510                         if (!mask)
511                                 break;
512                 }
513         }
514         rcu_read_unlock();
515
516         return fls(mask);
517 }
518
519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521         struct device_domain_info *info;
522         int nid = NUMA_NO_NODE;
523         unsigned long flags;
524
525         spin_lock_irqsave(&domain->lock, flags);
526         list_for_each_entry(info, &domain->devices, link) {
527                 /*
528                  * There could possibly be multiple device numa nodes as devices
529                  * within the same domain may sit behind different IOMMUs. There
530                  * isn't perfect answer in such situation, so we select first
531                  * come first served policy.
532                  */
533                 nid = dev_to_node(info->dev);
534                 if (nid != NUMA_NO_NODE)
535                         break;
536         }
537         spin_unlock_irqrestore(&domain->lock, flags);
538
539         return nid;
540 }
541
542 static void domain_update_iotlb(struct dmar_domain *domain);
543
544 /* Return the super pagesize bitmap if supported. */
545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547         unsigned long bitmap = 0;
548
549         /*
550          * 1-level super page supports page size of 2MiB, 2-level super page
551          * supports page size of both 2MiB and 1GiB.
552          */
553         if (domain->iommu_superpage == 1)
554                 bitmap |= SZ_2M;
555         else if (domain->iommu_superpage == 2)
556                 bitmap |= SZ_2M | SZ_1G;
557
558         return bitmap;
559 }
560
561 /* Some capabilities may be different across iommus */
562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564         domain_update_iommu_coherency(domain);
565         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566
567         /*
568          * If RHSA is missing, we should default to the device numa domain
569          * as fall back.
570          */
571         if (domain->nid == NUMA_NO_NODE)
572                 domain->nid = domain_update_device_node(domain);
573
574         /*
575          * First-level translation restricts the input-address to a
576          * canonical address (i.e., address bits 63:N have the same
577          * value as address bit [N-1], where N is 48-bits with 4-level
578          * paging and 57-bits with 5-level paging). Hence, skip bit
579          * [N-1].
580          */
581         if (domain_use_first_level(domain))
582                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583         else
584                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585
586         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587         domain_update_iotlb(domain);
588 }
589
590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591                                          u8 devfn, int alloc)
592 {
593         struct root_entry *root = &iommu->root_entry[bus];
594         struct context_entry *context;
595         u64 *entry;
596
597         /*
598          * Except that the caller requested to allocate a new entry,
599          * returning a copied context entry makes no sense.
600          */
601         if (!alloc && context_copied(iommu, bus, devfn))
602                 return NULL;
603
604         entry = &root->lo;
605         if (sm_supported(iommu)) {
606                 if (devfn >= 0x80) {
607                         devfn -= 0x80;
608                         entry = &root->hi;
609                 }
610                 devfn *= 2;
611         }
612         if (*entry & 1)
613                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
614         else {
615                 unsigned long phy_addr;
616                 if (!alloc)
617                         return NULL;
618
619                 context = alloc_pgtable_page(iommu->node);
620                 if (!context)
621                         return NULL;
622
623                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624                 phy_addr = virt_to_phys((void *)context);
625                 *entry = phy_addr | 1;
626                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
627         }
628         return &context[devfn];
629 }
630
631 /**
632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633  *                               sub-hierarchy of a candidate PCI-PCI bridge
634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635  * @bridge: the candidate PCI-PCI bridge
636  *
637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638  */
639 static bool
640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642         struct pci_dev *pdev, *pbridge;
643
644         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645                 return false;
646
647         pdev = to_pci_dev(dev);
648         pbridge = to_pci_dev(bridge);
649
650         if (pbridge->subordinate &&
651             pbridge->subordinate->number <= pdev->bus->number &&
652             pbridge->subordinate->busn_res.end >= pdev->bus->number)
653                 return true;
654
655         return false;
656 }
657
658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660         struct dmar_drhd_unit *drhd;
661         u32 vtbar;
662         int rc;
663
664         /* We know that this device on this chipset has its own IOMMU.
665          * If we find it under a different IOMMU, then the BIOS is lying
666          * to us. Hope that the IOMMU for this device is actually
667          * disabled, and it needs no translation...
668          */
669         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670         if (rc) {
671                 /* "can't" happen */
672                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673                 return false;
674         }
675         vtbar &= 0xffff0000;
676
677         /* we know that the this iommu should be at offset 0xa000 from vtbar */
678         drhd = dmar_find_matched_drhd_unit(pdev);
679         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682                 return true;
683         }
684
685         return false;
686 }
687
688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690         if (!iommu || iommu->drhd->ignored)
691                 return true;
692
693         if (dev_is_pci(dev)) {
694                 struct pci_dev *pdev = to_pci_dev(dev);
695
696                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698                     quirk_ioat_snb_local_iommu(pdev))
699                         return true;
700         }
701
702         return false;
703 }
704
705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707         struct dmar_drhd_unit *drhd = NULL;
708         struct pci_dev *pdev = NULL;
709         struct intel_iommu *iommu;
710         struct device *tmp;
711         u16 segment = 0;
712         int i;
713
714         if (!dev)
715                 return NULL;
716
717         if (dev_is_pci(dev)) {
718                 struct pci_dev *pf_pdev;
719
720                 pdev = pci_real_dma_dev(to_pci_dev(dev));
721
722                 /* VFs aren't listed in scope tables; we need to look up
723                  * the PF instead to find the IOMMU. */
724                 pf_pdev = pci_physfn(pdev);
725                 dev = &pf_pdev->dev;
726                 segment = pci_domain_nr(pdev->bus);
727         } else if (has_acpi_companion(dev))
728                 dev = &ACPI_COMPANION(dev)->dev;
729
730         rcu_read_lock();
731         for_each_iommu(iommu, drhd) {
732                 if (pdev && segment != drhd->segment)
733                         continue;
734
735                 for_each_active_dev_scope(drhd->devices,
736                                           drhd->devices_cnt, i, tmp) {
737                         if (tmp == dev) {
738                                 /* For a VF use its original BDF# not that of the PF
739                                  * which we used for the IOMMU lookup. Strictly speaking
740                                  * we could do this for all PCI devices; we only need to
741                                  * get the BDF# from the scope table for ACPI matches. */
742                                 if (pdev && pdev->is_virtfn)
743                                         goto got_pdev;
744
745                                 if (bus && devfn) {
746                                         *bus = drhd->devices[i].bus;
747                                         *devfn = drhd->devices[i].devfn;
748                                 }
749                                 goto out;
750                         }
751
752                         if (is_downstream_to_pci_bridge(dev, tmp))
753                                 goto got_pdev;
754                 }
755
756                 if (pdev && drhd->include_all) {
757 got_pdev:
758                         if (bus && devfn) {
759                                 *bus = pdev->bus->number;
760                                 *devfn = pdev->devfn;
761                         }
762                         goto out;
763                 }
764         }
765         iommu = NULL;
766 out:
767         if (iommu_is_dummy(iommu, dev))
768                 iommu = NULL;
769
770         rcu_read_unlock();
771
772         return iommu;
773 }
774
775 static void domain_flush_cache(struct dmar_domain *domain,
776                                void *addr, int size)
777 {
778         if (!domain->iommu_coherency)
779                 clflush_cache_range(addr, size);
780 }
781
782 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
783 {
784         struct context_entry *context;
785         int ret = 0;
786
787         spin_lock(&iommu->lock);
788         context = iommu_context_addr(iommu, bus, devfn, 0);
789         if (context)
790                 ret = context_present(context);
791         spin_unlock(&iommu->lock);
792         return ret;
793 }
794
795 static void free_context_table(struct intel_iommu *iommu)
796 {
797         struct context_entry *context;
798         int i;
799
800         if (!iommu->root_entry)
801                 return;
802
803         for (i = 0; i < ROOT_ENTRY_NR; i++) {
804                 context = iommu_context_addr(iommu, i, 0, 0);
805                 if (context)
806                         free_pgtable_page(context);
807
808                 if (!sm_supported(iommu))
809                         continue;
810
811                 context = iommu_context_addr(iommu, i, 0x80, 0);
812                 if (context)
813                         free_pgtable_page(context);
814         }
815
816         free_pgtable_page(iommu->root_entry);
817         iommu->root_entry = NULL;
818 }
819
820 #ifdef CONFIG_DMAR_DEBUG
821 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
822                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
823 {
824         struct dma_pte *pte;
825         int offset;
826
827         while (1) {
828                 offset = pfn_level_offset(pfn, level);
829                 pte = &parent[offset];
830                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
831                         pr_info("PTE not present at level %d\n", level);
832                         break;
833                 }
834
835                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
836
837                 if (level == 1)
838                         break;
839
840                 parent = phys_to_virt(dma_pte_addr(pte));
841                 level--;
842         }
843 }
844
845 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
846                           unsigned long long addr, u32 pasid)
847 {
848         struct pasid_dir_entry *dir, *pde;
849         struct pasid_entry *entries, *pte;
850         struct context_entry *ctx_entry;
851         struct root_entry *rt_entry;
852         int i, dir_index, index, level;
853         u8 devfn = source_id & 0xff;
854         u8 bus = source_id >> 8;
855         struct dma_pte *pgtable;
856
857         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
858
859         /* root entry dump */
860         rt_entry = &iommu->root_entry[bus];
861         if (!rt_entry) {
862                 pr_info("root table entry is not present\n");
863                 return;
864         }
865
866         if (sm_supported(iommu))
867                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
868                         rt_entry->hi, rt_entry->lo);
869         else
870                 pr_info("root entry: 0x%016llx", rt_entry->lo);
871
872         /* context entry dump */
873         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
874         if (!ctx_entry) {
875                 pr_info("context table entry is not present\n");
876                 return;
877         }
878
879         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
880                 ctx_entry->hi, ctx_entry->lo);
881
882         /* legacy mode does not require PASID entries */
883         if (!sm_supported(iommu)) {
884                 level = agaw_to_level(ctx_entry->hi & 7);
885                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886                 goto pgtable_walk;
887         }
888
889         /* get the pointer to pasid directory entry */
890         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
891         if (!dir) {
892                 pr_info("pasid directory entry is not present\n");
893                 return;
894         }
895         /* For request-without-pasid, get the pasid from context entry */
896         if (intel_iommu_sm && pasid == INVALID_IOASID)
897                 pasid = PASID_RID2PASID;
898
899         dir_index = pasid >> PASID_PDE_SHIFT;
900         pde = &dir[dir_index];
901         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
902
903         /* get the pointer to the pasid table entry */
904         entries = get_pasid_table_from_pde(pde);
905         if (!entries) {
906                 pr_info("pasid table entry is not present\n");
907                 return;
908         }
909         index = pasid & PASID_PTE_MASK;
910         pte = &entries[index];
911         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
912                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
913
914         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917         } else {
918                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920         }
921
922 pgtable_walk:
923         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926
927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928                                       unsigned long pfn, int *target_level)
929 {
930         struct dma_pte *parent, *pte;
931         int level = agaw_to_level(domain->agaw);
932         int offset;
933
934         BUG_ON(!domain->pgd);
935
936         if (!domain_pfn_supported(domain, pfn))
937                 /* Address beyond IOMMU's addressing capabilities. */
938                 return NULL;
939
940         parent = domain->pgd;
941
942         while (1) {
943                 void *tmp_page;
944
945                 offset = pfn_level_offset(pfn, level);
946                 pte = &parent[offset];
947                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
948                         break;
949                 if (level == *target_level)
950                         break;
951
952                 if (!dma_pte_present(pte)) {
953                         uint64_t pteval;
954
955                         tmp_page = alloc_pgtable_page(domain->nid);
956
957                         if (!tmp_page)
958                                 return NULL;
959
960                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
961                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
962                         if (domain_use_first_level(domain)) {
963                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
964                                 if (iommu_is_dma_domain(&domain->domain))
965                                         pteval |= DMA_FL_PTE_ACCESS;
966                         }
967                         if (cmpxchg64(&pte->val, 0ULL, pteval))
968                                 /* Someone else set it while we were thinking; use theirs. */
969                                 free_pgtable_page(tmp_page);
970                         else
971                                 domain_flush_cache(domain, pte, sizeof(*pte));
972                 }
973                 if (level == 1)
974                         break;
975
976                 parent = phys_to_virt(dma_pte_addr(pte));
977                 level--;
978         }
979
980         if (!*target_level)
981                 *target_level = level;
982
983         return pte;
984 }
985
986 /* return address's pte at specific level */
987 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
988                                          unsigned long pfn,
989                                          int level, int *large_page)
990 {
991         struct dma_pte *parent, *pte;
992         int total = agaw_to_level(domain->agaw);
993         int offset;
994
995         parent = domain->pgd;
996         while (level <= total) {
997                 offset = pfn_level_offset(pfn, total);
998                 pte = &parent[offset];
999                 if (level == total)
1000                         return pte;
1001
1002                 if (!dma_pte_present(pte)) {
1003                         *large_page = total;
1004                         break;
1005                 }
1006
1007                 if (dma_pte_superpage(pte)) {
1008                         *large_page = total;
1009                         return pte;
1010                 }
1011
1012                 parent = phys_to_virt(dma_pte_addr(pte));
1013                 total--;
1014         }
1015         return NULL;
1016 }
1017
1018 /* clear last level pte, a tlb flush should be followed */
1019 static void dma_pte_clear_range(struct dmar_domain *domain,
1020                                 unsigned long start_pfn,
1021                                 unsigned long last_pfn)
1022 {
1023         unsigned int large_page;
1024         struct dma_pte *first_pte, *pte;
1025
1026         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1027         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1028         BUG_ON(start_pfn > last_pfn);
1029
1030         /* we don't need lock here; nobody else touches the iova range */
1031         do {
1032                 large_page = 1;
1033                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1034                 if (!pte) {
1035                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1036                         continue;
1037                 }
1038                 do {
1039                         dma_clear_pte(pte);
1040                         start_pfn += lvl_to_nr_pages(large_page);
1041                         pte++;
1042                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1043
1044                 domain_flush_cache(domain, first_pte,
1045                                    (void *)pte - (void *)first_pte);
1046
1047         } while (start_pfn && start_pfn <= last_pfn);
1048 }
1049
1050 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1051                                int retain_level, struct dma_pte *pte,
1052                                unsigned long pfn, unsigned long start_pfn,
1053                                unsigned long last_pfn)
1054 {
1055         pfn = max(start_pfn, pfn);
1056         pte = &pte[pfn_level_offset(pfn, level)];
1057
1058         do {
1059                 unsigned long level_pfn;
1060                 struct dma_pte *level_pte;
1061
1062                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1063                         goto next;
1064
1065                 level_pfn = pfn & level_mask(level);
1066                 level_pte = phys_to_virt(dma_pte_addr(pte));
1067
1068                 if (level > 2) {
1069                         dma_pte_free_level(domain, level - 1, retain_level,
1070                                            level_pte, level_pfn, start_pfn,
1071                                            last_pfn);
1072                 }
1073
1074                 /*
1075                  * Free the page table if we're below the level we want to
1076                  * retain and the range covers the entire table.
1077                  */
1078                 if (level < retain_level && !(start_pfn > level_pfn ||
1079                       last_pfn < level_pfn + level_size(level) - 1)) {
1080                         dma_clear_pte(pte);
1081                         domain_flush_cache(domain, pte, sizeof(*pte));
1082                         free_pgtable_page(level_pte);
1083                 }
1084 next:
1085                 pfn += level_size(level);
1086         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1087 }
1088
1089 /*
1090  * clear last level (leaf) ptes and free page table pages below the
1091  * level we wish to keep intact.
1092  */
1093 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1094                                    unsigned long start_pfn,
1095                                    unsigned long last_pfn,
1096                                    int retain_level)
1097 {
1098         dma_pte_clear_range(domain, start_pfn, last_pfn);
1099
1100         /* We don't need lock here; nobody else touches the iova range */
1101         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1102                            domain->pgd, 0, start_pfn, last_pfn);
1103
1104         /* free pgd */
1105         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1106                 free_pgtable_page(domain->pgd);
1107                 domain->pgd = NULL;
1108         }
1109 }
1110
1111 /* When a page at a given level is being unlinked from its parent, we don't
1112    need to *modify* it at all. All we need to do is make a list of all the
1113    pages which can be freed just as soon as we've flushed the IOTLB and we
1114    know the hardware page-walk will no longer touch them.
1115    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1116    be freed. */
1117 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1118                                     int level, struct dma_pte *pte,
1119                                     struct list_head *freelist)
1120 {
1121         struct page *pg;
1122
1123         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1124         list_add_tail(&pg->lru, freelist);
1125
1126         if (level == 1)
1127                 return;
1128
1129         pte = page_address(pg);
1130         do {
1131                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1132                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1133                 pte++;
1134         } while (!first_pte_in_page(pte));
1135 }
1136
1137 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1138                                 struct dma_pte *pte, unsigned long pfn,
1139                                 unsigned long start_pfn, unsigned long last_pfn,
1140                                 struct list_head *freelist)
1141 {
1142         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1143
1144         pfn = max(start_pfn, pfn);
1145         pte = &pte[pfn_level_offset(pfn, level)];
1146
1147         do {
1148                 unsigned long level_pfn = pfn & level_mask(level);
1149
1150                 if (!dma_pte_present(pte))
1151                         goto next;
1152
1153                 /* If range covers entire pagetable, free it */
1154                 if (start_pfn <= level_pfn &&
1155                     last_pfn >= level_pfn + level_size(level) - 1) {
1156                         /* These suborbinate page tables are going away entirely. Don't
1157                            bother to clear them; we're just going to *free* them. */
1158                         if (level > 1 && !dma_pte_superpage(pte))
1159                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1160
1161                         dma_clear_pte(pte);
1162                         if (!first_pte)
1163                                 first_pte = pte;
1164                         last_pte = pte;
1165                 } else if (level > 1) {
1166                         /* Recurse down into a level that isn't *entirely* obsolete */
1167                         dma_pte_clear_level(domain, level - 1,
1168                                             phys_to_virt(dma_pte_addr(pte)),
1169                                             level_pfn, start_pfn, last_pfn,
1170                                             freelist);
1171                 }
1172 next:
1173                 pfn = level_pfn + level_size(level);
1174         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1175
1176         if (first_pte)
1177                 domain_flush_cache(domain, first_pte,
1178                                    (void *)++last_pte - (void *)first_pte);
1179 }
1180
1181 /* We can't just free the pages because the IOMMU may still be walking
1182    the page tables, and may have cached the intermediate levels. The
1183    pages can only be freed after the IOTLB flush has been done. */
1184 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1185                          unsigned long last_pfn, struct list_head *freelist)
1186 {
1187         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1188         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1189         BUG_ON(start_pfn > last_pfn);
1190
1191         /* we don't need lock here; nobody else touches the iova range */
1192         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1193                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1194
1195         /* free pgd */
1196         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1197                 struct page *pgd_page = virt_to_page(domain->pgd);
1198                 list_add_tail(&pgd_page->lru, freelist);
1199                 domain->pgd = NULL;
1200         }
1201 }
1202
1203 /* iommu handling */
1204 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1205 {
1206         struct root_entry *root;
1207
1208         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1209         if (!root) {
1210                 pr_err("Allocating root entry for %s failed\n",
1211                         iommu->name);
1212                 return -ENOMEM;
1213         }
1214
1215         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1216         iommu->root_entry = root;
1217
1218         return 0;
1219 }
1220
1221 static void iommu_set_root_entry(struct intel_iommu *iommu)
1222 {
1223         u64 addr;
1224         u32 sts;
1225         unsigned long flag;
1226
1227         addr = virt_to_phys(iommu->root_entry);
1228         if (sm_supported(iommu))
1229                 addr |= DMA_RTADDR_SMT;
1230
1231         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1233
1234         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1235
1236         /* Make sure hardware complete it */
1237         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238                       readl, (sts & DMA_GSTS_RTPS), sts);
1239
1240         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241
1242         /*
1243          * Hardware invalidates all DMA remapping hardware translation
1244          * caches as part of SRTP flow.
1245          */
1246         if (cap_esrtps(iommu->cap))
1247                 return;
1248
1249         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1250         if (sm_supported(iommu))
1251                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1252         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1253 }
1254
1255 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1256 {
1257         u32 val;
1258         unsigned long flag;
1259
1260         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1261                 return;
1262
1263         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1264         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1265
1266         /* Make sure hardware complete it */
1267         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1268                       readl, (!(val & DMA_GSTS_WBFS)), val);
1269
1270         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1271 }
1272
1273 /* return value determine if we need a write buffer flush */
1274 static void __iommu_flush_context(struct intel_iommu *iommu,
1275                                   u16 did, u16 source_id, u8 function_mask,
1276                                   u64 type)
1277 {
1278         u64 val = 0;
1279         unsigned long flag;
1280
1281         switch (type) {
1282         case DMA_CCMD_GLOBAL_INVL:
1283                 val = DMA_CCMD_GLOBAL_INVL;
1284                 break;
1285         case DMA_CCMD_DOMAIN_INVL:
1286                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1287                 break;
1288         case DMA_CCMD_DEVICE_INVL:
1289                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1290                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1291                 break;
1292         default:
1293                 BUG();
1294         }
1295         val |= DMA_CCMD_ICC;
1296
1297         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1298         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1299
1300         /* Make sure hardware complete it */
1301         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1302                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1303
1304         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1305 }
1306
1307 /* return value determine if we need a write buffer flush */
1308 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1309                                 u64 addr, unsigned int size_order, u64 type)
1310 {
1311         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1312         u64 val = 0, val_iva = 0;
1313         unsigned long flag;
1314
1315         switch (type) {
1316         case DMA_TLB_GLOBAL_FLUSH:
1317                 /* global flush doesn't need set IVA_REG */
1318                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1319                 break;
1320         case DMA_TLB_DSI_FLUSH:
1321                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1322                 break;
1323         case DMA_TLB_PSI_FLUSH:
1324                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1325                 /* IH bit is passed in as part of address */
1326                 val_iva = size_order | addr;
1327                 break;
1328         default:
1329                 BUG();
1330         }
1331         /* Note: set drain read/write */
1332 #if 0
1333         /*
1334          * This is probably to be super secure.. Looks like we can
1335          * ignore it without any impact.
1336          */
1337         if (cap_read_drain(iommu->cap))
1338                 val |= DMA_TLB_READ_DRAIN;
1339 #endif
1340         if (cap_write_drain(iommu->cap))
1341                 val |= DMA_TLB_WRITE_DRAIN;
1342
1343         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1344         /* Note: Only uses first TLB reg currently */
1345         if (val_iva)
1346                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1347         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1348
1349         /* Make sure hardware complete it */
1350         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1351                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1352
1353         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1354
1355         /* check IOTLB invalidation granularity */
1356         if (DMA_TLB_IAIG(val) == 0)
1357                 pr_err("Flush IOTLB failed\n");
1358         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1359                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1360                         (unsigned long long)DMA_TLB_IIRG(type),
1361                         (unsigned long long)DMA_TLB_IAIG(val));
1362 }
1363
1364 static struct device_domain_info *
1365 domain_lookup_dev_info(struct dmar_domain *domain,
1366                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1367 {
1368         struct device_domain_info *info;
1369         unsigned long flags;
1370
1371         spin_lock_irqsave(&domain->lock, flags);
1372         list_for_each_entry(info, &domain->devices, link) {
1373                 if (info->iommu == iommu && info->bus == bus &&
1374                     info->devfn == devfn) {
1375                         spin_unlock_irqrestore(&domain->lock, flags);
1376                         return info;
1377                 }
1378         }
1379         spin_unlock_irqrestore(&domain->lock, flags);
1380
1381         return NULL;
1382 }
1383
1384 static void domain_update_iotlb(struct dmar_domain *domain)
1385 {
1386         struct device_domain_info *info;
1387         bool has_iotlb_device = false;
1388         unsigned long flags;
1389
1390         spin_lock_irqsave(&domain->lock, flags);
1391         list_for_each_entry(info, &domain->devices, link) {
1392                 if (info->ats_enabled) {
1393                         has_iotlb_device = true;
1394                         break;
1395                 }
1396         }
1397         domain->has_iotlb_device = has_iotlb_device;
1398         spin_unlock_irqrestore(&domain->lock, flags);
1399 }
1400
1401 static void iommu_enable_pci_caps(struct device_domain_info *info)
1402 {
1403         struct pci_dev *pdev;
1404
1405         if (!info || !dev_is_pci(info->dev))
1406                 return;
1407
1408         pdev = to_pci_dev(info->dev);
1409         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1410          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1411          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1412          * reserved, which should be set to 0.
1413          */
1414         if (!ecap_dit(info->iommu->ecap))
1415                 info->pfsid = 0;
1416         else {
1417                 struct pci_dev *pf_pdev;
1418
1419                 /* pdev will be returned if device is not a vf */
1420                 pf_pdev = pci_physfn(pdev);
1421                 info->pfsid = pci_dev_id(pf_pdev);
1422         }
1423
1424         /* The PCIe spec, in its wisdom, declares that the behaviour of
1425            the device if you enable PASID support after ATS support is
1426            undefined. So always enable PASID support on devices which
1427            have it, even if we can't yet know if we're ever going to
1428            use it. */
1429         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1430                 info->pasid_enabled = 1;
1431
1432         if (info->pri_supported &&
1433             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1434             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1435                 info->pri_enabled = 1;
1436
1437         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1438             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1439                 info->ats_enabled = 1;
1440                 domain_update_iotlb(info->domain);
1441                 info->ats_qdep = pci_ats_queue_depth(pdev);
1442         }
1443 }
1444
1445 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1446 {
1447         struct pci_dev *pdev;
1448
1449         if (!dev_is_pci(info->dev))
1450                 return;
1451
1452         pdev = to_pci_dev(info->dev);
1453
1454         if (info->ats_enabled) {
1455                 pci_disable_ats(pdev);
1456                 info->ats_enabled = 0;
1457                 domain_update_iotlb(info->domain);
1458         }
1459
1460         if (info->pri_enabled) {
1461                 pci_disable_pri(pdev);
1462                 info->pri_enabled = 0;
1463         }
1464
1465         if (info->pasid_enabled) {
1466                 pci_disable_pasid(pdev);
1467                 info->pasid_enabled = 0;
1468         }
1469 }
1470
1471 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1472                                     u64 addr, unsigned int mask)
1473 {
1474         u16 sid, qdep;
1475
1476         if (!info || !info->ats_enabled)
1477                 return;
1478
1479         sid = info->bus << 8 | info->devfn;
1480         qdep = info->ats_qdep;
1481         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1482                            qdep, addr, mask);
1483 }
1484
1485 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486                                   u64 addr, unsigned mask)
1487 {
1488         struct device_domain_info *info;
1489         unsigned long flags;
1490
1491         if (!domain->has_iotlb_device)
1492                 return;
1493
1494         spin_lock_irqsave(&domain->lock, flags);
1495         list_for_each_entry(info, &domain->devices, link)
1496                 __iommu_flush_dev_iotlb(info, addr, mask);
1497         spin_unlock_irqrestore(&domain->lock, flags);
1498 }
1499
1500 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501                                   struct dmar_domain *domain,
1502                                   unsigned long pfn, unsigned int pages,
1503                                   int ih, int map)
1504 {
1505         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506         unsigned int mask = ilog2(aligned_pages);
1507         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508         u16 did = domain_id_iommu(domain, iommu);
1509
1510         BUG_ON(pages == 0);
1511
1512         if (ih)
1513                 ih = 1 << 6;
1514
1515         if (domain_use_first_level(domain)) {
1516                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517         } else {
1518                 unsigned long bitmask = aligned_pages - 1;
1519
1520                 /*
1521                  * PSI masks the low order bits of the base address. If the
1522                  * address isn't aligned to the mask, then compute a mask value
1523                  * needed to ensure the target range is flushed.
1524                  */
1525                 if (unlikely(bitmask & pfn)) {
1526                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528                         /*
1529                          * Since end_pfn <= pfn + bitmask, the only way bits
1530                          * higher than bitmask can differ in pfn and end_pfn is
1531                          * by carrying. This means after masking out bitmask,
1532                          * high bits starting with the first set bit in
1533                          * shared_bits are all equal in both pfn and end_pfn.
1534                          */
1535                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537                 }
1538
1539                 /*
1540                  * Fallback to domain selective flush if no PSI support or
1541                  * the size is too big.
1542                  */
1543                 if (!cap_pgsel_inv(iommu->cap) ||
1544                     mask > cap_max_amask_val(iommu->cap))
1545                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546                                                         DMA_TLB_DSI_FLUSH);
1547                 else
1548                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549                                                         DMA_TLB_PSI_FLUSH);
1550         }
1551
1552         /*
1553          * In caching mode, changes of pages from non-present to present require
1554          * flush. However, device IOTLB doesn't need to be flushed in this case.
1555          */
1556         if (!cap_caching_mode(iommu->cap) || !map)
1557                 iommu_flush_dev_iotlb(domain, addr, mask);
1558 }
1559
1560 /* Notification for newly created mappings */
1561 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562                                         struct dmar_domain *domain,
1563                                         unsigned long pfn, unsigned int pages)
1564 {
1565         /*
1566          * It's a non-present to present mapping. Only flush if caching mode
1567          * and second level.
1568          */
1569         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1570                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571         else
1572                 iommu_flush_write_buffer(iommu);
1573 }
1574
1575 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576 {
1577         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578         struct iommu_domain_info *info;
1579         unsigned long idx;
1580
1581         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582                 struct intel_iommu *iommu = info->iommu;
1583                 u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585                 if (domain_use_first_level(dmar_domain))
1586                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587                 else
1588                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589                                                  DMA_TLB_DSI_FLUSH);
1590
1591                 if (!cap_caching_mode(iommu->cap))
1592                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593         }
1594 }
1595
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598         u32 pmen;
1599         unsigned long flags;
1600
1601         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602                 return;
1603
1604         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606         pmen &= ~DMA_PMEN_EPM;
1607         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609         /* wait for the protected region status bit to clear */
1610         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618         u32 sts;
1619         unsigned long flags;
1620
1621         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622         iommu->gcmd |= DMA_GCMD_TE;
1623         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625         /* Make sure hardware complete it */
1626         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627                       readl, (sts & DMA_GSTS_TES), sts);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flag;
1636
1637         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639                 return;
1640
1641         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642         iommu->gcmd &= ~DMA_GCMD_TE;
1643         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645         /* Make sure hardware complete it */
1646         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647                       readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 }
1651
1652 static int iommu_init_domains(struct intel_iommu *iommu)
1653 {
1654         u32 ndomains;
1655
1656         ndomains = cap_ndoms(iommu->cap);
1657         pr_debug("%s: Number of Domains supported <%d>\n",
1658                  iommu->name, ndomains);
1659
1660         spin_lock_init(&iommu->lock);
1661
1662         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663         if (!iommu->domain_ids)
1664                 return -ENOMEM;
1665
1666         /*
1667          * If Caching mode is set, then invalid translations are tagged
1668          * with domain-id 0, hence we need to pre-allocate it. We also
1669          * use domain-id 0 as a marker for non-allocated domain-id, so
1670          * make sure it is not used for a real domain.
1671          */
1672         set_bit(0, iommu->domain_ids);
1673
1674         /*
1675          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676          * entry for first-level or pass-through translation modes should
1677          * be programmed with a domain id different from those used for
1678          * second-level or nested translation. We reserve a domain id for
1679          * this purpose.
1680          */
1681         if (sm_supported(iommu))
1682                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684         return 0;
1685 }
1686
1687 static void disable_dmar_iommu(struct intel_iommu *iommu)
1688 {
1689         if (!iommu->domain_ids)
1690                 return;
1691
1692         /*
1693          * All iommu domains must have been detached from the devices,
1694          * hence there should be no domain IDs in use.
1695          */
1696         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697                     > NUM_RESERVED_DID))
1698                 return;
1699
1700         if (iommu->gcmd & DMA_GCMD_TE)
1701                 iommu_disable_translation(iommu);
1702 }
1703
1704 static void free_dmar_iommu(struct intel_iommu *iommu)
1705 {
1706         if (iommu->domain_ids) {
1707                 bitmap_free(iommu->domain_ids);
1708                 iommu->domain_ids = NULL;
1709         }
1710
1711         if (iommu->copied_tables) {
1712                 bitmap_free(iommu->copied_tables);
1713                 iommu->copied_tables = NULL;
1714         }
1715
1716         /* free context mapping */
1717         free_context_table(iommu);
1718
1719 #ifdef CONFIG_INTEL_IOMMU_SVM
1720         if (pasid_supported(iommu)) {
1721                 if (ecap_prs(iommu->ecap))
1722                         intel_svm_finish_prq(iommu);
1723         }
1724         if (vccap_pasid(iommu->vccap))
1725                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727 #endif
1728 }
1729
1730 /*
1731  * Check and return whether first level is used by default for
1732  * DMA translation.
1733  */
1734 static bool first_level_by_default(unsigned int type)
1735 {
1736         /* Only SL is available in legacy mode */
1737         if (!scalable_mode_support())
1738                 return false;
1739
1740         /* Only level (either FL or SL) is available, just use it */
1741         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742                 return intel_cap_flts_sanity();
1743
1744         /* Both levels are available, decide it based on domain type */
1745         return type != IOMMU_DOMAIN_UNMANAGED;
1746 }
1747
1748 static struct dmar_domain *alloc_domain(unsigned int type)
1749 {
1750         struct dmar_domain *domain;
1751
1752         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753         if (!domain)
1754                 return NULL;
1755
1756         domain->nid = NUMA_NO_NODE;
1757         if (first_level_by_default(type))
1758                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1759         domain->has_iotlb_device = false;
1760         INIT_LIST_HEAD(&domain->devices);
1761         spin_lock_init(&domain->lock);
1762         xa_init(&domain->iommu_array);
1763
1764         return domain;
1765 }
1766
1767 static int domain_attach_iommu(struct dmar_domain *domain,
1768                                struct intel_iommu *iommu)
1769 {
1770         struct iommu_domain_info *info, *curr;
1771         unsigned long ndomains;
1772         int num, ret = -ENOSPC;
1773
1774         info = kzalloc(sizeof(*info), GFP_KERNEL);
1775         if (!info)
1776                 return -ENOMEM;
1777
1778         spin_lock(&iommu->lock);
1779         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780         if (curr) {
1781                 curr->refcnt++;
1782                 spin_unlock(&iommu->lock);
1783                 kfree(info);
1784                 return 0;
1785         }
1786
1787         ndomains = cap_ndoms(iommu->cap);
1788         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789         if (num >= ndomains) {
1790                 pr_err("%s: No free domain ids\n", iommu->name);
1791                 goto err_unlock;
1792         }
1793
1794         set_bit(num, iommu->domain_ids);
1795         info->refcnt    = 1;
1796         info->did       = num;
1797         info->iommu     = iommu;
1798         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799                           NULL, info, GFP_ATOMIC);
1800         if (curr) {
1801                 ret = xa_err(curr) ? : -EBUSY;
1802                 goto err_clear;
1803         }
1804         domain_update_iommu_cap(domain);
1805
1806         spin_unlock(&iommu->lock);
1807         return 0;
1808
1809 err_clear:
1810         clear_bit(info->did, iommu->domain_ids);
1811 err_unlock:
1812         spin_unlock(&iommu->lock);
1813         kfree(info);
1814         return ret;
1815 }
1816
1817 static void domain_detach_iommu(struct dmar_domain *domain,
1818                                 struct intel_iommu *iommu)
1819 {
1820         struct iommu_domain_info *info;
1821
1822         spin_lock(&iommu->lock);
1823         info = xa_load(&domain->iommu_array, iommu->seq_id);
1824         if (--info->refcnt == 0) {
1825                 clear_bit(info->did, iommu->domain_ids);
1826                 xa_erase(&domain->iommu_array, iommu->seq_id);
1827                 domain->nid = NUMA_NO_NODE;
1828                 domain_update_iommu_cap(domain);
1829                 kfree(info);
1830         }
1831         spin_unlock(&iommu->lock);
1832 }
1833
1834 static inline int guestwidth_to_adjustwidth(int gaw)
1835 {
1836         int agaw;
1837         int r = (gaw - 12) % 9;
1838
1839         if (r == 0)
1840                 agaw = gaw;
1841         else
1842                 agaw = gaw + 9 - r;
1843         if (agaw > 64)
1844                 agaw = 64;
1845         return agaw;
1846 }
1847
1848 static void domain_exit(struct dmar_domain *domain)
1849 {
1850         if (domain->pgd) {
1851                 LIST_HEAD(freelist);
1852
1853                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854                 put_pages_list(&freelist);
1855         }
1856
1857         if (WARN_ON(!list_empty(&domain->devices)))
1858                 return;
1859
1860         kfree(domain);
1861 }
1862
1863 /*
1864  * Get the PASID directory size for scalable mode context entry.
1865  * Value of X in the PDTS field of a scalable mode context entry
1866  * indicates PASID directory with 2^(X + 7) entries.
1867  */
1868 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869 {
1870         unsigned long pds, max_pde;
1871
1872         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874         if (pds < 7)
1875                 return 0;
1876
1877         return pds - 7;
1878 }
1879
1880 /*
1881  * Set the RID_PASID field of a scalable mode context entry. The
1882  * IOMMU hardware will use the PASID value set in this field for
1883  * DMA translations of DMA requests without PASID.
1884  */
1885 static inline void
1886 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887 {
1888         context->hi |= pasid & ((1 << 20) - 1);
1889 }
1890
1891 /*
1892  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893  * entry.
1894  */
1895 static inline void context_set_sm_dte(struct context_entry *context)
1896 {
1897         context->lo |= (1 << 2);
1898 }
1899
1900 /*
1901  * Set the PRE(Page Request Enable) field of a scalable mode context
1902  * entry.
1903  */
1904 static inline void context_set_sm_pre(struct context_entry *context)
1905 {
1906         context->lo |= (1 << 4);
1907 }
1908
1909 /* Convert value to context PASID directory size field coding. */
1910 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1911
1912 static int domain_context_mapping_one(struct dmar_domain *domain,
1913                                       struct intel_iommu *iommu,
1914                                       struct pasid_table *table,
1915                                       u8 bus, u8 devfn)
1916 {
1917         struct device_domain_info *info =
1918                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1919         u16 did = domain_id_iommu(domain, iommu);
1920         int translation = CONTEXT_TT_MULTI_LEVEL;
1921         struct context_entry *context;
1922         int ret;
1923
1924         WARN_ON(did == 0);
1925
1926         if (hw_pass_through && domain_type_is_si(domain))
1927                 translation = CONTEXT_TT_PASS_THROUGH;
1928
1929         pr_debug("Set context mapping for %02x:%02x.%d\n",
1930                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932         BUG_ON(!domain->pgd);
1933
1934         spin_lock(&iommu->lock);
1935         ret = -ENOMEM;
1936         context = iommu_context_addr(iommu, bus, devfn, 1);
1937         if (!context)
1938                 goto out_unlock;
1939
1940         ret = 0;
1941         if (context_present(context) && !context_copied(iommu, bus, devfn))
1942                 goto out_unlock;
1943
1944         /*
1945          * For kdump cases, old valid entries may be cached due to the
1946          * in-flight DMA and copied pgtable, but there is no unmapping
1947          * behaviour for them, thus we need an explicit cache flush for
1948          * the newly-mapped device. For kdump, at this point, the device
1949          * is supposed to finish reset at its driver probe stage, so no
1950          * in-flight DMA will exist, and we don't need to worry anymore
1951          * hereafter.
1952          */
1953         if (context_copied(iommu, bus, devfn)) {
1954                 u16 did_old = context_domain_id(context);
1955
1956                 if (did_old < cap_ndoms(iommu->cap)) {
1957                         iommu->flush.flush_context(iommu, did_old,
1958                                                    (((u16)bus) << 8) | devfn,
1959                                                    DMA_CCMD_MASK_NOBIT,
1960                                                    DMA_CCMD_DEVICE_INVL);
1961                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962                                                  DMA_TLB_DSI_FLUSH);
1963                 }
1964
1965                 clear_context_copied(iommu, bus, devfn);
1966         }
1967
1968         context_clear_entry(context);
1969
1970         if (sm_supported(iommu)) {
1971                 unsigned long pds;
1972
1973                 WARN_ON(!table);
1974
1975                 /* Setup the PASID DIR pointer: */
1976                 pds = context_get_sm_pds(table);
1977                 context->lo = (u64)virt_to_phys(table->table) |
1978                                 context_pdts(pds);
1979
1980                 /* Setup the RID_PASID field: */
1981                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983                 /*
1984                  * Setup the Device-TLB enable bit and Page request
1985                  * Enable bit:
1986                  */
1987                 if (info && info->ats_supported)
1988                         context_set_sm_dte(context);
1989                 if (info && info->pri_supported)
1990                         context_set_sm_pre(context);
1991                 if (info && info->pasid_supported)
1992                         context_set_pasid(context);
1993         } else {
1994                 struct dma_pte *pgd = domain->pgd;
1995                 int agaw;
1996
1997                 context_set_domain_id(context, did);
1998
1999                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2000                         /*
2001                          * Skip top levels of page tables for iommu which has
2002                          * less agaw than default. Unnecessary for PT mode.
2003                          */
2004                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005                                 ret = -ENOMEM;
2006                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2007                                 if (!dma_pte_present(pgd))
2008                                         goto out_unlock;
2009                         }
2010
2011                         if (info && info->ats_supported)
2012                                 translation = CONTEXT_TT_DEV_IOTLB;
2013                         else
2014                                 translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016                         context_set_address_root(context, virt_to_phys(pgd));
2017                         context_set_address_width(context, agaw);
2018                 } else {
2019                         /*
2020                          * In pass through mode, AW must be programmed to
2021                          * indicate the largest AGAW value supported by
2022                          * hardware. And ASR is ignored by hardware.
2023                          */
2024                         context_set_address_width(context, iommu->msagaw);
2025                 }
2026
2027                 context_set_translation_type(context, translation);
2028         }
2029
2030         context_set_fault_enable(context);
2031         context_set_present(context);
2032         if (!ecap_coherent(iommu->ecap))
2033                 clflush_cache_range(context, sizeof(*context));
2034
2035         /*
2036          * It's a non-present to present mapping. If hardware doesn't cache
2037          * non-present entry we only need to flush the write-buffer. If the
2038          * _does_ cache non-present entries, then it does so in the special
2039          * domain #0, which we have to flush:
2040          */
2041         if (cap_caching_mode(iommu->cap)) {
2042                 iommu->flush.flush_context(iommu, 0,
2043                                            (((u16)bus) << 8) | devfn,
2044                                            DMA_CCMD_MASK_NOBIT,
2045                                            DMA_CCMD_DEVICE_INVL);
2046                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047         } else {
2048                 iommu_flush_write_buffer(iommu);
2049         }
2050         iommu_enable_pci_caps(info);
2051
2052         ret = 0;
2053
2054 out_unlock:
2055         spin_unlock(&iommu->lock);
2056
2057         return ret;
2058 }
2059
2060 struct domain_context_mapping_data {
2061         struct dmar_domain *domain;
2062         struct intel_iommu *iommu;
2063         struct pasid_table *table;
2064 };
2065
2066 static int domain_context_mapping_cb(struct pci_dev *pdev,
2067                                      u16 alias, void *opaque)
2068 {
2069         struct domain_context_mapping_data *data = opaque;
2070
2071         return domain_context_mapping_one(data->domain, data->iommu,
2072                                           data->table, PCI_BUS_NUM(alias),
2073                                           alias & 0xff);
2074 }
2075
2076 static int
2077 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2078 {
2079         struct domain_context_mapping_data data;
2080         struct pasid_table *table;
2081         struct intel_iommu *iommu;
2082         u8 bus, devfn;
2083
2084         iommu = device_to_iommu(dev, &bus, &devfn);
2085         if (!iommu)
2086                 return -ENODEV;
2087
2088         table = intel_pasid_get_table(dev);
2089
2090         if (!dev_is_pci(dev))
2091                 return domain_context_mapping_one(domain, iommu, table,
2092                                                   bus, devfn);
2093
2094         data.domain = domain;
2095         data.iommu = iommu;
2096         data.table = table;
2097
2098         return pci_for_each_dma_alias(to_pci_dev(dev),
2099                                       &domain_context_mapping_cb, &data);
2100 }
2101
2102 static int domain_context_mapped_cb(struct pci_dev *pdev,
2103                                     u16 alias, void *opaque)
2104 {
2105         struct intel_iommu *iommu = opaque;
2106
2107         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2108 }
2109
2110 static int domain_context_mapped(struct device *dev)
2111 {
2112         struct intel_iommu *iommu;
2113         u8 bus, devfn;
2114
2115         iommu = device_to_iommu(dev, &bus, &devfn);
2116         if (!iommu)
2117                 return -ENODEV;
2118
2119         if (!dev_is_pci(dev))
2120                 return device_context_mapped(iommu, bus, devfn);
2121
2122         return !pci_for_each_dma_alias(to_pci_dev(dev),
2123                                        domain_context_mapped_cb, iommu);
2124 }
2125
2126 /* Returns a number of VTD pages, but aligned to MM page size */
2127 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2128                                             size_t size)
2129 {
2130         host_addr &= ~PAGE_MASK;
2131         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2132 }
2133
2134 /* Return largest possible superpage level for a given mapping */
2135 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2136                                           unsigned long iov_pfn,
2137                                           unsigned long phy_pfn,
2138                                           unsigned long pages)
2139 {
2140         int support, level = 1;
2141         unsigned long pfnmerge;
2142
2143         support = domain->iommu_superpage;
2144
2145         /* To use a large page, the virtual *and* physical addresses
2146            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2147            of them will mean we have to use smaller pages. So just
2148            merge them and check both at once. */
2149         pfnmerge = iov_pfn | phy_pfn;
2150
2151         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2152                 pages >>= VTD_STRIDE_SHIFT;
2153                 if (!pages)
2154                         break;
2155                 pfnmerge >>= VTD_STRIDE_SHIFT;
2156                 level++;
2157                 support--;
2158         }
2159         return level;
2160 }
2161
2162 /*
2163  * Ensure that old small page tables are removed to make room for superpage(s).
2164  * We're going to add new large pages, so make sure we don't remove their parent
2165  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2166  */
2167 static void switch_to_super_page(struct dmar_domain *domain,
2168                                  unsigned long start_pfn,
2169                                  unsigned long end_pfn, int level)
2170 {
2171         unsigned long lvl_pages = lvl_to_nr_pages(level);
2172         struct iommu_domain_info *info;
2173         struct dma_pte *pte = NULL;
2174         unsigned long i;
2175
2176         while (start_pfn <= end_pfn) {
2177                 if (!pte)
2178                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2179
2180                 if (dma_pte_present(pte)) {
2181                         dma_pte_free_pagetable(domain, start_pfn,
2182                                                start_pfn + lvl_pages - 1,
2183                                                level + 1);
2184
2185                         xa_for_each(&domain->iommu_array, i, info)
2186                                 iommu_flush_iotlb_psi(info->iommu, domain,
2187                                                       start_pfn, lvl_pages,
2188                                                       0, 0);
2189                 }
2190
2191                 pte++;
2192                 start_pfn += lvl_pages;
2193                 if (first_pte_in_page(pte))
2194                         pte = NULL;
2195         }
2196 }
2197
2198 static int
2199 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2200                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2201 {
2202         struct dma_pte *first_pte = NULL, *pte = NULL;
2203         unsigned int largepage_lvl = 0;
2204         unsigned long lvl_pages = 0;
2205         phys_addr_t pteval;
2206         u64 attr;
2207
2208         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2209
2210         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2211                 return -EINVAL;
2212
2213         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2214         attr |= DMA_FL_PTE_PRESENT;
2215         if (domain_use_first_level(domain)) {
2216                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2217                 if (prot & DMA_PTE_WRITE)
2218                         attr |= DMA_FL_PTE_DIRTY;
2219         }
2220
2221         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2222
2223         while (nr_pages > 0) {
2224                 uint64_t tmp;
2225
2226                 if (!pte) {
2227                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2228                                         phys_pfn, nr_pages);
2229
2230                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2231                         if (!pte)
2232                                 return -ENOMEM;
2233                         first_pte = pte;
2234
2235                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2236
2237                         /* It is large page*/
2238                         if (largepage_lvl > 1) {
2239                                 unsigned long end_pfn;
2240                                 unsigned long pages_to_remove;
2241
2242                                 pteval |= DMA_PTE_LARGE_PAGE;
2243                                 pages_to_remove = min_t(unsigned long, nr_pages,
2244                                                         nr_pte_to_next_page(pte) * lvl_pages);
2245                                 end_pfn = iov_pfn + pages_to_remove - 1;
2246                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2247                         } else {
2248                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2249                         }
2250
2251                 }
2252                 /* We don't need lock here, nobody else
2253                  * touches the iova range
2254                  */
2255                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2256                 if (tmp) {
2257                         static int dumps = 5;
2258                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2259                                 iov_pfn, tmp, (unsigned long long)pteval);
2260                         if (dumps) {
2261                                 dumps--;
2262                                 debug_dma_dump_mappings(NULL);
2263                         }
2264                         WARN_ON(1);
2265                 }
2266
2267                 nr_pages -= lvl_pages;
2268                 iov_pfn += lvl_pages;
2269                 phys_pfn += lvl_pages;
2270                 pteval += lvl_pages * VTD_PAGE_SIZE;
2271
2272                 /* If the next PTE would be the first in a new page, then we
2273                  * need to flush the cache on the entries we've just written.
2274                  * And then we'll need to recalculate 'pte', so clear it and
2275                  * let it get set again in the if (!pte) block above.
2276                  *
2277                  * If we're done (!nr_pages) we need to flush the cache too.
2278                  *
2279                  * Also if we've been setting superpages, we may need to
2280                  * recalculate 'pte' and switch back to smaller pages for the
2281                  * end of the mapping, if the trailing size is not enough to
2282                  * use another superpage (i.e. nr_pages < lvl_pages).
2283                  */
2284                 pte++;
2285                 if (!nr_pages || first_pte_in_page(pte) ||
2286                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2287                         domain_flush_cache(domain, first_pte,
2288                                            (void *)pte - (void *)first_pte);
2289                         pte = NULL;
2290                 }
2291         }
2292
2293         return 0;
2294 }
2295
2296 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2297 {
2298         struct intel_iommu *iommu = info->iommu;
2299         struct context_entry *context;
2300         u16 did_old;
2301
2302         if (!iommu)
2303                 return;
2304
2305         spin_lock(&iommu->lock);
2306         context = iommu_context_addr(iommu, bus, devfn, 0);
2307         if (!context) {
2308                 spin_unlock(&iommu->lock);
2309                 return;
2310         }
2311
2312         if (sm_supported(iommu)) {
2313                 if (hw_pass_through && domain_type_is_si(info->domain))
2314                         did_old = FLPT_DEFAULT_DID;
2315                 else
2316                         did_old = domain_id_iommu(info->domain, iommu);
2317         } else {
2318                 did_old = context_domain_id(context);
2319         }
2320
2321         context_clear_entry(context);
2322         __iommu_flush_cache(iommu, context, sizeof(*context));
2323         spin_unlock(&iommu->lock);
2324         iommu->flush.flush_context(iommu,
2325                                    did_old,
2326                                    (((u16)bus) << 8) | devfn,
2327                                    DMA_CCMD_MASK_NOBIT,
2328                                    DMA_CCMD_DEVICE_INVL);
2329
2330         if (sm_supported(iommu))
2331                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2332
2333         iommu->flush.flush_iotlb(iommu,
2334                                  did_old,
2335                                  0,
2336                                  0,
2337                                  DMA_TLB_DSI_FLUSH);
2338
2339         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2340 }
2341
2342 static int domain_setup_first_level(struct intel_iommu *iommu,
2343                                     struct dmar_domain *domain,
2344                                     struct device *dev,
2345                                     u32 pasid)
2346 {
2347         struct dma_pte *pgd = domain->pgd;
2348         int agaw, level;
2349         int flags = 0;
2350
2351         /*
2352          * Skip top levels of page tables for iommu which has
2353          * less agaw than default. Unnecessary for PT mode.
2354          */
2355         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2356                 pgd = phys_to_virt(dma_pte_addr(pgd));
2357                 if (!dma_pte_present(pgd))
2358                         return -ENOMEM;
2359         }
2360
2361         level = agaw_to_level(agaw);
2362         if (level != 4 && level != 5)
2363                 return -EINVAL;
2364
2365         if (pasid != PASID_RID2PASID)
2366                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2367         if (level == 5)
2368                 flags |= PASID_FLAG_FL5LP;
2369
2370         if (domain->force_snooping)
2371                 flags |= PASID_FLAG_PAGE_SNOOP;
2372
2373         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2374                                              domain_id_iommu(domain, iommu),
2375                                              flags);
2376 }
2377
2378 static bool dev_is_real_dma_subdevice(struct device *dev)
2379 {
2380         return dev && dev_is_pci(dev) &&
2381                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2382 }
2383
2384 static int iommu_domain_identity_map(struct dmar_domain *domain,
2385                                      unsigned long first_vpfn,
2386                                      unsigned long last_vpfn)
2387 {
2388         /*
2389          * RMRR range might have overlap with physical memory range,
2390          * clear it first
2391          */
2392         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2393
2394         return __domain_mapping(domain, first_vpfn,
2395                                 first_vpfn, last_vpfn - first_vpfn + 1,
2396                                 DMA_PTE_READ|DMA_PTE_WRITE);
2397 }
2398
2399 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2400
2401 static int __init si_domain_init(int hw)
2402 {
2403         struct dmar_rmrr_unit *rmrr;
2404         struct device *dev;
2405         int i, nid, ret;
2406
2407         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2408         if (!si_domain)
2409                 return -EFAULT;
2410
2411         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2412                 domain_exit(si_domain);
2413                 return -EFAULT;
2414         }
2415
2416         if (hw)
2417                 return 0;
2418
2419         for_each_online_node(nid) {
2420                 unsigned long start_pfn, end_pfn;
2421                 int i;
2422
2423                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2424                         ret = iommu_domain_identity_map(si_domain,
2425                                         mm_to_dma_pfn(start_pfn),
2426                                         mm_to_dma_pfn(end_pfn));
2427                         if (ret)
2428                                 return ret;
2429                 }
2430         }
2431
2432         /*
2433          * Identity map the RMRRs so that devices with RMRRs could also use
2434          * the si_domain.
2435          */
2436         for_each_rmrr_units(rmrr) {
2437                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2438                                           i, dev) {
2439                         unsigned long long start = rmrr->base_address;
2440                         unsigned long long end = rmrr->end_address;
2441
2442                         if (WARN_ON(end < start ||
2443                                     end >> agaw_to_width(si_domain->agaw)))
2444                                 continue;
2445
2446                         ret = iommu_domain_identity_map(si_domain,
2447                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2448                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2449                         if (ret)
2450                                 return ret;
2451                 }
2452         }
2453
2454         return 0;
2455 }
2456
2457 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2458 {
2459         struct device_domain_info *info = dev_iommu_priv_get(dev);
2460         struct intel_iommu *iommu;
2461         unsigned long flags;
2462         u8 bus, devfn;
2463         int ret;
2464
2465         iommu = device_to_iommu(dev, &bus, &devfn);
2466         if (!iommu)
2467                 return -ENODEV;
2468
2469         ret = domain_attach_iommu(domain, iommu);
2470         if (ret)
2471                 return ret;
2472         info->domain = domain;
2473         spin_lock_irqsave(&domain->lock, flags);
2474         list_add(&info->link, &domain->devices);
2475         spin_unlock_irqrestore(&domain->lock, flags);
2476
2477         /* PASID table is mandatory for a PCI device in scalable mode. */
2478         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2479                 ret = intel_pasid_alloc_table(dev);
2480                 if (ret) {
2481                         dev_err(dev, "PASID table allocation failed\n");
2482                         dmar_remove_one_dev_info(dev);
2483                         return ret;
2484                 }
2485
2486                 /* Setup the PASID entry for requests without PASID: */
2487                 if (hw_pass_through && domain_type_is_si(domain))
2488                         ret = intel_pasid_setup_pass_through(iommu, domain,
2489                                         dev, PASID_RID2PASID);
2490                 else if (domain_use_first_level(domain))
2491                         ret = domain_setup_first_level(iommu, domain, dev,
2492                                         PASID_RID2PASID);
2493                 else
2494                         ret = intel_pasid_setup_second_level(iommu, domain,
2495                                         dev, PASID_RID2PASID);
2496                 if (ret) {
2497                         dev_err(dev, "Setup RID2PASID failed\n");
2498                         dmar_remove_one_dev_info(dev);
2499                         return ret;
2500                 }
2501         }
2502
2503         ret = domain_context_mapping(domain, dev);
2504         if (ret) {
2505                 dev_err(dev, "Domain context map failed\n");
2506                 dmar_remove_one_dev_info(dev);
2507                 return ret;
2508         }
2509
2510         return 0;
2511 }
2512
2513 static bool device_has_rmrr(struct device *dev)
2514 {
2515         struct dmar_rmrr_unit *rmrr;
2516         struct device *tmp;
2517         int i;
2518
2519         rcu_read_lock();
2520         for_each_rmrr_units(rmrr) {
2521                 /*
2522                  * Return TRUE if this RMRR contains the device that
2523                  * is passed in.
2524                  */
2525                 for_each_active_dev_scope(rmrr->devices,
2526                                           rmrr->devices_cnt, i, tmp)
2527                         if (tmp == dev ||
2528                             is_downstream_to_pci_bridge(dev, tmp)) {
2529                                 rcu_read_unlock();
2530                                 return true;
2531                         }
2532         }
2533         rcu_read_unlock();
2534         return false;
2535 }
2536
2537 /**
2538  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2539  * is relaxable (ie. is allowed to be not enforced under some conditions)
2540  * @dev: device handle
2541  *
2542  * We assume that PCI USB devices with RMRRs have them largely
2543  * for historical reasons and that the RMRR space is not actively used post
2544  * boot.  This exclusion may change if vendors begin to abuse it.
2545  *
2546  * The same exception is made for graphics devices, with the requirement that
2547  * any use of the RMRR regions will be torn down before assigning the device
2548  * to a guest.
2549  *
2550  * Return: true if the RMRR is relaxable, false otherwise
2551  */
2552 static bool device_rmrr_is_relaxable(struct device *dev)
2553 {
2554         struct pci_dev *pdev;
2555
2556         if (!dev_is_pci(dev))
2557                 return false;
2558
2559         pdev = to_pci_dev(dev);
2560         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2561                 return true;
2562         else
2563                 return false;
2564 }
2565
2566 /*
2567  * There are a couple cases where we need to restrict the functionality of
2568  * devices associated with RMRRs.  The first is when evaluating a device for
2569  * identity mapping because problems exist when devices are moved in and out
2570  * of domains and their respective RMRR information is lost.  This means that
2571  * a device with associated RMRRs will never be in a "passthrough" domain.
2572  * The second is use of the device through the IOMMU API.  This interface
2573  * expects to have full control of the IOVA space for the device.  We cannot
2574  * satisfy both the requirement that RMRR access is maintained and have an
2575  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2576  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2577  * We therefore prevent devices associated with an RMRR from participating in
2578  * the IOMMU API, which eliminates them from device assignment.
2579  *
2580  * In both cases, devices which have relaxable RMRRs are not concerned by this
2581  * restriction. See device_rmrr_is_relaxable comment.
2582  */
2583 static bool device_is_rmrr_locked(struct device *dev)
2584 {
2585         if (!device_has_rmrr(dev))
2586                 return false;
2587
2588         if (device_rmrr_is_relaxable(dev))
2589                 return false;
2590
2591         return true;
2592 }
2593
2594 /*
2595  * Return the required default domain type for a specific device.
2596  *
2597  * @dev: the device in query
2598  * @startup: true if this is during early boot
2599  *
2600  * Returns:
2601  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2602  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2603  *  - 0: both identity and dynamic domains work for this device
2604  */
2605 static int device_def_domain_type(struct device *dev)
2606 {
2607         if (dev_is_pci(dev)) {
2608                 struct pci_dev *pdev = to_pci_dev(dev);
2609
2610                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2611                         return IOMMU_DOMAIN_IDENTITY;
2612
2613                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2614                         return IOMMU_DOMAIN_IDENTITY;
2615         }
2616
2617         return 0;
2618 }
2619
2620 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2621 {
2622         /*
2623          * Start from the sane iommu hardware state.
2624          * If the queued invalidation is already initialized by us
2625          * (for example, while enabling interrupt-remapping) then
2626          * we got the things already rolling from a sane state.
2627          */
2628         if (!iommu->qi) {
2629                 /*
2630                  * Clear any previous faults.
2631                  */
2632                 dmar_fault(-1, iommu);
2633                 /*
2634                  * Disable queued invalidation if supported and already enabled
2635                  * before OS handover.
2636                  */
2637                 dmar_disable_qi(iommu);
2638         }
2639
2640         if (dmar_enable_qi(iommu)) {
2641                 /*
2642                  * Queued Invalidate not enabled, use Register Based Invalidate
2643                  */
2644                 iommu->flush.flush_context = __iommu_flush_context;
2645                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2646                 pr_info("%s: Using Register based invalidation\n",
2647                         iommu->name);
2648         } else {
2649                 iommu->flush.flush_context = qi_flush_context;
2650                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2651                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2652         }
2653 }
2654
2655 static int copy_context_table(struct intel_iommu *iommu,
2656                               struct root_entry *old_re,
2657                               struct context_entry **tbl,
2658                               int bus, bool ext)
2659 {
2660         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2661         struct context_entry *new_ce = NULL, ce;
2662         struct context_entry *old_ce = NULL;
2663         struct root_entry re;
2664         phys_addr_t old_ce_phys;
2665
2666         tbl_idx = ext ? bus * 2 : bus;
2667         memcpy(&re, old_re, sizeof(re));
2668
2669         for (devfn = 0; devfn < 256; devfn++) {
2670                 /* First calculate the correct index */
2671                 idx = (ext ? devfn * 2 : devfn) % 256;
2672
2673                 if (idx == 0) {
2674                         /* First save what we may have and clean up */
2675                         if (new_ce) {
2676                                 tbl[tbl_idx] = new_ce;
2677                                 __iommu_flush_cache(iommu, new_ce,
2678                                                     VTD_PAGE_SIZE);
2679                                 pos = 1;
2680                         }
2681
2682                         if (old_ce)
2683                                 memunmap(old_ce);
2684
2685                         ret = 0;
2686                         if (devfn < 0x80)
2687                                 old_ce_phys = root_entry_lctp(&re);
2688                         else
2689                                 old_ce_phys = root_entry_uctp(&re);
2690
2691                         if (!old_ce_phys) {
2692                                 if (ext && devfn == 0) {
2693                                         /* No LCTP, try UCTP */
2694                                         devfn = 0x7f;
2695                                         continue;
2696                                 } else {
2697                                         goto out;
2698                                 }
2699                         }
2700
2701                         ret = -ENOMEM;
2702                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2703                                         MEMREMAP_WB);
2704                         if (!old_ce)
2705                                 goto out;
2706
2707                         new_ce = alloc_pgtable_page(iommu->node);
2708                         if (!new_ce)
2709                                 goto out_unmap;
2710
2711                         ret = 0;
2712                 }
2713
2714                 /* Now copy the context entry */
2715                 memcpy(&ce, old_ce + idx, sizeof(ce));
2716
2717                 if (!context_present(&ce))
2718                         continue;
2719
2720                 did = context_domain_id(&ce);
2721                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2722                         set_bit(did, iommu->domain_ids);
2723
2724                 set_context_copied(iommu, bus, devfn);
2725                 new_ce[idx] = ce;
2726         }
2727
2728         tbl[tbl_idx + pos] = new_ce;
2729
2730         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2731
2732 out_unmap:
2733         memunmap(old_ce);
2734
2735 out:
2736         return ret;
2737 }
2738
2739 static int copy_translation_tables(struct intel_iommu *iommu)
2740 {
2741         struct context_entry **ctxt_tbls;
2742         struct root_entry *old_rt;
2743         phys_addr_t old_rt_phys;
2744         int ctxt_table_entries;
2745         u64 rtaddr_reg;
2746         int bus, ret;
2747         bool new_ext, ext;
2748
2749         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2750         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2751         new_ext    = !!sm_supported(iommu);
2752
2753         /*
2754          * The RTT bit can only be changed when translation is disabled,
2755          * but disabling translation means to open a window for data
2756          * corruption. So bail out and don't copy anything if we would
2757          * have to change the bit.
2758          */
2759         if (new_ext != ext)
2760                 return -EINVAL;
2761
2762         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2763         if (!iommu->copied_tables)
2764                 return -ENOMEM;
2765
2766         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2767         if (!old_rt_phys)
2768                 return -EINVAL;
2769
2770         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2771         if (!old_rt)
2772                 return -ENOMEM;
2773
2774         /* This is too big for the stack - allocate it from slab */
2775         ctxt_table_entries = ext ? 512 : 256;
2776         ret = -ENOMEM;
2777         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2778         if (!ctxt_tbls)
2779                 goto out_unmap;
2780
2781         for (bus = 0; bus < 256; bus++) {
2782                 ret = copy_context_table(iommu, &old_rt[bus],
2783                                          ctxt_tbls, bus, ext);
2784                 if (ret) {
2785                         pr_err("%s: Failed to copy context table for bus %d\n",
2786                                 iommu->name, bus);
2787                         continue;
2788                 }
2789         }
2790
2791         spin_lock(&iommu->lock);
2792
2793         /* Context tables are copied, now write them to the root_entry table */
2794         for (bus = 0; bus < 256; bus++) {
2795                 int idx = ext ? bus * 2 : bus;
2796                 u64 val;
2797
2798                 if (ctxt_tbls[idx]) {
2799                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2800                         iommu->root_entry[bus].lo = val;
2801                 }
2802
2803                 if (!ext || !ctxt_tbls[idx + 1])
2804                         continue;
2805
2806                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2807                 iommu->root_entry[bus].hi = val;
2808         }
2809
2810         spin_unlock(&iommu->lock);
2811
2812         kfree(ctxt_tbls);
2813
2814         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2815
2816         ret = 0;
2817
2818 out_unmap:
2819         memunmap(old_rt);
2820
2821         return ret;
2822 }
2823
2824 #ifdef CONFIG_INTEL_IOMMU_SVM
2825 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2826 {
2827         struct intel_iommu *iommu = data;
2828         ioasid_t ioasid;
2829
2830         if (!iommu)
2831                 return INVALID_IOASID;
2832         /*
2833          * VT-d virtual command interface always uses the full 20 bit
2834          * PASID range. Host can partition guest PASID range based on
2835          * policies but it is out of guest's control.
2836          */
2837         if (min < PASID_MIN || max > intel_pasid_max_id)
2838                 return INVALID_IOASID;
2839
2840         if (vcmd_alloc_pasid(iommu, &ioasid))
2841                 return INVALID_IOASID;
2842
2843         return ioasid;
2844 }
2845
2846 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2847 {
2848         struct intel_iommu *iommu = data;
2849
2850         if (!iommu)
2851                 return;
2852         /*
2853          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2854          * We can only free the PASID when all the devices are unbound.
2855          */
2856         if (ioasid_find(NULL, ioasid, NULL)) {
2857                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2858                 return;
2859         }
2860         vcmd_free_pasid(iommu, ioasid);
2861 }
2862
2863 static void register_pasid_allocator(struct intel_iommu *iommu)
2864 {
2865         /*
2866          * If we are running in the host, no need for custom allocator
2867          * in that PASIDs are allocated from the host system-wide.
2868          */
2869         if (!cap_caching_mode(iommu->cap))
2870                 return;
2871
2872         if (!sm_supported(iommu)) {
2873                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2874                 return;
2875         }
2876
2877         /*
2878          * Register a custom PASID allocator if we are running in a guest,
2879          * guest PASID must be obtained via virtual command interface.
2880          * There can be multiple vIOMMUs in each guest but only one allocator
2881          * is active. All vIOMMU allocators will eventually be calling the same
2882          * host allocator.
2883          */
2884         if (!vccap_pasid(iommu->vccap))
2885                 return;
2886
2887         pr_info("Register custom PASID allocator\n");
2888         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2889         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2890         iommu->pasid_allocator.pdata = (void *)iommu;
2891         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2892                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2893                 /*
2894                  * Disable scalable mode on this IOMMU if there
2895                  * is no custom allocator. Mixing SM capable vIOMMU
2896                  * and non-SM vIOMMU are not supported.
2897                  */
2898                 intel_iommu_sm = 0;
2899         }
2900 }
2901 #endif
2902
2903 static int __init init_dmars(void)
2904 {
2905         struct dmar_drhd_unit *drhd;
2906         struct intel_iommu *iommu;
2907         int ret;
2908
2909         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2910         if (ret)
2911                 goto free_iommu;
2912
2913         for_each_iommu(iommu, drhd) {
2914                 if (drhd->ignored) {
2915                         iommu_disable_translation(iommu);
2916                         continue;
2917                 }
2918
2919                 /*
2920                  * Find the max pasid size of all IOMMU's in the system.
2921                  * We need to ensure the system pasid table is no bigger
2922                  * than the smallest supported.
2923                  */
2924                 if (pasid_supported(iommu)) {
2925                         u32 temp = 2 << ecap_pss(iommu->ecap);
2926
2927                         intel_pasid_max_id = min_t(u32, temp,
2928                                                    intel_pasid_max_id);
2929                 }
2930
2931                 intel_iommu_init_qi(iommu);
2932
2933                 ret = iommu_init_domains(iommu);
2934                 if (ret)
2935                         goto free_iommu;
2936
2937                 init_translation_status(iommu);
2938
2939                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2940                         iommu_disable_translation(iommu);
2941                         clear_translation_pre_enabled(iommu);
2942                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2943                                 iommu->name);
2944                 }
2945
2946                 /*
2947                  * TBD:
2948                  * we could share the same root & context tables
2949                  * among all IOMMU's. Need to Split it later.
2950                  */
2951                 ret = iommu_alloc_root_entry(iommu);
2952                 if (ret)
2953                         goto free_iommu;
2954
2955                 if (translation_pre_enabled(iommu)) {
2956                         pr_info("Translation already enabled - trying to copy translation structures\n");
2957
2958                         ret = copy_translation_tables(iommu);
2959                         if (ret) {
2960                                 /*
2961                                  * We found the IOMMU with translation
2962                                  * enabled - but failed to copy over the
2963                                  * old root-entry table. Try to proceed
2964                                  * by disabling translation now and
2965                                  * allocating a clean root-entry table.
2966                                  * This might cause DMAR faults, but
2967                                  * probably the dump will still succeed.
2968                                  */
2969                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2970                                        iommu->name);
2971                                 iommu_disable_translation(iommu);
2972                                 clear_translation_pre_enabled(iommu);
2973                         } else {
2974                                 pr_info("Copied translation tables from previous kernel for %s\n",
2975                                         iommu->name);
2976                         }
2977                 }
2978
2979                 if (!ecap_pass_through(iommu->ecap))
2980                         hw_pass_through = 0;
2981                 intel_svm_check(iommu);
2982         }
2983
2984         /*
2985          * Now that qi is enabled on all iommus, set the root entry and flush
2986          * caches. This is required on some Intel X58 chipsets, otherwise the
2987          * flush_context function will loop forever and the boot hangs.
2988          */
2989         for_each_active_iommu(iommu, drhd) {
2990                 iommu_flush_write_buffer(iommu);
2991 #ifdef CONFIG_INTEL_IOMMU_SVM
2992                 register_pasid_allocator(iommu);
2993 #endif
2994                 iommu_set_root_entry(iommu);
2995         }
2996
2997 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2998         dmar_map_gfx = 0;
2999 #endif
3000
3001         if (!dmar_map_gfx)
3002                 iommu_identity_mapping |= IDENTMAP_GFX;
3003
3004         check_tylersburg_isoch();
3005
3006         ret = si_domain_init(hw_pass_through);
3007         if (ret)
3008                 goto free_iommu;
3009
3010         /*
3011          * for each drhd
3012          *   enable fault log
3013          *   global invalidate context cache
3014          *   global invalidate iotlb
3015          *   enable translation
3016          */
3017         for_each_iommu(iommu, drhd) {
3018                 if (drhd->ignored) {
3019                         /*
3020                          * we always have to disable PMRs or DMA may fail on
3021                          * this device
3022                          */
3023                         if (force_on)
3024                                 iommu_disable_protect_mem_regions(iommu);
3025                         continue;
3026                 }
3027
3028                 iommu_flush_write_buffer(iommu);
3029
3030 #ifdef CONFIG_INTEL_IOMMU_SVM
3031                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3032                         /*
3033                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3034                          * could cause possible lock race condition.
3035                          */
3036                         up_write(&dmar_global_lock);
3037                         ret = intel_svm_enable_prq(iommu);
3038                         down_write(&dmar_global_lock);
3039                         if (ret)
3040                                 goto free_iommu;
3041                 }
3042 #endif
3043                 ret = dmar_set_interrupt(iommu);
3044                 if (ret)
3045                         goto free_iommu;
3046         }
3047
3048         return 0;
3049
3050 free_iommu:
3051         for_each_active_iommu(iommu, drhd) {
3052                 disable_dmar_iommu(iommu);
3053                 free_dmar_iommu(iommu);
3054         }
3055
3056         return ret;
3057 }
3058
3059 static void __init init_no_remapping_devices(void)
3060 {
3061         struct dmar_drhd_unit *drhd;
3062         struct device *dev;
3063         int i;
3064
3065         for_each_drhd_unit(drhd) {
3066                 if (!drhd->include_all) {
3067                         for_each_active_dev_scope(drhd->devices,
3068                                                   drhd->devices_cnt, i, dev)
3069                                 break;
3070                         /* ignore DMAR unit if no devices exist */
3071                         if (i == drhd->devices_cnt)
3072                                 drhd->ignored = 1;
3073                 }
3074         }
3075
3076         for_each_active_drhd_unit(drhd) {
3077                 if (drhd->include_all)
3078                         continue;
3079
3080                 for_each_active_dev_scope(drhd->devices,
3081                                           drhd->devices_cnt, i, dev)
3082                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3083                                 break;
3084                 if (i < drhd->devices_cnt)
3085                         continue;
3086
3087                 /* This IOMMU has *only* gfx devices. Either bypass it or
3088                    set the gfx_mapped flag, as appropriate */
3089                 drhd->gfx_dedicated = 1;
3090                 if (!dmar_map_gfx)
3091                         drhd->ignored = 1;
3092         }
3093 }
3094
3095 #ifdef CONFIG_SUSPEND
3096 static int init_iommu_hw(void)
3097 {
3098         struct dmar_drhd_unit *drhd;
3099         struct intel_iommu *iommu = NULL;
3100
3101         for_each_active_iommu(iommu, drhd)
3102                 if (iommu->qi)
3103                         dmar_reenable_qi(iommu);
3104
3105         for_each_iommu(iommu, drhd) {
3106                 if (drhd->ignored) {
3107                         /*
3108                          * we always have to disable PMRs or DMA may fail on
3109                          * this device
3110                          */
3111                         if (force_on)
3112                                 iommu_disable_protect_mem_regions(iommu);
3113                         continue;
3114                 }
3115
3116                 iommu_flush_write_buffer(iommu);
3117                 iommu_set_root_entry(iommu);
3118                 iommu_enable_translation(iommu);
3119                 iommu_disable_protect_mem_regions(iommu);
3120         }
3121
3122         return 0;
3123 }
3124
3125 static void iommu_flush_all(void)
3126 {
3127         struct dmar_drhd_unit *drhd;
3128         struct intel_iommu *iommu;
3129
3130         for_each_active_iommu(iommu, drhd) {
3131                 iommu->flush.flush_context(iommu, 0, 0, 0,
3132                                            DMA_CCMD_GLOBAL_INVL);
3133                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3134                                          DMA_TLB_GLOBAL_FLUSH);
3135         }
3136 }
3137
3138 static int iommu_suspend(void)
3139 {
3140         struct dmar_drhd_unit *drhd;
3141         struct intel_iommu *iommu = NULL;
3142         unsigned long flag;
3143
3144         for_each_active_iommu(iommu, drhd) {
3145                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3146                                              GFP_KERNEL);
3147                 if (!iommu->iommu_state)
3148                         goto nomem;
3149         }
3150
3151         iommu_flush_all();
3152
3153         for_each_active_iommu(iommu, drhd) {
3154                 iommu_disable_translation(iommu);
3155
3156                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3157
3158                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3159                         readl(iommu->reg + DMAR_FECTL_REG);
3160                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3161                         readl(iommu->reg + DMAR_FEDATA_REG);
3162                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3163                         readl(iommu->reg + DMAR_FEADDR_REG);
3164                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3165                         readl(iommu->reg + DMAR_FEUADDR_REG);
3166
3167                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3168         }
3169         return 0;
3170
3171 nomem:
3172         for_each_active_iommu(iommu, drhd)
3173                 kfree(iommu->iommu_state);
3174
3175         return -ENOMEM;
3176 }
3177
3178 static void iommu_resume(void)
3179 {
3180         struct dmar_drhd_unit *drhd;
3181         struct intel_iommu *iommu = NULL;
3182         unsigned long flag;
3183
3184         if (init_iommu_hw()) {
3185                 if (force_on)
3186                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3187                 else
3188                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3189                 return;
3190         }
3191
3192         for_each_active_iommu(iommu, drhd) {
3193
3194                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3195
3196                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3197                         iommu->reg + DMAR_FECTL_REG);
3198                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3199                         iommu->reg + DMAR_FEDATA_REG);
3200                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3201                         iommu->reg + DMAR_FEADDR_REG);
3202                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3203                         iommu->reg + DMAR_FEUADDR_REG);
3204
3205                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3206         }
3207
3208         for_each_active_iommu(iommu, drhd)
3209                 kfree(iommu->iommu_state);
3210 }
3211
3212 static struct syscore_ops iommu_syscore_ops = {
3213         .resume         = iommu_resume,
3214         .suspend        = iommu_suspend,
3215 };
3216
3217 static void __init init_iommu_pm_ops(void)
3218 {
3219         register_syscore_ops(&iommu_syscore_ops);
3220 }
3221
3222 #else
3223 static inline void init_iommu_pm_ops(void) {}
3224 #endif  /* CONFIG_PM */
3225
3226 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3227 {
3228         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3229             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3230             rmrr->end_address <= rmrr->base_address ||
3231             arch_rmrr_sanity_check(rmrr))
3232                 return -EINVAL;
3233
3234         return 0;
3235 }
3236
3237 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3238 {
3239         struct acpi_dmar_reserved_memory *rmrr;
3240         struct dmar_rmrr_unit *rmrru;
3241
3242         rmrr = (struct acpi_dmar_reserved_memory *)header;
3243         if (rmrr_sanity_check(rmrr)) {
3244                 pr_warn(FW_BUG
3245                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3246                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3247                            rmrr->base_address, rmrr->end_address,
3248                            dmi_get_system_info(DMI_BIOS_VENDOR),
3249                            dmi_get_system_info(DMI_BIOS_VERSION),
3250                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3251                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3252         }
3253
3254         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3255         if (!rmrru)
3256                 goto out;
3257
3258         rmrru->hdr = header;
3259
3260         rmrru->base_address = rmrr->base_address;
3261         rmrru->end_address = rmrr->end_address;
3262
3263         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3264                                 ((void *)rmrr) + rmrr->header.length,
3265                                 &rmrru->devices_cnt);
3266         if (rmrru->devices_cnt && rmrru->devices == NULL)
3267                 goto free_rmrru;
3268
3269         list_add(&rmrru->list, &dmar_rmrr_units);
3270
3271         return 0;
3272 free_rmrru:
3273         kfree(rmrru);
3274 out:
3275         return -ENOMEM;
3276 }
3277
3278 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3279 {
3280         struct dmar_atsr_unit *atsru;
3281         struct acpi_dmar_atsr *tmp;
3282
3283         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3284                                 dmar_rcu_check()) {
3285                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3286                 if (atsr->segment != tmp->segment)
3287                         continue;
3288                 if (atsr->header.length != tmp->header.length)
3289                         continue;
3290                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3291                         return atsru;
3292         }
3293
3294         return NULL;
3295 }
3296
3297 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3298 {
3299         struct acpi_dmar_atsr *atsr;
3300         struct dmar_atsr_unit *atsru;
3301
3302         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3303                 return 0;
3304
3305         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3306         atsru = dmar_find_atsr(atsr);
3307         if (atsru)
3308                 return 0;
3309
3310         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3311         if (!atsru)
3312                 return -ENOMEM;
3313
3314         /*
3315          * If memory is allocated from slab by ACPI _DSM method, we need to
3316          * copy the memory content because the memory buffer will be freed
3317          * on return.
3318          */
3319         atsru->hdr = (void *)(atsru + 1);
3320         memcpy(atsru->hdr, hdr, hdr->length);
3321         atsru->include_all = atsr->flags & 0x1;
3322         if (!atsru->include_all) {
3323                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3324                                 (void *)atsr + atsr->header.length,
3325                                 &atsru->devices_cnt);
3326                 if (atsru->devices_cnt && atsru->devices == NULL) {
3327                         kfree(atsru);
3328                         return -ENOMEM;
3329                 }
3330         }
3331
3332         list_add_rcu(&atsru->list, &dmar_atsr_units);
3333
3334         return 0;
3335 }
3336
3337 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3338 {
3339         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3340         kfree(atsru);
3341 }
3342
3343 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3344 {
3345         struct acpi_dmar_atsr *atsr;
3346         struct dmar_atsr_unit *atsru;
3347
3348         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3349         atsru = dmar_find_atsr(atsr);
3350         if (atsru) {
3351                 list_del_rcu(&atsru->list);
3352                 synchronize_rcu();
3353                 intel_iommu_free_atsr(atsru);
3354         }
3355
3356         return 0;
3357 }
3358
3359 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3360 {
3361         int i;
3362         struct device *dev;
3363         struct acpi_dmar_atsr *atsr;
3364         struct dmar_atsr_unit *atsru;
3365
3366         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3367         atsru = dmar_find_atsr(atsr);
3368         if (!atsru)
3369                 return 0;
3370
3371         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3372                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3373                                           i, dev)
3374                         return -EBUSY;
3375         }
3376
3377         return 0;
3378 }
3379
3380 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3381 {
3382         struct dmar_satc_unit *satcu;
3383         struct acpi_dmar_satc *tmp;
3384
3385         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3386                                 dmar_rcu_check()) {
3387                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3388                 if (satc->segment != tmp->segment)
3389                         continue;
3390                 if (satc->header.length != tmp->header.length)
3391                         continue;
3392                 if (memcmp(satc, tmp, satc->header.length) == 0)
3393                         return satcu;
3394         }
3395
3396         return NULL;
3397 }
3398
3399 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3400 {
3401         struct acpi_dmar_satc *satc;
3402         struct dmar_satc_unit *satcu;
3403
3404         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3405                 return 0;
3406
3407         satc = container_of(hdr, struct acpi_dmar_satc, header);
3408         satcu = dmar_find_satc(satc);
3409         if (satcu)
3410                 return 0;
3411
3412         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3413         if (!satcu)
3414                 return -ENOMEM;
3415
3416         satcu->hdr = (void *)(satcu + 1);
3417         memcpy(satcu->hdr, hdr, hdr->length);
3418         satcu->atc_required = satc->flags & 0x1;
3419         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3420                                               (void *)satc + satc->header.length,
3421                                               &satcu->devices_cnt);
3422         if (satcu->devices_cnt && !satcu->devices) {
3423                 kfree(satcu);
3424                 return -ENOMEM;
3425         }
3426         list_add_rcu(&satcu->list, &dmar_satc_units);
3427
3428         return 0;
3429 }
3430
3431 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3432 {
3433         int sp, ret;
3434         struct intel_iommu *iommu = dmaru->iommu;
3435
3436         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3437         if (ret)
3438                 goto out;
3439
3440         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3441                 pr_warn("%s: Doesn't support hardware pass through.\n",
3442                         iommu->name);
3443                 return -ENXIO;
3444         }
3445
3446         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3447         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3448                 pr_warn("%s: Doesn't support large page.\n",
3449                         iommu->name);
3450                 return -ENXIO;
3451         }
3452
3453         /*
3454          * Disable translation if already enabled prior to OS handover.
3455          */
3456         if (iommu->gcmd & DMA_GCMD_TE)
3457                 iommu_disable_translation(iommu);
3458
3459         ret = iommu_init_domains(iommu);
3460         if (ret == 0)
3461                 ret = iommu_alloc_root_entry(iommu);
3462         if (ret)
3463                 goto out;
3464
3465         intel_svm_check(iommu);
3466
3467         if (dmaru->ignored) {
3468                 /*
3469                  * we always have to disable PMRs or DMA may fail on this device
3470                  */
3471                 if (force_on)
3472                         iommu_disable_protect_mem_regions(iommu);
3473                 return 0;
3474         }
3475
3476         intel_iommu_init_qi(iommu);
3477         iommu_flush_write_buffer(iommu);
3478
3479 #ifdef CONFIG_INTEL_IOMMU_SVM
3480         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3481                 ret = intel_svm_enable_prq(iommu);
3482                 if (ret)
3483                         goto disable_iommu;
3484         }
3485 #endif
3486         ret = dmar_set_interrupt(iommu);
3487         if (ret)
3488                 goto disable_iommu;
3489
3490         iommu_set_root_entry(iommu);
3491         iommu_enable_translation(iommu);
3492
3493         iommu_disable_protect_mem_regions(iommu);
3494         return 0;
3495
3496 disable_iommu:
3497         disable_dmar_iommu(iommu);
3498 out:
3499         free_dmar_iommu(iommu);
3500         return ret;
3501 }
3502
3503 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3504 {
3505         int ret = 0;
3506         struct intel_iommu *iommu = dmaru->iommu;
3507
3508         if (!intel_iommu_enabled)
3509                 return 0;
3510         if (iommu == NULL)
3511                 return -EINVAL;
3512
3513         if (insert) {
3514                 ret = intel_iommu_add(dmaru);
3515         } else {
3516                 disable_dmar_iommu(iommu);
3517                 free_dmar_iommu(iommu);
3518         }
3519
3520         return ret;
3521 }
3522
3523 static void intel_iommu_free_dmars(void)
3524 {
3525         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3526         struct dmar_atsr_unit *atsru, *atsr_n;
3527         struct dmar_satc_unit *satcu, *satc_n;
3528
3529         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3530                 list_del(&rmrru->list);
3531                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3532                 kfree(rmrru);
3533         }
3534
3535         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3536                 list_del(&atsru->list);
3537                 intel_iommu_free_atsr(atsru);
3538         }
3539         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3540                 list_del(&satcu->list);
3541                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3542                 kfree(satcu);
3543         }
3544 }
3545
3546 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3547 {
3548         struct dmar_satc_unit *satcu;
3549         struct acpi_dmar_satc *satc;
3550         struct device *tmp;
3551         int i;
3552
3553         dev = pci_physfn(dev);
3554         rcu_read_lock();
3555
3556         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3557                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3558                 if (satc->segment != pci_domain_nr(dev->bus))
3559                         continue;
3560                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3561                         if (to_pci_dev(tmp) == dev)
3562                                 goto out;
3563         }
3564         satcu = NULL;
3565 out:
3566         rcu_read_unlock();
3567         return satcu;
3568 }
3569
3570 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3571 {
3572         int i, ret = 1;
3573         struct pci_bus *bus;
3574         struct pci_dev *bridge = NULL;
3575         struct device *tmp;
3576         struct acpi_dmar_atsr *atsr;
3577         struct dmar_atsr_unit *atsru;
3578         struct dmar_satc_unit *satcu;
3579
3580         dev = pci_physfn(dev);
3581         satcu = dmar_find_matched_satc_unit(dev);
3582         if (satcu)
3583                 /*
3584                  * This device supports ATS as it is in SATC table.
3585                  * When IOMMU is in legacy mode, enabling ATS is done
3586                  * automatically by HW for the device that requires
3587                  * ATS, hence OS should not enable this device ATS
3588                  * to avoid duplicated TLB invalidation.
3589                  */
3590                 return !(satcu->atc_required && !sm_supported(iommu));
3591
3592         for (bus = dev->bus; bus; bus = bus->parent) {
3593                 bridge = bus->self;
3594                 /* If it's an integrated device, allow ATS */
3595                 if (!bridge)
3596                         return 1;
3597                 /* Connected via non-PCIe: no ATS */
3598                 if (!pci_is_pcie(bridge) ||
3599                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3600                         return 0;
3601                 /* If we found the root port, look it up in the ATSR */
3602                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3603                         break;
3604         }
3605
3606         rcu_read_lock();
3607         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3608                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3609                 if (atsr->segment != pci_domain_nr(dev->bus))
3610                         continue;
3611
3612                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3613                         if (tmp == &bridge->dev)
3614                                 goto out;
3615
3616                 if (atsru->include_all)
3617                         goto out;
3618         }
3619         ret = 0;
3620 out:
3621         rcu_read_unlock();
3622
3623         return ret;
3624 }
3625
3626 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3627 {
3628         int ret;
3629         struct dmar_rmrr_unit *rmrru;
3630         struct dmar_atsr_unit *atsru;
3631         struct dmar_satc_unit *satcu;
3632         struct acpi_dmar_atsr *atsr;
3633         struct acpi_dmar_reserved_memory *rmrr;
3634         struct acpi_dmar_satc *satc;
3635
3636         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3637                 return 0;
3638
3639         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3640                 rmrr = container_of(rmrru->hdr,
3641                                     struct acpi_dmar_reserved_memory, header);
3642                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3643                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3644                                 ((void *)rmrr) + rmrr->header.length,
3645                                 rmrr->segment, rmrru->devices,
3646                                 rmrru->devices_cnt);
3647                         if (ret < 0)
3648                                 return ret;
3649                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3650                         dmar_remove_dev_scope(info, rmrr->segment,
3651                                 rmrru->devices, rmrru->devices_cnt);
3652                 }
3653         }
3654
3655         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3656                 if (atsru->include_all)
3657                         continue;
3658
3659                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3660                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3661                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3662                                         (void *)atsr + atsr->header.length,
3663                                         atsr->segment, atsru->devices,
3664                                         atsru->devices_cnt);
3665                         if (ret > 0)
3666                                 break;
3667                         else if (ret < 0)
3668                                 return ret;
3669                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3670                         if (dmar_remove_dev_scope(info, atsr->segment,
3671                                         atsru->devices, atsru->devices_cnt))
3672                                 break;
3673                 }
3674         }
3675         list_for_each_entry(satcu, &dmar_satc_units, list) {
3676                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3677                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3678                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3679                                         (void *)satc + satc->header.length,
3680                                         satc->segment, satcu->devices,
3681                                         satcu->devices_cnt);
3682                         if (ret > 0)
3683                                 break;
3684                         else if (ret < 0)
3685                                 return ret;
3686                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3687                         if (dmar_remove_dev_scope(info, satc->segment,
3688                                         satcu->devices, satcu->devices_cnt))
3689                                 break;
3690                 }
3691         }
3692
3693         return 0;
3694 }
3695
3696 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3697                                        unsigned long val, void *v)
3698 {
3699         struct memory_notify *mhp = v;
3700         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3701         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3702                         mhp->nr_pages - 1);
3703
3704         switch (val) {
3705         case MEM_GOING_ONLINE:
3706                 if (iommu_domain_identity_map(si_domain,
3707                                               start_vpfn, last_vpfn)) {
3708                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3709                                 start_vpfn, last_vpfn);
3710                         return NOTIFY_BAD;
3711                 }
3712                 break;
3713
3714         case MEM_OFFLINE:
3715         case MEM_CANCEL_ONLINE:
3716                 {
3717                         struct dmar_drhd_unit *drhd;
3718                         struct intel_iommu *iommu;
3719                         LIST_HEAD(freelist);
3720
3721                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3722
3723                         rcu_read_lock();
3724                         for_each_active_iommu(iommu, drhd)
3725                                 iommu_flush_iotlb_psi(iommu, si_domain,
3726                                         start_vpfn, mhp->nr_pages,
3727                                         list_empty(&freelist), 0);
3728                         rcu_read_unlock();
3729                         put_pages_list(&freelist);
3730                 }
3731                 break;
3732         }
3733
3734         return NOTIFY_OK;
3735 }
3736
3737 static struct notifier_block intel_iommu_memory_nb = {
3738         .notifier_call = intel_iommu_memory_notifier,
3739         .priority = 0
3740 };
3741
3742 static void intel_disable_iommus(void)
3743 {
3744         struct intel_iommu *iommu = NULL;
3745         struct dmar_drhd_unit *drhd;
3746
3747         for_each_iommu(iommu, drhd)
3748                 iommu_disable_translation(iommu);
3749 }
3750
3751 void intel_iommu_shutdown(void)
3752 {
3753         struct dmar_drhd_unit *drhd;
3754         struct intel_iommu *iommu = NULL;
3755
3756         if (no_iommu || dmar_disabled)
3757                 return;
3758
3759         down_write(&dmar_global_lock);
3760
3761         /* Disable PMRs explicitly here. */
3762         for_each_iommu(iommu, drhd)
3763                 iommu_disable_protect_mem_regions(iommu);
3764
3765         /* Make sure the IOMMUs are switched off */
3766         intel_disable_iommus();
3767
3768         up_write(&dmar_global_lock);
3769 }
3770
3771 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3772 {
3773         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3774
3775         return container_of(iommu_dev, struct intel_iommu, iommu);
3776 }
3777
3778 static ssize_t version_show(struct device *dev,
3779                             struct device_attribute *attr, char *buf)
3780 {
3781         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3782         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3783         return sprintf(buf, "%d:%d\n",
3784                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3785 }
3786 static DEVICE_ATTR_RO(version);
3787
3788 static ssize_t address_show(struct device *dev,
3789                             struct device_attribute *attr, char *buf)
3790 {
3791         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792         return sprintf(buf, "%llx\n", iommu->reg_phys);
3793 }
3794 static DEVICE_ATTR_RO(address);
3795
3796 static ssize_t cap_show(struct device *dev,
3797                         struct device_attribute *attr, char *buf)
3798 {
3799         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800         return sprintf(buf, "%llx\n", iommu->cap);
3801 }
3802 static DEVICE_ATTR_RO(cap);
3803
3804 static ssize_t ecap_show(struct device *dev,
3805                          struct device_attribute *attr, char *buf)
3806 {
3807         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3808         return sprintf(buf, "%llx\n", iommu->ecap);
3809 }
3810 static DEVICE_ATTR_RO(ecap);
3811
3812 static ssize_t domains_supported_show(struct device *dev,
3813                                       struct device_attribute *attr, char *buf)
3814 {
3815         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3816         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3817 }
3818 static DEVICE_ATTR_RO(domains_supported);
3819
3820 static ssize_t domains_used_show(struct device *dev,
3821                                  struct device_attribute *attr, char *buf)
3822 {
3823         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3824         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3825                                                   cap_ndoms(iommu->cap)));
3826 }
3827 static DEVICE_ATTR_RO(domains_used);
3828
3829 static struct attribute *intel_iommu_attrs[] = {
3830         &dev_attr_version.attr,
3831         &dev_attr_address.attr,
3832         &dev_attr_cap.attr,
3833         &dev_attr_ecap.attr,
3834         &dev_attr_domains_supported.attr,
3835         &dev_attr_domains_used.attr,
3836         NULL,
3837 };
3838
3839 static struct attribute_group intel_iommu_group = {
3840         .name = "intel-iommu",
3841         .attrs = intel_iommu_attrs,
3842 };
3843
3844 const struct attribute_group *intel_iommu_groups[] = {
3845         &intel_iommu_group,
3846         NULL,
3847 };
3848
3849 static inline bool has_external_pci(void)
3850 {
3851         struct pci_dev *pdev = NULL;
3852
3853         for_each_pci_dev(pdev)
3854                 if (pdev->external_facing)
3855                         return true;
3856
3857         return false;
3858 }
3859
3860 static int __init platform_optin_force_iommu(void)
3861 {
3862         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3863                 return 0;
3864
3865         if (no_iommu || dmar_disabled)
3866                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3867
3868         /*
3869          * If Intel-IOMMU is disabled by default, we will apply identity
3870          * map for all devices except those marked as being untrusted.
3871          */
3872         if (dmar_disabled)
3873                 iommu_set_default_passthrough(false);
3874
3875         dmar_disabled = 0;
3876         no_iommu = 0;
3877
3878         return 1;
3879 }
3880
3881 static int __init probe_acpi_namespace_devices(void)
3882 {
3883         struct dmar_drhd_unit *drhd;
3884         /* To avoid a -Wunused-but-set-variable warning. */
3885         struct intel_iommu *iommu __maybe_unused;
3886         struct device *dev;
3887         int i, ret = 0;
3888
3889         for_each_active_iommu(iommu, drhd) {
3890                 for_each_active_dev_scope(drhd->devices,
3891                                           drhd->devices_cnt, i, dev) {
3892                         struct acpi_device_physical_node *pn;
3893                         struct iommu_group *group;
3894                         struct acpi_device *adev;
3895
3896                         if (dev->bus != &acpi_bus_type)
3897                                 continue;
3898
3899                         adev = to_acpi_device(dev);
3900                         mutex_lock(&adev->physical_node_lock);
3901                         list_for_each_entry(pn,
3902                                             &adev->physical_node_list, node) {
3903                                 group = iommu_group_get(pn->dev);
3904                                 if (group) {
3905                                         iommu_group_put(group);
3906                                         continue;
3907                                 }
3908
3909                                 ret = iommu_probe_device(pn->dev);
3910                                 if (ret)
3911                                         break;
3912                         }
3913                         mutex_unlock(&adev->physical_node_lock);
3914
3915                         if (ret)
3916                                 return ret;
3917                 }
3918         }
3919
3920         return 0;
3921 }
3922
3923 static __init int tboot_force_iommu(void)
3924 {
3925         if (!tboot_enabled())
3926                 return 0;
3927
3928         if (no_iommu || dmar_disabled)
3929                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3930
3931         dmar_disabled = 0;
3932         no_iommu = 0;
3933
3934         return 1;
3935 }
3936
3937 int __init intel_iommu_init(void)
3938 {
3939         int ret = -ENODEV;
3940         struct dmar_drhd_unit *drhd;
3941         struct intel_iommu *iommu;
3942
3943         /*
3944          * Intel IOMMU is required for a TXT/tboot launch or platform
3945          * opt in, so enforce that.
3946          */
3947         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3948                     platform_optin_force_iommu();
3949
3950         down_write(&dmar_global_lock);
3951         if (dmar_table_init()) {
3952                 if (force_on)
3953                         panic("tboot: Failed to initialize DMAR table\n");
3954                 goto out_free_dmar;
3955         }
3956
3957         if (dmar_dev_scope_init() < 0) {
3958                 if (force_on)
3959                         panic("tboot: Failed to initialize DMAR device scope\n");
3960                 goto out_free_dmar;
3961         }
3962
3963         up_write(&dmar_global_lock);
3964
3965         /*
3966          * The bus notifier takes the dmar_global_lock, so lockdep will
3967          * complain later when we register it under the lock.
3968          */
3969         dmar_register_bus_notifier();
3970
3971         down_write(&dmar_global_lock);
3972
3973         if (!no_iommu)
3974                 intel_iommu_debugfs_init();
3975
3976         if (no_iommu || dmar_disabled) {
3977                 /*
3978                  * We exit the function here to ensure IOMMU's remapping and
3979                  * mempool aren't setup, which means that the IOMMU's PMRs
3980                  * won't be disabled via the call to init_dmars(). So disable
3981                  * it explicitly here. The PMRs were setup by tboot prior to
3982                  * calling SENTER, but the kernel is expected to reset/tear
3983                  * down the PMRs.
3984                  */
3985                 if (intel_iommu_tboot_noforce) {
3986                         for_each_iommu(iommu, drhd)
3987                                 iommu_disable_protect_mem_regions(iommu);
3988                 }
3989
3990                 /*
3991                  * Make sure the IOMMUs are switched off, even when we
3992                  * boot into a kexec kernel and the previous kernel left
3993                  * them enabled
3994                  */
3995                 intel_disable_iommus();
3996                 goto out_free_dmar;
3997         }
3998
3999         if (list_empty(&dmar_rmrr_units))
4000                 pr_info("No RMRR found\n");
4001
4002         if (list_empty(&dmar_atsr_units))
4003                 pr_info("No ATSR found\n");
4004
4005         if (list_empty(&dmar_satc_units))
4006                 pr_info("No SATC found\n");
4007
4008         init_no_remapping_devices();
4009
4010         ret = init_dmars();
4011         if (ret) {
4012                 if (force_on)
4013                         panic("tboot: Failed to initialize DMARs\n");
4014                 pr_err("Initialization failed\n");
4015                 goto out_free_dmar;
4016         }
4017         up_write(&dmar_global_lock);
4018
4019         init_iommu_pm_ops();
4020
4021         down_read(&dmar_global_lock);
4022         for_each_active_iommu(iommu, drhd) {
4023                 /*
4024                  * The flush queue implementation does not perform
4025                  * page-selective invalidations that are required for efficient
4026                  * TLB flushes in virtual environments.  The benefit of batching
4027                  * is likely to be much lower than the overhead of synchronizing
4028                  * the virtual and physical IOMMU page-tables.
4029                  */
4030                 if (cap_caching_mode(iommu->cap)) {
4031                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4032                         iommu_set_dma_strict();
4033                 }
4034                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4035                                        intel_iommu_groups,
4036                                        "%s", iommu->name);
4037                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4038         }
4039         up_read(&dmar_global_lock);
4040
4041         if (si_domain && !hw_pass_through)
4042                 register_memory_notifier(&intel_iommu_memory_nb);
4043
4044         down_read(&dmar_global_lock);
4045         if (probe_acpi_namespace_devices())
4046                 pr_warn("ACPI name space devices didn't probe correctly\n");
4047
4048         /* Finally, we enable the DMA remapping hardware. */
4049         for_each_iommu(iommu, drhd) {
4050                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4051                         iommu_enable_translation(iommu);
4052
4053                 iommu_disable_protect_mem_regions(iommu);
4054         }
4055         up_read(&dmar_global_lock);
4056
4057         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4058
4059         intel_iommu_enabled = 1;
4060
4061         return 0;
4062
4063 out_free_dmar:
4064         intel_iommu_free_dmars();
4065         up_write(&dmar_global_lock);
4066         return ret;
4067 }
4068
4069 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4070 {
4071         struct device_domain_info *info = opaque;
4072
4073         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4074         return 0;
4075 }
4076
4077 /*
4078  * NB - intel-iommu lacks any sort of reference counting for the users of
4079  * dependent devices.  If multiple endpoints have intersecting dependent
4080  * devices, unbinding the driver from any one of them will possibly leave
4081  * the others unable to operate.
4082  */
4083 static void domain_context_clear(struct device_domain_info *info)
4084 {
4085         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4086                 return;
4087
4088         pci_for_each_dma_alias(to_pci_dev(info->dev),
4089                                &domain_context_clear_one_cb, info);
4090 }
4091
4092 static void dmar_remove_one_dev_info(struct device *dev)
4093 {
4094         struct device_domain_info *info = dev_iommu_priv_get(dev);
4095         struct dmar_domain *domain = info->domain;
4096         struct intel_iommu *iommu = info->iommu;
4097         unsigned long flags;
4098
4099         if (!dev_is_real_dma_subdevice(info->dev)) {
4100                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4101                         intel_pasid_tear_down_entry(iommu, info->dev,
4102                                         PASID_RID2PASID, false);
4103
4104                 iommu_disable_dev_iotlb(info);
4105                 domain_context_clear(info);
4106                 intel_pasid_free_table(info->dev);
4107         }
4108
4109         spin_lock_irqsave(&domain->lock, flags);
4110         list_del(&info->link);
4111         spin_unlock_irqrestore(&domain->lock, flags);
4112
4113         domain_detach_iommu(domain, iommu);
4114         info->domain = NULL;
4115 }
4116
4117 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4118 {
4119         int adjust_width;
4120
4121         /* calculate AGAW */
4122         domain->gaw = guest_width;
4123         adjust_width = guestwidth_to_adjustwidth(guest_width);
4124         domain->agaw = width_to_agaw(adjust_width);
4125
4126         domain->iommu_coherency = false;
4127         domain->iommu_superpage = 0;
4128         domain->max_addr = 0;
4129
4130         /* always allocate the top pgd */
4131         domain->pgd = alloc_pgtable_page(domain->nid);
4132         if (!domain->pgd)
4133                 return -ENOMEM;
4134         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4135         return 0;
4136 }
4137
4138 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4139 {
4140         struct dmar_domain *dmar_domain;
4141         struct iommu_domain *domain;
4142
4143         switch (type) {
4144         case IOMMU_DOMAIN_DMA:
4145         case IOMMU_DOMAIN_DMA_FQ:
4146         case IOMMU_DOMAIN_UNMANAGED:
4147                 dmar_domain = alloc_domain(type);
4148                 if (!dmar_domain) {
4149                         pr_err("Can't allocate dmar_domain\n");
4150                         return NULL;
4151                 }
4152                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4153                         pr_err("Domain initialization failed\n");
4154                         domain_exit(dmar_domain);
4155                         return NULL;
4156                 }
4157
4158                 domain = &dmar_domain->domain;
4159                 domain->geometry.aperture_start = 0;
4160                 domain->geometry.aperture_end   =
4161                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4162                 domain->geometry.force_aperture = true;
4163
4164                 return domain;
4165         case IOMMU_DOMAIN_IDENTITY:
4166                 return &si_domain->domain;
4167         default:
4168                 return NULL;
4169         }
4170
4171         return NULL;
4172 }
4173
4174 static void intel_iommu_domain_free(struct iommu_domain *domain)
4175 {
4176         if (domain != &si_domain->domain)
4177                 domain_exit(to_dmar_domain(domain));
4178 }
4179
4180 static int prepare_domain_attach_device(struct iommu_domain *domain,
4181                                         struct device *dev)
4182 {
4183         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4184         struct intel_iommu *iommu;
4185         int addr_width;
4186
4187         iommu = device_to_iommu(dev, NULL, NULL);
4188         if (!iommu)
4189                 return -ENODEV;
4190
4191         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4192                 return -EOPNOTSUPP;
4193
4194         /* check if this iommu agaw is sufficient for max mapped address */
4195         addr_width = agaw_to_width(iommu->agaw);
4196         if (addr_width > cap_mgaw(iommu->cap))
4197                 addr_width = cap_mgaw(iommu->cap);
4198
4199         if (dmar_domain->max_addr > (1LL << addr_width)) {
4200                 dev_err(dev, "%s: iommu width (%d) is not "
4201                         "sufficient for the mapped address (%llx)\n",
4202                         __func__, addr_width, dmar_domain->max_addr);
4203                 return -EFAULT;
4204         }
4205         dmar_domain->gaw = addr_width;
4206
4207         /*
4208          * Knock out extra levels of page tables if necessary
4209          */
4210         while (iommu->agaw < dmar_domain->agaw) {
4211                 struct dma_pte *pte;
4212
4213                 pte = dmar_domain->pgd;
4214                 if (dma_pte_present(pte)) {
4215                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4216                         free_pgtable_page(pte);
4217                 }
4218                 dmar_domain->agaw--;
4219         }
4220
4221         return 0;
4222 }
4223
4224 static int intel_iommu_attach_device(struct iommu_domain *domain,
4225                                      struct device *dev)
4226 {
4227         int ret;
4228
4229         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4230             device_is_rmrr_locked(dev)) {
4231                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4232                 return -EPERM;
4233         }
4234
4235         /* normally dev is not mapped */
4236         if (unlikely(domain_context_mapped(dev))) {
4237                 struct device_domain_info *info = dev_iommu_priv_get(dev);
4238
4239                 if (info->domain)
4240                         dmar_remove_one_dev_info(dev);
4241         }
4242
4243         ret = prepare_domain_attach_device(domain, dev);
4244         if (ret)
4245                 return ret;
4246
4247         return domain_add_dev_info(to_dmar_domain(domain), dev);
4248 }
4249
4250 static void intel_iommu_detach_device(struct iommu_domain *domain,
4251                                       struct device *dev)
4252 {
4253         dmar_remove_one_dev_info(dev);
4254 }
4255
4256 static int intel_iommu_map(struct iommu_domain *domain,
4257                            unsigned long iova, phys_addr_t hpa,
4258                            size_t size, int iommu_prot, gfp_t gfp)
4259 {
4260         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4261         u64 max_addr;
4262         int prot = 0;
4263
4264         if (iommu_prot & IOMMU_READ)
4265                 prot |= DMA_PTE_READ;
4266         if (iommu_prot & IOMMU_WRITE)
4267                 prot |= DMA_PTE_WRITE;
4268         if (dmar_domain->set_pte_snp)
4269                 prot |= DMA_PTE_SNP;
4270
4271         max_addr = iova + size;
4272         if (dmar_domain->max_addr < max_addr) {
4273                 u64 end;
4274
4275                 /* check if minimum agaw is sufficient for mapped address */
4276                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4277                 if (end < max_addr) {
4278                         pr_err("%s: iommu width (%d) is not "
4279                                "sufficient for the mapped address (%llx)\n",
4280                                __func__, dmar_domain->gaw, max_addr);
4281                         return -EFAULT;
4282                 }
4283                 dmar_domain->max_addr = max_addr;
4284         }
4285         /* Round up size to next multiple of PAGE_SIZE, if it and
4286            the low bits of hpa would take us onto the next page */
4287         size = aligned_nrpages(hpa, size);
4288         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4289                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4290 }
4291
4292 static int intel_iommu_map_pages(struct iommu_domain *domain,
4293                                  unsigned long iova, phys_addr_t paddr,
4294                                  size_t pgsize, size_t pgcount,
4295                                  int prot, gfp_t gfp, size_t *mapped)
4296 {
4297         unsigned long pgshift = __ffs(pgsize);
4298         size_t size = pgcount << pgshift;
4299         int ret;
4300
4301         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4302                 return -EINVAL;
4303
4304         if (!IS_ALIGNED(iova | paddr, pgsize))
4305                 return -EINVAL;
4306
4307         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4308         if (!ret && mapped)
4309                 *mapped = size;
4310
4311         return ret;
4312 }
4313
4314 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4315                                 unsigned long iova, size_t size,
4316                                 struct iommu_iotlb_gather *gather)
4317 {
4318         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4319         unsigned long start_pfn, last_pfn;
4320         int level = 0;
4321
4322         /* Cope with horrid API which requires us to unmap more than the
4323            size argument if it happens to be a large-page mapping. */
4324         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4325
4326         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4327                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4328
4329         start_pfn = iova >> VTD_PAGE_SHIFT;
4330         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4331
4332         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4333
4334         if (dmar_domain->max_addr == iova + size)
4335                 dmar_domain->max_addr = iova;
4336
4337         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4338
4339         return size;
4340 }
4341
4342 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4343                                       unsigned long iova,
4344                                       size_t pgsize, size_t pgcount,
4345                                       struct iommu_iotlb_gather *gather)
4346 {
4347         unsigned long pgshift = __ffs(pgsize);
4348         size_t size = pgcount << pgshift;
4349
4350         return intel_iommu_unmap(domain, iova, size, gather);
4351 }
4352
4353 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4354                                  struct iommu_iotlb_gather *gather)
4355 {
4356         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4357         unsigned long iova_pfn = IOVA_PFN(gather->start);
4358         size_t size = gather->end - gather->start;
4359         struct iommu_domain_info *info;
4360         unsigned long start_pfn;
4361         unsigned long nrpages;
4362         unsigned long i;
4363
4364         nrpages = aligned_nrpages(gather->start, size);
4365         start_pfn = mm_to_dma_pfn(iova_pfn);
4366
4367         xa_for_each(&dmar_domain->iommu_array, i, info)
4368                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4369                                       start_pfn, nrpages,
4370                                       list_empty(&gather->freelist), 0);
4371
4372         put_pages_list(&gather->freelist);
4373 }
4374
4375 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4376                                             dma_addr_t iova)
4377 {
4378         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4379         struct dma_pte *pte;
4380         int level = 0;
4381         u64 phys = 0;
4382
4383         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4384         if (pte && dma_pte_present(pte))
4385                 phys = dma_pte_addr(pte) +
4386                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4387                                                 VTD_PAGE_SHIFT) - 1));
4388
4389         return phys;
4390 }
4391
4392 static bool domain_support_force_snooping(struct dmar_domain *domain)
4393 {
4394         struct device_domain_info *info;
4395         bool support = true;
4396
4397         assert_spin_locked(&domain->lock);
4398         list_for_each_entry(info, &domain->devices, link) {
4399                 if (!ecap_sc_support(info->iommu->ecap)) {
4400                         support = false;
4401                         break;
4402                 }
4403         }
4404
4405         return support;
4406 }
4407
4408 static void domain_set_force_snooping(struct dmar_domain *domain)
4409 {
4410         struct device_domain_info *info;
4411
4412         assert_spin_locked(&domain->lock);
4413         /*
4414          * Second level page table supports per-PTE snoop control. The
4415          * iommu_map() interface will handle this by setting SNP bit.
4416          */
4417         if (!domain_use_first_level(domain)) {
4418                 domain->set_pte_snp = true;
4419                 return;
4420         }
4421
4422         list_for_each_entry(info, &domain->devices, link)
4423                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4424                                                      PASID_RID2PASID);
4425 }
4426
4427 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4428 {
4429         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4430         unsigned long flags;
4431
4432         if (dmar_domain->force_snooping)
4433                 return true;
4434
4435         spin_lock_irqsave(&dmar_domain->lock, flags);
4436         if (!domain_support_force_snooping(dmar_domain)) {
4437                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4438                 return false;
4439         }
4440
4441         domain_set_force_snooping(dmar_domain);
4442         dmar_domain->force_snooping = true;
4443         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4444
4445         return true;
4446 }
4447
4448 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4449 {
4450         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4451                 return true;
4452         if (cap == IOMMU_CAP_INTR_REMAP)
4453                 return irq_remapping_enabled == 1;
4454         if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4455                 return dmar_platform_optin();
4456
4457         return false;
4458 }
4459
4460 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4461 {
4462         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4463         struct device_domain_info *info;
4464         struct intel_iommu *iommu;
4465         u8 bus, devfn;
4466
4467         iommu = device_to_iommu(dev, &bus, &devfn);
4468         if (!iommu || !iommu->iommu.ops)
4469                 return ERR_PTR(-ENODEV);
4470
4471         info = kzalloc(sizeof(*info), GFP_KERNEL);
4472         if (!info)
4473                 return ERR_PTR(-ENOMEM);
4474
4475         if (dev_is_real_dma_subdevice(dev)) {
4476                 info->bus = pdev->bus->number;
4477                 info->devfn = pdev->devfn;
4478                 info->segment = pci_domain_nr(pdev->bus);
4479         } else {
4480                 info->bus = bus;
4481                 info->devfn = devfn;
4482                 info->segment = iommu->segment;
4483         }
4484
4485         info->dev = dev;
4486         info->iommu = iommu;
4487         if (dev_is_pci(dev)) {
4488                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4489                     pci_ats_supported(pdev) &&
4490                     dmar_ats_supported(pdev, iommu))
4491                         info->ats_supported = 1;
4492
4493                 if (sm_supported(iommu)) {
4494                         if (pasid_supported(iommu)) {
4495                                 int features = pci_pasid_features(pdev);
4496
4497                                 if (features >= 0)
4498                                         info->pasid_supported = features | 1;
4499                         }
4500
4501                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4502                             pci_pri_supported(pdev))
4503                                 info->pri_supported = 1;
4504                 }
4505         }
4506
4507         dev_iommu_priv_set(dev, info);
4508
4509         return &iommu->iommu;
4510 }
4511
4512 static void intel_iommu_release_device(struct device *dev)
4513 {
4514         struct device_domain_info *info = dev_iommu_priv_get(dev);
4515
4516         dmar_remove_one_dev_info(dev);
4517         dev_iommu_priv_set(dev, NULL);
4518         kfree(info);
4519         set_dma_ops(dev, NULL);
4520 }
4521
4522 static void intel_iommu_probe_finalize(struct device *dev)
4523 {
4524         set_dma_ops(dev, NULL);
4525         iommu_setup_dma_ops(dev, 0, U64_MAX);
4526 }
4527
4528 static void intel_iommu_get_resv_regions(struct device *device,
4529                                          struct list_head *head)
4530 {
4531         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4532         struct iommu_resv_region *reg;
4533         struct dmar_rmrr_unit *rmrr;
4534         struct device *i_dev;
4535         int i;
4536
4537         down_read(&dmar_global_lock);
4538         for_each_rmrr_units(rmrr) {
4539                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4540                                           i, i_dev) {
4541                         struct iommu_resv_region *resv;
4542                         enum iommu_resv_type type;
4543                         size_t length;
4544
4545                         if (i_dev != device &&
4546                             !is_downstream_to_pci_bridge(device, i_dev))
4547                                 continue;
4548
4549                         length = rmrr->end_address - rmrr->base_address + 1;
4550
4551                         type = device_rmrr_is_relaxable(device) ?
4552                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4553
4554                         resv = iommu_alloc_resv_region(rmrr->base_address,
4555                                                        length, prot, type);
4556                         if (!resv)
4557                                 break;
4558
4559                         list_add_tail(&resv->list, head);
4560                 }
4561         }
4562         up_read(&dmar_global_lock);
4563
4564 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4565         if (dev_is_pci(device)) {
4566                 struct pci_dev *pdev = to_pci_dev(device);
4567
4568                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4569                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4570                                                    IOMMU_RESV_DIRECT_RELAXABLE);
4571                         if (reg)
4572                                 list_add_tail(&reg->list, head);
4573                 }
4574         }
4575 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4576
4577         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4578                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4579                                       0, IOMMU_RESV_MSI);
4580         if (!reg)
4581                 return;
4582         list_add_tail(&reg->list, head);
4583 }
4584
4585 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4586 {
4587         if (dev_is_pci(dev))
4588                 return pci_device_group(dev);
4589         return generic_device_group(dev);
4590 }
4591
4592 static int intel_iommu_enable_sva(struct device *dev)
4593 {
4594         struct device_domain_info *info = dev_iommu_priv_get(dev);
4595         struct intel_iommu *iommu;
4596         int ret;
4597
4598         if (!info || dmar_disabled)
4599                 return -EINVAL;
4600
4601         iommu = info->iommu;
4602         if (!iommu)
4603                 return -EINVAL;
4604
4605         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4606                 return -ENODEV;
4607
4608         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4609                 return -EINVAL;
4610
4611         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4612         if (!ret)
4613                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4614
4615         return ret;
4616 }
4617
4618 static int intel_iommu_disable_sva(struct device *dev)
4619 {
4620         struct device_domain_info *info = dev_iommu_priv_get(dev);
4621         struct intel_iommu *iommu = info->iommu;
4622         int ret;
4623
4624         ret = iommu_unregister_device_fault_handler(dev);
4625         if (!ret)
4626                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4627
4628         return ret;
4629 }
4630
4631 static int intel_iommu_enable_iopf(struct device *dev)
4632 {
4633         struct device_domain_info *info = dev_iommu_priv_get(dev);
4634
4635         if (info && info->pri_supported)
4636                 return 0;
4637
4638         return -ENODEV;
4639 }
4640
4641 static int
4642 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4643 {
4644         switch (feat) {
4645         case IOMMU_DEV_FEAT_IOPF:
4646                 return intel_iommu_enable_iopf(dev);
4647
4648         case IOMMU_DEV_FEAT_SVA:
4649                 return intel_iommu_enable_sva(dev);
4650
4651         default:
4652                 return -ENODEV;
4653         }
4654 }
4655
4656 static int
4657 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4658 {
4659         switch (feat) {
4660         case IOMMU_DEV_FEAT_IOPF:
4661                 return 0;
4662
4663         case IOMMU_DEV_FEAT_SVA:
4664                 return intel_iommu_disable_sva(dev);
4665
4666         default:
4667                 return -ENODEV;
4668         }
4669 }
4670
4671 static bool intel_iommu_is_attach_deferred(struct device *dev)
4672 {
4673         struct device_domain_info *info = dev_iommu_priv_get(dev);
4674
4675         return translation_pre_enabled(info->iommu) && !info->domain;
4676 }
4677
4678 /*
4679  * Check that the device does not live on an external facing PCI port that is
4680  * marked as untrusted. Such devices should not be able to apply quirks and
4681  * thus not be able to bypass the IOMMU restrictions.
4682  */
4683 static bool risky_device(struct pci_dev *pdev)
4684 {
4685         if (pdev->untrusted) {
4686                 pci_info(pdev,
4687                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4688                          pdev->vendor, pdev->device);
4689                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4690                 return true;
4691         }
4692         return false;
4693 }
4694
4695 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4696                                        unsigned long iova, size_t size)
4697 {
4698         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4699         unsigned long pages = aligned_nrpages(iova, size);
4700         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4701         struct iommu_domain_info *info;
4702         unsigned long i;
4703
4704         xa_for_each(&dmar_domain->iommu_array, i, info)
4705                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4706 }
4707
4708 const struct iommu_ops intel_iommu_ops = {
4709         .capable                = intel_iommu_capable,
4710         .domain_alloc           = intel_iommu_domain_alloc,
4711         .probe_device           = intel_iommu_probe_device,
4712         .probe_finalize         = intel_iommu_probe_finalize,
4713         .release_device         = intel_iommu_release_device,
4714         .get_resv_regions       = intel_iommu_get_resv_regions,
4715         .device_group           = intel_iommu_device_group,
4716         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4717         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4718         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4719         .def_domain_type        = device_def_domain_type,
4720         .pgsize_bitmap          = SZ_4K,
4721 #ifdef CONFIG_INTEL_IOMMU_SVM
4722         .sva_bind               = intel_svm_bind,
4723         .sva_unbind             = intel_svm_unbind,
4724         .sva_get_pasid          = intel_svm_get_pasid,
4725         .page_response          = intel_svm_page_response,
4726 #endif
4727         .default_domain_ops = &(const struct iommu_domain_ops) {
4728                 .attach_dev             = intel_iommu_attach_device,
4729                 .detach_dev             = intel_iommu_detach_device,
4730                 .map_pages              = intel_iommu_map_pages,
4731                 .unmap_pages            = intel_iommu_unmap_pages,
4732                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4733                 .flush_iotlb_all        = intel_flush_iotlb_all,
4734                 .iotlb_sync             = intel_iommu_tlb_sync,
4735                 .iova_to_phys           = intel_iommu_iova_to_phys,
4736                 .free                   = intel_iommu_domain_free,
4737                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4738         }
4739 };
4740
4741 static void quirk_iommu_igfx(struct pci_dev *dev)
4742 {
4743         if (risky_device(dev))
4744                 return;
4745
4746         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4747         dmar_map_gfx = 0;
4748 }
4749
4750 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4756 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4757 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4758
4759 /* Broadwell igfx malfunctions with dmar */
4760 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4761 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4763 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4764 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4765 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4766 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4767 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4774 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4776 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4777 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4784
4785 static void quirk_iommu_rwbf(struct pci_dev *dev)
4786 {
4787         if (risky_device(dev))
4788                 return;
4789
4790         /*
4791          * Mobile 4 Series Chipset neglects to set RWBF capability,
4792          * but needs it. Same seems to hold for the desktop versions.
4793          */
4794         pci_info(dev, "Forcing write-buffer flush capability\n");
4795         rwbf_quirk = 1;
4796 }
4797
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4805
4806 #define GGC 0x52
4807 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4808 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4809 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4810 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4811 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4812 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4813 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4814 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4815
4816 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4817 {
4818         unsigned short ggc;
4819
4820         if (risky_device(dev))
4821                 return;
4822
4823         if (pci_read_config_word(dev, GGC, &ggc))
4824                 return;
4825
4826         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4827                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4828                 dmar_map_gfx = 0;
4829         } else if (dmar_map_gfx) {
4830                 /* we have to ensure the gfx device is idle before we flush */
4831                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4832                 iommu_set_dma_strict();
4833         }
4834 }
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4839
4840 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4841 {
4842         unsigned short ver;
4843
4844         if (!IS_GFX_DEVICE(dev))
4845                 return;
4846
4847         ver = (dev->device >> 8) & 0xff;
4848         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4849             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4850             ver != 0x9a && ver != 0xa7)
4851                 return;
4852
4853         if (risky_device(dev))
4854                 return;
4855
4856         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4857         iommu_skip_te_disable = 1;
4858 }
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4860
4861 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4862    ISOCH DMAR unit for the Azalia sound device, but not give it any
4863    TLB entries, which causes it to deadlock. Check for that.  We do
4864    this in a function called from init_dmars(), instead of in a PCI
4865    quirk, because we don't want to print the obnoxious "BIOS broken"
4866    message if VT-d is actually disabled.
4867 */
4868 static void __init check_tylersburg_isoch(void)
4869 {
4870         struct pci_dev *pdev;
4871         uint32_t vtisochctrl;
4872
4873         /* If there's no Azalia in the system anyway, forget it. */
4874         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4875         if (!pdev)
4876                 return;
4877
4878         if (risky_device(pdev)) {
4879                 pci_dev_put(pdev);
4880                 return;
4881         }
4882
4883         pci_dev_put(pdev);
4884
4885         /* System Management Registers. Might be hidden, in which case
4886            we can't do the sanity check. But that's OK, because the
4887            known-broken BIOSes _don't_ actually hide it, so far. */
4888         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4889         if (!pdev)
4890                 return;
4891
4892         if (risky_device(pdev)) {
4893                 pci_dev_put(pdev);
4894                 return;
4895         }
4896
4897         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4898                 pci_dev_put(pdev);
4899                 return;
4900         }
4901
4902         pci_dev_put(pdev);
4903
4904         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4905         if (vtisochctrl & 1)
4906                 return;
4907
4908         /* Drop all bits other than the number of TLB entries */
4909         vtisochctrl &= 0x1c;
4910
4911         /* If we have the recommended number of TLB entries (16), fine. */
4912         if (vtisochctrl == 0x10)
4913                 return;
4914
4915         /* Zero TLB entries? You get to ride the short bus to school. */
4916         if (!vtisochctrl) {
4917                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4918                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4919                      dmi_get_system_info(DMI_BIOS_VENDOR),
4920                      dmi_get_system_info(DMI_BIOS_VERSION),
4921                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4922                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4923                 return;
4924         }
4925
4926         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4927                vtisochctrl);
4928 }