Merge branches 'iommu/fixes', 'arm/tegra', 'arm/smmu', 'virtio', 'x86/vt-d', 'x86...
[linux-2.6-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE               VTD_PAGE_SIZE
36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
58                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN          (1)
63
64 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE            (9)
68 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
69
70 static inline int agaw_to_level(int agaw)
71 {
72         return agaw + 2;
73 }
74
75 static inline int agaw_to_width(int agaw)
76 {
77         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
80 static inline int width_to_agaw(int width)
81 {
82         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87         return (level - 1) * LEVEL_STRIDE;
88 }
89
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
95 static inline u64 level_mask(int level)
96 {
97         return -1ULL << level_to_offset_bits(level);
98 }
99
100 static inline u64 level_size(int level)
101 {
102         return 1ULL << level_to_offset_bits(level);
103 }
104
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107         return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127         return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131         return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_set_present(struct context_entry *context)
172 {
173         context->lo |= 1;
174 }
175
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178         context->lo &= (((u64)-1) << 2) | 1;
179 }
180
181 static inline void context_set_translation_type(struct context_entry *context,
182                                                 unsigned long value)
183 {
184         context->lo &= (((u64)-1) << 4) | 3;
185         context->lo |= (value & 3) << 2;
186 }
187
188 static inline void context_set_address_root(struct context_entry *context,
189                                             unsigned long value)
190 {
191         context->lo &= ~VTD_PAGE_MASK;
192         context->lo |= value & VTD_PAGE_MASK;
193 }
194
195 static inline void context_set_address_width(struct context_entry *context,
196                                              unsigned long value)
197 {
198         context->hi |= value & 7;
199 }
200
201 static inline void context_set_domain_id(struct context_entry *context,
202                                          unsigned long value)
203 {
204         context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209         context->lo |= CONTEXT_PASIDE;
210 }
211
212 static inline int context_domain_id(struct context_entry *c)
213 {
214         return((c->hi >> 8) & 0xffff);
215 }
216
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219         context->lo = 0;
220         context->hi = 0;
221 }
222
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225         if (!iommu->copied_tables)
226                 return false;
227
228         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244  * This domain is a statically identity mapping domain.
245  *      1. This domain creats a static 1:1 mapping to all usable memory.
246  *      2. It maps to each iommu if successful.
247  *      3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253         struct list_head list;          /* list of rmrr units   */
254         struct acpi_dmar_header *hdr;   /* ACPI header          */
255         u64     base_address;           /* reserved base address*/
256         u64     end_address;            /* reserved end address */
257         struct dmar_dev_scope *devices; /* target devices */
258         int     devices_cnt;            /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262         struct list_head list;          /* list of ATSR units */
263         struct acpi_dmar_header *hdr;   /* ACPI header */
264         struct dmar_dev_scope *devices; /* target devices */
265         int devices_cnt;                /* target device count */
266         u8 include_all:1;               /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270         struct list_head list;          /* list of SATC units */
271         struct acpi_dmar_header *hdr;   /* ACPI header */
272         struct dmar_dev_scope *devices; /* target devices */
273         struct intel_iommu *iommu;      /* the corresponding iommu */
274         int devices_cnt;                /* target device count */
275         u8 atc_required:1;              /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void device_block_translation(struct device *dev);
286 static void intel_iommu_domain_free(struct iommu_domain *domain);
287
288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290
291 int intel_iommu_enabled = 0;
292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293
294 static int dmar_map_gfx = 1;
295 static int intel_iommu_superpage = 1;
296 static int iommu_identity_mapping;
297 static int iommu_skip_te_disable;
298
299 #define IDENTMAP_GFX            2
300 #define IDENTMAP_AZALIA         4
301
302 const struct iommu_ops intel_iommu_ops;
303
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316         u32 gsts;
317
318         gsts = readl(iommu->reg + DMAR_GSTS_REG);
319         if (gsts & DMA_GSTS_TES)
320                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
323 static int __init intel_iommu_setup(char *str)
324 {
325         if (!str)
326                 return -EINVAL;
327
328         while (*str) {
329                 if (!strncmp(str, "on", 2)) {
330                         dmar_disabled = 0;
331                         pr_info("IOMMU enabled\n");
332                 } else if (!strncmp(str, "off", 3)) {
333                         dmar_disabled = 1;
334                         no_platform_optin = 1;
335                         pr_info("IOMMU disabled\n");
336                 } else if (!strncmp(str, "igfx_off", 8)) {
337                         dmar_map_gfx = 0;
338                         pr_info("Disable GFX device mapping\n");
339                 } else if (!strncmp(str, "forcedac", 8)) {
340                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341                         iommu_dma_forcedac = true;
342                 } else if (!strncmp(str, "strict", 6)) {
343                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344                         iommu_set_dma_strict();
345                 } else if (!strncmp(str, "sp_off", 6)) {
346                         pr_info("Disable supported super page\n");
347                         intel_iommu_superpage = 0;
348                 } else if (!strncmp(str, "sm_on", 5)) {
349                         pr_info("Enable scalable mode if hardware supports\n");
350                         intel_iommu_sm = 1;
351                 } else if (!strncmp(str, "sm_off", 6)) {
352                         pr_info("Scalable mode is disallowed\n");
353                         intel_iommu_sm = 0;
354                 } else if (!strncmp(str, "tboot_noforce", 13)) {
355                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356                         intel_iommu_tboot_noforce = 1;
357                 } else {
358                         pr_notice("Unknown option - '%s'\n", str);
359                 }
360
361                 str += strcspn(str, ",");
362                 while (*str == ',')
363                         str++;
364         }
365
366         return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372         struct page *page;
373         void *vaddr = NULL;
374
375         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376         if (page)
377                 vaddr = page_address(page);
378         return vaddr;
379 }
380
381 void free_pgtable_page(void *vaddr)
382 {
383         free_page((unsigned long)vaddr);
384 }
385
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392                                        unsigned long pfn)
393 {
394         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406         unsigned long fl_sagaw, sl_sagaw;
407
408         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409         sl_sagaw = cap_sagaw(iommu->cap);
410
411         /* Second level only. */
412         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413                 return sl_sagaw;
414
415         /* First level only. */
416         if (!ecap_slts(iommu->ecap))
417                 return fl_sagaw;
418
419         return fl_sagaw & sl_sagaw;
420 }
421
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424         unsigned long sagaw;
425         int agaw;
426
427         sagaw = __iommu_calculate_sagaw(iommu);
428         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429                 if (test_bit(agaw, &sagaw))
430                         break;
431         }
432
433         return agaw;
434 }
435
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456         return sm_supported(iommu) ?
457                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462         struct iommu_domain_info *info;
463         struct dmar_drhd_unit *drhd;
464         struct intel_iommu *iommu;
465         bool found = false;
466         unsigned long i;
467
468         domain->iommu_coherency = true;
469         xa_for_each(&domain->iommu_array, i, info) {
470                 found = true;
471                 if (!iommu_paging_structure_coherency(info->iommu)) {
472                         domain->iommu_coherency = false;
473                         break;
474                 }
475         }
476         if (found)
477                 return;
478
479         /* No hardware attached; use lowest common denominator */
480         rcu_read_lock();
481         for_each_active_iommu(iommu, drhd) {
482                 if (!iommu_paging_structure_coherency(iommu)) {
483                         domain->iommu_coherency = false;
484                         break;
485                 }
486         }
487         rcu_read_unlock();
488 }
489
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491                                          struct intel_iommu *skip)
492 {
493         struct dmar_drhd_unit *drhd;
494         struct intel_iommu *iommu;
495         int mask = 0x3;
496
497         if (!intel_iommu_superpage)
498                 return 0;
499
500         /* set iommu_superpage to the smallest common denominator */
501         rcu_read_lock();
502         for_each_active_iommu(iommu, drhd) {
503                 if (iommu != skip) {
504                         if (domain && domain->use_first_level) {
505                                 if (!cap_fl1gp_support(iommu->cap))
506                                         mask = 0x1;
507                         } else {
508                                 mask &= cap_super_page_val(iommu->cap);
509                         }
510
511                         if (!mask)
512                                 break;
513                 }
514         }
515         rcu_read_unlock();
516
517         return fls(mask);
518 }
519
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522         struct device_domain_info *info;
523         int nid = NUMA_NO_NODE;
524         unsigned long flags;
525
526         spin_lock_irqsave(&domain->lock, flags);
527         list_for_each_entry(info, &domain->devices, link) {
528                 /*
529                  * There could possibly be multiple device numa nodes as devices
530                  * within the same domain may sit behind different IOMMUs. There
531                  * isn't perfect answer in such situation, so we select first
532                  * come first served policy.
533                  */
534                 nid = dev_to_node(info->dev);
535                 if (nid != NUMA_NO_NODE)
536                         break;
537         }
538         spin_unlock_irqrestore(&domain->lock, flags);
539
540         return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548         unsigned long bitmap = 0;
549
550         /*
551          * 1-level super page supports page size of 2MiB, 2-level super page
552          * supports page size of both 2MiB and 1GiB.
553          */
554         if (domain->iommu_superpage == 1)
555                 bitmap |= SZ_2M;
556         else if (domain->iommu_superpage == 2)
557                 bitmap |= SZ_2M | SZ_1G;
558
559         return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
563 static void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565         domain_update_iommu_coherency(domain);
566         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568         /*
569          * If RHSA is missing, we should default to the device numa domain
570          * as fall back.
571          */
572         if (domain->nid == NUMA_NO_NODE)
573                 domain->nid = domain_update_device_node(domain);
574
575         /*
576          * First-level translation restricts the input-address to a
577          * canonical address (i.e., address bits 63:N have the same
578          * value as address bit [N-1], where N is 48-bits with 4-level
579          * paging and 57-bits with 5-level paging). Hence, skip bit
580          * [N-1].
581          */
582         if (domain->use_first_level)
583                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584         else
585                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588         domain_update_iotlb(domain);
589 }
590
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592                                          u8 devfn, int alloc)
593 {
594         struct root_entry *root = &iommu->root_entry[bus];
595         struct context_entry *context;
596         u64 *entry;
597
598         /*
599          * Except that the caller requested to allocate a new entry,
600          * returning a copied context entry makes no sense.
601          */
602         if (!alloc && context_copied(iommu, bus, devfn))
603                 return NULL;
604
605         entry = &root->lo;
606         if (sm_supported(iommu)) {
607                 if (devfn >= 0x80) {
608                         devfn -= 0x80;
609                         entry = &root->hi;
610                 }
611                 devfn *= 2;
612         }
613         if (*entry & 1)
614                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615         else {
616                 unsigned long phy_addr;
617                 if (!alloc)
618                         return NULL;
619
620                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621                 if (!context)
622                         return NULL;
623
624                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625                 phy_addr = virt_to_phys((void *)context);
626                 *entry = phy_addr | 1;
627                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628         }
629         return &context[devfn];
630 }
631
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *                               sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643         struct pci_dev *pdev, *pbridge;
644
645         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646                 return false;
647
648         pdev = to_pci_dev(dev);
649         pbridge = to_pci_dev(bridge);
650
651         if (pbridge->subordinate &&
652             pbridge->subordinate->number <= pdev->bus->number &&
653             pbridge->subordinate->busn_res.end >= pdev->bus->number)
654                 return true;
655
656         return false;
657 }
658
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661         struct dmar_drhd_unit *drhd;
662         u32 vtbar;
663         int rc;
664
665         /* We know that this device on this chipset has its own IOMMU.
666          * If we find it under a different IOMMU, then the BIOS is lying
667          * to us. Hope that the IOMMU for this device is actually
668          * disabled, and it needs no translation...
669          */
670         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671         if (rc) {
672                 /* "can't" happen */
673                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674                 return false;
675         }
676         vtbar &= 0xffff0000;
677
678         /* we know that the this iommu should be at offset 0xa000 from vtbar */
679         drhd = dmar_find_matched_drhd_unit(pdev);
680         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683                 return true;
684         }
685
686         return false;
687 }
688
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691         if (!iommu || iommu->drhd->ignored)
692                 return true;
693
694         if (dev_is_pci(dev)) {
695                 struct pci_dev *pdev = to_pci_dev(dev);
696
697                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699                     quirk_ioat_snb_local_iommu(pdev))
700                         return true;
701         }
702
703         return false;
704 }
705
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708         struct dmar_drhd_unit *drhd = NULL;
709         struct pci_dev *pdev = NULL;
710         struct intel_iommu *iommu;
711         struct device *tmp;
712         u16 segment = 0;
713         int i;
714
715         if (!dev)
716                 return NULL;
717
718         if (dev_is_pci(dev)) {
719                 struct pci_dev *pf_pdev;
720
721                 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723                 /* VFs aren't listed in scope tables; we need to look up
724                  * the PF instead to find the IOMMU. */
725                 pf_pdev = pci_physfn(pdev);
726                 dev = &pf_pdev->dev;
727                 segment = pci_domain_nr(pdev->bus);
728         } else if (has_acpi_companion(dev))
729                 dev = &ACPI_COMPANION(dev)->dev;
730
731         rcu_read_lock();
732         for_each_iommu(iommu, drhd) {
733                 if (pdev && segment != drhd->segment)
734                         continue;
735
736                 for_each_active_dev_scope(drhd->devices,
737                                           drhd->devices_cnt, i, tmp) {
738                         if (tmp == dev) {
739                                 /* For a VF use its original BDF# not that of the PF
740                                  * which we used for the IOMMU lookup. Strictly speaking
741                                  * we could do this for all PCI devices; we only need to
742                                  * get the BDF# from the scope table for ACPI matches. */
743                                 if (pdev && pdev->is_virtfn)
744                                         goto got_pdev;
745
746                                 if (bus && devfn) {
747                                         *bus = drhd->devices[i].bus;
748                                         *devfn = drhd->devices[i].devfn;
749                                 }
750                                 goto out;
751                         }
752
753                         if (is_downstream_to_pci_bridge(dev, tmp))
754                                 goto got_pdev;
755                 }
756
757                 if (pdev && drhd->include_all) {
758 got_pdev:
759                         if (bus && devfn) {
760                                 *bus = pdev->bus->number;
761                                 *devfn = pdev->devfn;
762                         }
763                         goto out;
764                 }
765         }
766         iommu = NULL;
767 out:
768         if (iommu_is_dummy(iommu, dev))
769                 iommu = NULL;
770
771         rcu_read_unlock();
772
773         return iommu;
774 }
775
776 static void domain_flush_cache(struct dmar_domain *domain,
777                                void *addr, int size)
778 {
779         if (!domain->iommu_coherency)
780                 clflush_cache_range(addr, size);
781 }
782
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785         struct context_entry *context;
786         int i;
787
788         if (!iommu->root_entry)
789                 return;
790
791         for (i = 0; i < ROOT_ENTRY_NR; i++) {
792                 context = iommu_context_addr(iommu, i, 0, 0);
793                 if (context)
794                         free_pgtable_page(context);
795
796                 if (!sm_supported(iommu))
797                         continue;
798
799                 context = iommu_context_addr(iommu, i, 0x80, 0);
800                 if (context)
801                         free_pgtable_page(context);
802         }
803
804         free_pgtable_page(iommu->root_entry);
805         iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812         struct dma_pte *pte;
813         int offset;
814
815         while (1) {
816                 offset = pfn_level_offset(pfn, level);
817                 pte = &parent[offset];
818                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819                         pr_info("PTE not present at level %d\n", level);
820                         break;
821                 }
822
823                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825                 if (level == 1)
826                         break;
827
828                 parent = phys_to_virt(dma_pte_addr(pte));
829                 level--;
830         }
831 }
832
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834                           unsigned long long addr, u32 pasid)
835 {
836         struct pasid_dir_entry *dir, *pde;
837         struct pasid_entry *entries, *pte;
838         struct context_entry *ctx_entry;
839         struct root_entry *rt_entry;
840         int i, dir_index, index, level;
841         u8 devfn = source_id & 0xff;
842         u8 bus = source_id >> 8;
843         struct dma_pte *pgtable;
844
845         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847         /* root entry dump */
848         rt_entry = &iommu->root_entry[bus];
849         if (!rt_entry) {
850                 pr_info("root table entry is not present\n");
851                 return;
852         }
853
854         if (sm_supported(iommu))
855                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856                         rt_entry->hi, rt_entry->lo);
857         else
858                 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860         /* context entry dump */
861         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862         if (!ctx_entry) {
863                 pr_info("context table entry is not present\n");
864                 return;
865         }
866
867         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868                 ctx_entry->hi, ctx_entry->lo);
869
870         /* legacy mode does not require PASID entries */
871         if (!sm_supported(iommu)) {
872                 level = agaw_to_level(ctx_entry->hi & 7);
873                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874                 goto pgtable_walk;
875         }
876
877         /* get the pointer to pasid directory entry */
878         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879         if (!dir) {
880                 pr_info("pasid directory entry is not present\n");
881                 return;
882         }
883         /* For request-without-pasid, get the pasid from context entry */
884         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885                 pasid = IOMMU_NO_PASID;
886
887         dir_index = pasid >> PASID_PDE_SHIFT;
888         pde = &dir[dir_index];
889         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891         /* get the pointer to the pasid table entry */
892         entries = get_pasid_table_from_pde(pde);
893         if (!entries) {
894                 pr_info("pasid table entry is not present\n");
895                 return;
896         }
897         index = pasid & PASID_PTE_MASK;
898         pte = &entries[index];
899         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905         } else {
906                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908         }
909
910 pgtable_walk:
911         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916                                       unsigned long pfn, int *target_level,
917                                       gfp_t gfp)
918 {
919         struct dma_pte *parent, *pte;
920         int level = agaw_to_level(domain->agaw);
921         int offset;
922
923         if (!domain_pfn_supported(domain, pfn))
924                 /* Address beyond IOMMU's addressing capabilities. */
925                 return NULL;
926
927         parent = domain->pgd;
928
929         while (1) {
930                 void *tmp_page;
931
932                 offset = pfn_level_offset(pfn, level);
933                 pte = &parent[offset];
934                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935                         break;
936                 if (level == *target_level)
937                         break;
938
939                 if (!dma_pte_present(pte)) {
940                         uint64_t pteval;
941
942                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944                         if (!tmp_page)
945                                 return NULL;
946
947                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949                         if (domain->use_first_level)
950                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952                         if (cmpxchg64(&pte->val, 0ULL, pteval))
953                                 /* Someone else set it while we were thinking; use theirs. */
954                                 free_pgtable_page(tmp_page);
955                         else
956                                 domain_flush_cache(domain, pte, sizeof(*pte));
957                 }
958                 if (level == 1)
959                         break;
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 level--;
963         }
964
965         if (!*target_level)
966                 *target_level = level;
967
968         return pte;
969 }
970
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973                                          unsigned long pfn,
974                                          int level, int *large_page)
975 {
976         struct dma_pte *parent, *pte;
977         int total = agaw_to_level(domain->agaw);
978         int offset;
979
980         parent = domain->pgd;
981         while (level <= total) {
982                 offset = pfn_level_offset(pfn, total);
983                 pte = &parent[offset];
984                 if (level == total)
985                         return pte;
986
987                 if (!dma_pte_present(pte)) {
988                         *large_page = total;
989                         break;
990                 }
991
992                 if (dma_pte_superpage(pte)) {
993                         *large_page = total;
994                         return pte;
995                 }
996
997                 parent = phys_to_virt(dma_pte_addr(pte));
998                 total--;
999         }
1000         return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005                                 unsigned long start_pfn,
1006                                 unsigned long last_pfn)
1007 {
1008         unsigned int large_page;
1009         struct dma_pte *first_pte, *pte;
1010
1011         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012             WARN_ON(start_pfn > last_pfn))
1013                 return;
1014
1015         /* we don't need lock here; nobody else touches the iova range */
1016         do {
1017                 large_page = 1;
1018                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019                 if (!pte) {
1020                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021                         continue;
1022                 }
1023                 do {
1024                         dma_clear_pte(pte);
1025                         start_pfn += lvl_to_nr_pages(large_page);
1026                         pte++;
1027                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029                 domain_flush_cache(domain, first_pte,
1030                                    (void *)pte - (void *)first_pte);
1031
1032         } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036                                int retain_level, struct dma_pte *pte,
1037                                unsigned long pfn, unsigned long start_pfn,
1038                                unsigned long last_pfn)
1039 {
1040         pfn = max(start_pfn, pfn);
1041         pte = &pte[pfn_level_offset(pfn, level)];
1042
1043         do {
1044                 unsigned long level_pfn;
1045                 struct dma_pte *level_pte;
1046
1047                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048                         goto next;
1049
1050                 level_pfn = pfn & level_mask(level);
1051                 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053                 if (level > 2) {
1054                         dma_pte_free_level(domain, level - 1, retain_level,
1055                                            level_pte, level_pfn, start_pfn,
1056                                            last_pfn);
1057                 }
1058
1059                 /*
1060                  * Free the page table if we're below the level we want to
1061                  * retain and the range covers the entire table.
1062                  */
1063                 if (level < retain_level && !(start_pfn > level_pfn ||
1064                       last_pfn < level_pfn + level_size(level) - 1)) {
1065                         dma_clear_pte(pte);
1066                         domain_flush_cache(domain, pte, sizeof(*pte));
1067                         free_pgtable_page(level_pte);
1068                 }
1069 next:
1070                 pfn += level_size(level);
1071         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079                                    unsigned long start_pfn,
1080                                    unsigned long last_pfn,
1081                                    int retain_level)
1082 {
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                     int level, struct dma_pte *pte,
1104                                     struct list_head *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         list_add_tail(&pg->lru, freelist);
1110
1111         if (level == 1)
1112                 return;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118                 pte++;
1119         } while (!first_pte_in_page(pte));
1120 }
1121
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123                                 struct dma_pte *pte, unsigned long pfn,
1124                                 unsigned long start_pfn, unsigned long last_pfn,
1125                                 struct list_head *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn = pfn & level_mask(level);
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         dma_pte_clear_level(domain, level - 1,
1153                                             phys_to_virt(dma_pte_addr(pte)),
1154                                             level_pfn, start_pfn, last_pfn,
1155                                             freelist);
1156                 }
1157 next:
1158                 pfn = level_pfn + level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170                          unsigned long last_pfn, struct list_head *freelist)
1171 {
1172         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173             WARN_ON(start_pfn > last_pfn))
1174                 return;
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 list_add_tail(&pgd_page->lru, freelist);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191         struct root_entry *root;
1192
1193         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194         if (!root) {
1195                 pr_err("Allocating root entry for %s failed\n",
1196                         iommu->name);
1197                 return -ENOMEM;
1198         }
1199
1200         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201         iommu->root_entry = root;
1202
1203         return 0;
1204 }
1205
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208         u64 addr;
1209         u32 sts;
1210         unsigned long flag;
1211
1212         addr = virt_to_phys(iommu->root_entry);
1213         if (sm_supported(iommu))
1214                 addr |= DMA_RTADDR_SMT;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227         /*
1228          * Hardware invalidates all DMA remapping hardware translation
1229          * caches as part of SRTP flow.
1230          */
1231         if (cap_esrtps(iommu->cap))
1232                 return;
1233
1234         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235         if (sm_supported(iommu))
1236                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242         u32 val;
1243         unsigned long flag;
1244
1245         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246                 return;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260                                   u16 did, u16 source_id, u8 function_mask,
1261                                   u64 type)
1262 {
1263         u64 val = 0;
1264         unsigned long flag;
1265
1266         switch (type) {
1267         case DMA_CCMD_GLOBAL_INVL:
1268                 val = DMA_CCMD_GLOBAL_INVL;
1269                 break;
1270         case DMA_CCMD_DOMAIN_INVL:
1271                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272                 break;
1273         case DMA_CCMD_DEVICE_INVL:
1274                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276                 break;
1277         default:
1278                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279                         iommu->name, type);
1280                 return;
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317                         iommu->name, type);
1318                 return;
1319         }
1320
1321         if (cap_write_drain(iommu->cap))
1322                 val |= DMA_TLB_WRITE_DRAIN;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         /* Note: Only uses first TLB reg currently */
1326         if (val_iva)
1327                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336         /* check IOTLB invalidation granularity */
1337         if (DMA_TLB_IAIG(val) == 0)
1338                 pr_err("Flush IOTLB failed\n");
1339         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341                         (unsigned long long)DMA_TLB_IIRG(type),
1342                         (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349         struct device_domain_info *info;
1350         unsigned long flags;
1351
1352         spin_lock_irqsave(&domain->lock, flags);
1353         list_for_each_entry(info, &domain->devices, link) {
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         spin_unlock_irqrestore(&domain->lock, flags);
1357                         return info;
1358                 }
1359         }
1360         spin_unlock_irqrestore(&domain->lock, flags);
1361
1362         return NULL;
1363 }
1364
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367         struct dev_pasid_info *dev_pasid;
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379
1380         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381                 info = dev_iommu_priv_get(dev_pasid->dev);
1382                 if (info->ats_enabled) {
1383                         has_iotlb_device = true;
1384                         break;
1385                 }
1386         }
1387         domain->has_iotlb_device = has_iotlb_device;
1388         spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401                 return false;
1402
1403         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411         struct pci_dev *pdev;
1412
1413         if (!dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428                 info->ats_enabled = 1;
1429                 domain_update_iotlb(info->domain);
1430         }
1431 }
1432
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435         struct pci_dev *pdev;
1436
1437         if (!dev_is_pci(info->dev))
1438                 return;
1439
1440         pdev = to_pci_dev(info->dev);
1441
1442         if (info->ats_enabled) {
1443                 pci_disable_ats(pdev);
1444                 info->ats_enabled = 0;
1445                 domain_update_iotlb(info->domain);
1446         }
1447
1448         if (info->pasid_enabled) {
1449                 pci_disable_pasid(pdev);
1450                 info->pasid_enabled = 0;
1451         }
1452 }
1453
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455                                     u64 addr, unsigned int mask)
1456 {
1457         u16 sid, qdep;
1458
1459         if (!info || !info->ats_enabled)
1460                 return;
1461
1462         sid = info->bus << 8 | info->devfn;
1463         qdep = info->ats_qdep;
1464         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465                            qdep, addr, mask);
1466         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         struct dev_pasid_info *dev_pasid;
1473         struct device_domain_info *info;
1474         unsigned long flags;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&domain->lock, flags);
1480         list_for_each_entry(info, &domain->devices, link)
1481                 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484                 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486                 if (!info->ats_enabled)
1487                         continue;
1488
1489                 qi_flush_dev_iotlb_pasid(info->iommu,
1490                                          PCI_DEVID(info->bus, info->devfn),
1491                                          info->pfsid, dev_pasid->pasid,
1492                                          info->ats_qdep, addr,
1493                                          mask);
1494         }
1495         spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499                                      struct dmar_domain *domain, u64 addr,
1500                                      unsigned long npages, bool ih)
1501 {
1502         u16 did = domain_id_iommu(domain, iommu);
1503         struct dev_pasid_info *dev_pasid;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&domain->lock, flags);
1507         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512         spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516                                   struct dmar_domain *domain,
1517                                   unsigned long pfn, unsigned int pages,
1518                                   int ih, int map)
1519 {
1520         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521         unsigned int mask = ilog2(aligned_pages);
1522         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523         u16 did = domain_id_iommu(domain, iommu);
1524
1525         if (WARN_ON(!pages))
1526                 return;
1527
1528         if (ih)
1529                 ih = 1 << 6;
1530
1531         if (domain->use_first_level) {
1532                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533         } else {
1534                 unsigned long bitmask = aligned_pages - 1;
1535
1536                 /*
1537                  * PSI masks the low order bits of the base address. If the
1538                  * address isn't aligned to the mask, then compute a mask value
1539                  * needed to ensure the target range is flushed.
1540                  */
1541                 if (unlikely(bitmask & pfn)) {
1542                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544                         /*
1545                          * Since end_pfn <= pfn + bitmask, the only way bits
1546                          * higher than bitmask can differ in pfn and end_pfn is
1547                          * by carrying. This means after masking out bitmask,
1548                          * high bits starting with the first set bit in
1549                          * shared_bits are all equal in both pfn and end_pfn.
1550                          */
1551                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553                 }
1554
1555                 /*
1556                  * Fallback to domain selective flush if no PSI support or
1557                  * the size is too big.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594         struct iommu_domain_info *info;
1595         unsigned long idx;
1596
1597         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598                 struct intel_iommu *iommu = info->iommu;
1599                 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601                 if (dmar_domain->use_first_level)
1602                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603                 else
1604                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                  DMA_TLB_DSI_FLUSH);
1606
1607                 if (!cap_caching_mode(iommu->cap))
1608                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618                 return;
1619
1620         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622         pmen &= ~DMA_PMEN_EPM;
1623         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625         /* wait for the protected region status bit to clear */
1626         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flags;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         iommu->gcmd |= DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (sts & DMA_GSTS_TES), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650         u32 sts;
1651         unsigned long flag;
1652
1653         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655                 return;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670         u32 ndomains;
1671
1672         ndomains = cap_ndoms(iommu->cap);
1673         pr_debug("%s: Number of Domains supported <%d>\n",
1674                  iommu->name, ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679         if (!iommu->domain_ids)
1680                 return -ENOMEM;
1681
1682         /*
1683          * If Caching mode is set, then invalid translations are tagged
1684          * with domain-id 0, hence we need to pre-allocate it. We also
1685          * use domain-id 0 as a marker for non-allocated domain-id, so
1686          * make sure it is not used for a real domain.
1687          */
1688         set_bit(0, iommu->domain_ids);
1689
1690         /*
1691          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692          * entry for first-level or pass-through translation modes should
1693          * be programmed with a domain id different from those used for
1694          * second-level or nested translation. We reserve a domain id for
1695          * this purpose.
1696          */
1697         if (sm_supported(iommu))
1698                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700         return 0;
1701 }
1702
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705         if (!iommu->domain_ids)
1706                 return;
1707
1708         /*
1709          * All iommu domains must have been detached from the devices,
1710          * hence there should be no domain IDs in use.
1711          */
1712         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713                     > NUM_RESERVED_DID))
1714                 return;
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if (iommu->domain_ids) {
1723                 bitmap_free(iommu->domain_ids);
1724                 iommu->domain_ids = NULL;
1725         }
1726
1727         if (iommu->copied_tables) {
1728                 bitmap_free(iommu->copied_tables);
1729                 iommu->copied_tables = NULL;
1730         }
1731
1732         /* free context mapping */
1733         free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736         if (pasid_supported(iommu)) {
1737                 if (ecap_prs(iommu->ecap))
1738                         intel_svm_finish_prq(iommu);
1739         }
1740 #endif
1741 }
1742
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749         /* Only SL is available in legacy mode */
1750         if (!scalable_mode_support())
1751                 return false;
1752
1753         /* Only level (either FL or SL) is available, just use it */
1754         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755                 return intel_cap_flts_sanity();
1756
1757         /* Both levels are available, decide it based on domain type */
1758         return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763         struct dmar_domain *domain;
1764
1765         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766         if (!domain)
1767                 return NULL;
1768
1769         domain->nid = NUMA_NO_NODE;
1770         if (first_level_by_default(type))
1771                 domain->use_first_level = true;
1772         domain->has_iotlb_device = false;
1773         INIT_LIST_HEAD(&domain->devices);
1774         INIT_LIST_HEAD(&domain->dev_pasids);
1775         spin_lock_init(&domain->lock);
1776         xa_init(&domain->iommu_array);
1777
1778         return domain;
1779 }
1780
1781 static int domain_attach_iommu(struct dmar_domain *domain,
1782                                struct intel_iommu *iommu)
1783 {
1784         struct iommu_domain_info *info, *curr;
1785         unsigned long ndomains;
1786         int num, ret = -ENOSPC;
1787
1788         info = kzalloc(sizeof(*info), GFP_KERNEL);
1789         if (!info)
1790                 return -ENOMEM;
1791
1792         spin_lock(&iommu->lock);
1793         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794         if (curr) {
1795                 curr->refcnt++;
1796                 spin_unlock(&iommu->lock);
1797                 kfree(info);
1798                 return 0;
1799         }
1800
1801         ndomains = cap_ndoms(iommu->cap);
1802         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803         if (num >= ndomains) {
1804                 pr_err("%s: No free domain ids\n", iommu->name);
1805                 goto err_unlock;
1806         }
1807
1808         set_bit(num, iommu->domain_ids);
1809         info->refcnt    = 1;
1810         info->did       = num;
1811         info->iommu     = iommu;
1812         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813                           NULL, info, GFP_ATOMIC);
1814         if (curr) {
1815                 ret = xa_err(curr) ? : -EBUSY;
1816                 goto err_clear;
1817         }
1818         domain_update_iommu_cap(domain);
1819
1820         spin_unlock(&iommu->lock);
1821         return 0;
1822
1823 err_clear:
1824         clear_bit(info->did, iommu->domain_ids);
1825 err_unlock:
1826         spin_unlock(&iommu->lock);
1827         kfree(info);
1828         return ret;
1829 }
1830
1831 static void domain_detach_iommu(struct dmar_domain *domain,
1832                                 struct intel_iommu *iommu)
1833 {
1834         struct iommu_domain_info *info;
1835
1836         spin_lock(&iommu->lock);
1837         info = xa_load(&domain->iommu_array, iommu->seq_id);
1838         if (--info->refcnt == 0) {
1839                 clear_bit(info->did, iommu->domain_ids);
1840                 xa_erase(&domain->iommu_array, iommu->seq_id);
1841                 domain->nid = NUMA_NO_NODE;
1842                 domain_update_iommu_cap(domain);
1843                 kfree(info);
1844         }
1845         spin_unlock(&iommu->lock);
1846 }
1847
1848 static inline int guestwidth_to_adjustwidth(int gaw)
1849 {
1850         int agaw;
1851         int r = (gaw - 12) % 9;
1852
1853         if (r == 0)
1854                 agaw = gaw;
1855         else
1856                 agaw = gaw + 9 - r;
1857         if (agaw > 64)
1858                 agaw = 64;
1859         return agaw;
1860 }
1861
1862 static void domain_exit(struct dmar_domain *domain)
1863 {
1864         if (domain->pgd) {
1865                 LIST_HEAD(freelist);
1866
1867                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868                 put_pages_list(&freelist);
1869         }
1870
1871         if (WARN_ON(!list_empty(&domain->devices)))
1872                 return;
1873
1874         kfree(domain);
1875 }
1876
1877 /*
1878  * Get the PASID directory size for scalable mode context entry.
1879  * Value of X in the PDTS field of a scalable mode context entry
1880  * indicates PASID directory with 2^(X + 7) entries.
1881  */
1882 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883 {
1884         unsigned long pds, max_pde;
1885
1886         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1887         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1888         if (pds < 7)
1889                 return 0;
1890
1891         return pds - 7;
1892 }
1893
1894 /*
1895  * Set the RID_PASID field of a scalable mode context entry. The
1896  * IOMMU hardware will use the PASID value set in this field for
1897  * DMA translations of DMA requests without PASID.
1898  */
1899 static inline void
1900 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901 {
1902         context->hi |= pasid & ((1 << 20) - 1);
1903 }
1904
1905 /*
1906  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907  * entry.
1908  */
1909 static inline void context_set_sm_dte(struct context_entry *context)
1910 {
1911         context->lo |= BIT_ULL(2);
1912 }
1913
1914 /*
1915  * Set the PRE(Page Request Enable) field of a scalable mode context
1916  * entry.
1917  */
1918 static inline void context_set_sm_pre(struct context_entry *context)
1919 {
1920         context->lo |= BIT_ULL(4);
1921 }
1922
1923 /* Convert value to context PASID directory size field coding. */
1924 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1925
1926 static int domain_context_mapping_one(struct dmar_domain *domain,
1927                                       struct intel_iommu *iommu,
1928                                       struct pasid_table *table,
1929                                       u8 bus, u8 devfn)
1930 {
1931         struct device_domain_info *info =
1932                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1933         u16 did = domain_id_iommu(domain, iommu);
1934         int translation = CONTEXT_TT_MULTI_LEVEL;
1935         struct context_entry *context;
1936         int ret;
1937
1938         if (hw_pass_through && domain_type_is_si(domain))
1939                 translation = CONTEXT_TT_PASS_THROUGH;
1940
1941         pr_debug("Set context mapping for %02x:%02x.%d\n",
1942                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1943
1944         spin_lock(&iommu->lock);
1945         ret = -ENOMEM;
1946         context = iommu_context_addr(iommu, bus, devfn, 1);
1947         if (!context)
1948                 goto out_unlock;
1949
1950         ret = 0;
1951         if (context_present(context) && !context_copied(iommu, bus, devfn))
1952                 goto out_unlock;
1953
1954         /*
1955          * For kdump cases, old valid entries may be cached due to the
1956          * in-flight DMA and copied pgtable, but there is no unmapping
1957          * behaviour for them, thus we need an explicit cache flush for
1958          * the newly-mapped device. For kdump, at this point, the device
1959          * is supposed to finish reset at its driver probe stage, so no
1960          * in-flight DMA will exist, and we don't need to worry anymore
1961          * hereafter.
1962          */
1963         if (context_copied(iommu, bus, devfn)) {
1964                 u16 did_old = context_domain_id(context);
1965
1966                 if (did_old < cap_ndoms(iommu->cap)) {
1967                         iommu->flush.flush_context(iommu, did_old,
1968                                                    (((u16)bus) << 8) | devfn,
1969                                                    DMA_CCMD_MASK_NOBIT,
1970                                                    DMA_CCMD_DEVICE_INVL);
1971                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972                                                  DMA_TLB_DSI_FLUSH);
1973                 }
1974
1975                 clear_context_copied(iommu, bus, devfn);
1976         }
1977
1978         context_clear_entry(context);
1979
1980         if (sm_supported(iommu)) {
1981                 unsigned long pds;
1982
1983                 /* Setup the PASID DIR pointer: */
1984                 pds = context_get_sm_pds(table);
1985                 context->lo = (u64)virt_to_phys(table->table) |
1986                                 context_pdts(pds);
1987
1988                 /* Setup the RID_PASID field: */
1989                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990
1991                 /*
1992                  * Setup the Device-TLB enable bit and Page request
1993                  * Enable bit:
1994                  */
1995                 if (info && info->ats_supported)
1996                         context_set_sm_dte(context);
1997                 if (info && info->pri_supported)
1998                         context_set_sm_pre(context);
1999                 if (info && info->pasid_supported)
2000                         context_set_pasid(context);
2001         } else {
2002                 struct dma_pte *pgd = domain->pgd;
2003                 int agaw;
2004
2005                 context_set_domain_id(context, did);
2006
2007                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2008                         /*
2009                          * Skip top levels of page tables for iommu which has
2010                          * less agaw than default. Unnecessary for PT mode.
2011                          */
2012                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013                                 ret = -ENOMEM;
2014                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2015                                 if (!dma_pte_present(pgd))
2016                                         goto out_unlock;
2017                         }
2018
2019                         if (info && info->ats_supported)
2020                                 translation = CONTEXT_TT_DEV_IOTLB;
2021                         else
2022                                 translation = CONTEXT_TT_MULTI_LEVEL;
2023
2024                         context_set_address_root(context, virt_to_phys(pgd));
2025                         context_set_address_width(context, agaw);
2026                 } else {
2027                         /*
2028                          * In pass through mode, AW must be programmed to
2029                          * indicate the largest AGAW value supported by
2030                          * hardware. And ASR is ignored by hardware.
2031                          */
2032                         context_set_address_width(context, iommu->msagaw);
2033                 }
2034
2035                 context_set_translation_type(context, translation);
2036         }
2037
2038         context_set_fault_enable(context);
2039         context_set_present(context);
2040         if (!ecap_coherent(iommu->ecap))
2041                 clflush_cache_range(context, sizeof(*context));
2042
2043         /*
2044          * It's a non-present to present mapping. If hardware doesn't cache
2045          * non-present entry we only need to flush the write-buffer. If the
2046          * _does_ cache non-present entries, then it does so in the special
2047          * domain #0, which we have to flush:
2048          */
2049         if (cap_caching_mode(iommu->cap)) {
2050                 iommu->flush.flush_context(iommu, 0,
2051                                            (((u16)bus) << 8) | devfn,
2052                                            DMA_CCMD_MASK_NOBIT,
2053                                            DMA_CCMD_DEVICE_INVL);
2054                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2055         } else {
2056                 iommu_flush_write_buffer(iommu);
2057         }
2058
2059         ret = 0;
2060
2061 out_unlock:
2062         spin_unlock(&iommu->lock);
2063
2064         return ret;
2065 }
2066
2067 struct domain_context_mapping_data {
2068         struct dmar_domain *domain;
2069         struct intel_iommu *iommu;
2070         struct pasid_table *table;
2071 };
2072
2073 static int domain_context_mapping_cb(struct pci_dev *pdev,
2074                                      u16 alias, void *opaque)
2075 {
2076         struct domain_context_mapping_data *data = opaque;
2077
2078         return domain_context_mapping_one(data->domain, data->iommu,
2079                                           data->table, PCI_BUS_NUM(alias),
2080                                           alias & 0xff);
2081 }
2082
2083 static int
2084 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2085 {
2086         struct domain_context_mapping_data data;
2087         struct pasid_table *table;
2088         struct intel_iommu *iommu;
2089         u8 bus, devfn;
2090
2091         iommu = device_to_iommu(dev, &bus, &devfn);
2092         if (!iommu)
2093                 return -ENODEV;
2094
2095         table = intel_pasid_get_table(dev);
2096
2097         if (!dev_is_pci(dev))
2098                 return domain_context_mapping_one(domain, iommu, table,
2099                                                   bus, devfn);
2100
2101         data.domain = domain;
2102         data.iommu = iommu;
2103         data.table = table;
2104
2105         return pci_for_each_dma_alias(to_pci_dev(dev),
2106                                       &domain_context_mapping_cb, &data);
2107 }
2108
2109 /* Returns a number of VTD pages, but aligned to MM page size */
2110 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111                                             size_t size)
2112 {
2113         host_addr &= ~PAGE_MASK;
2114         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115 }
2116
2117 /* Return largest possible superpage level for a given mapping */
2118 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119                                           unsigned long iov_pfn,
2120                                           unsigned long phy_pfn,
2121                                           unsigned long pages)
2122 {
2123         int support, level = 1;
2124         unsigned long pfnmerge;
2125
2126         support = domain->iommu_superpage;
2127
2128         /* To use a large page, the virtual *and* physical addresses
2129            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130            of them will mean we have to use smaller pages. So just
2131            merge them and check both at once. */
2132         pfnmerge = iov_pfn | phy_pfn;
2133
2134         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135                 pages >>= VTD_STRIDE_SHIFT;
2136                 if (!pages)
2137                         break;
2138                 pfnmerge >>= VTD_STRIDE_SHIFT;
2139                 level++;
2140                 support--;
2141         }
2142         return level;
2143 }
2144
2145 /*
2146  * Ensure that old small page tables are removed to make room for superpage(s).
2147  * We're going to add new large pages, so make sure we don't remove their parent
2148  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149  */
2150 static void switch_to_super_page(struct dmar_domain *domain,
2151                                  unsigned long start_pfn,
2152                                  unsigned long end_pfn, int level)
2153 {
2154         unsigned long lvl_pages = lvl_to_nr_pages(level);
2155         struct iommu_domain_info *info;
2156         struct dma_pte *pte = NULL;
2157         unsigned long i;
2158
2159         while (start_pfn <= end_pfn) {
2160                 if (!pte)
2161                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162                                              GFP_ATOMIC);
2163
2164                 if (dma_pte_present(pte)) {
2165                         dma_pte_free_pagetable(domain, start_pfn,
2166                                                start_pfn + lvl_pages - 1,
2167                                                level + 1);
2168
2169                         xa_for_each(&domain->iommu_array, i, info)
2170                                 iommu_flush_iotlb_psi(info->iommu, domain,
2171                                                       start_pfn, lvl_pages,
2172                                                       0, 0);
2173                 }
2174
2175                 pte++;
2176                 start_pfn += lvl_pages;
2177                 if (first_pte_in_page(pte))
2178                         pte = NULL;
2179         }
2180 }
2181
2182 static int
2183 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185                  gfp_t gfp)
2186 {
2187         struct dma_pte *first_pte = NULL, *pte = NULL;
2188         unsigned int largepage_lvl = 0;
2189         unsigned long lvl_pages = 0;
2190         phys_addr_t pteval;
2191         u64 attr;
2192
2193         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194                 return -EINVAL;
2195
2196         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197                 return -EINVAL;
2198
2199         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2200         attr |= DMA_FL_PTE_PRESENT;
2201         if (domain->use_first_level) {
2202                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203                 if (prot & DMA_PTE_WRITE)
2204                         attr |= DMA_FL_PTE_DIRTY;
2205         }
2206
2207         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2208
2209         while (nr_pages > 0) {
2210                 uint64_t tmp;
2211
2212                 if (!pte) {
2213                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2214                                         phys_pfn, nr_pages);
2215
2216                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2217                                              gfp);
2218                         if (!pte)
2219                                 return -ENOMEM;
2220                         first_pte = pte;
2221
2222                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2223
2224                         /* It is large page*/
2225                         if (largepage_lvl > 1) {
2226                                 unsigned long end_pfn;
2227                                 unsigned long pages_to_remove;
2228
2229                                 pteval |= DMA_PTE_LARGE_PAGE;
2230                                 pages_to_remove = min_t(unsigned long, nr_pages,
2231                                                         nr_pte_to_next_page(pte) * lvl_pages);
2232                                 end_pfn = iov_pfn + pages_to_remove - 1;
2233                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2234                         } else {
2235                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2236                         }
2237
2238                 }
2239                 /* We don't need lock here, nobody else
2240                  * touches the iova range
2241                  */
2242                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2243                 if (tmp) {
2244                         static int dumps = 5;
2245                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2246                                 iov_pfn, tmp, (unsigned long long)pteval);
2247                         if (dumps) {
2248                                 dumps--;
2249                                 debug_dma_dump_mappings(NULL);
2250                         }
2251                         WARN_ON(1);
2252                 }
2253
2254                 nr_pages -= lvl_pages;
2255                 iov_pfn += lvl_pages;
2256                 phys_pfn += lvl_pages;
2257                 pteval += lvl_pages * VTD_PAGE_SIZE;
2258
2259                 /* If the next PTE would be the first in a new page, then we
2260                  * need to flush the cache on the entries we've just written.
2261                  * And then we'll need to recalculate 'pte', so clear it and
2262                  * let it get set again in the if (!pte) block above.
2263                  *
2264                  * If we're done (!nr_pages) we need to flush the cache too.
2265                  *
2266                  * Also if we've been setting superpages, we may need to
2267                  * recalculate 'pte' and switch back to smaller pages for the
2268                  * end of the mapping, if the trailing size is not enough to
2269                  * use another superpage (i.e. nr_pages < lvl_pages).
2270                  */
2271                 pte++;
2272                 if (!nr_pages || first_pte_in_page(pte) ||
2273                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2274                         domain_flush_cache(domain, first_pte,
2275                                            (void *)pte - (void *)first_pte);
2276                         pte = NULL;
2277                 }
2278         }
2279
2280         return 0;
2281 }
2282
2283 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2284 {
2285         struct intel_iommu *iommu = info->iommu;
2286         struct context_entry *context;
2287         u16 did_old;
2288
2289         if (!iommu)
2290                 return;
2291
2292         spin_lock(&iommu->lock);
2293         context = iommu_context_addr(iommu, bus, devfn, 0);
2294         if (!context) {
2295                 spin_unlock(&iommu->lock);
2296                 return;
2297         }
2298
2299         if (sm_supported(iommu)) {
2300                 if (hw_pass_through && domain_type_is_si(info->domain))
2301                         did_old = FLPT_DEFAULT_DID;
2302                 else
2303                         did_old = domain_id_iommu(info->domain, iommu);
2304         } else {
2305                 did_old = context_domain_id(context);
2306         }
2307
2308         context_clear_entry(context);
2309         __iommu_flush_cache(iommu, context, sizeof(*context));
2310         spin_unlock(&iommu->lock);
2311         iommu->flush.flush_context(iommu,
2312                                    did_old,
2313                                    (((u16)bus) << 8) | devfn,
2314                                    DMA_CCMD_MASK_NOBIT,
2315                                    DMA_CCMD_DEVICE_INVL);
2316
2317         if (sm_supported(iommu))
2318                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2319
2320         iommu->flush.flush_iotlb(iommu,
2321                                  did_old,
2322                                  0,
2323                                  0,
2324                                  DMA_TLB_DSI_FLUSH);
2325
2326         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2327 }
2328
2329 static int domain_setup_first_level(struct intel_iommu *iommu,
2330                                     struct dmar_domain *domain,
2331                                     struct device *dev,
2332                                     u32 pasid)
2333 {
2334         struct dma_pte *pgd = domain->pgd;
2335         int agaw, level;
2336         int flags = 0;
2337
2338         /*
2339          * Skip top levels of page tables for iommu which has
2340          * less agaw than default. Unnecessary for PT mode.
2341          */
2342         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2343                 pgd = phys_to_virt(dma_pte_addr(pgd));
2344                 if (!dma_pte_present(pgd))
2345                         return -ENOMEM;
2346         }
2347
2348         level = agaw_to_level(agaw);
2349         if (level != 4 && level != 5)
2350                 return -EINVAL;
2351
2352         if (level == 5)
2353                 flags |= PASID_FLAG_FL5LP;
2354
2355         if (domain->force_snooping)
2356                 flags |= PASID_FLAG_PAGE_SNOOP;
2357
2358         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2359                                              domain_id_iommu(domain, iommu),
2360                                              flags);
2361 }
2362
2363 static bool dev_is_real_dma_subdevice(struct device *dev)
2364 {
2365         return dev && dev_is_pci(dev) &&
2366                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2367 }
2368
2369 static int iommu_domain_identity_map(struct dmar_domain *domain,
2370                                      unsigned long first_vpfn,
2371                                      unsigned long last_vpfn)
2372 {
2373         /*
2374          * RMRR range might have overlap with physical memory range,
2375          * clear it first
2376          */
2377         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2378
2379         return __domain_mapping(domain, first_vpfn,
2380                                 first_vpfn, last_vpfn - first_vpfn + 1,
2381                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2382 }
2383
2384 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2385
2386 static int __init si_domain_init(int hw)
2387 {
2388         struct dmar_rmrr_unit *rmrr;
2389         struct device *dev;
2390         int i, nid, ret;
2391
2392         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2393         if (!si_domain)
2394                 return -EFAULT;
2395
2396         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2397                 domain_exit(si_domain);
2398                 si_domain = NULL;
2399                 return -EFAULT;
2400         }
2401
2402         if (hw)
2403                 return 0;
2404
2405         for_each_online_node(nid) {
2406                 unsigned long start_pfn, end_pfn;
2407                 int i;
2408
2409                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2410                         ret = iommu_domain_identity_map(si_domain,
2411                                         mm_to_dma_pfn_start(start_pfn),
2412                                         mm_to_dma_pfn_end(end_pfn));
2413                         if (ret)
2414                                 return ret;
2415                 }
2416         }
2417
2418         /*
2419          * Identity map the RMRRs so that devices with RMRRs could also use
2420          * the si_domain.
2421          */
2422         for_each_rmrr_units(rmrr) {
2423                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2424                                           i, dev) {
2425                         unsigned long long start = rmrr->base_address;
2426                         unsigned long long end = rmrr->end_address;
2427
2428                         if (WARN_ON(end < start ||
2429                                     end >> agaw_to_width(si_domain->agaw)))
2430                                 continue;
2431
2432                         ret = iommu_domain_identity_map(si_domain,
2433                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2434                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2435                         if (ret)
2436                                 return ret;
2437                 }
2438         }
2439
2440         return 0;
2441 }
2442
2443 static int dmar_domain_attach_device(struct dmar_domain *domain,
2444                                      struct device *dev)
2445 {
2446         struct device_domain_info *info = dev_iommu_priv_get(dev);
2447         struct intel_iommu *iommu;
2448         unsigned long flags;
2449         u8 bus, devfn;
2450         int ret;
2451
2452         iommu = device_to_iommu(dev, &bus, &devfn);
2453         if (!iommu)
2454                 return -ENODEV;
2455
2456         ret = domain_attach_iommu(domain, iommu);
2457         if (ret)
2458                 return ret;
2459         info->domain = domain;
2460         spin_lock_irqsave(&domain->lock, flags);
2461         list_add(&info->link, &domain->devices);
2462         spin_unlock_irqrestore(&domain->lock, flags);
2463
2464         /* PASID table is mandatory for a PCI device in scalable mode. */
2465         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2466                 /* Setup the PASID entry for requests without PASID: */
2467                 if (hw_pass_through && domain_type_is_si(domain))
2468                         ret = intel_pasid_setup_pass_through(iommu, domain,
2469                                         dev, IOMMU_NO_PASID);
2470                 else if (domain->use_first_level)
2471                         ret = domain_setup_first_level(iommu, domain, dev,
2472                                         IOMMU_NO_PASID);
2473                 else
2474                         ret = intel_pasid_setup_second_level(iommu, domain,
2475                                         dev, IOMMU_NO_PASID);
2476                 if (ret) {
2477                         dev_err(dev, "Setup RID2PASID failed\n");
2478                         device_block_translation(dev);
2479                         return ret;
2480                 }
2481         }
2482
2483         ret = domain_context_mapping(domain, dev);
2484         if (ret) {
2485                 dev_err(dev, "Domain context map failed\n");
2486                 device_block_translation(dev);
2487                 return ret;
2488         }
2489
2490         iommu_enable_pci_caps(info);
2491
2492         return 0;
2493 }
2494
2495 /**
2496  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2497  * is relaxable (ie. is allowed to be not enforced under some conditions)
2498  * @dev: device handle
2499  *
2500  * We assume that PCI USB devices with RMRRs have them largely
2501  * for historical reasons and that the RMRR space is not actively used post
2502  * boot.  This exclusion may change if vendors begin to abuse it.
2503  *
2504  * The same exception is made for graphics devices, with the requirement that
2505  * any use of the RMRR regions will be torn down before assigning the device
2506  * to a guest.
2507  *
2508  * Return: true if the RMRR is relaxable, false otherwise
2509  */
2510 static bool device_rmrr_is_relaxable(struct device *dev)
2511 {
2512         struct pci_dev *pdev;
2513
2514         if (!dev_is_pci(dev))
2515                 return false;
2516
2517         pdev = to_pci_dev(dev);
2518         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2519                 return true;
2520         else
2521                 return false;
2522 }
2523
2524 /*
2525  * Return the required default domain type for a specific device.
2526  *
2527  * @dev: the device in query
2528  * @startup: true if this is during early boot
2529  *
2530  * Returns:
2531  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2532  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2533  *  - 0: both identity and dynamic domains work for this device
2534  */
2535 static int device_def_domain_type(struct device *dev)
2536 {
2537         if (dev_is_pci(dev)) {
2538                 struct pci_dev *pdev = to_pci_dev(dev);
2539
2540                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2541                         return IOMMU_DOMAIN_IDENTITY;
2542
2543                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2544                         return IOMMU_DOMAIN_IDENTITY;
2545         }
2546
2547         return 0;
2548 }
2549
2550 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2551 {
2552         /*
2553          * Start from the sane iommu hardware state.
2554          * If the queued invalidation is already initialized by us
2555          * (for example, while enabling interrupt-remapping) then
2556          * we got the things already rolling from a sane state.
2557          */
2558         if (!iommu->qi) {
2559                 /*
2560                  * Clear any previous faults.
2561                  */
2562                 dmar_fault(-1, iommu);
2563                 /*
2564                  * Disable queued invalidation if supported and already enabled
2565                  * before OS handover.
2566                  */
2567                 dmar_disable_qi(iommu);
2568         }
2569
2570         if (dmar_enable_qi(iommu)) {
2571                 /*
2572                  * Queued Invalidate not enabled, use Register Based Invalidate
2573                  */
2574                 iommu->flush.flush_context = __iommu_flush_context;
2575                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2576                 pr_info("%s: Using Register based invalidation\n",
2577                         iommu->name);
2578         } else {
2579                 iommu->flush.flush_context = qi_flush_context;
2580                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2581                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2582         }
2583 }
2584
2585 static int copy_context_table(struct intel_iommu *iommu,
2586                               struct root_entry *old_re,
2587                               struct context_entry **tbl,
2588                               int bus, bool ext)
2589 {
2590         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2591         struct context_entry *new_ce = NULL, ce;
2592         struct context_entry *old_ce = NULL;
2593         struct root_entry re;
2594         phys_addr_t old_ce_phys;
2595
2596         tbl_idx = ext ? bus * 2 : bus;
2597         memcpy(&re, old_re, sizeof(re));
2598
2599         for (devfn = 0; devfn < 256; devfn++) {
2600                 /* First calculate the correct index */
2601                 idx = (ext ? devfn * 2 : devfn) % 256;
2602
2603                 if (idx == 0) {
2604                         /* First save what we may have and clean up */
2605                         if (new_ce) {
2606                                 tbl[tbl_idx] = new_ce;
2607                                 __iommu_flush_cache(iommu, new_ce,
2608                                                     VTD_PAGE_SIZE);
2609                                 pos = 1;
2610                         }
2611
2612                         if (old_ce)
2613                                 memunmap(old_ce);
2614
2615                         ret = 0;
2616                         if (devfn < 0x80)
2617                                 old_ce_phys = root_entry_lctp(&re);
2618                         else
2619                                 old_ce_phys = root_entry_uctp(&re);
2620
2621                         if (!old_ce_phys) {
2622                                 if (ext && devfn == 0) {
2623                                         /* No LCTP, try UCTP */
2624                                         devfn = 0x7f;
2625                                         continue;
2626                                 } else {
2627                                         goto out;
2628                                 }
2629                         }
2630
2631                         ret = -ENOMEM;
2632                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2633                                         MEMREMAP_WB);
2634                         if (!old_ce)
2635                                 goto out;
2636
2637                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2638                         if (!new_ce)
2639                                 goto out_unmap;
2640
2641                         ret = 0;
2642                 }
2643
2644                 /* Now copy the context entry */
2645                 memcpy(&ce, old_ce + idx, sizeof(ce));
2646
2647                 if (!context_present(&ce))
2648                         continue;
2649
2650                 did = context_domain_id(&ce);
2651                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2652                         set_bit(did, iommu->domain_ids);
2653
2654                 set_context_copied(iommu, bus, devfn);
2655                 new_ce[idx] = ce;
2656         }
2657
2658         tbl[tbl_idx + pos] = new_ce;
2659
2660         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2661
2662 out_unmap:
2663         memunmap(old_ce);
2664
2665 out:
2666         return ret;
2667 }
2668
2669 static int copy_translation_tables(struct intel_iommu *iommu)
2670 {
2671         struct context_entry **ctxt_tbls;
2672         struct root_entry *old_rt;
2673         phys_addr_t old_rt_phys;
2674         int ctxt_table_entries;
2675         u64 rtaddr_reg;
2676         int bus, ret;
2677         bool new_ext, ext;
2678
2679         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2680         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2681         new_ext    = !!sm_supported(iommu);
2682
2683         /*
2684          * The RTT bit can only be changed when translation is disabled,
2685          * but disabling translation means to open a window for data
2686          * corruption. So bail out and don't copy anything if we would
2687          * have to change the bit.
2688          */
2689         if (new_ext != ext)
2690                 return -EINVAL;
2691
2692         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2693         if (!iommu->copied_tables)
2694                 return -ENOMEM;
2695
2696         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2697         if (!old_rt_phys)
2698                 return -EINVAL;
2699
2700         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2701         if (!old_rt)
2702                 return -ENOMEM;
2703
2704         /* This is too big for the stack - allocate it from slab */
2705         ctxt_table_entries = ext ? 512 : 256;
2706         ret = -ENOMEM;
2707         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2708         if (!ctxt_tbls)
2709                 goto out_unmap;
2710
2711         for (bus = 0; bus < 256; bus++) {
2712                 ret = copy_context_table(iommu, &old_rt[bus],
2713                                          ctxt_tbls, bus, ext);
2714                 if (ret) {
2715                         pr_err("%s: Failed to copy context table for bus %d\n",
2716                                 iommu->name, bus);
2717                         continue;
2718                 }
2719         }
2720
2721         spin_lock(&iommu->lock);
2722
2723         /* Context tables are copied, now write them to the root_entry table */
2724         for (bus = 0; bus < 256; bus++) {
2725                 int idx = ext ? bus * 2 : bus;
2726                 u64 val;
2727
2728                 if (ctxt_tbls[idx]) {
2729                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2730                         iommu->root_entry[bus].lo = val;
2731                 }
2732
2733                 if (!ext || !ctxt_tbls[idx + 1])
2734                         continue;
2735
2736                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2737                 iommu->root_entry[bus].hi = val;
2738         }
2739
2740         spin_unlock(&iommu->lock);
2741
2742         kfree(ctxt_tbls);
2743
2744         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2745
2746         ret = 0;
2747
2748 out_unmap:
2749         memunmap(old_rt);
2750
2751         return ret;
2752 }
2753
2754 static int __init init_dmars(void)
2755 {
2756         struct dmar_drhd_unit *drhd;
2757         struct intel_iommu *iommu;
2758         int ret;
2759
2760         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2761         if (ret)
2762                 goto free_iommu;
2763
2764         for_each_iommu(iommu, drhd) {
2765                 if (drhd->ignored) {
2766                         iommu_disable_translation(iommu);
2767                         continue;
2768                 }
2769
2770                 /*
2771                  * Find the max pasid size of all IOMMU's in the system.
2772                  * We need to ensure the system pasid table is no bigger
2773                  * than the smallest supported.
2774                  */
2775                 if (pasid_supported(iommu)) {
2776                         u32 temp = 2 << ecap_pss(iommu->ecap);
2777
2778                         intel_pasid_max_id = min_t(u32, temp,
2779                                                    intel_pasid_max_id);
2780                 }
2781
2782                 intel_iommu_init_qi(iommu);
2783
2784                 ret = iommu_init_domains(iommu);
2785                 if (ret)
2786                         goto free_iommu;
2787
2788                 init_translation_status(iommu);
2789
2790                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2791                         iommu_disable_translation(iommu);
2792                         clear_translation_pre_enabled(iommu);
2793                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2794                                 iommu->name);
2795                 }
2796
2797                 /*
2798                  * TBD:
2799                  * we could share the same root & context tables
2800                  * among all IOMMU's. Need to Split it later.
2801                  */
2802                 ret = iommu_alloc_root_entry(iommu);
2803                 if (ret)
2804                         goto free_iommu;
2805
2806                 if (translation_pre_enabled(iommu)) {
2807                         pr_info("Translation already enabled - trying to copy translation structures\n");
2808
2809                         ret = copy_translation_tables(iommu);
2810                         if (ret) {
2811                                 /*
2812                                  * We found the IOMMU with translation
2813                                  * enabled - but failed to copy over the
2814                                  * old root-entry table. Try to proceed
2815                                  * by disabling translation now and
2816                                  * allocating a clean root-entry table.
2817                                  * This might cause DMAR faults, but
2818                                  * probably the dump will still succeed.
2819                                  */
2820                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2821                                        iommu->name);
2822                                 iommu_disable_translation(iommu);
2823                                 clear_translation_pre_enabled(iommu);
2824                         } else {
2825                                 pr_info("Copied translation tables from previous kernel for %s\n",
2826                                         iommu->name);
2827                         }
2828                 }
2829
2830                 if (!ecap_pass_through(iommu->ecap))
2831                         hw_pass_through = 0;
2832                 intel_svm_check(iommu);
2833         }
2834
2835         /*
2836          * Now that qi is enabled on all iommus, set the root entry and flush
2837          * caches. This is required on some Intel X58 chipsets, otherwise the
2838          * flush_context function will loop forever and the boot hangs.
2839          */
2840         for_each_active_iommu(iommu, drhd) {
2841                 iommu_flush_write_buffer(iommu);
2842                 iommu_set_root_entry(iommu);
2843         }
2844
2845 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2846         dmar_map_gfx = 0;
2847 #endif
2848
2849         if (!dmar_map_gfx)
2850                 iommu_identity_mapping |= IDENTMAP_GFX;
2851
2852         check_tylersburg_isoch();
2853
2854         ret = si_domain_init(hw_pass_through);
2855         if (ret)
2856                 goto free_iommu;
2857
2858         /*
2859          * for each drhd
2860          *   enable fault log
2861          *   global invalidate context cache
2862          *   global invalidate iotlb
2863          *   enable translation
2864          */
2865         for_each_iommu(iommu, drhd) {
2866                 if (drhd->ignored) {
2867                         /*
2868                          * we always have to disable PMRs or DMA may fail on
2869                          * this device
2870                          */
2871                         if (force_on)
2872                                 iommu_disable_protect_mem_regions(iommu);
2873                         continue;
2874                 }
2875
2876                 iommu_flush_write_buffer(iommu);
2877
2878 #ifdef CONFIG_INTEL_IOMMU_SVM
2879                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2880                         /*
2881                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2882                          * could cause possible lock race condition.
2883                          */
2884                         up_write(&dmar_global_lock);
2885                         ret = intel_svm_enable_prq(iommu);
2886                         down_write(&dmar_global_lock);
2887                         if (ret)
2888                                 goto free_iommu;
2889                 }
2890 #endif
2891                 ret = dmar_set_interrupt(iommu);
2892                 if (ret)
2893                         goto free_iommu;
2894         }
2895
2896         return 0;
2897
2898 free_iommu:
2899         for_each_active_iommu(iommu, drhd) {
2900                 disable_dmar_iommu(iommu);
2901                 free_dmar_iommu(iommu);
2902         }
2903         if (si_domain) {
2904                 domain_exit(si_domain);
2905                 si_domain = NULL;
2906         }
2907
2908         return ret;
2909 }
2910
2911 static void __init init_no_remapping_devices(void)
2912 {
2913         struct dmar_drhd_unit *drhd;
2914         struct device *dev;
2915         int i;
2916
2917         for_each_drhd_unit(drhd) {
2918                 if (!drhd->include_all) {
2919                         for_each_active_dev_scope(drhd->devices,
2920                                                   drhd->devices_cnt, i, dev)
2921                                 break;
2922                         /* ignore DMAR unit if no devices exist */
2923                         if (i == drhd->devices_cnt)
2924                                 drhd->ignored = 1;
2925                 }
2926         }
2927
2928         for_each_active_drhd_unit(drhd) {
2929                 if (drhd->include_all)
2930                         continue;
2931
2932                 for_each_active_dev_scope(drhd->devices,
2933                                           drhd->devices_cnt, i, dev)
2934                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2935                                 break;
2936                 if (i < drhd->devices_cnt)
2937                         continue;
2938
2939                 /* This IOMMU has *only* gfx devices. Either bypass it or
2940                    set the gfx_mapped flag, as appropriate */
2941                 drhd->gfx_dedicated = 1;
2942                 if (!dmar_map_gfx)
2943                         drhd->ignored = 1;
2944         }
2945 }
2946
2947 #ifdef CONFIG_SUSPEND
2948 static int init_iommu_hw(void)
2949 {
2950         struct dmar_drhd_unit *drhd;
2951         struct intel_iommu *iommu = NULL;
2952         int ret;
2953
2954         for_each_active_iommu(iommu, drhd) {
2955                 if (iommu->qi) {
2956                         ret = dmar_reenable_qi(iommu);
2957                         if (ret)
2958                                 return ret;
2959                 }
2960         }
2961
2962         for_each_iommu(iommu, drhd) {
2963                 if (drhd->ignored) {
2964                         /*
2965                          * we always have to disable PMRs or DMA may fail on
2966                          * this device
2967                          */
2968                         if (force_on)
2969                                 iommu_disable_protect_mem_regions(iommu);
2970                         continue;
2971                 }
2972
2973                 iommu_flush_write_buffer(iommu);
2974                 iommu_set_root_entry(iommu);
2975                 iommu_enable_translation(iommu);
2976                 iommu_disable_protect_mem_regions(iommu);
2977         }
2978
2979         return 0;
2980 }
2981
2982 static void iommu_flush_all(void)
2983 {
2984         struct dmar_drhd_unit *drhd;
2985         struct intel_iommu *iommu;
2986
2987         for_each_active_iommu(iommu, drhd) {
2988                 iommu->flush.flush_context(iommu, 0, 0, 0,
2989                                            DMA_CCMD_GLOBAL_INVL);
2990                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2991                                          DMA_TLB_GLOBAL_FLUSH);
2992         }
2993 }
2994
2995 static int iommu_suspend(void)
2996 {
2997         struct dmar_drhd_unit *drhd;
2998         struct intel_iommu *iommu = NULL;
2999         unsigned long flag;
3000
3001         iommu_flush_all();
3002
3003         for_each_active_iommu(iommu, drhd) {
3004                 iommu_disable_translation(iommu);
3005
3006                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3007
3008                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3009                         readl(iommu->reg + DMAR_FECTL_REG);
3010                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3011                         readl(iommu->reg + DMAR_FEDATA_REG);
3012                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3013                         readl(iommu->reg + DMAR_FEADDR_REG);
3014                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3015                         readl(iommu->reg + DMAR_FEUADDR_REG);
3016
3017                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3018         }
3019         return 0;
3020 }
3021
3022 static void iommu_resume(void)
3023 {
3024         struct dmar_drhd_unit *drhd;
3025         struct intel_iommu *iommu = NULL;
3026         unsigned long flag;
3027
3028         if (init_iommu_hw()) {
3029                 if (force_on)
3030                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3031                 else
3032                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3033                 return;
3034         }
3035
3036         for_each_active_iommu(iommu, drhd) {
3037
3038                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3039
3040                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3041                         iommu->reg + DMAR_FECTL_REG);
3042                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3043                         iommu->reg + DMAR_FEDATA_REG);
3044                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3045                         iommu->reg + DMAR_FEADDR_REG);
3046                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3047                         iommu->reg + DMAR_FEUADDR_REG);
3048
3049                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3050         }
3051 }
3052
3053 static struct syscore_ops iommu_syscore_ops = {
3054         .resume         = iommu_resume,
3055         .suspend        = iommu_suspend,
3056 };
3057
3058 static void __init init_iommu_pm_ops(void)
3059 {
3060         register_syscore_ops(&iommu_syscore_ops);
3061 }
3062
3063 #else
3064 static inline void init_iommu_pm_ops(void) {}
3065 #endif  /* CONFIG_PM */
3066
3067 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3068 {
3069         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3070             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3071             rmrr->end_address <= rmrr->base_address ||
3072             arch_rmrr_sanity_check(rmrr))
3073                 return -EINVAL;
3074
3075         return 0;
3076 }
3077
3078 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3079 {
3080         struct acpi_dmar_reserved_memory *rmrr;
3081         struct dmar_rmrr_unit *rmrru;
3082
3083         rmrr = (struct acpi_dmar_reserved_memory *)header;
3084         if (rmrr_sanity_check(rmrr)) {
3085                 pr_warn(FW_BUG
3086                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3087                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3088                            rmrr->base_address, rmrr->end_address,
3089                            dmi_get_system_info(DMI_BIOS_VENDOR),
3090                            dmi_get_system_info(DMI_BIOS_VERSION),
3091                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3092                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3093         }
3094
3095         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3096         if (!rmrru)
3097                 goto out;
3098
3099         rmrru->hdr = header;
3100
3101         rmrru->base_address = rmrr->base_address;
3102         rmrru->end_address = rmrr->end_address;
3103
3104         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3105                                 ((void *)rmrr) + rmrr->header.length,
3106                                 &rmrru->devices_cnt);
3107         if (rmrru->devices_cnt && rmrru->devices == NULL)
3108                 goto free_rmrru;
3109
3110         list_add(&rmrru->list, &dmar_rmrr_units);
3111
3112         return 0;
3113 free_rmrru:
3114         kfree(rmrru);
3115 out:
3116         return -ENOMEM;
3117 }
3118
3119 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3120 {
3121         struct dmar_atsr_unit *atsru;
3122         struct acpi_dmar_atsr *tmp;
3123
3124         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3125                                 dmar_rcu_check()) {
3126                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3127                 if (atsr->segment != tmp->segment)
3128                         continue;
3129                 if (atsr->header.length != tmp->header.length)
3130                         continue;
3131                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3132                         return atsru;
3133         }
3134
3135         return NULL;
3136 }
3137
3138 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3139 {
3140         struct acpi_dmar_atsr *atsr;
3141         struct dmar_atsr_unit *atsru;
3142
3143         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3144                 return 0;
3145
3146         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3147         atsru = dmar_find_atsr(atsr);
3148         if (atsru)
3149                 return 0;
3150
3151         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3152         if (!atsru)
3153                 return -ENOMEM;
3154
3155         /*
3156          * If memory is allocated from slab by ACPI _DSM method, we need to
3157          * copy the memory content because the memory buffer will be freed
3158          * on return.
3159          */
3160         atsru->hdr = (void *)(atsru + 1);
3161         memcpy(atsru->hdr, hdr, hdr->length);
3162         atsru->include_all = atsr->flags & 0x1;
3163         if (!atsru->include_all) {
3164                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3165                                 (void *)atsr + atsr->header.length,
3166                                 &atsru->devices_cnt);
3167                 if (atsru->devices_cnt && atsru->devices == NULL) {
3168                         kfree(atsru);
3169                         return -ENOMEM;
3170                 }
3171         }
3172
3173         list_add_rcu(&atsru->list, &dmar_atsr_units);
3174
3175         return 0;
3176 }
3177
3178 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3179 {
3180         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3181         kfree(atsru);
3182 }
3183
3184 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3185 {
3186         struct acpi_dmar_atsr *atsr;
3187         struct dmar_atsr_unit *atsru;
3188
3189         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3190         atsru = dmar_find_atsr(atsr);
3191         if (atsru) {
3192                 list_del_rcu(&atsru->list);
3193                 synchronize_rcu();
3194                 intel_iommu_free_atsr(atsru);
3195         }
3196
3197         return 0;
3198 }
3199
3200 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3201 {
3202         int i;
3203         struct device *dev;
3204         struct acpi_dmar_atsr *atsr;
3205         struct dmar_atsr_unit *atsru;
3206
3207         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3208         atsru = dmar_find_atsr(atsr);
3209         if (!atsru)
3210                 return 0;
3211
3212         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3213                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3214                                           i, dev)
3215                         return -EBUSY;
3216         }
3217
3218         return 0;
3219 }
3220
3221 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3222 {
3223         struct dmar_satc_unit *satcu;
3224         struct acpi_dmar_satc *tmp;
3225
3226         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3227                                 dmar_rcu_check()) {
3228                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3229                 if (satc->segment != tmp->segment)
3230                         continue;
3231                 if (satc->header.length != tmp->header.length)
3232                         continue;
3233                 if (memcmp(satc, tmp, satc->header.length) == 0)
3234                         return satcu;
3235         }
3236
3237         return NULL;
3238 }
3239
3240 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3241 {
3242         struct acpi_dmar_satc *satc;
3243         struct dmar_satc_unit *satcu;
3244
3245         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3246                 return 0;
3247
3248         satc = container_of(hdr, struct acpi_dmar_satc, header);
3249         satcu = dmar_find_satc(satc);
3250         if (satcu)
3251                 return 0;
3252
3253         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3254         if (!satcu)
3255                 return -ENOMEM;
3256
3257         satcu->hdr = (void *)(satcu + 1);
3258         memcpy(satcu->hdr, hdr, hdr->length);
3259         satcu->atc_required = satc->flags & 0x1;
3260         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3261                                               (void *)satc + satc->header.length,
3262                                               &satcu->devices_cnt);
3263         if (satcu->devices_cnt && !satcu->devices) {
3264                 kfree(satcu);
3265                 return -ENOMEM;
3266         }
3267         list_add_rcu(&satcu->list, &dmar_satc_units);
3268
3269         return 0;
3270 }
3271
3272 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3273 {
3274         int sp, ret;
3275         struct intel_iommu *iommu = dmaru->iommu;
3276
3277         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3278         if (ret)
3279                 goto out;
3280
3281         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3282                 pr_warn("%s: Doesn't support hardware pass through.\n",
3283                         iommu->name);
3284                 return -ENXIO;
3285         }
3286
3287         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3288         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3289                 pr_warn("%s: Doesn't support large page.\n",
3290                         iommu->name);
3291                 return -ENXIO;
3292         }
3293
3294         /*
3295          * Disable translation if already enabled prior to OS handover.
3296          */
3297         if (iommu->gcmd & DMA_GCMD_TE)
3298                 iommu_disable_translation(iommu);
3299
3300         ret = iommu_init_domains(iommu);
3301         if (ret == 0)
3302                 ret = iommu_alloc_root_entry(iommu);
3303         if (ret)
3304                 goto out;
3305
3306         intel_svm_check(iommu);
3307
3308         if (dmaru->ignored) {
3309                 /*
3310                  * we always have to disable PMRs or DMA may fail on this device
3311                  */
3312                 if (force_on)
3313                         iommu_disable_protect_mem_regions(iommu);
3314                 return 0;
3315         }
3316
3317         intel_iommu_init_qi(iommu);
3318         iommu_flush_write_buffer(iommu);
3319
3320 #ifdef CONFIG_INTEL_IOMMU_SVM
3321         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3322                 ret = intel_svm_enable_prq(iommu);
3323                 if (ret)
3324                         goto disable_iommu;
3325         }
3326 #endif
3327         ret = dmar_set_interrupt(iommu);
3328         if (ret)
3329                 goto disable_iommu;
3330
3331         iommu_set_root_entry(iommu);
3332         iommu_enable_translation(iommu);
3333
3334         iommu_disable_protect_mem_regions(iommu);
3335         return 0;
3336
3337 disable_iommu:
3338         disable_dmar_iommu(iommu);
3339 out:
3340         free_dmar_iommu(iommu);
3341         return ret;
3342 }
3343
3344 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3345 {
3346         int ret = 0;
3347         struct intel_iommu *iommu = dmaru->iommu;
3348
3349         if (!intel_iommu_enabled)
3350                 return 0;
3351         if (iommu == NULL)
3352                 return -EINVAL;
3353
3354         if (insert) {
3355                 ret = intel_iommu_add(dmaru);
3356         } else {
3357                 disable_dmar_iommu(iommu);
3358                 free_dmar_iommu(iommu);
3359         }
3360
3361         return ret;
3362 }
3363
3364 static void intel_iommu_free_dmars(void)
3365 {
3366         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3367         struct dmar_atsr_unit *atsru, *atsr_n;
3368         struct dmar_satc_unit *satcu, *satc_n;
3369
3370         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3371                 list_del(&rmrru->list);
3372                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3373                 kfree(rmrru);
3374         }
3375
3376         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3377                 list_del(&atsru->list);
3378                 intel_iommu_free_atsr(atsru);
3379         }
3380         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3381                 list_del(&satcu->list);
3382                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3383                 kfree(satcu);
3384         }
3385 }
3386
3387 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3388 {
3389         struct dmar_satc_unit *satcu;
3390         struct acpi_dmar_satc *satc;
3391         struct device *tmp;
3392         int i;
3393
3394         dev = pci_physfn(dev);
3395         rcu_read_lock();
3396
3397         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3398                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3399                 if (satc->segment != pci_domain_nr(dev->bus))
3400                         continue;
3401                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3402                         if (to_pci_dev(tmp) == dev)
3403                                 goto out;
3404         }
3405         satcu = NULL;
3406 out:
3407         rcu_read_unlock();
3408         return satcu;
3409 }
3410
3411 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3412 {
3413         int i, ret = 1;
3414         struct pci_bus *bus;
3415         struct pci_dev *bridge = NULL;
3416         struct device *tmp;
3417         struct acpi_dmar_atsr *atsr;
3418         struct dmar_atsr_unit *atsru;
3419         struct dmar_satc_unit *satcu;
3420
3421         dev = pci_physfn(dev);
3422         satcu = dmar_find_matched_satc_unit(dev);
3423         if (satcu)
3424                 /*
3425                  * This device supports ATS as it is in SATC table.
3426                  * When IOMMU is in legacy mode, enabling ATS is done
3427                  * automatically by HW for the device that requires
3428                  * ATS, hence OS should not enable this device ATS
3429                  * to avoid duplicated TLB invalidation.
3430                  */
3431                 return !(satcu->atc_required && !sm_supported(iommu));
3432
3433         for (bus = dev->bus; bus; bus = bus->parent) {
3434                 bridge = bus->self;
3435                 /* If it's an integrated device, allow ATS */
3436                 if (!bridge)
3437                         return 1;
3438                 /* Connected via non-PCIe: no ATS */
3439                 if (!pci_is_pcie(bridge) ||
3440                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3441                         return 0;
3442                 /* If we found the root port, look it up in the ATSR */
3443                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3444                         break;
3445         }
3446
3447         rcu_read_lock();
3448         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3449                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3450                 if (atsr->segment != pci_domain_nr(dev->bus))
3451                         continue;
3452
3453                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3454                         if (tmp == &bridge->dev)
3455                                 goto out;
3456
3457                 if (atsru->include_all)
3458                         goto out;
3459         }
3460         ret = 0;
3461 out:
3462         rcu_read_unlock();
3463
3464         return ret;
3465 }
3466
3467 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3468 {
3469         int ret;
3470         struct dmar_rmrr_unit *rmrru;
3471         struct dmar_atsr_unit *atsru;
3472         struct dmar_satc_unit *satcu;
3473         struct acpi_dmar_atsr *atsr;
3474         struct acpi_dmar_reserved_memory *rmrr;
3475         struct acpi_dmar_satc *satc;
3476
3477         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3478                 return 0;
3479
3480         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3481                 rmrr = container_of(rmrru->hdr,
3482                                     struct acpi_dmar_reserved_memory, header);
3483                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3484                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3485                                 ((void *)rmrr) + rmrr->header.length,
3486                                 rmrr->segment, rmrru->devices,
3487                                 rmrru->devices_cnt);
3488                         if (ret < 0)
3489                                 return ret;
3490                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3491                         dmar_remove_dev_scope(info, rmrr->segment,
3492                                 rmrru->devices, rmrru->devices_cnt);
3493                 }
3494         }
3495
3496         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3497                 if (atsru->include_all)
3498                         continue;
3499
3500                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3501                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3502                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3503                                         (void *)atsr + atsr->header.length,
3504                                         atsr->segment, atsru->devices,
3505                                         atsru->devices_cnt);
3506                         if (ret > 0)
3507                                 break;
3508                         else if (ret < 0)
3509                                 return ret;
3510                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3511                         if (dmar_remove_dev_scope(info, atsr->segment,
3512                                         atsru->devices, atsru->devices_cnt))
3513                                 break;
3514                 }
3515         }
3516         list_for_each_entry(satcu, &dmar_satc_units, list) {
3517                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3518                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3519                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3520                                         (void *)satc + satc->header.length,
3521                                         satc->segment, satcu->devices,
3522                                         satcu->devices_cnt);
3523                         if (ret > 0)
3524                                 break;
3525                         else if (ret < 0)
3526                                 return ret;
3527                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3528                         if (dmar_remove_dev_scope(info, satc->segment,
3529                                         satcu->devices, satcu->devices_cnt))
3530                                 break;
3531                 }
3532         }
3533
3534         return 0;
3535 }
3536
3537 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3538                                        unsigned long val, void *v)
3539 {
3540         struct memory_notify *mhp = v;
3541         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3542         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3543                         mhp->nr_pages - 1);
3544
3545         switch (val) {
3546         case MEM_GOING_ONLINE:
3547                 if (iommu_domain_identity_map(si_domain,
3548                                               start_vpfn, last_vpfn)) {
3549                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3550                                 start_vpfn, last_vpfn);
3551                         return NOTIFY_BAD;
3552                 }
3553                 break;
3554
3555         case MEM_OFFLINE:
3556         case MEM_CANCEL_ONLINE:
3557                 {
3558                         struct dmar_drhd_unit *drhd;
3559                         struct intel_iommu *iommu;
3560                         LIST_HEAD(freelist);
3561
3562                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3563
3564                         rcu_read_lock();
3565                         for_each_active_iommu(iommu, drhd)
3566                                 iommu_flush_iotlb_psi(iommu, si_domain,
3567                                         start_vpfn, mhp->nr_pages,
3568                                         list_empty(&freelist), 0);
3569                         rcu_read_unlock();
3570                         put_pages_list(&freelist);
3571                 }
3572                 break;
3573         }
3574
3575         return NOTIFY_OK;
3576 }
3577
3578 static struct notifier_block intel_iommu_memory_nb = {
3579         .notifier_call = intel_iommu_memory_notifier,
3580         .priority = 0
3581 };
3582
3583 static void intel_disable_iommus(void)
3584 {
3585         struct intel_iommu *iommu = NULL;
3586         struct dmar_drhd_unit *drhd;
3587
3588         for_each_iommu(iommu, drhd)
3589                 iommu_disable_translation(iommu);
3590 }
3591
3592 void intel_iommu_shutdown(void)
3593 {
3594         struct dmar_drhd_unit *drhd;
3595         struct intel_iommu *iommu = NULL;
3596
3597         if (no_iommu || dmar_disabled)
3598                 return;
3599
3600         down_write(&dmar_global_lock);
3601
3602         /* Disable PMRs explicitly here. */
3603         for_each_iommu(iommu, drhd)
3604                 iommu_disable_protect_mem_regions(iommu);
3605
3606         /* Make sure the IOMMUs are switched off */
3607         intel_disable_iommus();
3608
3609         up_write(&dmar_global_lock);
3610 }
3611
3612 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3613 {
3614         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3615
3616         return container_of(iommu_dev, struct intel_iommu, iommu);
3617 }
3618
3619 static ssize_t version_show(struct device *dev,
3620                             struct device_attribute *attr, char *buf)
3621 {
3622         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3623         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3624         return sysfs_emit(buf, "%d:%d\n",
3625                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3626 }
3627 static DEVICE_ATTR_RO(version);
3628
3629 static ssize_t address_show(struct device *dev,
3630                             struct device_attribute *attr, char *buf)
3631 {
3632         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3633         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3634 }
3635 static DEVICE_ATTR_RO(address);
3636
3637 static ssize_t cap_show(struct device *dev,
3638                         struct device_attribute *attr, char *buf)
3639 {
3640         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3641         return sysfs_emit(buf, "%llx\n", iommu->cap);
3642 }
3643 static DEVICE_ATTR_RO(cap);
3644
3645 static ssize_t ecap_show(struct device *dev,
3646                          struct device_attribute *attr, char *buf)
3647 {
3648         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3649         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3650 }
3651 static DEVICE_ATTR_RO(ecap);
3652
3653 static ssize_t domains_supported_show(struct device *dev,
3654                                       struct device_attribute *attr, char *buf)
3655 {
3656         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3657         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3658 }
3659 static DEVICE_ATTR_RO(domains_supported);
3660
3661 static ssize_t domains_used_show(struct device *dev,
3662                                  struct device_attribute *attr, char *buf)
3663 {
3664         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3665         return sysfs_emit(buf, "%d\n",
3666                           bitmap_weight(iommu->domain_ids,
3667                                         cap_ndoms(iommu->cap)));
3668 }
3669 static DEVICE_ATTR_RO(domains_used);
3670
3671 static struct attribute *intel_iommu_attrs[] = {
3672         &dev_attr_version.attr,
3673         &dev_attr_address.attr,
3674         &dev_attr_cap.attr,
3675         &dev_attr_ecap.attr,
3676         &dev_attr_domains_supported.attr,
3677         &dev_attr_domains_used.attr,
3678         NULL,
3679 };
3680
3681 static struct attribute_group intel_iommu_group = {
3682         .name = "intel-iommu",
3683         .attrs = intel_iommu_attrs,
3684 };
3685
3686 const struct attribute_group *intel_iommu_groups[] = {
3687         &intel_iommu_group,
3688         NULL,
3689 };
3690
3691 static inline bool has_external_pci(void)
3692 {
3693         struct pci_dev *pdev = NULL;
3694
3695         for_each_pci_dev(pdev)
3696                 if (pdev->external_facing) {
3697                         pci_dev_put(pdev);
3698                         return true;
3699                 }
3700
3701         return false;
3702 }
3703
3704 static int __init platform_optin_force_iommu(void)
3705 {
3706         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3707                 return 0;
3708
3709         if (no_iommu || dmar_disabled)
3710                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3711
3712         /*
3713          * If Intel-IOMMU is disabled by default, we will apply identity
3714          * map for all devices except those marked as being untrusted.
3715          */
3716         if (dmar_disabled)
3717                 iommu_set_default_passthrough(false);
3718
3719         dmar_disabled = 0;
3720         no_iommu = 0;
3721
3722         return 1;
3723 }
3724
3725 static int __init probe_acpi_namespace_devices(void)
3726 {
3727         struct dmar_drhd_unit *drhd;
3728         /* To avoid a -Wunused-but-set-variable warning. */
3729         struct intel_iommu *iommu __maybe_unused;
3730         struct device *dev;
3731         int i, ret = 0;
3732
3733         for_each_active_iommu(iommu, drhd) {
3734                 for_each_active_dev_scope(drhd->devices,
3735                                           drhd->devices_cnt, i, dev) {
3736                         struct acpi_device_physical_node *pn;
3737                         struct acpi_device *adev;
3738
3739                         if (dev->bus != &acpi_bus_type)
3740                                 continue;
3741
3742                         adev = to_acpi_device(dev);
3743                         mutex_lock(&adev->physical_node_lock);
3744                         list_for_each_entry(pn,
3745                                             &adev->physical_node_list, node) {
3746                                 ret = iommu_probe_device(pn->dev);
3747                                 if (ret)
3748                                         break;
3749                         }
3750                         mutex_unlock(&adev->physical_node_lock);
3751
3752                         if (ret)
3753                                 return ret;
3754                 }
3755         }
3756
3757         return 0;
3758 }
3759
3760 static __init int tboot_force_iommu(void)
3761 {
3762         if (!tboot_enabled())
3763                 return 0;
3764
3765         if (no_iommu || dmar_disabled)
3766                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3767
3768         dmar_disabled = 0;
3769         no_iommu = 0;
3770
3771         return 1;
3772 }
3773
3774 int __init intel_iommu_init(void)
3775 {
3776         int ret = -ENODEV;
3777         struct dmar_drhd_unit *drhd;
3778         struct intel_iommu *iommu;
3779
3780         /*
3781          * Intel IOMMU is required for a TXT/tboot launch or platform
3782          * opt in, so enforce that.
3783          */
3784         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3785                     platform_optin_force_iommu();
3786
3787         down_write(&dmar_global_lock);
3788         if (dmar_table_init()) {
3789                 if (force_on)
3790                         panic("tboot: Failed to initialize DMAR table\n");
3791                 goto out_free_dmar;
3792         }
3793
3794         if (dmar_dev_scope_init() < 0) {
3795                 if (force_on)
3796                         panic("tboot: Failed to initialize DMAR device scope\n");
3797                 goto out_free_dmar;
3798         }
3799
3800         up_write(&dmar_global_lock);
3801
3802         /*
3803          * The bus notifier takes the dmar_global_lock, so lockdep will
3804          * complain later when we register it under the lock.
3805          */
3806         dmar_register_bus_notifier();
3807
3808         down_write(&dmar_global_lock);
3809
3810         if (!no_iommu)
3811                 intel_iommu_debugfs_init();
3812
3813         if (no_iommu || dmar_disabled) {
3814                 /*
3815                  * We exit the function here to ensure IOMMU's remapping and
3816                  * mempool aren't setup, which means that the IOMMU's PMRs
3817                  * won't be disabled via the call to init_dmars(). So disable
3818                  * it explicitly here. The PMRs were setup by tboot prior to
3819                  * calling SENTER, but the kernel is expected to reset/tear
3820                  * down the PMRs.
3821                  */
3822                 if (intel_iommu_tboot_noforce) {
3823                         for_each_iommu(iommu, drhd)
3824                                 iommu_disable_protect_mem_regions(iommu);
3825                 }
3826
3827                 /*
3828                  * Make sure the IOMMUs are switched off, even when we
3829                  * boot into a kexec kernel and the previous kernel left
3830                  * them enabled
3831                  */
3832                 intel_disable_iommus();
3833                 goto out_free_dmar;
3834         }
3835
3836         if (list_empty(&dmar_rmrr_units))
3837                 pr_info("No RMRR found\n");
3838
3839         if (list_empty(&dmar_atsr_units))
3840                 pr_info("No ATSR found\n");
3841
3842         if (list_empty(&dmar_satc_units))
3843                 pr_info("No SATC found\n");
3844
3845         init_no_remapping_devices();
3846
3847         ret = init_dmars();
3848         if (ret) {
3849                 if (force_on)
3850                         panic("tboot: Failed to initialize DMARs\n");
3851                 pr_err("Initialization failed\n");
3852                 goto out_free_dmar;
3853         }
3854         up_write(&dmar_global_lock);
3855
3856         init_iommu_pm_ops();
3857
3858         down_read(&dmar_global_lock);
3859         for_each_active_iommu(iommu, drhd) {
3860                 /*
3861                  * The flush queue implementation does not perform
3862                  * page-selective invalidations that are required for efficient
3863                  * TLB flushes in virtual environments.  The benefit of batching
3864                  * is likely to be much lower than the overhead of synchronizing
3865                  * the virtual and physical IOMMU page-tables.
3866                  */
3867                 if (cap_caching_mode(iommu->cap) &&
3868                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3869                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3870                         iommu_set_dma_strict();
3871                 }
3872                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3873                                        intel_iommu_groups,
3874                                        "%s", iommu->name);
3875                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3876
3877                 iommu_pmu_register(iommu);
3878         }
3879         up_read(&dmar_global_lock);
3880
3881         if (si_domain && !hw_pass_through)
3882                 register_memory_notifier(&intel_iommu_memory_nb);
3883
3884         down_read(&dmar_global_lock);
3885         if (probe_acpi_namespace_devices())
3886                 pr_warn("ACPI name space devices didn't probe correctly\n");
3887
3888         /* Finally, we enable the DMA remapping hardware. */
3889         for_each_iommu(iommu, drhd) {
3890                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3891                         iommu_enable_translation(iommu);
3892
3893                 iommu_disable_protect_mem_regions(iommu);
3894         }
3895         up_read(&dmar_global_lock);
3896
3897         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3898
3899         intel_iommu_enabled = 1;
3900
3901         return 0;
3902
3903 out_free_dmar:
3904         intel_iommu_free_dmars();
3905         up_write(&dmar_global_lock);
3906         return ret;
3907 }
3908
3909 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3910 {
3911         struct device_domain_info *info = opaque;
3912
3913         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3914         return 0;
3915 }
3916
3917 /*
3918  * NB - intel-iommu lacks any sort of reference counting for the users of
3919  * dependent devices.  If multiple endpoints have intersecting dependent
3920  * devices, unbinding the driver from any one of them will possibly leave
3921  * the others unable to operate.
3922  */
3923 static void domain_context_clear(struct device_domain_info *info)
3924 {
3925         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3926                 return;
3927
3928         pci_for_each_dma_alias(to_pci_dev(info->dev),
3929                                &domain_context_clear_one_cb, info);
3930 }
3931
3932 static void dmar_remove_one_dev_info(struct device *dev)
3933 {
3934         struct device_domain_info *info = dev_iommu_priv_get(dev);
3935         struct dmar_domain *domain = info->domain;
3936         struct intel_iommu *iommu = info->iommu;
3937         unsigned long flags;
3938
3939         if (!dev_is_real_dma_subdevice(info->dev)) {
3940                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3941                         intel_pasid_tear_down_entry(iommu, info->dev,
3942                                         IOMMU_NO_PASID, false);
3943
3944                 iommu_disable_pci_caps(info);
3945                 domain_context_clear(info);
3946         }
3947
3948         spin_lock_irqsave(&domain->lock, flags);
3949         list_del(&info->link);
3950         spin_unlock_irqrestore(&domain->lock, flags);
3951
3952         domain_detach_iommu(domain, iommu);
3953         info->domain = NULL;
3954 }
3955
3956 /*
3957  * Clear the page table pointer in context or pasid table entries so that
3958  * all DMA requests without PASID from the device are blocked. If the page
3959  * table has been set, clean up the data structures.
3960  */
3961 static void device_block_translation(struct device *dev)
3962 {
3963         struct device_domain_info *info = dev_iommu_priv_get(dev);
3964         struct intel_iommu *iommu = info->iommu;
3965         unsigned long flags;
3966
3967         iommu_disable_pci_caps(info);
3968         if (!dev_is_real_dma_subdevice(dev)) {
3969                 if (sm_supported(iommu))
3970                         intel_pasid_tear_down_entry(iommu, dev,
3971                                                     IOMMU_NO_PASID, false);
3972                 else
3973                         domain_context_clear(info);
3974         }
3975
3976         if (!info->domain)
3977                 return;
3978
3979         spin_lock_irqsave(&info->domain->lock, flags);
3980         list_del(&info->link);
3981         spin_unlock_irqrestore(&info->domain->lock, flags);
3982
3983         domain_detach_iommu(info->domain, iommu);
3984         info->domain = NULL;
3985 }
3986
3987 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3988 {
3989         int adjust_width;
3990
3991         /* calculate AGAW */
3992         domain->gaw = guest_width;
3993         adjust_width = guestwidth_to_adjustwidth(guest_width);
3994         domain->agaw = width_to_agaw(adjust_width);
3995
3996         domain->iommu_coherency = false;
3997         domain->iommu_superpage = 0;
3998         domain->max_addr = 0;
3999
4000         /* always allocate the top pgd */
4001         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4002         if (!domain->pgd)
4003                 return -ENOMEM;
4004         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4005         return 0;
4006 }
4007
4008 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4009                                       struct device *dev)
4010 {
4011         device_block_translation(dev);
4012         return 0;
4013 }
4014
4015 static struct iommu_domain blocking_domain = {
4016         .type = IOMMU_DOMAIN_BLOCKED,
4017         .ops = &(const struct iommu_domain_ops) {
4018                 .attach_dev     = blocking_domain_attach_dev,
4019         }
4020 };
4021
4022 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4023 {
4024         struct dmar_domain *dmar_domain;
4025         struct iommu_domain *domain;
4026
4027         switch (type) {
4028         case IOMMU_DOMAIN_DMA:
4029         case IOMMU_DOMAIN_UNMANAGED:
4030                 dmar_domain = alloc_domain(type);
4031                 if (!dmar_domain) {
4032                         pr_err("Can't allocate dmar_domain\n");
4033                         return NULL;
4034                 }
4035                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4036                         pr_err("Domain initialization failed\n");
4037                         domain_exit(dmar_domain);
4038                         return NULL;
4039                 }
4040
4041                 domain = &dmar_domain->domain;
4042                 domain->geometry.aperture_start = 0;
4043                 domain->geometry.aperture_end   =
4044                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4045                 domain->geometry.force_aperture = true;
4046
4047                 return domain;
4048         case IOMMU_DOMAIN_IDENTITY:
4049                 return &si_domain->domain;
4050         case IOMMU_DOMAIN_SVA:
4051                 return intel_svm_domain_alloc();
4052         default:
4053                 return NULL;
4054         }
4055
4056         return NULL;
4057 }
4058
4059 static void intel_iommu_domain_free(struct iommu_domain *domain)
4060 {
4061         if (domain != &si_domain->domain)
4062                 domain_exit(to_dmar_domain(domain));
4063 }
4064
4065 static int prepare_domain_attach_device(struct iommu_domain *domain,
4066                                         struct device *dev)
4067 {
4068         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4069         struct intel_iommu *iommu;
4070         int addr_width;
4071
4072         iommu = device_to_iommu(dev, NULL, NULL);
4073         if (!iommu)
4074                 return -ENODEV;
4075
4076         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4077                 return -EINVAL;
4078
4079         /* check if this iommu agaw is sufficient for max mapped address */
4080         addr_width = agaw_to_width(iommu->agaw);
4081         if (addr_width > cap_mgaw(iommu->cap))
4082                 addr_width = cap_mgaw(iommu->cap);
4083
4084         if (dmar_domain->max_addr > (1LL << addr_width))
4085                 return -EINVAL;
4086         dmar_domain->gaw = addr_width;
4087
4088         /*
4089          * Knock out extra levels of page tables if necessary
4090          */
4091         while (iommu->agaw < dmar_domain->agaw) {
4092                 struct dma_pte *pte;
4093
4094                 pte = dmar_domain->pgd;
4095                 if (dma_pte_present(pte)) {
4096                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4097                         free_pgtable_page(pte);
4098                 }
4099                 dmar_domain->agaw--;
4100         }
4101
4102         return 0;
4103 }
4104
4105 static int intel_iommu_attach_device(struct iommu_domain *domain,
4106                                      struct device *dev)
4107 {
4108         struct device_domain_info *info = dev_iommu_priv_get(dev);
4109         int ret;
4110
4111         if (info->domain)
4112                 device_block_translation(dev);
4113
4114         ret = prepare_domain_attach_device(domain, dev);
4115         if (ret)
4116                 return ret;
4117
4118         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4119 }
4120
4121 static int intel_iommu_map(struct iommu_domain *domain,
4122                            unsigned long iova, phys_addr_t hpa,
4123                            size_t size, int iommu_prot, gfp_t gfp)
4124 {
4125         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4126         u64 max_addr;
4127         int prot = 0;
4128
4129         if (iommu_prot & IOMMU_READ)
4130                 prot |= DMA_PTE_READ;
4131         if (iommu_prot & IOMMU_WRITE)
4132                 prot |= DMA_PTE_WRITE;
4133         if (dmar_domain->set_pte_snp)
4134                 prot |= DMA_PTE_SNP;
4135
4136         max_addr = iova + size;
4137         if (dmar_domain->max_addr < max_addr) {
4138                 u64 end;
4139
4140                 /* check if minimum agaw is sufficient for mapped address */
4141                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4142                 if (end < max_addr) {
4143                         pr_err("%s: iommu width (%d) is not "
4144                                "sufficient for the mapped address (%llx)\n",
4145                                __func__, dmar_domain->gaw, max_addr);
4146                         return -EFAULT;
4147                 }
4148                 dmar_domain->max_addr = max_addr;
4149         }
4150         /* Round up size to next multiple of PAGE_SIZE, if it and
4151            the low bits of hpa would take us onto the next page */
4152         size = aligned_nrpages(hpa, size);
4153         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4154                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4155 }
4156
4157 static int intel_iommu_map_pages(struct iommu_domain *domain,
4158                                  unsigned long iova, phys_addr_t paddr,
4159                                  size_t pgsize, size_t pgcount,
4160                                  int prot, gfp_t gfp, size_t *mapped)
4161 {
4162         unsigned long pgshift = __ffs(pgsize);
4163         size_t size = pgcount << pgshift;
4164         int ret;
4165
4166         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4167                 return -EINVAL;
4168
4169         if (!IS_ALIGNED(iova | paddr, pgsize))
4170                 return -EINVAL;
4171
4172         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4173         if (!ret && mapped)
4174                 *mapped = size;
4175
4176         return ret;
4177 }
4178
4179 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4180                                 unsigned long iova, size_t size,
4181                                 struct iommu_iotlb_gather *gather)
4182 {
4183         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4184         unsigned long start_pfn, last_pfn;
4185         int level = 0;
4186
4187         /* Cope with horrid API which requires us to unmap more than the
4188            size argument if it happens to be a large-page mapping. */
4189         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4190                                      &level, GFP_ATOMIC)))
4191                 return 0;
4192
4193         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4194                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4195
4196         start_pfn = iova >> VTD_PAGE_SHIFT;
4197         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4198
4199         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4200
4201         if (dmar_domain->max_addr == iova + size)
4202                 dmar_domain->max_addr = iova;
4203
4204         /*
4205          * We do not use page-selective IOTLB invalidation in flush queue,
4206          * so there is no need to track page and sync iotlb.
4207          */
4208         if (!iommu_iotlb_gather_queued(gather))
4209                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4210
4211         return size;
4212 }
4213
4214 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4215                                       unsigned long iova,
4216                                       size_t pgsize, size_t pgcount,
4217                                       struct iommu_iotlb_gather *gather)
4218 {
4219         unsigned long pgshift = __ffs(pgsize);
4220         size_t size = pgcount << pgshift;
4221
4222         return intel_iommu_unmap(domain, iova, size, gather);
4223 }
4224
4225 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4226                                  struct iommu_iotlb_gather *gather)
4227 {
4228         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4229         unsigned long iova_pfn = IOVA_PFN(gather->start);
4230         size_t size = gather->end - gather->start;
4231         struct iommu_domain_info *info;
4232         unsigned long start_pfn;
4233         unsigned long nrpages;
4234         unsigned long i;
4235
4236         nrpages = aligned_nrpages(gather->start, size);
4237         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4238
4239         xa_for_each(&dmar_domain->iommu_array, i, info)
4240                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4241                                       start_pfn, nrpages,
4242                                       list_empty(&gather->freelist), 0);
4243
4244         put_pages_list(&gather->freelist);
4245 }
4246
4247 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4248                                             dma_addr_t iova)
4249 {
4250         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4251         struct dma_pte *pte;
4252         int level = 0;
4253         u64 phys = 0;
4254
4255         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4256                              GFP_ATOMIC);
4257         if (pte && dma_pte_present(pte))
4258                 phys = dma_pte_addr(pte) +
4259                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4260                                                 VTD_PAGE_SHIFT) - 1));
4261
4262         return phys;
4263 }
4264
4265 static bool domain_support_force_snooping(struct dmar_domain *domain)
4266 {
4267         struct device_domain_info *info;
4268         bool support = true;
4269
4270         assert_spin_locked(&domain->lock);
4271         list_for_each_entry(info, &domain->devices, link) {
4272                 if (!ecap_sc_support(info->iommu->ecap)) {
4273                         support = false;
4274                         break;
4275                 }
4276         }
4277
4278         return support;
4279 }
4280
4281 static void domain_set_force_snooping(struct dmar_domain *domain)
4282 {
4283         struct device_domain_info *info;
4284
4285         assert_spin_locked(&domain->lock);
4286         /*
4287          * Second level page table supports per-PTE snoop control. The
4288          * iommu_map() interface will handle this by setting SNP bit.
4289          */
4290         if (!domain->use_first_level) {
4291                 domain->set_pte_snp = true;
4292                 return;
4293         }
4294
4295         list_for_each_entry(info, &domain->devices, link)
4296                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4297                                                      IOMMU_NO_PASID);
4298 }
4299
4300 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4301 {
4302         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4303         unsigned long flags;
4304
4305         if (dmar_domain->force_snooping)
4306                 return true;
4307
4308         spin_lock_irqsave(&dmar_domain->lock, flags);
4309         if (!domain_support_force_snooping(dmar_domain)) {
4310                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4311                 return false;
4312         }
4313
4314         domain_set_force_snooping(dmar_domain);
4315         dmar_domain->force_snooping = true;
4316         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4317
4318         return true;
4319 }
4320
4321 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4322 {
4323         struct device_domain_info *info = dev_iommu_priv_get(dev);
4324
4325         switch (cap) {
4326         case IOMMU_CAP_CACHE_COHERENCY:
4327         case IOMMU_CAP_DEFERRED_FLUSH:
4328                 return true;
4329         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4330                 return dmar_platform_optin();
4331         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4332                 return ecap_sc_support(info->iommu->ecap);
4333         default:
4334                 return false;
4335         }
4336 }
4337
4338 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4339 {
4340         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4341         struct device_domain_info *info;
4342         struct intel_iommu *iommu;
4343         u8 bus, devfn;
4344         int ret;
4345
4346         iommu = device_to_iommu(dev, &bus, &devfn);
4347         if (!iommu || !iommu->iommu.ops)
4348                 return ERR_PTR(-ENODEV);
4349
4350         info = kzalloc(sizeof(*info), GFP_KERNEL);
4351         if (!info)
4352                 return ERR_PTR(-ENOMEM);
4353
4354         if (dev_is_real_dma_subdevice(dev)) {
4355                 info->bus = pdev->bus->number;
4356                 info->devfn = pdev->devfn;
4357                 info->segment = pci_domain_nr(pdev->bus);
4358         } else {
4359                 info->bus = bus;
4360                 info->devfn = devfn;
4361                 info->segment = iommu->segment;
4362         }
4363
4364         info->dev = dev;
4365         info->iommu = iommu;
4366         if (dev_is_pci(dev)) {
4367                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4368                     pci_ats_supported(pdev) &&
4369                     dmar_ats_supported(pdev, iommu)) {
4370                         info->ats_supported = 1;
4371                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4372
4373                         /*
4374                          * For IOMMU that supports device IOTLB throttling
4375                          * (DIT), we assign PFSID to the invalidation desc
4376                          * of a VF such that IOMMU HW can gauge queue depth
4377                          * at PF level. If DIT is not set, PFSID will be
4378                          * treated as reserved, which should be set to 0.
4379                          */
4380                         if (ecap_dit(iommu->ecap))
4381                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4382                         info->ats_qdep = pci_ats_queue_depth(pdev);
4383                 }
4384                 if (sm_supported(iommu)) {
4385                         if (pasid_supported(iommu)) {
4386                                 int features = pci_pasid_features(pdev);
4387
4388                                 if (features >= 0)
4389                                         info->pasid_supported = features | 1;
4390                         }
4391
4392                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4393                             pci_pri_supported(pdev))
4394                                 info->pri_supported = 1;
4395                 }
4396         }
4397
4398         dev_iommu_priv_set(dev, info);
4399
4400         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4401                 ret = intel_pasid_alloc_table(dev);
4402                 if (ret) {
4403                         dev_err(dev, "PASID table allocation failed\n");
4404                         dev_iommu_priv_set(dev, NULL);
4405                         kfree(info);
4406                         return ERR_PTR(ret);
4407                 }
4408         }
4409
4410         intel_iommu_debugfs_create_dev(info);
4411
4412         return &iommu->iommu;
4413 }
4414
4415 static void intel_iommu_release_device(struct device *dev)
4416 {
4417         struct device_domain_info *info = dev_iommu_priv_get(dev);
4418
4419         dmar_remove_one_dev_info(dev);
4420         intel_pasid_free_table(dev);
4421         intel_iommu_debugfs_remove_dev(info);
4422         dev_iommu_priv_set(dev, NULL);
4423         kfree(info);
4424         set_dma_ops(dev, NULL);
4425 }
4426
4427 static void intel_iommu_probe_finalize(struct device *dev)
4428 {
4429         set_dma_ops(dev, NULL);
4430         iommu_setup_dma_ops(dev, 0, U64_MAX);
4431 }
4432
4433 static void intel_iommu_get_resv_regions(struct device *device,
4434                                          struct list_head *head)
4435 {
4436         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4437         struct iommu_resv_region *reg;
4438         struct dmar_rmrr_unit *rmrr;
4439         struct device *i_dev;
4440         int i;
4441
4442         rcu_read_lock();
4443         for_each_rmrr_units(rmrr) {
4444                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4445                                           i, i_dev) {
4446                         struct iommu_resv_region *resv;
4447                         enum iommu_resv_type type;
4448                         size_t length;
4449
4450                         if (i_dev != device &&
4451                             !is_downstream_to_pci_bridge(device, i_dev))
4452                                 continue;
4453
4454                         length = rmrr->end_address - rmrr->base_address + 1;
4455
4456                         type = device_rmrr_is_relaxable(device) ?
4457                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4458
4459                         resv = iommu_alloc_resv_region(rmrr->base_address,
4460                                                        length, prot, type,
4461                                                        GFP_ATOMIC);
4462                         if (!resv)
4463                                 break;
4464
4465                         list_add_tail(&resv->list, head);
4466                 }
4467         }
4468         rcu_read_unlock();
4469
4470 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4471         if (dev_is_pci(device)) {
4472                 struct pci_dev *pdev = to_pci_dev(device);
4473
4474                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4475                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4476                                         IOMMU_RESV_DIRECT_RELAXABLE,
4477                                         GFP_KERNEL);
4478                         if (reg)
4479                                 list_add_tail(&reg->list, head);
4480                 }
4481         }
4482 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4483
4484         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4485                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4486                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4487         if (!reg)
4488                 return;
4489         list_add_tail(&reg->list, head);
4490 }
4491
4492 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4493 {
4494         if (dev_is_pci(dev))
4495                 return pci_device_group(dev);
4496         return generic_device_group(dev);
4497 }
4498
4499 static int intel_iommu_enable_sva(struct device *dev)
4500 {
4501         struct device_domain_info *info = dev_iommu_priv_get(dev);
4502         struct intel_iommu *iommu;
4503
4504         if (!info || dmar_disabled)
4505                 return -EINVAL;
4506
4507         iommu = info->iommu;
4508         if (!iommu)
4509                 return -EINVAL;
4510
4511         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4512                 return -ENODEV;
4513
4514         if (!info->pasid_enabled || !info->ats_enabled)
4515                 return -EINVAL;
4516
4517         /*
4518          * Devices having device-specific I/O fault handling should not
4519          * support PCI/PRI. The IOMMU side has no means to check the
4520          * capability of device-specific IOPF.  Therefore, IOMMU can only
4521          * default that if the device driver enables SVA on a non-PRI
4522          * device, it will handle IOPF in its own way.
4523          */
4524         if (!info->pri_supported)
4525                 return 0;
4526
4527         /* Devices supporting PRI should have it enabled. */
4528         if (!info->pri_enabled)
4529                 return -EINVAL;
4530
4531         return 0;
4532 }
4533
4534 static int intel_iommu_enable_iopf(struct device *dev)
4535 {
4536         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4537         struct device_domain_info *info = dev_iommu_priv_get(dev);
4538         struct intel_iommu *iommu;
4539         int ret;
4540
4541         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4542                 return -ENODEV;
4543
4544         if (info->pri_enabled)
4545                 return -EBUSY;
4546
4547         iommu = info->iommu;
4548         if (!iommu)
4549                 return -EINVAL;
4550
4551         /* PASID is required in PRG Response Message. */
4552         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4553                 return -EINVAL;
4554
4555         ret = pci_reset_pri(pdev);
4556         if (ret)
4557                 return ret;
4558
4559         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4560         if (ret)
4561                 return ret;
4562
4563         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4564         if (ret)
4565                 goto iopf_remove_device;
4566
4567         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4568         if (ret)
4569                 goto iopf_unregister_handler;
4570         info->pri_enabled = 1;
4571
4572         return 0;
4573
4574 iopf_unregister_handler:
4575         iommu_unregister_device_fault_handler(dev);
4576 iopf_remove_device:
4577         iopf_queue_remove_device(iommu->iopf_queue, dev);
4578
4579         return ret;
4580 }
4581
4582 static int intel_iommu_disable_iopf(struct device *dev)
4583 {
4584         struct device_domain_info *info = dev_iommu_priv_get(dev);
4585         struct intel_iommu *iommu = info->iommu;
4586
4587         if (!info->pri_enabled)
4588                 return -EINVAL;
4589
4590         /*
4591          * PCIe spec states that by clearing PRI enable bit, the Page
4592          * Request Interface will not issue new page requests, but has
4593          * outstanding page requests that have been transmitted or are
4594          * queued for transmission. This is supposed to be called after
4595          * the device driver has stopped DMA, all PASIDs have been
4596          * unbound and the outstanding PRQs have been drained.
4597          */
4598         pci_disable_pri(to_pci_dev(dev));
4599         info->pri_enabled = 0;
4600
4601         /*
4602          * With PRI disabled and outstanding PRQs drained, unregistering
4603          * fault handler and removing device from iopf queue should never
4604          * fail.
4605          */
4606         WARN_ON(iommu_unregister_device_fault_handler(dev));
4607         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4608
4609         return 0;
4610 }
4611
4612 static int
4613 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4614 {
4615         switch (feat) {
4616         case IOMMU_DEV_FEAT_IOPF:
4617                 return intel_iommu_enable_iopf(dev);
4618
4619         case IOMMU_DEV_FEAT_SVA:
4620                 return intel_iommu_enable_sva(dev);
4621
4622         default:
4623                 return -ENODEV;
4624         }
4625 }
4626
4627 static int
4628 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4629 {
4630         switch (feat) {
4631         case IOMMU_DEV_FEAT_IOPF:
4632                 return intel_iommu_disable_iopf(dev);
4633
4634         case IOMMU_DEV_FEAT_SVA:
4635                 return 0;
4636
4637         default:
4638                 return -ENODEV;
4639         }
4640 }
4641
4642 static bool intel_iommu_is_attach_deferred(struct device *dev)
4643 {
4644         struct device_domain_info *info = dev_iommu_priv_get(dev);
4645
4646         return translation_pre_enabled(info->iommu) && !info->domain;
4647 }
4648
4649 /*
4650  * Check that the device does not live on an external facing PCI port that is
4651  * marked as untrusted. Such devices should not be able to apply quirks and
4652  * thus not be able to bypass the IOMMU restrictions.
4653  */
4654 static bool risky_device(struct pci_dev *pdev)
4655 {
4656         if (pdev->untrusted) {
4657                 pci_info(pdev,
4658                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4659                          pdev->vendor, pdev->device);
4660                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4661                 return true;
4662         }
4663         return false;
4664 }
4665
4666 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4667                                       unsigned long iova, size_t size)
4668 {
4669         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4670         unsigned long pages = aligned_nrpages(iova, size);
4671         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4672         struct iommu_domain_info *info;
4673         unsigned long i;
4674
4675         xa_for_each(&dmar_domain->iommu_array, i, info)
4676                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4677         return 0;
4678 }
4679
4680 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4681 {
4682         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4683         struct dev_pasid_info *curr, *dev_pasid = NULL;
4684         struct dmar_domain *dmar_domain;
4685         struct iommu_domain *domain;
4686         unsigned long flags;
4687
4688         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4689         if (WARN_ON_ONCE(!domain))
4690                 goto out_tear_down;
4691
4692         /*
4693          * The SVA implementation needs to handle its own stuffs like the mm
4694          * notification. Before consolidating that code into iommu core, let
4695          * the intel sva code handle it.
4696          */
4697         if (domain->type == IOMMU_DOMAIN_SVA) {
4698                 intel_svm_remove_dev_pasid(dev, pasid);
4699                 goto out_tear_down;
4700         }
4701
4702         dmar_domain = to_dmar_domain(domain);
4703         spin_lock_irqsave(&dmar_domain->lock, flags);
4704         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4705                 if (curr->dev == dev && curr->pasid == pasid) {
4706                         list_del(&curr->link_domain);
4707                         dev_pasid = curr;
4708                         break;
4709                 }
4710         }
4711         WARN_ON_ONCE(!dev_pasid);
4712         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4713
4714         domain_detach_iommu(dmar_domain, iommu);
4715         intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4716         kfree(dev_pasid);
4717 out_tear_down:
4718         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4719         intel_drain_pasid_prq(dev, pasid);
4720 }
4721
4722 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4723                                      struct device *dev, ioasid_t pasid)
4724 {
4725         struct device_domain_info *info = dev_iommu_priv_get(dev);
4726         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4727         struct intel_iommu *iommu = info->iommu;
4728         struct dev_pasid_info *dev_pasid;
4729         unsigned long flags;
4730         int ret;
4731
4732         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4733                 return -EOPNOTSUPP;
4734
4735         if (context_copied(iommu, info->bus, info->devfn))
4736                 return -EBUSY;
4737
4738         ret = prepare_domain_attach_device(domain, dev);
4739         if (ret)
4740                 return ret;
4741
4742         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4743         if (!dev_pasid)
4744                 return -ENOMEM;
4745
4746         ret = domain_attach_iommu(dmar_domain, iommu);
4747         if (ret)
4748                 goto out_free;
4749
4750         if (domain_type_is_si(dmar_domain))
4751                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4752                                                      dev, pasid);
4753         else if (dmar_domain->use_first_level)
4754                 ret = domain_setup_first_level(iommu, dmar_domain,
4755                                                dev, pasid);
4756         else
4757                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4758                                                      dev, pasid);
4759         if (ret)
4760                 goto out_detach_iommu;
4761
4762         dev_pasid->dev = dev;
4763         dev_pasid->pasid = pasid;
4764         spin_lock_irqsave(&dmar_domain->lock, flags);
4765         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4766         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4767
4768         if (domain->type & __IOMMU_DOMAIN_PAGING)
4769                 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4770
4771         return 0;
4772 out_detach_iommu:
4773         domain_detach_iommu(dmar_domain, iommu);
4774 out_free:
4775         kfree(dev_pasid);
4776         return ret;
4777 }
4778
4779 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4780 {
4781         struct device_domain_info *info = dev_iommu_priv_get(dev);
4782         struct intel_iommu *iommu = info->iommu;
4783         struct iommu_hw_info_vtd *vtd;
4784
4785         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4786         if (!vtd)
4787                 return ERR_PTR(-ENOMEM);
4788
4789         vtd->cap_reg = iommu->cap;
4790         vtd->ecap_reg = iommu->ecap;
4791         *length = sizeof(*vtd);
4792         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4793         return vtd;
4794 }
4795
4796 const struct iommu_ops intel_iommu_ops = {
4797         .blocked_domain         = &blocking_domain,
4798         .capable                = intel_iommu_capable,
4799         .hw_info                = intel_iommu_hw_info,
4800         .domain_alloc           = intel_iommu_domain_alloc,
4801         .probe_device           = intel_iommu_probe_device,
4802         .probe_finalize         = intel_iommu_probe_finalize,
4803         .release_device         = intel_iommu_release_device,
4804         .get_resv_regions       = intel_iommu_get_resv_regions,
4805         .device_group           = intel_iommu_device_group,
4806         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4807         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4808         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4809         .def_domain_type        = device_def_domain_type,
4810         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4811         .pgsize_bitmap          = SZ_4K,
4812 #ifdef CONFIG_INTEL_IOMMU_SVM
4813         .page_response          = intel_svm_page_response,
4814 #endif
4815         .default_domain_ops = &(const struct iommu_domain_ops) {
4816                 .attach_dev             = intel_iommu_attach_device,
4817                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4818                 .map_pages              = intel_iommu_map_pages,
4819                 .unmap_pages            = intel_iommu_unmap_pages,
4820                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4821                 .flush_iotlb_all        = intel_flush_iotlb_all,
4822                 .iotlb_sync             = intel_iommu_tlb_sync,
4823                 .iova_to_phys           = intel_iommu_iova_to_phys,
4824                 .free                   = intel_iommu_domain_free,
4825                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4826         }
4827 };
4828
4829 static void quirk_iommu_igfx(struct pci_dev *dev)
4830 {
4831         if (risky_device(dev))
4832                 return;
4833
4834         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4835         dmar_map_gfx = 0;
4836 }
4837
4838 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4846
4847 /* Broadwell igfx malfunctions with dmar */
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4872
4873 static void quirk_iommu_rwbf(struct pci_dev *dev)
4874 {
4875         if (risky_device(dev))
4876                 return;
4877
4878         /*
4879          * Mobile 4 Series Chipset neglects to set RWBF capability,
4880          * but needs it. Same seems to hold for the desktop versions.
4881          */
4882         pci_info(dev, "Forcing write-buffer flush capability\n");
4883         rwbf_quirk = 1;
4884 }
4885
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4893
4894 #define GGC 0x52
4895 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4896 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4897 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4898 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4899 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4900 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4901 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4902 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4903
4904 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4905 {
4906         unsigned short ggc;
4907
4908         if (risky_device(dev))
4909                 return;
4910
4911         if (pci_read_config_word(dev, GGC, &ggc))
4912                 return;
4913
4914         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4915                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4916                 dmar_map_gfx = 0;
4917         } else if (dmar_map_gfx) {
4918                 /* we have to ensure the gfx device is idle before we flush */
4919                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4920                 iommu_set_dma_strict();
4921         }
4922 }
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4924 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4925 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4927
4928 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4929 {
4930         unsigned short ver;
4931
4932         if (!IS_GFX_DEVICE(dev))
4933                 return;
4934
4935         ver = (dev->device >> 8) & 0xff;
4936         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4937             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4938             ver != 0x9a && ver != 0xa7)
4939                 return;
4940
4941         if (risky_device(dev))
4942                 return;
4943
4944         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4945         iommu_skip_te_disable = 1;
4946 }
4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4948
4949 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4950    ISOCH DMAR unit for the Azalia sound device, but not give it any
4951    TLB entries, which causes it to deadlock. Check for that.  We do
4952    this in a function called from init_dmars(), instead of in a PCI
4953    quirk, because we don't want to print the obnoxious "BIOS broken"
4954    message if VT-d is actually disabled.
4955 */
4956 static void __init check_tylersburg_isoch(void)
4957 {
4958         struct pci_dev *pdev;
4959         uint32_t vtisochctrl;
4960
4961         /* If there's no Azalia in the system anyway, forget it. */
4962         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4963         if (!pdev)
4964                 return;
4965
4966         if (risky_device(pdev)) {
4967                 pci_dev_put(pdev);
4968                 return;
4969         }
4970
4971         pci_dev_put(pdev);
4972
4973         /* System Management Registers. Might be hidden, in which case
4974            we can't do the sanity check. But that's OK, because the
4975            known-broken BIOSes _don't_ actually hide it, so far. */
4976         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4977         if (!pdev)
4978                 return;
4979
4980         if (risky_device(pdev)) {
4981                 pci_dev_put(pdev);
4982                 return;
4983         }
4984
4985         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4986                 pci_dev_put(pdev);
4987                 return;
4988         }
4989
4990         pci_dev_put(pdev);
4991
4992         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4993         if (vtisochctrl & 1)
4994                 return;
4995
4996         /* Drop all bits other than the number of TLB entries */
4997         vtisochctrl &= 0x1c;
4998
4999         /* If we have the recommended number of TLB entries (16), fine. */
5000         if (vtisochctrl == 0x10)
5001                 return;
5002
5003         /* Zero TLB entries? You get to ride the short bus to school. */
5004         if (!vtisochctrl) {
5005                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5006                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5007                      dmi_get_system_info(DMI_BIOS_VENDOR),
5008                      dmi_get_system_info(DMI_BIOS_VERSION),
5009                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5010                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5011                 return;
5012         }
5013
5014         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5015                vtisochctrl);
5016 }
5017
5018 /*
5019  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5020  * invalidation completion before posted writes initiated with translated address
5021  * that utilized translations matching the invalidation address range, violating
5022  * the invalidation completion ordering.
5023  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5024  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5025  * under the control of the trusted/privileged host device driver must use this
5026  * quirk.
5027  * Device TLBs are invalidated under the following six conditions:
5028  * 1. Device driver does DMA API unmap IOVA
5029  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5030  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5031  *    exit_mmap() due to crash
5032  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5033  *    VM has to free pages that were unmapped
5034  * 5. Userspace driver unmaps a DMA buffer
5035  * 6. Cache invalidation in vSVA usage (upcoming)
5036  *
5037  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5038  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5039  * invalidate TLB the same way as normal user unmap which will use this quirk.
5040  * The dTLB invalidation after PASID cache flush does not need this quirk.
5041  *
5042  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5043  */
5044 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5045                                unsigned long address, unsigned long mask,
5046                                u32 pasid, u16 qdep)
5047 {
5048         u16 sid;
5049
5050         if (likely(!info->dtlb_extra_inval))
5051                 return;
5052
5053         sid = PCI_DEVID(info->bus, info->devfn);
5054         if (pasid == IOMMU_NO_PASID) {
5055                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5056                                    qdep, address, mask);
5057         } else {
5058                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5059                                          pasid, qdep, address, mask);
5060         }
5061 }
5062
5063 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5064
5065 /*
5066  * Function to submit a command to the enhanced command interface. The
5067  * valid enhanced command descriptions are defined in Table 47 of the
5068  * VT-d spec. The VT-d hardware implementation may support some but not
5069  * all commands, which can be determined by checking the Enhanced
5070  * Command Capability Register.
5071  *
5072  * Return values:
5073  *  - 0: Command successful without any error;
5074  *  - Negative: software error value;
5075  *  - Nonzero positive: failure status code defined in Table 48.
5076  */
5077 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5078 {
5079         unsigned long flags;
5080         u64 res;
5081         int ret;
5082
5083         if (!cap_ecmds(iommu->cap))
5084                 return -ENODEV;
5085
5086         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5087
5088         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5089         if (res & DMA_ECMD_ECRSP_IP) {
5090                 ret = -EBUSY;
5091                 goto err;
5092         }
5093
5094         /*
5095          * Unconditionally write the operand B, because
5096          * - There is no side effect if an ecmd doesn't require an
5097          *   operand B, but we set the register to some value.
5098          * - It's not invoked in any critical path. The extra MMIO
5099          *   write doesn't bring any performance concerns.
5100          */
5101         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5102         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5103
5104         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5105                       !(res & DMA_ECMD_ECRSP_IP), res);
5106
5107         if (res & DMA_ECMD_ECRSP_IP) {
5108                 ret = -ETIMEDOUT;
5109                 goto err;
5110         }
5111
5112         ret = ecmd_get_status_code(res);
5113 err:
5114         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5115
5116         return ret;
5117 }