Merge tag 'pm-6.12-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[linux-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE               VTD_PAGE_SIZE
36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
55                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77         if (!(re->lo & 1))
78                 return 0;
79
80         return re->lo & VTD_PAGE_MASK;
81 }
82
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89         if (!(re->hi & 1))
90                 return 0;
91
92         return re->hi & VTD_PAGE_MASK;
93 }
94
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97         struct device_domain_info *info =
98                 rb_entry(node, struct device_domain_info, node);
99         const u16 *rid_lhs = key;
100
101         if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102                 return -1;
103
104         if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105                 return 1;
106
107         return 0;
108 }
109
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112         struct device_domain_info *info =
113                 rb_entry(lhs, struct device_domain_info, node);
114         u16 key = PCI_DEVID(info->bus, info->devfn);
115
116         return device_rid_cmp_key(&key, rhs);
117 }
118
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132         struct device_domain_info *info = NULL;
133         struct rb_node *node;
134         unsigned long flags;
135
136         spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137         node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138         if (node)
139                 info = rb_entry(node, struct device_domain_info, node);
140         spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141
142         return info ? info->dev : NULL;
143 }
144
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146                                 struct device_domain_info *info)
147 {
148         struct rb_node *curr;
149         unsigned long flags;
150
151         spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152         curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153         spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154         if (WARN_ON(curr))
155                 return -EEXIST;
156
157         return 0;
158 }
159
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162         struct intel_iommu *iommu = info->iommu;
163         unsigned long flags;
164
165         spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166         rb_erase(&info->node, &iommu->device_rbtree);
167         spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169
170 struct dmar_rmrr_unit {
171         struct list_head list;          /* list of rmrr units   */
172         struct acpi_dmar_header *hdr;   /* ACPI header          */
173         u64     base_address;           /* reserved base address*/
174         u64     end_address;            /* reserved end address */
175         struct dmar_dev_scope *devices; /* target devices */
176         int     devices_cnt;            /* target device count */
177 };
178
179 struct dmar_atsr_unit {
180         struct list_head list;          /* list of ATSR units */
181         struct acpi_dmar_header *hdr;   /* ACPI header */
182         struct dmar_dev_scope *devices; /* target devices */
183         int devices_cnt;                /* target device count */
184         u8 include_all:1;               /* include all ports */
185 };
186
187 struct dmar_satc_unit {
188         struct list_head list;          /* list of SATC units */
189         struct acpi_dmar_header *hdr;   /* ACPI header */
190         struct dmar_dev_scope *devices; /* target devices */
191         struct intel_iommu *iommu;      /* the corresponding iommu */
192         int devices_cnt;                /* target device count */
193         u8 atc_required:1;              /* ATS is required */
194 };
195
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199
200 #define for_each_rmrr_units(rmrr) \
201         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215
216 #define IDENTMAP_AZALIA         4
217
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220
221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225
226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230
231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233         u32 gsts;
234
235         gsts = readl(iommu->reg + DMAR_GSTS_REG);
236         if (gsts & DMA_GSTS_TES)
237                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239
240 static int __init intel_iommu_setup(char *str)
241 {
242         if (!str)
243                 return -EINVAL;
244
245         while (*str) {
246                 if (!strncmp(str, "on", 2)) {
247                         dmar_disabled = 0;
248                         pr_info("IOMMU enabled\n");
249                 } else if (!strncmp(str, "off", 3)) {
250                         dmar_disabled = 1;
251                         no_platform_optin = 1;
252                         pr_info("IOMMU disabled\n");
253                 } else if (!strncmp(str, "igfx_off", 8)) {
254                         disable_igfx_iommu = 1;
255                         pr_info("Disable GFX device mapping\n");
256                 } else if (!strncmp(str, "forcedac", 8)) {
257                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258                         iommu_dma_forcedac = true;
259                 } else if (!strncmp(str, "strict", 6)) {
260                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261                         iommu_set_dma_strict();
262                 } else if (!strncmp(str, "sp_off", 6)) {
263                         pr_info("Disable supported super page\n");
264                         intel_iommu_superpage = 0;
265                 } else if (!strncmp(str, "sm_on", 5)) {
266                         pr_info("Enable scalable mode if hardware supports\n");
267                         intel_iommu_sm = 1;
268                 } else if (!strncmp(str, "sm_off", 6)) {
269                         pr_info("Scalable mode is disallowed\n");
270                         intel_iommu_sm = 0;
271                 } else if (!strncmp(str, "tboot_noforce", 13)) {
272                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273                         intel_iommu_tboot_noforce = 1;
274                 } else {
275                         pr_notice("Unknown option - '%s'\n", str);
276                 }
277
278                 str += strcspn(str, ",");
279                 while (*str == ',')
280                         str++;
281         }
282
283         return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286
287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290
291         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293
294 /*
295  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297  * the returned SAGAW.
298  */
299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301         unsigned long fl_sagaw, sl_sagaw;
302
303         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304         sl_sagaw = cap_sagaw(iommu->cap);
305
306         /* Second level only. */
307         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308                 return sl_sagaw;
309
310         /* First level only. */
311         if (!ecap_slts(iommu->ecap))
312                 return fl_sagaw;
313
314         return fl_sagaw & sl_sagaw;
315 }
316
317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319         unsigned long sagaw;
320         int agaw;
321
322         sagaw = __iommu_calculate_sagaw(iommu);
323         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324                 if (test_bit(agaw, &sagaw))
325                         break;
326         }
327
328         return agaw;
329 }
330
331 /*
332  * Calculate max SAGAW for each iommu.
333  */
334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338
339 /*
340  * calculate agaw for each iommu.
341  * "SAGAW" may be different across iommus, use a default agaw, and
342  * get a supported less agaw for iommus that don't support the default agaw.
343  */
344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348
349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351         return sm_supported(iommu) ?
352                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354
355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
356 {
357         struct iommu_domain_info *info;
358         struct dmar_drhd_unit *drhd;
359         struct intel_iommu *iommu;
360         bool found = false;
361         unsigned long i;
362
363         domain->iommu_coherency = true;
364         xa_for_each(&domain->iommu_array, i, info) {
365                 found = true;
366                 if (!iommu_paging_structure_coherency(info->iommu)) {
367                         domain->iommu_coherency = false;
368                         break;
369                 }
370         }
371         if (found)
372                 return;
373
374         /* No hardware attached; use lowest common denominator */
375         rcu_read_lock();
376         for_each_active_iommu(iommu, drhd) {
377                 if (!iommu_paging_structure_coherency(iommu)) {
378                         domain->iommu_coherency = false;
379                         break;
380                 }
381         }
382         rcu_read_unlock();
383 }
384
385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386                                          struct intel_iommu *skip)
387 {
388         struct dmar_drhd_unit *drhd;
389         struct intel_iommu *iommu;
390         int mask = 0x3;
391
392         if (!intel_iommu_superpage)
393                 return 0;
394
395         /* set iommu_superpage to the smallest common denominator */
396         rcu_read_lock();
397         for_each_active_iommu(iommu, drhd) {
398                 if (iommu != skip) {
399                         if (domain && domain->use_first_level) {
400                                 if (!cap_fl1gp_support(iommu->cap))
401                                         mask = 0x1;
402                         } else {
403                                 mask &= cap_super_page_val(iommu->cap);
404                         }
405
406                         if (!mask)
407                                 break;
408                 }
409         }
410         rcu_read_unlock();
411
412         return fls(mask);
413 }
414
415 static int domain_update_device_node(struct dmar_domain *domain)
416 {
417         struct device_domain_info *info;
418         int nid = NUMA_NO_NODE;
419         unsigned long flags;
420
421         spin_lock_irqsave(&domain->lock, flags);
422         list_for_each_entry(info, &domain->devices, link) {
423                 /*
424                  * There could possibly be multiple device numa nodes as devices
425                  * within the same domain may sit behind different IOMMUs. There
426                  * isn't perfect answer in such situation, so we select first
427                  * come first served policy.
428                  */
429                 nid = dev_to_node(info->dev);
430                 if (nid != NUMA_NO_NODE)
431                         break;
432         }
433         spin_unlock_irqrestore(&domain->lock, flags);
434
435         return nid;
436 }
437
438 /* Return the super pagesize bitmap if supported. */
439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
440 {
441         unsigned long bitmap = 0;
442
443         /*
444          * 1-level super page supports page size of 2MiB, 2-level super page
445          * supports page size of both 2MiB and 1GiB.
446          */
447         if (domain->iommu_superpage == 1)
448                 bitmap |= SZ_2M;
449         else if (domain->iommu_superpage == 2)
450                 bitmap |= SZ_2M | SZ_1G;
451
452         return bitmap;
453 }
454
455 /* Some capabilities may be different across iommus */
456 void domain_update_iommu_cap(struct dmar_domain *domain)
457 {
458         domain_update_iommu_coherency(domain);
459         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
460
461         /*
462          * If RHSA is missing, we should default to the device numa domain
463          * as fall back.
464          */
465         if (domain->nid == NUMA_NO_NODE)
466                 domain->nid = domain_update_device_node(domain);
467
468         /*
469          * First-level translation restricts the input-address to a
470          * canonical address (i.e., address bits 63:N have the same
471          * value as address bit [N-1], where N is 48-bits with 4-level
472          * paging and 57-bits with 5-level paging). Hence, skip bit
473          * [N-1].
474          */
475         if (domain->use_first_level)
476                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
477         else
478                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
479
480         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
481 }
482
483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
484                                          u8 devfn, int alloc)
485 {
486         struct root_entry *root = &iommu->root_entry[bus];
487         struct context_entry *context;
488         u64 *entry;
489
490         /*
491          * Except that the caller requested to allocate a new entry,
492          * returning a copied context entry makes no sense.
493          */
494         if (!alloc && context_copied(iommu, bus, devfn))
495                 return NULL;
496
497         entry = &root->lo;
498         if (sm_supported(iommu)) {
499                 if (devfn >= 0x80) {
500                         devfn -= 0x80;
501                         entry = &root->hi;
502                 }
503                 devfn *= 2;
504         }
505         if (*entry & 1)
506                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
507         else {
508                 unsigned long phy_addr;
509                 if (!alloc)
510                         return NULL;
511
512                 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
513                 if (!context)
514                         return NULL;
515
516                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517                 phy_addr = virt_to_phys((void *)context);
518                 *entry = phy_addr | 1;
519                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
520         }
521         return &context[devfn];
522 }
523
524 /**
525  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526  *                               sub-hierarchy of a candidate PCI-PCI bridge
527  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528  * @bridge: the candidate PCI-PCI bridge
529  *
530  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
531  */
532 static bool
533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
534 {
535         struct pci_dev *pdev, *pbridge;
536
537         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
538                 return false;
539
540         pdev = to_pci_dev(dev);
541         pbridge = to_pci_dev(bridge);
542
543         if (pbridge->subordinate &&
544             pbridge->subordinate->number <= pdev->bus->number &&
545             pbridge->subordinate->busn_res.end >= pdev->bus->number)
546                 return true;
547
548         return false;
549 }
550
551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
552 {
553         struct dmar_drhd_unit *drhd;
554         u32 vtbar;
555         int rc;
556
557         /* We know that this device on this chipset has its own IOMMU.
558          * If we find it under a different IOMMU, then the BIOS is lying
559          * to us. Hope that the IOMMU for this device is actually
560          * disabled, and it needs no translation...
561          */
562         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
563         if (rc) {
564                 /* "can't" happen */
565                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
566                 return false;
567         }
568         vtbar &= 0xffff0000;
569
570         /* we know that the this iommu should be at offset 0xa000 from vtbar */
571         drhd = dmar_find_matched_drhd_unit(pdev);
572         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
575                 return true;
576         }
577
578         return false;
579 }
580
581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
582 {
583         if (!iommu || iommu->drhd->ignored)
584                 return true;
585
586         if (dev_is_pci(dev)) {
587                 struct pci_dev *pdev = to_pci_dev(dev);
588
589                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591                     quirk_ioat_snb_local_iommu(pdev))
592                         return true;
593         }
594
595         return false;
596 }
597
598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
599 {
600         struct dmar_drhd_unit *drhd = NULL;
601         struct pci_dev *pdev = NULL;
602         struct intel_iommu *iommu;
603         struct device *tmp;
604         u16 segment = 0;
605         int i;
606
607         if (!dev)
608                 return NULL;
609
610         if (dev_is_pci(dev)) {
611                 struct pci_dev *pf_pdev;
612
613                 pdev = pci_real_dma_dev(to_pci_dev(dev));
614
615                 /* VFs aren't listed in scope tables; we need to look up
616                  * the PF instead to find the IOMMU. */
617                 pf_pdev = pci_physfn(pdev);
618                 dev = &pf_pdev->dev;
619                 segment = pci_domain_nr(pdev->bus);
620         } else if (has_acpi_companion(dev))
621                 dev = &ACPI_COMPANION(dev)->dev;
622
623         rcu_read_lock();
624         for_each_iommu(iommu, drhd) {
625                 if (pdev && segment != drhd->segment)
626                         continue;
627
628                 for_each_active_dev_scope(drhd->devices,
629                                           drhd->devices_cnt, i, tmp) {
630                         if (tmp == dev) {
631                                 /* For a VF use its original BDF# not that of the PF
632                                  * which we used for the IOMMU lookup. Strictly speaking
633                                  * we could do this for all PCI devices; we only need to
634                                  * get the BDF# from the scope table for ACPI matches. */
635                                 if (pdev && pdev->is_virtfn)
636                                         goto got_pdev;
637
638                                 if (bus && devfn) {
639                                         *bus = drhd->devices[i].bus;
640                                         *devfn = drhd->devices[i].devfn;
641                                 }
642                                 goto out;
643                         }
644
645                         if (is_downstream_to_pci_bridge(dev, tmp))
646                                 goto got_pdev;
647                 }
648
649                 if (pdev && drhd->include_all) {
650 got_pdev:
651                         if (bus && devfn) {
652                                 *bus = pdev->bus->number;
653                                 *devfn = pdev->devfn;
654                         }
655                         goto out;
656                 }
657         }
658         iommu = NULL;
659 out:
660         if (iommu_is_dummy(iommu, dev))
661                 iommu = NULL;
662
663         rcu_read_unlock();
664
665         return iommu;
666 }
667
668 static void domain_flush_cache(struct dmar_domain *domain,
669                                void *addr, int size)
670 {
671         if (!domain->iommu_coherency)
672                 clflush_cache_range(addr, size);
673 }
674
675 static void free_context_table(struct intel_iommu *iommu)
676 {
677         struct context_entry *context;
678         int i;
679
680         if (!iommu->root_entry)
681                 return;
682
683         for (i = 0; i < ROOT_ENTRY_NR; i++) {
684                 context = iommu_context_addr(iommu, i, 0, 0);
685                 if (context)
686                         iommu_free_page(context);
687
688                 if (!sm_supported(iommu))
689                         continue;
690
691                 context = iommu_context_addr(iommu, i, 0x80, 0);
692                 if (context)
693                         iommu_free_page(context);
694         }
695
696         iommu_free_page(iommu->root_entry);
697         iommu->root_entry = NULL;
698 }
699
700 #ifdef CONFIG_DMAR_DEBUG
701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
703 {
704         struct dma_pte *pte;
705         int offset;
706
707         while (1) {
708                 offset = pfn_level_offset(pfn, level);
709                 pte = &parent[offset];
710                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
711                         pr_info("PTE not present at level %d\n", level);
712                         break;
713                 }
714
715                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
716
717                 if (level == 1)
718                         break;
719
720                 parent = phys_to_virt(dma_pte_addr(pte));
721                 level--;
722         }
723 }
724
725 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
726                           unsigned long long addr, u32 pasid)
727 {
728         struct pasid_dir_entry *dir, *pde;
729         struct pasid_entry *entries, *pte;
730         struct context_entry *ctx_entry;
731         struct root_entry *rt_entry;
732         int i, dir_index, index, level;
733         u8 devfn = source_id & 0xff;
734         u8 bus = source_id >> 8;
735         struct dma_pte *pgtable;
736
737         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
738
739         /* root entry dump */
740         rt_entry = &iommu->root_entry[bus];
741         if (!rt_entry) {
742                 pr_info("root table entry is not present\n");
743                 return;
744         }
745
746         if (sm_supported(iommu))
747                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
748                         rt_entry->hi, rt_entry->lo);
749         else
750                 pr_info("root entry: 0x%016llx", rt_entry->lo);
751
752         /* context entry dump */
753         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
754         if (!ctx_entry) {
755                 pr_info("context table entry is not present\n");
756                 return;
757         }
758
759         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
760                 ctx_entry->hi, ctx_entry->lo);
761
762         /* legacy mode does not require PASID entries */
763         if (!sm_supported(iommu)) {
764                 level = agaw_to_level(ctx_entry->hi & 7);
765                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
766                 goto pgtable_walk;
767         }
768
769         /* get the pointer to pasid directory entry */
770         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
771         if (!dir) {
772                 pr_info("pasid directory entry is not present\n");
773                 return;
774         }
775         /* For request-without-pasid, get the pasid from context entry */
776         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
777                 pasid = IOMMU_NO_PASID;
778
779         dir_index = pasid >> PASID_PDE_SHIFT;
780         pde = &dir[dir_index];
781         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
782
783         /* get the pointer to the pasid table entry */
784         entries = get_pasid_table_from_pde(pde);
785         if (!entries) {
786                 pr_info("pasid table entry is not present\n");
787                 return;
788         }
789         index = pasid & PASID_PTE_MASK;
790         pte = &entries[index];
791         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
792                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
793
794         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
795                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
796                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
797         } else {
798                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
799                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
800         }
801
802 pgtable_walk:
803         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
804 }
805 #endif
806
807 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
808                                       unsigned long pfn, int *target_level,
809                                       gfp_t gfp)
810 {
811         struct dma_pte *parent, *pte;
812         int level = agaw_to_level(domain->agaw);
813         int offset;
814
815         if (!domain_pfn_supported(domain, pfn))
816                 /* Address beyond IOMMU's addressing capabilities. */
817                 return NULL;
818
819         parent = domain->pgd;
820
821         while (1) {
822                 void *tmp_page;
823
824                 offset = pfn_level_offset(pfn, level);
825                 pte = &parent[offset];
826                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
827                         break;
828                 if (level == *target_level)
829                         break;
830
831                 if (!dma_pte_present(pte)) {
832                         uint64_t pteval, tmp;
833
834                         tmp_page = iommu_alloc_page_node(domain->nid, gfp);
835
836                         if (!tmp_page)
837                                 return NULL;
838
839                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
840                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
841                         if (domain->use_first_level)
842                                 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
843
844                         tmp = 0ULL;
845                         if (!try_cmpxchg64(&pte->val, &tmp, pteval))
846                                 /* Someone else set it while we were thinking; use theirs. */
847                                 iommu_free_page(tmp_page);
848                         else
849                                 domain_flush_cache(domain, pte, sizeof(*pte));
850                 }
851                 if (level == 1)
852                         break;
853
854                 parent = phys_to_virt(dma_pte_addr(pte));
855                 level--;
856         }
857
858         if (!*target_level)
859                 *target_level = level;
860
861         return pte;
862 }
863
864 /* return address's pte at specific level */
865 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
866                                          unsigned long pfn,
867                                          int level, int *large_page)
868 {
869         struct dma_pte *parent, *pte;
870         int total = agaw_to_level(domain->agaw);
871         int offset;
872
873         parent = domain->pgd;
874         while (level <= total) {
875                 offset = pfn_level_offset(pfn, total);
876                 pte = &parent[offset];
877                 if (level == total)
878                         return pte;
879
880                 if (!dma_pte_present(pte)) {
881                         *large_page = total;
882                         break;
883                 }
884
885                 if (dma_pte_superpage(pte)) {
886                         *large_page = total;
887                         return pte;
888                 }
889
890                 parent = phys_to_virt(dma_pte_addr(pte));
891                 total--;
892         }
893         return NULL;
894 }
895
896 /* clear last level pte, a tlb flush should be followed */
897 static void dma_pte_clear_range(struct dmar_domain *domain,
898                                 unsigned long start_pfn,
899                                 unsigned long last_pfn)
900 {
901         unsigned int large_page;
902         struct dma_pte *first_pte, *pte;
903
904         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
905             WARN_ON(start_pfn > last_pfn))
906                 return;
907
908         /* we don't need lock here; nobody else touches the iova range */
909         do {
910                 large_page = 1;
911                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
912                 if (!pte) {
913                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
914                         continue;
915                 }
916                 do {
917                         dma_clear_pte(pte);
918                         start_pfn += lvl_to_nr_pages(large_page);
919                         pte++;
920                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
921
922                 domain_flush_cache(domain, first_pte,
923                                    (void *)pte - (void *)first_pte);
924
925         } while (start_pfn && start_pfn <= last_pfn);
926 }
927
928 static void dma_pte_free_level(struct dmar_domain *domain, int level,
929                                int retain_level, struct dma_pte *pte,
930                                unsigned long pfn, unsigned long start_pfn,
931                                unsigned long last_pfn)
932 {
933         pfn = max(start_pfn, pfn);
934         pte = &pte[pfn_level_offset(pfn, level)];
935
936         do {
937                 unsigned long level_pfn;
938                 struct dma_pte *level_pte;
939
940                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
941                         goto next;
942
943                 level_pfn = pfn & level_mask(level);
944                 level_pte = phys_to_virt(dma_pte_addr(pte));
945
946                 if (level > 2) {
947                         dma_pte_free_level(domain, level - 1, retain_level,
948                                            level_pte, level_pfn, start_pfn,
949                                            last_pfn);
950                 }
951
952                 /*
953                  * Free the page table if we're below the level we want to
954                  * retain and the range covers the entire table.
955                  */
956                 if (level < retain_level && !(start_pfn > level_pfn ||
957                       last_pfn < level_pfn + level_size(level) - 1)) {
958                         dma_clear_pte(pte);
959                         domain_flush_cache(domain, pte, sizeof(*pte));
960                         iommu_free_page(level_pte);
961                 }
962 next:
963                 pfn += level_size(level);
964         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
965 }
966
967 /*
968  * clear last level (leaf) ptes and free page table pages below the
969  * level we wish to keep intact.
970  */
971 static void dma_pte_free_pagetable(struct dmar_domain *domain,
972                                    unsigned long start_pfn,
973                                    unsigned long last_pfn,
974                                    int retain_level)
975 {
976         dma_pte_clear_range(domain, start_pfn, last_pfn);
977
978         /* We don't need lock here; nobody else touches the iova range */
979         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
980                            domain->pgd, 0, start_pfn, last_pfn);
981
982         /* free pgd */
983         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
984                 iommu_free_page(domain->pgd);
985                 domain->pgd = NULL;
986         }
987 }
988
989 /* When a page at a given level is being unlinked from its parent, we don't
990    need to *modify* it at all. All we need to do is make a list of all the
991    pages which can be freed just as soon as we've flushed the IOTLB and we
992    know the hardware page-walk will no longer touch them.
993    The 'pte' argument is the *parent* PTE, pointing to the page that is to
994    be freed. */
995 static void dma_pte_list_pagetables(struct dmar_domain *domain,
996                                     int level, struct dma_pte *pte,
997                                     struct list_head *freelist)
998 {
999         struct page *pg;
1000
1001         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1002         list_add_tail(&pg->lru, freelist);
1003
1004         if (level == 1)
1005                 return;
1006
1007         pte = page_address(pg);
1008         do {
1009                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1010                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1011                 pte++;
1012         } while (!first_pte_in_page(pte));
1013 }
1014
1015 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1016                                 struct dma_pte *pte, unsigned long pfn,
1017                                 unsigned long start_pfn, unsigned long last_pfn,
1018                                 struct list_head *freelist)
1019 {
1020         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1021
1022         pfn = max(start_pfn, pfn);
1023         pte = &pte[pfn_level_offset(pfn, level)];
1024
1025         do {
1026                 unsigned long level_pfn = pfn & level_mask(level);
1027
1028                 if (!dma_pte_present(pte))
1029                         goto next;
1030
1031                 /* If range covers entire pagetable, free it */
1032                 if (start_pfn <= level_pfn &&
1033                     last_pfn >= level_pfn + level_size(level) - 1) {
1034                         /* These suborbinate page tables are going away entirely. Don't
1035                            bother to clear them; we're just going to *free* them. */
1036                         if (level > 1 && !dma_pte_superpage(pte))
1037                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1038
1039                         dma_clear_pte(pte);
1040                         if (!first_pte)
1041                                 first_pte = pte;
1042                         last_pte = pte;
1043                 } else if (level > 1) {
1044                         /* Recurse down into a level that isn't *entirely* obsolete */
1045                         dma_pte_clear_level(domain, level - 1,
1046                                             phys_to_virt(dma_pte_addr(pte)),
1047                                             level_pfn, start_pfn, last_pfn,
1048                                             freelist);
1049                 }
1050 next:
1051                 pfn = level_pfn + level_size(level);
1052         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1053
1054         if (first_pte)
1055                 domain_flush_cache(domain, first_pte,
1056                                    (void *)++last_pte - (void *)first_pte);
1057 }
1058
1059 /* We can't just free the pages because the IOMMU may still be walking
1060    the page tables, and may have cached the intermediate levels. The
1061    pages can only be freed after the IOTLB flush has been done. */
1062 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1063                          unsigned long last_pfn, struct list_head *freelist)
1064 {
1065         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1066             WARN_ON(start_pfn > last_pfn))
1067                 return;
1068
1069         /* we don't need lock here; nobody else touches the iova range */
1070         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1072
1073         /* free pgd */
1074         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075                 struct page *pgd_page = virt_to_page(domain->pgd);
1076                 list_add_tail(&pgd_page->lru, freelist);
1077                 domain->pgd = NULL;
1078         }
1079 }
1080
1081 /* iommu handling */
1082 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1083 {
1084         struct root_entry *root;
1085
1086         root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1087         if (!root) {
1088                 pr_err("Allocating root entry for %s failed\n",
1089                         iommu->name);
1090                 return -ENOMEM;
1091         }
1092
1093         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1094         iommu->root_entry = root;
1095
1096         return 0;
1097 }
1098
1099 static void iommu_set_root_entry(struct intel_iommu *iommu)
1100 {
1101         u64 addr;
1102         u32 sts;
1103         unsigned long flag;
1104
1105         addr = virt_to_phys(iommu->root_entry);
1106         if (sm_supported(iommu))
1107                 addr |= DMA_RTADDR_SMT;
1108
1109         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1110         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1111
1112         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1113
1114         /* Make sure hardware complete it */
1115         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1116                       readl, (sts & DMA_GSTS_RTPS), sts);
1117
1118         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1119
1120         /*
1121          * Hardware invalidates all DMA remapping hardware translation
1122          * caches as part of SRTP flow.
1123          */
1124         if (cap_esrtps(iommu->cap))
1125                 return;
1126
1127         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1128         if (sm_supported(iommu))
1129                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1130         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1131 }
1132
1133 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1134 {
1135         u32 val;
1136         unsigned long flag;
1137
1138         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1139                 return;
1140
1141         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1143
1144         /* Make sure hardware complete it */
1145         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146                       readl, (!(val & DMA_GSTS_WBFS)), val);
1147
1148         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153                                   u16 did, u16 source_id, u8 function_mask,
1154                                   u64 type)
1155 {
1156         u64 val = 0;
1157         unsigned long flag;
1158
1159         switch (type) {
1160         case DMA_CCMD_GLOBAL_INVL:
1161                 val = DMA_CCMD_GLOBAL_INVL;
1162                 break;
1163         case DMA_CCMD_DOMAIN_INVL:
1164                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1165                 break;
1166         case DMA_CCMD_DEVICE_INVL:
1167                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1169                 break;
1170         default:
1171                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1172                         iommu->name, type);
1173                 return;
1174         }
1175         val |= DMA_CCMD_ICC;
1176
1177         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1178         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1179
1180         /* Make sure hardware complete it */
1181         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1182                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1183
1184         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1185 }
1186
1187 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1188                          unsigned int size_order, u64 type)
1189 {
1190         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191         u64 val = 0, val_iva = 0;
1192         unsigned long flag;
1193
1194         switch (type) {
1195         case DMA_TLB_GLOBAL_FLUSH:
1196                 /* global flush doesn't need set IVA_REG */
1197                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1198                 break;
1199         case DMA_TLB_DSI_FLUSH:
1200                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1201                 break;
1202         case DMA_TLB_PSI_FLUSH:
1203                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204                 /* IH bit is passed in as part of address */
1205                 val_iva = size_order | addr;
1206                 break;
1207         default:
1208                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1209                         iommu->name, type);
1210                 return;
1211         }
1212
1213         if (cap_write_drain(iommu->cap))
1214                 val |= DMA_TLB_WRITE_DRAIN;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         /* Note: Only uses first TLB reg currently */
1218         if (val_iva)
1219                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1220         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1221
1222         /* Make sure hardware complete it */
1223         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1224                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1225
1226         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227
1228         /* check IOTLB invalidation granularity */
1229         if (DMA_TLB_IAIG(val) == 0)
1230                 pr_err("Flush IOTLB failed\n");
1231         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1232                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1233                         (unsigned long long)DMA_TLB_IIRG(type),
1234                         (unsigned long long)DMA_TLB_IAIG(val));
1235 }
1236
1237 static struct device_domain_info *
1238 domain_lookup_dev_info(struct dmar_domain *domain,
1239                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1240 {
1241         struct device_domain_info *info;
1242         unsigned long flags;
1243
1244         spin_lock_irqsave(&domain->lock, flags);
1245         list_for_each_entry(info, &domain->devices, link) {
1246                 if (info->iommu == iommu && info->bus == bus &&
1247                     info->devfn == devfn) {
1248                         spin_unlock_irqrestore(&domain->lock, flags);
1249                         return info;
1250                 }
1251         }
1252         spin_unlock_irqrestore(&domain->lock, flags);
1253
1254         return NULL;
1255 }
1256
1257 /*
1258  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1259  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1260  * check because it applies only to the built-in QAT devices and it doesn't
1261  * grant additional privileges.
1262  */
1263 #define BUGGY_QAT_DEVID_MASK 0x4940
1264 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1265 {
1266         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1267                 return false;
1268
1269         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1270                 return false;
1271
1272         return true;
1273 }
1274
1275 static void iommu_enable_pci_caps(struct device_domain_info *info)
1276 {
1277         struct pci_dev *pdev;
1278
1279         if (!dev_is_pci(info->dev))
1280                 return;
1281
1282         pdev = to_pci_dev(info->dev);
1283         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1284             !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1285                 info->ats_enabled = 1;
1286 }
1287
1288 static void iommu_disable_pci_caps(struct device_domain_info *info)
1289 {
1290         struct pci_dev *pdev;
1291
1292         if (!dev_is_pci(info->dev))
1293                 return;
1294
1295         pdev = to_pci_dev(info->dev);
1296
1297         if (info->ats_enabled) {
1298                 pci_disable_ats(pdev);
1299                 info->ats_enabled = 0;
1300         }
1301 }
1302
1303 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1304 {
1305         cache_tag_flush_all(to_dmar_domain(domain));
1306 }
1307
1308 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1309 {
1310         u32 pmen;
1311         unsigned long flags;
1312
1313         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1314                 return;
1315
1316         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1317         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1318         pmen &= ~DMA_PMEN_EPM;
1319         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1320
1321         /* wait for the protected region status bit to clear */
1322         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1323                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1324
1325         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1326 }
1327
1328 static void iommu_enable_translation(struct intel_iommu *iommu)
1329 {
1330         u32 sts;
1331         unsigned long flags;
1332
1333         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1334         iommu->gcmd |= DMA_GCMD_TE;
1335         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1336
1337         /* Make sure hardware complete it */
1338         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1339                       readl, (sts & DMA_GSTS_TES), sts);
1340
1341         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1342 }
1343
1344 static void iommu_disable_translation(struct intel_iommu *iommu)
1345 {
1346         u32 sts;
1347         unsigned long flag;
1348
1349         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1350             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1351                 return;
1352
1353         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354         iommu->gcmd &= ~DMA_GCMD_TE;
1355         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1356
1357         /* Make sure hardware complete it */
1358         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1359                       readl, (!(sts & DMA_GSTS_TES)), sts);
1360
1361         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1362 }
1363
1364 static int iommu_init_domains(struct intel_iommu *iommu)
1365 {
1366         u32 ndomains;
1367
1368         ndomains = cap_ndoms(iommu->cap);
1369         pr_debug("%s: Number of Domains supported <%d>\n",
1370                  iommu->name, ndomains);
1371
1372         spin_lock_init(&iommu->lock);
1373
1374         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1375         if (!iommu->domain_ids)
1376                 return -ENOMEM;
1377
1378         /*
1379          * If Caching mode is set, then invalid translations are tagged
1380          * with domain-id 0, hence we need to pre-allocate it. We also
1381          * use domain-id 0 as a marker for non-allocated domain-id, so
1382          * make sure it is not used for a real domain.
1383          */
1384         set_bit(0, iommu->domain_ids);
1385
1386         /*
1387          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1388          * entry for first-level or pass-through translation modes should
1389          * be programmed with a domain id different from those used for
1390          * second-level or nested translation. We reserve a domain id for
1391          * this purpose. This domain id is also used for identity domain
1392          * in legacy mode.
1393          */
1394         set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1395
1396         return 0;
1397 }
1398
1399 static void disable_dmar_iommu(struct intel_iommu *iommu)
1400 {
1401         if (!iommu->domain_ids)
1402                 return;
1403
1404         /*
1405          * All iommu domains must have been detached from the devices,
1406          * hence there should be no domain IDs in use.
1407          */
1408         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1409                     > NUM_RESERVED_DID))
1410                 return;
1411
1412         if (iommu->gcmd & DMA_GCMD_TE)
1413                 iommu_disable_translation(iommu);
1414 }
1415
1416 static void free_dmar_iommu(struct intel_iommu *iommu)
1417 {
1418         if (iommu->domain_ids) {
1419                 bitmap_free(iommu->domain_ids);
1420                 iommu->domain_ids = NULL;
1421         }
1422
1423         if (iommu->copied_tables) {
1424                 bitmap_free(iommu->copied_tables);
1425                 iommu->copied_tables = NULL;
1426         }
1427
1428         /* free context mapping */
1429         free_context_table(iommu);
1430
1431 #ifdef CONFIG_INTEL_IOMMU_SVM
1432         if (pasid_supported(iommu)) {
1433                 if (ecap_prs(iommu->ecap))
1434                         intel_svm_finish_prq(iommu);
1435         }
1436 #endif
1437 }
1438
1439 /*
1440  * Check and return whether first level is used by default for
1441  * DMA translation.
1442  */
1443 static bool first_level_by_default(unsigned int type)
1444 {
1445         /* Only SL is available in legacy mode */
1446         if (!scalable_mode_support())
1447                 return false;
1448
1449         /* Only level (either FL or SL) is available, just use it */
1450         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1451                 return intel_cap_flts_sanity();
1452
1453         /* Both levels are available, decide it based on domain type */
1454         return type != IOMMU_DOMAIN_UNMANAGED;
1455 }
1456
1457 static struct dmar_domain *alloc_domain(unsigned int type)
1458 {
1459         struct dmar_domain *domain;
1460
1461         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1462         if (!domain)
1463                 return NULL;
1464
1465         domain->nid = NUMA_NO_NODE;
1466         if (first_level_by_default(type))
1467                 domain->use_first_level = true;
1468         INIT_LIST_HEAD(&domain->devices);
1469         INIT_LIST_HEAD(&domain->dev_pasids);
1470         INIT_LIST_HEAD(&domain->cache_tags);
1471         spin_lock_init(&domain->lock);
1472         spin_lock_init(&domain->cache_lock);
1473         xa_init(&domain->iommu_array);
1474
1475         return domain;
1476 }
1477
1478 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1479 {
1480         struct iommu_domain_info *info, *curr;
1481         unsigned long ndomains;
1482         int num, ret = -ENOSPC;
1483
1484         if (domain->domain.type == IOMMU_DOMAIN_SVA)
1485                 return 0;
1486
1487         info = kzalloc(sizeof(*info), GFP_KERNEL);
1488         if (!info)
1489                 return -ENOMEM;
1490
1491         spin_lock(&iommu->lock);
1492         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1493         if (curr) {
1494                 curr->refcnt++;
1495                 spin_unlock(&iommu->lock);
1496                 kfree(info);
1497                 return 0;
1498         }
1499
1500         ndomains = cap_ndoms(iommu->cap);
1501         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502         if (num >= ndomains) {
1503                 pr_err("%s: No free domain ids\n", iommu->name);
1504                 goto err_unlock;
1505         }
1506
1507         set_bit(num, iommu->domain_ids);
1508         info->refcnt    = 1;
1509         info->did       = num;
1510         info->iommu     = iommu;
1511         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1512                           NULL, info, GFP_ATOMIC);
1513         if (curr) {
1514                 ret = xa_err(curr) ? : -EBUSY;
1515                 goto err_clear;
1516         }
1517         domain_update_iommu_cap(domain);
1518
1519         spin_unlock(&iommu->lock);
1520         return 0;
1521
1522 err_clear:
1523         clear_bit(info->did, iommu->domain_ids);
1524 err_unlock:
1525         spin_unlock(&iommu->lock);
1526         kfree(info);
1527         return ret;
1528 }
1529
1530 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1531 {
1532         struct iommu_domain_info *info;
1533
1534         if (domain->domain.type == IOMMU_DOMAIN_SVA)
1535                 return;
1536
1537         spin_lock(&iommu->lock);
1538         info = xa_load(&domain->iommu_array, iommu->seq_id);
1539         if (--info->refcnt == 0) {
1540                 clear_bit(info->did, iommu->domain_ids);
1541                 xa_erase(&domain->iommu_array, iommu->seq_id);
1542                 domain->nid = NUMA_NO_NODE;
1543                 domain_update_iommu_cap(domain);
1544                 kfree(info);
1545         }
1546         spin_unlock(&iommu->lock);
1547 }
1548
1549 static int guestwidth_to_adjustwidth(int gaw)
1550 {
1551         int agaw;
1552         int r = (gaw - 12) % 9;
1553
1554         if (r == 0)
1555                 agaw = gaw;
1556         else
1557                 agaw = gaw + 9 - r;
1558         if (agaw > 64)
1559                 agaw = 64;
1560         return agaw;
1561 }
1562
1563 static void domain_exit(struct dmar_domain *domain)
1564 {
1565         if (domain->pgd) {
1566                 LIST_HEAD(freelist);
1567
1568                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1569                 iommu_put_pages_list(&freelist);
1570         }
1571
1572         if (WARN_ON(!list_empty(&domain->devices)))
1573                 return;
1574
1575         kfree(domain->qi_batch);
1576         kfree(domain);
1577 }
1578
1579 /*
1580  * For kdump cases, old valid entries may be cached due to the
1581  * in-flight DMA and copied pgtable, but there is no unmapping
1582  * behaviour for them, thus we need an explicit cache flush for
1583  * the newly-mapped device. For kdump, at this point, the device
1584  * is supposed to finish reset at its driver probe stage, so no
1585  * in-flight DMA will exist, and we don't need to worry anymore
1586  * hereafter.
1587  */
1588 static void copied_context_tear_down(struct intel_iommu *iommu,
1589                                      struct context_entry *context,
1590                                      u8 bus, u8 devfn)
1591 {
1592         u16 did_old;
1593
1594         if (!context_copied(iommu, bus, devfn))
1595                 return;
1596
1597         assert_spin_locked(&iommu->lock);
1598
1599         did_old = context_domain_id(context);
1600         context_clear_entry(context);
1601
1602         if (did_old < cap_ndoms(iommu->cap)) {
1603                 iommu->flush.flush_context(iommu, did_old,
1604                                            (((u16)bus) << 8) | devfn,
1605                                            DMA_CCMD_MASK_NOBIT,
1606                                            DMA_CCMD_DEVICE_INVL);
1607                 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1608                                          DMA_TLB_DSI_FLUSH);
1609         }
1610
1611         clear_context_copied(iommu, bus, devfn);
1612 }
1613
1614 /*
1615  * It's a non-present to present mapping. If hardware doesn't cache
1616  * non-present entry we only need to flush the write-buffer. If the
1617  * _does_ cache non-present entries, then it does so in the special
1618  * domain #0, which we have to flush:
1619  */
1620 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1621                                         u8 bus, u8 devfn)
1622 {
1623         if (cap_caching_mode(iommu->cap)) {
1624                 iommu->flush.flush_context(iommu, 0,
1625                                            (((u16)bus) << 8) | devfn,
1626                                            DMA_CCMD_MASK_NOBIT,
1627                                            DMA_CCMD_DEVICE_INVL);
1628                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1629         } else {
1630                 iommu_flush_write_buffer(iommu);
1631         }
1632 }
1633
1634 static int domain_context_mapping_one(struct dmar_domain *domain,
1635                                       struct intel_iommu *iommu,
1636                                       u8 bus, u8 devfn)
1637 {
1638         struct device_domain_info *info =
1639                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1640         u16 did = domain_id_iommu(domain, iommu);
1641         int translation = CONTEXT_TT_MULTI_LEVEL;
1642         struct dma_pte *pgd = domain->pgd;
1643         struct context_entry *context;
1644         int agaw, ret;
1645
1646         pr_debug("Set context mapping for %02x:%02x.%d\n",
1647                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1648
1649         spin_lock(&iommu->lock);
1650         ret = -ENOMEM;
1651         context = iommu_context_addr(iommu, bus, devfn, 1);
1652         if (!context)
1653                 goto out_unlock;
1654
1655         ret = 0;
1656         if (context_present(context) && !context_copied(iommu, bus, devfn))
1657                 goto out_unlock;
1658
1659         copied_context_tear_down(iommu, context, bus, devfn);
1660         context_clear_entry(context);
1661
1662         context_set_domain_id(context, did);
1663
1664         /*
1665          * Skip top levels of page tables for iommu which has
1666          * less agaw than default. Unnecessary for PT mode.
1667          */
1668         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1669                 ret = -ENOMEM;
1670                 pgd = phys_to_virt(dma_pte_addr(pgd));
1671                 if (!dma_pte_present(pgd))
1672                         goto out_unlock;
1673         }
1674
1675         if (info && info->ats_supported)
1676                 translation = CONTEXT_TT_DEV_IOTLB;
1677         else
1678                 translation = CONTEXT_TT_MULTI_LEVEL;
1679
1680         context_set_address_root(context, virt_to_phys(pgd));
1681         context_set_address_width(context, agaw);
1682         context_set_translation_type(context, translation);
1683         context_set_fault_enable(context);
1684         context_set_present(context);
1685         if (!ecap_coherent(iommu->ecap))
1686                 clflush_cache_range(context, sizeof(*context));
1687         context_present_cache_flush(iommu, did, bus, devfn);
1688         ret = 0;
1689
1690 out_unlock:
1691         spin_unlock(&iommu->lock);
1692
1693         return ret;
1694 }
1695
1696 static int domain_context_mapping_cb(struct pci_dev *pdev,
1697                                      u16 alias, void *opaque)
1698 {
1699         struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1700         struct intel_iommu *iommu = info->iommu;
1701         struct dmar_domain *domain = opaque;
1702
1703         return domain_context_mapping_one(domain, iommu,
1704                                           PCI_BUS_NUM(alias), alias & 0xff);
1705 }
1706
1707 static int
1708 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1709 {
1710         struct device_domain_info *info = dev_iommu_priv_get(dev);
1711         struct intel_iommu *iommu = info->iommu;
1712         u8 bus = info->bus, devfn = info->devfn;
1713
1714         if (!dev_is_pci(dev))
1715                 return domain_context_mapping_one(domain, iommu, bus, devfn);
1716
1717         return pci_for_each_dma_alias(to_pci_dev(dev),
1718                                       domain_context_mapping_cb, domain);
1719 }
1720
1721 /* Return largest possible superpage level for a given mapping */
1722 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1723                                    unsigned long phy_pfn, unsigned long pages)
1724 {
1725         int support, level = 1;
1726         unsigned long pfnmerge;
1727
1728         support = domain->iommu_superpage;
1729
1730         /* To use a large page, the virtual *and* physical addresses
1731            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1732            of them will mean we have to use smaller pages. So just
1733            merge them and check both at once. */
1734         pfnmerge = iov_pfn | phy_pfn;
1735
1736         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1737                 pages >>= VTD_STRIDE_SHIFT;
1738                 if (!pages)
1739                         break;
1740                 pfnmerge >>= VTD_STRIDE_SHIFT;
1741                 level++;
1742                 support--;
1743         }
1744         return level;
1745 }
1746
1747 /*
1748  * Ensure that old small page tables are removed to make room for superpage(s).
1749  * We're going to add new large pages, so make sure we don't remove their parent
1750  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1751  */
1752 static void switch_to_super_page(struct dmar_domain *domain,
1753                                  unsigned long start_pfn,
1754                                  unsigned long end_pfn, int level)
1755 {
1756         unsigned long lvl_pages = lvl_to_nr_pages(level);
1757         struct dma_pte *pte = NULL;
1758
1759         while (start_pfn <= end_pfn) {
1760                 if (!pte)
1761                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
1762                                              GFP_ATOMIC);
1763
1764                 if (dma_pte_present(pte)) {
1765                         dma_pte_free_pagetable(domain, start_pfn,
1766                                                start_pfn + lvl_pages - 1,
1767                                                level + 1);
1768
1769                         cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1770                                               end_pfn << VTD_PAGE_SHIFT, 0);
1771                 }
1772
1773                 pte++;
1774                 start_pfn += lvl_pages;
1775                 if (first_pte_in_page(pte))
1776                         pte = NULL;
1777         }
1778 }
1779
1780 static int
1781 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1782                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
1783                  gfp_t gfp)
1784 {
1785         struct dma_pte *first_pte = NULL, *pte = NULL;
1786         unsigned int largepage_lvl = 0;
1787         unsigned long lvl_pages = 0;
1788         phys_addr_t pteval;
1789         u64 attr;
1790
1791         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1792                 return -EINVAL;
1793
1794         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1795                 return -EINVAL;
1796
1797         if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1798                 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1799                 return -EINVAL;
1800         }
1801
1802         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1803         attr |= DMA_FL_PTE_PRESENT;
1804         if (domain->use_first_level) {
1805                 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1806                 if (prot & DMA_PTE_WRITE)
1807                         attr |= DMA_FL_PTE_DIRTY;
1808         }
1809
1810         domain->has_mappings = true;
1811
1812         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1813
1814         while (nr_pages > 0) {
1815                 uint64_t tmp;
1816
1817                 if (!pte) {
1818                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1819                                         phys_pfn, nr_pages);
1820
1821                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1822                                              gfp);
1823                         if (!pte)
1824                                 return -ENOMEM;
1825                         first_pte = pte;
1826
1827                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
1828
1829                         /* It is large page*/
1830                         if (largepage_lvl > 1) {
1831                                 unsigned long end_pfn;
1832                                 unsigned long pages_to_remove;
1833
1834                                 pteval |= DMA_PTE_LARGE_PAGE;
1835                                 pages_to_remove = min_t(unsigned long, nr_pages,
1836                                                         nr_pte_to_next_page(pte) * lvl_pages);
1837                                 end_pfn = iov_pfn + pages_to_remove - 1;
1838                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1839                         } else {
1840                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841                         }
1842
1843                 }
1844                 /* We don't need lock here, nobody else
1845                  * touches the iova range
1846                  */
1847                 tmp = 0ULL;
1848                 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1849                         static int dumps = 5;
1850                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851                                 iov_pfn, tmp, (unsigned long long)pteval);
1852                         if (dumps) {
1853                                 dumps--;
1854                                 debug_dma_dump_mappings(NULL);
1855                         }
1856                         WARN_ON(1);
1857                 }
1858
1859                 nr_pages -= lvl_pages;
1860                 iov_pfn += lvl_pages;
1861                 phys_pfn += lvl_pages;
1862                 pteval += lvl_pages * VTD_PAGE_SIZE;
1863
1864                 /* If the next PTE would be the first in a new page, then we
1865                  * need to flush the cache on the entries we've just written.
1866                  * And then we'll need to recalculate 'pte', so clear it and
1867                  * let it get set again in the if (!pte) block above.
1868                  *
1869                  * If we're done (!nr_pages) we need to flush the cache too.
1870                  *
1871                  * Also if we've been setting superpages, we may need to
1872                  * recalculate 'pte' and switch back to smaller pages for the
1873                  * end of the mapping, if the trailing size is not enough to
1874                  * use another superpage (i.e. nr_pages < lvl_pages).
1875                  */
1876                 pte++;
1877                 if (!nr_pages || first_pte_in_page(pte) ||
1878                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1879                         domain_flush_cache(domain, first_pte,
1880                                            (void *)pte - (void *)first_pte);
1881                         pte = NULL;
1882                 }
1883         }
1884
1885         return 0;
1886 }
1887
1888 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1889 {
1890         struct intel_iommu *iommu = info->iommu;
1891         struct context_entry *context;
1892         u16 did;
1893
1894         spin_lock(&iommu->lock);
1895         context = iommu_context_addr(iommu, bus, devfn, 0);
1896         if (!context) {
1897                 spin_unlock(&iommu->lock);
1898                 return;
1899         }
1900
1901         did = context_domain_id(context);
1902         context_clear_entry(context);
1903         __iommu_flush_cache(iommu, context, sizeof(*context));
1904         spin_unlock(&iommu->lock);
1905         intel_context_flush_present(info, context, did, true);
1906 }
1907
1908 static int domain_setup_first_level(struct intel_iommu *iommu,
1909                                     struct dmar_domain *domain,
1910                                     struct device *dev,
1911                                     u32 pasid)
1912 {
1913         struct dma_pte *pgd = domain->pgd;
1914         int agaw, level;
1915         int flags = 0;
1916
1917         /*
1918          * Skip top levels of page tables for iommu which has
1919          * less agaw than default. Unnecessary for PT mode.
1920          */
1921         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1922                 pgd = phys_to_virt(dma_pte_addr(pgd));
1923                 if (!dma_pte_present(pgd))
1924                         return -ENOMEM;
1925         }
1926
1927         level = agaw_to_level(agaw);
1928         if (level != 4 && level != 5)
1929                 return -EINVAL;
1930
1931         if (level == 5)
1932                 flags |= PASID_FLAG_FL5LP;
1933
1934         if (domain->force_snooping)
1935                 flags |= PASID_FLAG_PAGE_SNOOP;
1936
1937         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1938                                              domain_id_iommu(domain, iommu),
1939                                              flags);
1940 }
1941
1942 static bool dev_is_real_dma_subdevice(struct device *dev)
1943 {
1944         return dev && dev_is_pci(dev) &&
1945                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1946 }
1947
1948 static int dmar_domain_attach_device(struct dmar_domain *domain,
1949                                      struct device *dev)
1950 {
1951         struct device_domain_info *info = dev_iommu_priv_get(dev);
1952         struct intel_iommu *iommu = info->iommu;
1953         unsigned long flags;
1954         int ret;
1955
1956         ret = domain_attach_iommu(domain, iommu);
1957         if (ret)
1958                 return ret;
1959
1960         info->domain = domain;
1961         spin_lock_irqsave(&domain->lock, flags);
1962         list_add(&info->link, &domain->devices);
1963         spin_unlock_irqrestore(&domain->lock, flags);
1964
1965         if (dev_is_real_dma_subdevice(dev))
1966                 return 0;
1967
1968         if (!sm_supported(iommu))
1969                 ret = domain_context_mapping(domain, dev);
1970         else if (domain->use_first_level)
1971                 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
1972         else
1973                 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
1974
1975         if (ret)
1976                 goto out_block_translation;
1977
1978         iommu_enable_pci_caps(info);
1979
1980         ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1981         if (ret)
1982                 goto out_block_translation;
1983
1984         return 0;
1985
1986 out_block_translation:
1987         device_block_translation(dev);
1988         return ret;
1989 }
1990
1991 /**
1992  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1993  * is relaxable (ie. is allowed to be not enforced under some conditions)
1994  * @dev: device handle
1995  *
1996  * We assume that PCI USB devices with RMRRs have them largely
1997  * for historical reasons and that the RMRR space is not actively used post
1998  * boot.  This exclusion may change if vendors begin to abuse it.
1999  *
2000  * The same exception is made for graphics devices, with the requirement that
2001  * any use of the RMRR regions will be torn down before assigning the device
2002  * to a guest.
2003  *
2004  * Return: true if the RMRR is relaxable, false otherwise
2005  */
2006 static bool device_rmrr_is_relaxable(struct device *dev)
2007 {
2008         struct pci_dev *pdev;
2009
2010         if (!dev_is_pci(dev))
2011                 return false;
2012
2013         pdev = to_pci_dev(dev);
2014         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2015                 return true;
2016         else
2017                 return false;
2018 }
2019
2020 static int device_def_domain_type(struct device *dev)
2021 {
2022         struct device_domain_info *info = dev_iommu_priv_get(dev);
2023         struct intel_iommu *iommu = info->iommu;
2024
2025         /*
2026          * Hardware does not support the passthrough translation mode.
2027          * Always use a dynamaic mapping domain.
2028          */
2029         if (!ecap_pass_through(iommu->ecap))
2030                 return IOMMU_DOMAIN_DMA;
2031
2032         if (dev_is_pci(dev)) {
2033                 struct pci_dev *pdev = to_pci_dev(dev);
2034
2035                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2036                         return IOMMU_DOMAIN_IDENTITY;
2037         }
2038
2039         return 0;
2040 }
2041
2042 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2043 {
2044         /*
2045          * Start from the sane iommu hardware state.
2046          * If the queued invalidation is already initialized by us
2047          * (for example, while enabling interrupt-remapping) then
2048          * we got the things already rolling from a sane state.
2049          */
2050         if (!iommu->qi) {
2051                 /*
2052                  * Clear any previous faults.
2053                  */
2054                 dmar_fault(-1, iommu);
2055                 /*
2056                  * Disable queued invalidation if supported and already enabled
2057                  * before OS handover.
2058                  */
2059                 dmar_disable_qi(iommu);
2060         }
2061
2062         if (dmar_enable_qi(iommu)) {
2063                 /*
2064                  * Queued Invalidate not enabled, use Register Based Invalidate
2065                  */
2066                 iommu->flush.flush_context = __iommu_flush_context;
2067                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2068                 pr_info("%s: Using Register based invalidation\n",
2069                         iommu->name);
2070         } else {
2071                 iommu->flush.flush_context = qi_flush_context;
2072                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2073                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2074         }
2075 }
2076
2077 static int copy_context_table(struct intel_iommu *iommu,
2078                               struct root_entry *old_re,
2079                               struct context_entry **tbl,
2080                               int bus, bool ext)
2081 {
2082         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2083         struct context_entry *new_ce = NULL, ce;
2084         struct context_entry *old_ce = NULL;
2085         struct root_entry re;
2086         phys_addr_t old_ce_phys;
2087
2088         tbl_idx = ext ? bus * 2 : bus;
2089         memcpy(&re, old_re, sizeof(re));
2090
2091         for (devfn = 0; devfn < 256; devfn++) {
2092                 /* First calculate the correct index */
2093                 idx = (ext ? devfn * 2 : devfn) % 256;
2094
2095                 if (idx == 0) {
2096                         /* First save what we may have and clean up */
2097                         if (new_ce) {
2098                                 tbl[tbl_idx] = new_ce;
2099                                 __iommu_flush_cache(iommu, new_ce,
2100                                                     VTD_PAGE_SIZE);
2101                                 pos = 1;
2102                         }
2103
2104                         if (old_ce)
2105                                 memunmap(old_ce);
2106
2107                         ret = 0;
2108                         if (devfn < 0x80)
2109                                 old_ce_phys = root_entry_lctp(&re);
2110                         else
2111                                 old_ce_phys = root_entry_uctp(&re);
2112
2113                         if (!old_ce_phys) {
2114                                 if (ext && devfn == 0) {
2115                                         /* No LCTP, try UCTP */
2116                                         devfn = 0x7f;
2117                                         continue;
2118                                 } else {
2119                                         goto out;
2120                                 }
2121                         }
2122
2123                         ret = -ENOMEM;
2124                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2125                                         MEMREMAP_WB);
2126                         if (!old_ce)
2127                                 goto out;
2128
2129                         new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2130                         if (!new_ce)
2131                                 goto out_unmap;
2132
2133                         ret = 0;
2134                 }
2135
2136                 /* Now copy the context entry */
2137                 memcpy(&ce, old_ce + idx, sizeof(ce));
2138
2139                 if (!context_present(&ce))
2140                         continue;
2141
2142                 did = context_domain_id(&ce);
2143                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2144                         set_bit(did, iommu->domain_ids);
2145
2146                 set_context_copied(iommu, bus, devfn);
2147                 new_ce[idx] = ce;
2148         }
2149
2150         tbl[tbl_idx + pos] = new_ce;
2151
2152         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2153
2154 out_unmap:
2155         memunmap(old_ce);
2156
2157 out:
2158         return ret;
2159 }
2160
2161 static int copy_translation_tables(struct intel_iommu *iommu)
2162 {
2163         struct context_entry **ctxt_tbls;
2164         struct root_entry *old_rt;
2165         phys_addr_t old_rt_phys;
2166         int ctxt_table_entries;
2167         u64 rtaddr_reg;
2168         int bus, ret;
2169         bool new_ext, ext;
2170
2171         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2172         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2173         new_ext    = !!sm_supported(iommu);
2174
2175         /*
2176          * The RTT bit can only be changed when translation is disabled,
2177          * but disabling translation means to open a window for data
2178          * corruption. So bail out and don't copy anything if we would
2179          * have to change the bit.
2180          */
2181         if (new_ext != ext)
2182                 return -EINVAL;
2183
2184         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2185         if (!iommu->copied_tables)
2186                 return -ENOMEM;
2187
2188         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2189         if (!old_rt_phys)
2190                 return -EINVAL;
2191
2192         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2193         if (!old_rt)
2194                 return -ENOMEM;
2195
2196         /* This is too big for the stack - allocate it from slab */
2197         ctxt_table_entries = ext ? 512 : 256;
2198         ret = -ENOMEM;
2199         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2200         if (!ctxt_tbls)
2201                 goto out_unmap;
2202
2203         for (bus = 0; bus < 256; bus++) {
2204                 ret = copy_context_table(iommu, &old_rt[bus],
2205                                          ctxt_tbls, bus, ext);
2206                 if (ret) {
2207                         pr_err("%s: Failed to copy context table for bus %d\n",
2208                                 iommu->name, bus);
2209                         continue;
2210                 }
2211         }
2212
2213         spin_lock(&iommu->lock);
2214
2215         /* Context tables are copied, now write them to the root_entry table */
2216         for (bus = 0; bus < 256; bus++) {
2217                 int idx = ext ? bus * 2 : bus;
2218                 u64 val;
2219
2220                 if (ctxt_tbls[idx]) {
2221                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2222                         iommu->root_entry[bus].lo = val;
2223                 }
2224
2225                 if (!ext || !ctxt_tbls[idx + 1])
2226                         continue;
2227
2228                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2229                 iommu->root_entry[bus].hi = val;
2230         }
2231
2232         spin_unlock(&iommu->lock);
2233
2234         kfree(ctxt_tbls);
2235
2236         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2237
2238         ret = 0;
2239
2240 out_unmap:
2241         memunmap(old_rt);
2242
2243         return ret;
2244 }
2245
2246 static int __init init_dmars(void)
2247 {
2248         struct dmar_drhd_unit *drhd;
2249         struct intel_iommu *iommu;
2250         int ret;
2251
2252         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2253         if (ret)
2254                 goto free_iommu;
2255
2256         for_each_iommu(iommu, drhd) {
2257                 if (drhd->ignored) {
2258                         iommu_disable_translation(iommu);
2259                         continue;
2260                 }
2261
2262                 /*
2263                  * Find the max pasid size of all IOMMU's in the system.
2264                  * We need to ensure the system pasid table is no bigger
2265                  * than the smallest supported.
2266                  */
2267                 if (pasid_supported(iommu)) {
2268                         u32 temp = 2 << ecap_pss(iommu->ecap);
2269
2270                         intel_pasid_max_id = min_t(u32, temp,
2271                                                    intel_pasid_max_id);
2272                 }
2273
2274                 intel_iommu_init_qi(iommu);
2275
2276                 ret = iommu_init_domains(iommu);
2277                 if (ret)
2278                         goto free_iommu;
2279
2280                 init_translation_status(iommu);
2281
2282                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2283                         iommu_disable_translation(iommu);
2284                         clear_translation_pre_enabled(iommu);
2285                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2286                                 iommu->name);
2287                 }
2288
2289                 /*
2290                  * TBD:
2291                  * we could share the same root & context tables
2292                  * among all IOMMU's. Need to Split it later.
2293                  */
2294                 ret = iommu_alloc_root_entry(iommu);
2295                 if (ret)
2296                         goto free_iommu;
2297
2298                 if (translation_pre_enabled(iommu)) {
2299                         pr_info("Translation already enabled - trying to copy translation structures\n");
2300
2301                         ret = copy_translation_tables(iommu);
2302                         if (ret) {
2303                                 /*
2304                                  * We found the IOMMU with translation
2305                                  * enabled - but failed to copy over the
2306                                  * old root-entry table. Try to proceed
2307                                  * by disabling translation now and
2308                                  * allocating a clean root-entry table.
2309                                  * This might cause DMAR faults, but
2310                                  * probably the dump will still succeed.
2311                                  */
2312                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2313                                        iommu->name);
2314                                 iommu_disable_translation(iommu);
2315                                 clear_translation_pre_enabled(iommu);
2316                         } else {
2317                                 pr_info("Copied translation tables from previous kernel for %s\n",
2318                                         iommu->name);
2319                         }
2320                 }
2321
2322                 intel_svm_check(iommu);
2323         }
2324
2325         /*
2326          * Now that qi is enabled on all iommus, set the root entry and flush
2327          * caches. This is required on some Intel X58 chipsets, otherwise the
2328          * flush_context function will loop forever and the boot hangs.
2329          */
2330         for_each_active_iommu(iommu, drhd) {
2331                 iommu_flush_write_buffer(iommu);
2332                 iommu_set_root_entry(iommu);
2333         }
2334
2335         check_tylersburg_isoch();
2336
2337         /*
2338          * for each drhd
2339          *   enable fault log
2340          *   global invalidate context cache
2341          *   global invalidate iotlb
2342          *   enable translation
2343          */
2344         for_each_iommu(iommu, drhd) {
2345                 if (drhd->ignored) {
2346                         /*
2347                          * we always have to disable PMRs or DMA may fail on
2348                          * this device
2349                          */
2350                         if (force_on)
2351                                 iommu_disable_protect_mem_regions(iommu);
2352                         continue;
2353                 }
2354
2355                 iommu_flush_write_buffer(iommu);
2356
2357 #ifdef CONFIG_INTEL_IOMMU_SVM
2358                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2359                         /*
2360                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2361                          * could cause possible lock race condition.
2362                          */
2363                         up_write(&dmar_global_lock);
2364                         ret = intel_svm_enable_prq(iommu);
2365                         down_write(&dmar_global_lock);
2366                         if (ret)
2367                                 goto free_iommu;
2368                 }
2369 #endif
2370                 ret = dmar_set_interrupt(iommu);
2371                 if (ret)
2372                         goto free_iommu;
2373         }
2374
2375         return 0;
2376
2377 free_iommu:
2378         for_each_active_iommu(iommu, drhd) {
2379                 disable_dmar_iommu(iommu);
2380                 free_dmar_iommu(iommu);
2381         }
2382
2383         return ret;
2384 }
2385
2386 static void __init init_no_remapping_devices(void)
2387 {
2388         struct dmar_drhd_unit *drhd;
2389         struct device *dev;
2390         int i;
2391
2392         for_each_drhd_unit(drhd) {
2393                 if (!drhd->include_all) {
2394                         for_each_active_dev_scope(drhd->devices,
2395                                                   drhd->devices_cnt, i, dev)
2396                                 break;
2397                         /* ignore DMAR unit if no devices exist */
2398                         if (i == drhd->devices_cnt)
2399                                 drhd->ignored = 1;
2400                 }
2401         }
2402
2403         for_each_active_drhd_unit(drhd) {
2404                 if (drhd->include_all)
2405                         continue;
2406
2407                 for_each_active_dev_scope(drhd->devices,
2408                                           drhd->devices_cnt, i, dev)
2409                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2410                                 break;
2411                 if (i < drhd->devices_cnt)
2412                         continue;
2413
2414                 /* This IOMMU has *only* gfx devices. Either bypass it or
2415                    set the gfx_mapped flag, as appropriate */
2416                 drhd->gfx_dedicated = 1;
2417                 if (disable_igfx_iommu)
2418                         drhd->ignored = 1;
2419         }
2420 }
2421
2422 #ifdef CONFIG_SUSPEND
2423 static int init_iommu_hw(void)
2424 {
2425         struct dmar_drhd_unit *drhd;
2426         struct intel_iommu *iommu = NULL;
2427         int ret;
2428
2429         for_each_active_iommu(iommu, drhd) {
2430                 if (iommu->qi) {
2431                         ret = dmar_reenable_qi(iommu);
2432                         if (ret)
2433                                 return ret;
2434                 }
2435         }
2436
2437         for_each_iommu(iommu, drhd) {
2438                 if (drhd->ignored) {
2439                         /*
2440                          * we always have to disable PMRs or DMA may fail on
2441                          * this device
2442                          */
2443                         if (force_on)
2444                                 iommu_disable_protect_mem_regions(iommu);
2445                         continue;
2446                 }
2447
2448                 iommu_flush_write_buffer(iommu);
2449                 iommu_set_root_entry(iommu);
2450                 iommu_enable_translation(iommu);
2451                 iommu_disable_protect_mem_regions(iommu);
2452         }
2453
2454         return 0;
2455 }
2456
2457 static void iommu_flush_all(void)
2458 {
2459         struct dmar_drhd_unit *drhd;
2460         struct intel_iommu *iommu;
2461
2462         for_each_active_iommu(iommu, drhd) {
2463                 iommu->flush.flush_context(iommu, 0, 0, 0,
2464                                            DMA_CCMD_GLOBAL_INVL);
2465                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2466                                          DMA_TLB_GLOBAL_FLUSH);
2467         }
2468 }
2469
2470 static int iommu_suspend(void)
2471 {
2472         struct dmar_drhd_unit *drhd;
2473         struct intel_iommu *iommu = NULL;
2474         unsigned long flag;
2475
2476         iommu_flush_all();
2477
2478         for_each_active_iommu(iommu, drhd) {
2479                 iommu_disable_translation(iommu);
2480
2481                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2482
2483                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2484                         readl(iommu->reg + DMAR_FECTL_REG);
2485                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2486                         readl(iommu->reg + DMAR_FEDATA_REG);
2487                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2488                         readl(iommu->reg + DMAR_FEADDR_REG);
2489                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2490                         readl(iommu->reg + DMAR_FEUADDR_REG);
2491
2492                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2493         }
2494         return 0;
2495 }
2496
2497 static void iommu_resume(void)
2498 {
2499         struct dmar_drhd_unit *drhd;
2500         struct intel_iommu *iommu = NULL;
2501         unsigned long flag;
2502
2503         if (init_iommu_hw()) {
2504                 if (force_on)
2505                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2506                 else
2507                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2508                 return;
2509         }
2510
2511         for_each_active_iommu(iommu, drhd) {
2512
2513                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2514
2515                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2516                         iommu->reg + DMAR_FECTL_REG);
2517                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2518                         iommu->reg + DMAR_FEDATA_REG);
2519                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2520                         iommu->reg + DMAR_FEADDR_REG);
2521                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2522                         iommu->reg + DMAR_FEUADDR_REG);
2523
2524                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2525         }
2526 }
2527
2528 static struct syscore_ops iommu_syscore_ops = {
2529         .resume         = iommu_resume,
2530         .suspend        = iommu_suspend,
2531 };
2532
2533 static void __init init_iommu_pm_ops(void)
2534 {
2535         register_syscore_ops(&iommu_syscore_ops);
2536 }
2537
2538 #else
2539 static inline void init_iommu_pm_ops(void) {}
2540 #endif  /* CONFIG_PM */
2541
2542 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2543 {
2544         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2545             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2546             rmrr->end_address <= rmrr->base_address ||
2547             arch_rmrr_sanity_check(rmrr))
2548                 return -EINVAL;
2549
2550         return 0;
2551 }
2552
2553 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2554 {
2555         struct acpi_dmar_reserved_memory *rmrr;
2556         struct dmar_rmrr_unit *rmrru;
2557
2558         rmrr = (struct acpi_dmar_reserved_memory *)header;
2559         if (rmrr_sanity_check(rmrr)) {
2560                 pr_warn(FW_BUG
2561                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2562                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2563                            rmrr->base_address, rmrr->end_address,
2564                            dmi_get_system_info(DMI_BIOS_VENDOR),
2565                            dmi_get_system_info(DMI_BIOS_VERSION),
2566                            dmi_get_system_info(DMI_PRODUCT_VERSION));
2567                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2568         }
2569
2570         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2571         if (!rmrru)
2572                 goto out;
2573
2574         rmrru->hdr = header;
2575
2576         rmrru->base_address = rmrr->base_address;
2577         rmrru->end_address = rmrr->end_address;
2578
2579         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2580                                 ((void *)rmrr) + rmrr->header.length,
2581                                 &rmrru->devices_cnt);
2582         if (rmrru->devices_cnt && rmrru->devices == NULL)
2583                 goto free_rmrru;
2584
2585         list_add(&rmrru->list, &dmar_rmrr_units);
2586
2587         return 0;
2588 free_rmrru:
2589         kfree(rmrru);
2590 out:
2591         return -ENOMEM;
2592 }
2593
2594 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2595 {
2596         struct dmar_atsr_unit *atsru;
2597         struct acpi_dmar_atsr *tmp;
2598
2599         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2600                                 dmar_rcu_check()) {
2601                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2602                 if (atsr->segment != tmp->segment)
2603                         continue;
2604                 if (atsr->header.length != tmp->header.length)
2605                         continue;
2606                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2607                         return atsru;
2608         }
2609
2610         return NULL;
2611 }
2612
2613 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2614 {
2615         struct acpi_dmar_atsr *atsr;
2616         struct dmar_atsr_unit *atsru;
2617
2618         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2619                 return 0;
2620
2621         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2622         atsru = dmar_find_atsr(atsr);
2623         if (atsru)
2624                 return 0;
2625
2626         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2627         if (!atsru)
2628                 return -ENOMEM;
2629
2630         /*
2631          * If memory is allocated from slab by ACPI _DSM method, we need to
2632          * copy the memory content because the memory buffer will be freed
2633          * on return.
2634          */
2635         atsru->hdr = (void *)(atsru + 1);
2636         memcpy(atsru->hdr, hdr, hdr->length);
2637         atsru->include_all = atsr->flags & 0x1;
2638         if (!atsru->include_all) {
2639                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2640                                 (void *)atsr + atsr->header.length,
2641                                 &atsru->devices_cnt);
2642                 if (atsru->devices_cnt && atsru->devices == NULL) {
2643                         kfree(atsru);
2644                         return -ENOMEM;
2645                 }
2646         }
2647
2648         list_add_rcu(&atsru->list, &dmar_atsr_units);
2649
2650         return 0;
2651 }
2652
2653 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2654 {
2655         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2656         kfree(atsru);
2657 }
2658
2659 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2660 {
2661         struct acpi_dmar_atsr *atsr;
2662         struct dmar_atsr_unit *atsru;
2663
2664         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2665         atsru = dmar_find_atsr(atsr);
2666         if (atsru) {
2667                 list_del_rcu(&atsru->list);
2668                 synchronize_rcu();
2669                 intel_iommu_free_atsr(atsru);
2670         }
2671
2672         return 0;
2673 }
2674
2675 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2676 {
2677         int i;
2678         struct device *dev;
2679         struct acpi_dmar_atsr *atsr;
2680         struct dmar_atsr_unit *atsru;
2681
2682         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2683         atsru = dmar_find_atsr(atsr);
2684         if (!atsru)
2685                 return 0;
2686
2687         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2688                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2689                                           i, dev)
2690                         return -EBUSY;
2691         }
2692
2693         return 0;
2694 }
2695
2696 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2697 {
2698         struct dmar_satc_unit *satcu;
2699         struct acpi_dmar_satc *tmp;
2700
2701         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2702                                 dmar_rcu_check()) {
2703                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2704                 if (satc->segment != tmp->segment)
2705                         continue;
2706                 if (satc->header.length != tmp->header.length)
2707                         continue;
2708                 if (memcmp(satc, tmp, satc->header.length) == 0)
2709                         return satcu;
2710         }
2711
2712         return NULL;
2713 }
2714
2715 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2716 {
2717         struct acpi_dmar_satc *satc;
2718         struct dmar_satc_unit *satcu;
2719
2720         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2721                 return 0;
2722
2723         satc = container_of(hdr, struct acpi_dmar_satc, header);
2724         satcu = dmar_find_satc(satc);
2725         if (satcu)
2726                 return 0;
2727
2728         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2729         if (!satcu)
2730                 return -ENOMEM;
2731
2732         satcu->hdr = (void *)(satcu + 1);
2733         memcpy(satcu->hdr, hdr, hdr->length);
2734         satcu->atc_required = satc->flags & 0x1;
2735         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2736                                               (void *)satc + satc->header.length,
2737                                               &satcu->devices_cnt);
2738         if (satcu->devices_cnt && !satcu->devices) {
2739                 kfree(satcu);
2740                 return -ENOMEM;
2741         }
2742         list_add_rcu(&satcu->list, &dmar_satc_units);
2743
2744         return 0;
2745 }
2746
2747 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2748 {
2749         int sp, ret;
2750         struct intel_iommu *iommu = dmaru->iommu;
2751
2752         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2753         if (ret)
2754                 goto out;
2755
2756         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2757         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2758                 pr_warn("%s: Doesn't support large page.\n",
2759                         iommu->name);
2760                 return -ENXIO;
2761         }
2762
2763         /*
2764          * Disable translation if already enabled prior to OS handover.
2765          */
2766         if (iommu->gcmd & DMA_GCMD_TE)
2767                 iommu_disable_translation(iommu);
2768
2769         ret = iommu_init_domains(iommu);
2770         if (ret == 0)
2771                 ret = iommu_alloc_root_entry(iommu);
2772         if (ret)
2773                 goto out;
2774
2775         intel_svm_check(iommu);
2776
2777         if (dmaru->ignored) {
2778                 /*
2779                  * we always have to disable PMRs or DMA may fail on this device
2780                  */
2781                 if (force_on)
2782                         iommu_disable_protect_mem_regions(iommu);
2783                 return 0;
2784         }
2785
2786         intel_iommu_init_qi(iommu);
2787         iommu_flush_write_buffer(iommu);
2788
2789 #ifdef CONFIG_INTEL_IOMMU_SVM
2790         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2791                 ret = intel_svm_enable_prq(iommu);
2792                 if (ret)
2793                         goto disable_iommu;
2794         }
2795 #endif
2796         ret = dmar_set_interrupt(iommu);
2797         if (ret)
2798                 goto disable_iommu;
2799
2800         iommu_set_root_entry(iommu);
2801         iommu_enable_translation(iommu);
2802
2803         iommu_disable_protect_mem_regions(iommu);
2804         return 0;
2805
2806 disable_iommu:
2807         disable_dmar_iommu(iommu);
2808 out:
2809         free_dmar_iommu(iommu);
2810         return ret;
2811 }
2812
2813 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2814 {
2815         int ret = 0;
2816         struct intel_iommu *iommu = dmaru->iommu;
2817
2818         if (!intel_iommu_enabled)
2819                 return 0;
2820         if (iommu == NULL)
2821                 return -EINVAL;
2822
2823         if (insert) {
2824                 ret = intel_iommu_add(dmaru);
2825         } else {
2826                 disable_dmar_iommu(iommu);
2827                 free_dmar_iommu(iommu);
2828         }
2829
2830         return ret;
2831 }
2832
2833 static void intel_iommu_free_dmars(void)
2834 {
2835         struct dmar_rmrr_unit *rmrru, *rmrr_n;
2836         struct dmar_atsr_unit *atsru, *atsr_n;
2837         struct dmar_satc_unit *satcu, *satc_n;
2838
2839         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2840                 list_del(&rmrru->list);
2841                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2842                 kfree(rmrru);
2843         }
2844
2845         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2846                 list_del(&atsru->list);
2847                 intel_iommu_free_atsr(atsru);
2848         }
2849         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2850                 list_del(&satcu->list);
2851                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2852                 kfree(satcu);
2853         }
2854 }
2855
2856 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2857 {
2858         struct dmar_satc_unit *satcu;
2859         struct acpi_dmar_satc *satc;
2860         struct device *tmp;
2861         int i;
2862
2863         dev = pci_physfn(dev);
2864         rcu_read_lock();
2865
2866         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2867                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2868                 if (satc->segment != pci_domain_nr(dev->bus))
2869                         continue;
2870                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2871                         if (to_pci_dev(tmp) == dev)
2872                                 goto out;
2873         }
2874         satcu = NULL;
2875 out:
2876         rcu_read_unlock();
2877         return satcu;
2878 }
2879
2880 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2881 {
2882         int i, ret = 1;
2883         struct pci_bus *bus;
2884         struct pci_dev *bridge = NULL;
2885         struct device *tmp;
2886         struct acpi_dmar_atsr *atsr;
2887         struct dmar_atsr_unit *atsru;
2888         struct dmar_satc_unit *satcu;
2889
2890         dev = pci_physfn(dev);
2891         satcu = dmar_find_matched_satc_unit(dev);
2892         if (satcu)
2893                 /*
2894                  * This device supports ATS as it is in SATC table.
2895                  * When IOMMU is in legacy mode, enabling ATS is done
2896                  * automatically by HW for the device that requires
2897                  * ATS, hence OS should not enable this device ATS
2898                  * to avoid duplicated TLB invalidation.
2899                  */
2900                 return !(satcu->atc_required && !sm_supported(iommu));
2901
2902         for (bus = dev->bus; bus; bus = bus->parent) {
2903                 bridge = bus->self;
2904                 /* If it's an integrated device, allow ATS */
2905                 if (!bridge)
2906                         return 1;
2907                 /* Connected via non-PCIe: no ATS */
2908                 if (!pci_is_pcie(bridge) ||
2909                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2910                         return 0;
2911                 /* If we found the root port, look it up in the ATSR */
2912                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2913                         break;
2914         }
2915
2916         rcu_read_lock();
2917         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2918                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2919                 if (atsr->segment != pci_domain_nr(dev->bus))
2920                         continue;
2921
2922                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2923                         if (tmp == &bridge->dev)
2924                                 goto out;
2925
2926                 if (atsru->include_all)
2927                         goto out;
2928         }
2929         ret = 0;
2930 out:
2931         rcu_read_unlock();
2932
2933         return ret;
2934 }
2935
2936 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2937 {
2938         int ret;
2939         struct dmar_rmrr_unit *rmrru;
2940         struct dmar_atsr_unit *atsru;
2941         struct dmar_satc_unit *satcu;
2942         struct acpi_dmar_atsr *atsr;
2943         struct acpi_dmar_reserved_memory *rmrr;
2944         struct acpi_dmar_satc *satc;
2945
2946         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2947                 return 0;
2948
2949         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2950                 rmrr = container_of(rmrru->hdr,
2951                                     struct acpi_dmar_reserved_memory, header);
2952                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2953                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2954                                 ((void *)rmrr) + rmrr->header.length,
2955                                 rmrr->segment, rmrru->devices,
2956                                 rmrru->devices_cnt);
2957                         if (ret < 0)
2958                                 return ret;
2959                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2960                         dmar_remove_dev_scope(info, rmrr->segment,
2961                                 rmrru->devices, rmrru->devices_cnt);
2962                 }
2963         }
2964
2965         list_for_each_entry(atsru, &dmar_atsr_units, list) {
2966                 if (atsru->include_all)
2967                         continue;
2968
2969                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2970                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2971                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2972                                         (void *)atsr + atsr->header.length,
2973                                         atsr->segment, atsru->devices,
2974                                         atsru->devices_cnt);
2975                         if (ret > 0)
2976                                 break;
2977                         else if (ret < 0)
2978                                 return ret;
2979                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2980                         if (dmar_remove_dev_scope(info, atsr->segment,
2981                                         atsru->devices, atsru->devices_cnt))
2982                                 break;
2983                 }
2984         }
2985         list_for_each_entry(satcu, &dmar_satc_units, list) {
2986                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2987                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2988                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2989                                         (void *)satc + satc->header.length,
2990                                         satc->segment, satcu->devices,
2991                                         satcu->devices_cnt);
2992                         if (ret > 0)
2993                                 break;
2994                         else if (ret < 0)
2995                                 return ret;
2996                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2997                         if (dmar_remove_dev_scope(info, satc->segment,
2998                                         satcu->devices, satcu->devices_cnt))
2999                                 break;
3000                 }
3001         }
3002
3003         return 0;
3004 }
3005
3006 static void intel_disable_iommus(void)
3007 {
3008         struct intel_iommu *iommu = NULL;
3009         struct dmar_drhd_unit *drhd;
3010
3011         for_each_iommu(iommu, drhd)
3012                 iommu_disable_translation(iommu);
3013 }
3014
3015 void intel_iommu_shutdown(void)
3016 {
3017         struct dmar_drhd_unit *drhd;
3018         struct intel_iommu *iommu = NULL;
3019
3020         if (no_iommu || dmar_disabled)
3021                 return;
3022
3023         down_write(&dmar_global_lock);
3024
3025         /* Disable PMRs explicitly here. */
3026         for_each_iommu(iommu, drhd)
3027                 iommu_disable_protect_mem_regions(iommu);
3028
3029         /* Make sure the IOMMUs are switched off */
3030         intel_disable_iommus();
3031
3032         up_write(&dmar_global_lock);
3033 }
3034
3035 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3036 {
3037         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3038
3039         return container_of(iommu_dev, struct intel_iommu, iommu);
3040 }
3041
3042 static ssize_t version_show(struct device *dev,
3043                             struct device_attribute *attr, char *buf)
3044 {
3045         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3046         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3047         return sysfs_emit(buf, "%d:%d\n",
3048                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3049 }
3050 static DEVICE_ATTR_RO(version);
3051
3052 static ssize_t address_show(struct device *dev,
3053                             struct device_attribute *attr, char *buf)
3054 {
3055         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3056         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3057 }
3058 static DEVICE_ATTR_RO(address);
3059
3060 static ssize_t cap_show(struct device *dev,
3061                         struct device_attribute *attr, char *buf)
3062 {
3063         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3064         return sysfs_emit(buf, "%llx\n", iommu->cap);
3065 }
3066 static DEVICE_ATTR_RO(cap);
3067
3068 static ssize_t ecap_show(struct device *dev,
3069                          struct device_attribute *attr, char *buf)
3070 {
3071         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3072         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3073 }
3074 static DEVICE_ATTR_RO(ecap);
3075
3076 static ssize_t domains_supported_show(struct device *dev,
3077                                       struct device_attribute *attr, char *buf)
3078 {
3079         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3080         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3081 }
3082 static DEVICE_ATTR_RO(domains_supported);
3083
3084 static ssize_t domains_used_show(struct device *dev,
3085                                  struct device_attribute *attr, char *buf)
3086 {
3087         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088         return sysfs_emit(buf, "%d\n",
3089                           bitmap_weight(iommu->domain_ids,
3090                                         cap_ndoms(iommu->cap)));
3091 }
3092 static DEVICE_ATTR_RO(domains_used);
3093
3094 static struct attribute *intel_iommu_attrs[] = {
3095         &dev_attr_version.attr,
3096         &dev_attr_address.attr,
3097         &dev_attr_cap.attr,
3098         &dev_attr_ecap.attr,
3099         &dev_attr_domains_supported.attr,
3100         &dev_attr_domains_used.attr,
3101         NULL,
3102 };
3103
3104 static struct attribute_group intel_iommu_group = {
3105         .name = "intel-iommu",
3106         .attrs = intel_iommu_attrs,
3107 };
3108
3109 const struct attribute_group *intel_iommu_groups[] = {
3110         &intel_iommu_group,
3111         NULL,
3112 };
3113
3114 static bool has_external_pci(void)
3115 {
3116         struct pci_dev *pdev = NULL;
3117
3118         for_each_pci_dev(pdev)
3119                 if (pdev->external_facing) {
3120                         pci_dev_put(pdev);
3121                         return true;
3122                 }
3123
3124         return false;
3125 }
3126
3127 static int __init platform_optin_force_iommu(void)
3128 {
3129         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3130                 return 0;
3131
3132         if (no_iommu || dmar_disabled)
3133                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3134
3135         /*
3136          * If Intel-IOMMU is disabled by default, we will apply identity
3137          * map for all devices except those marked as being untrusted.
3138          */
3139         if (dmar_disabled)
3140                 iommu_set_default_passthrough(false);
3141
3142         dmar_disabled = 0;
3143         no_iommu = 0;
3144
3145         return 1;
3146 }
3147
3148 static int __init probe_acpi_namespace_devices(void)
3149 {
3150         struct dmar_drhd_unit *drhd;
3151         /* To avoid a -Wunused-but-set-variable warning. */
3152         struct intel_iommu *iommu __maybe_unused;
3153         struct device *dev;
3154         int i, ret = 0;
3155
3156         for_each_active_iommu(iommu, drhd) {
3157                 for_each_active_dev_scope(drhd->devices,
3158                                           drhd->devices_cnt, i, dev) {
3159                         struct acpi_device_physical_node *pn;
3160                         struct acpi_device *adev;
3161
3162                         if (dev->bus != &acpi_bus_type)
3163                                 continue;
3164
3165                         adev = to_acpi_device(dev);
3166                         mutex_lock(&adev->physical_node_lock);
3167                         list_for_each_entry(pn,
3168                                             &adev->physical_node_list, node) {
3169                                 ret = iommu_probe_device(pn->dev);
3170                                 if (ret)
3171                                         break;
3172                         }
3173                         mutex_unlock(&adev->physical_node_lock);
3174
3175                         if (ret)
3176                                 return ret;
3177                 }
3178         }
3179
3180         return 0;
3181 }
3182
3183 static __init int tboot_force_iommu(void)
3184 {
3185         if (!tboot_enabled())
3186                 return 0;
3187
3188         if (no_iommu || dmar_disabled)
3189                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3190
3191         dmar_disabled = 0;
3192         no_iommu = 0;
3193
3194         return 1;
3195 }
3196
3197 int __init intel_iommu_init(void)
3198 {
3199         int ret = -ENODEV;
3200         struct dmar_drhd_unit *drhd;
3201         struct intel_iommu *iommu;
3202
3203         /*
3204          * Intel IOMMU is required for a TXT/tboot launch or platform
3205          * opt in, so enforce that.
3206          */
3207         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3208                     platform_optin_force_iommu();
3209
3210         down_write(&dmar_global_lock);
3211         if (dmar_table_init()) {
3212                 if (force_on)
3213                         panic("tboot: Failed to initialize DMAR table\n");
3214                 goto out_free_dmar;
3215         }
3216
3217         if (dmar_dev_scope_init() < 0) {
3218                 if (force_on)
3219                         panic("tboot: Failed to initialize DMAR device scope\n");
3220                 goto out_free_dmar;
3221         }
3222
3223         up_write(&dmar_global_lock);
3224
3225         /*
3226          * The bus notifier takes the dmar_global_lock, so lockdep will
3227          * complain later when we register it under the lock.
3228          */
3229         dmar_register_bus_notifier();
3230
3231         down_write(&dmar_global_lock);
3232
3233         if (!no_iommu)
3234                 intel_iommu_debugfs_init();
3235
3236         if (no_iommu || dmar_disabled) {
3237                 /*
3238                  * We exit the function here to ensure IOMMU's remapping and
3239                  * mempool aren't setup, which means that the IOMMU's PMRs
3240                  * won't be disabled via the call to init_dmars(). So disable
3241                  * it explicitly here. The PMRs were setup by tboot prior to
3242                  * calling SENTER, but the kernel is expected to reset/tear
3243                  * down the PMRs.
3244                  */
3245                 if (intel_iommu_tboot_noforce) {
3246                         for_each_iommu(iommu, drhd)
3247                                 iommu_disable_protect_mem_regions(iommu);
3248                 }
3249
3250                 /*
3251                  * Make sure the IOMMUs are switched off, even when we
3252                  * boot into a kexec kernel and the previous kernel left
3253                  * them enabled
3254                  */
3255                 intel_disable_iommus();
3256                 goto out_free_dmar;
3257         }
3258
3259         if (list_empty(&dmar_rmrr_units))
3260                 pr_info("No RMRR found\n");
3261
3262         if (list_empty(&dmar_atsr_units))
3263                 pr_info("No ATSR found\n");
3264
3265         if (list_empty(&dmar_satc_units))
3266                 pr_info("No SATC found\n");
3267
3268         init_no_remapping_devices();
3269
3270         ret = init_dmars();
3271         if (ret) {
3272                 if (force_on)
3273                         panic("tboot: Failed to initialize DMARs\n");
3274                 pr_err("Initialization failed\n");
3275                 goto out_free_dmar;
3276         }
3277         up_write(&dmar_global_lock);
3278
3279         init_iommu_pm_ops();
3280
3281         down_read(&dmar_global_lock);
3282         for_each_active_iommu(iommu, drhd) {
3283                 /*
3284                  * The flush queue implementation does not perform
3285                  * page-selective invalidations that are required for efficient
3286                  * TLB flushes in virtual environments.  The benefit of batching
3287                  * is likely to be much lower than the overhead of synchronizing
3288                  * the virtual and physical IOMMU page-tables.
3289                  */
3290                 if (cap_caching_mode(iommu->cap) &&
3291                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3292                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3293                         iommu_set_dma_strict();
3294                 }
3295                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3296                                        intel_iommu_groups,
3297                                        "%s", iommu->name);
3298                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3299
3300                 iommu_pmu_register(iommu);
3301         }
3302
3303         if (probe_acpi_namespace_devices())
3304                 pr_warn("ACPI name space devices didn't probe correctly\n");
3305
3306         /* Finally, we enable the DMA remapping hardware. */
3307         for_each_iommu(iommu, drhd) {
3308                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3309                         iommu_enable_translation(iommu);
3310
3311                 iommu_disable_protect_mem_regions(iommu);
3312         }
3313         up_read(&dmar_global_lock);
3314
3315         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3316
3317         intel_iommu_enabled = 1;
3318
3319         return 0;
3320
3321 out_free_dmar:
3322         intel_iommu_free_dmars();
3323         up_write(&dmar_global_lock);
3324         return ret;
3325 }
3326
3327 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3328 {
3329         struct device_domain_info *info = opaque;
3330
3331         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3332         return 0;
3333 }
3334
3335 /*
3336  * NB - intel-iommu lacks any sort of reference counting for the users of
3337  * dependent devices.  If multiple endpoints have intersecting dependent
3338  * devices, unbinding the driver from any one of them will possibly leave
3339  * the others unable to operate.
3340  */
3341 static void domain_context_clear(struct device_domain_info *info)
3342 {
3343         if (!dev_is_pci(info->dev))
3344                 domain_context_clear_one(info, info->bus, info->devfn);
3345
3346         pci_for_each_dma_alias(to_pci_dev(info->dev),
3347                                &domain_context_clear_one_cb, info);
3348 }
3349
3350 /*
3351  * Clear the page table pointer in context or pasid table entries so that
3352  * all DMA requests without PASID from the device are blocked. If the page
3353  * table has been set, clean up the data structures.
3354  */
3355 void device_block_translation(struct device *dev)
3356 {
3357         struct device_domain_info *info = dev_iommu_priv_get(dev);
3358         struct intel_iommu *iommu = info->iommu;
3359         unsigned long flags;
3360
3361         iommu_disable_pci_caps(info);
3362         if (!dev_is_real_dma_subdevice(dev)) {
3363                 if (sm_supported(iommu))
3364                         intel_pasid_tear_down_entry(iommu, dev,
3365                                                     IOMMU_NO_PASID, false);
3366                 else
3367                         domain_context_clear(info);
3368         }
3369
3370         if (!info->domain)
3371                 return;
3372
3373         spin_lock_irqsave(&info->domain->lock, flags);
3374         list_del(&info->link);
3375         spin_unlock_irqrestore(&info->domain->lock, flags);
3376
3377         cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3378         domain_detach_iommu(info->domain, iommu);
3379         info->domain = NULL;
3380 }
3381
3382 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3383 {
3384         int adjust_width;
3385
3386         /* calculate AGAW */
3387         domain->gaw = guest_width;
3388         adjust_width = guestwidth_to_adjustwidth(guest_width);
3389         domain->agaw = width_to_agaw(adjust_width);
3390
3391         domain->iommu_coherency = false;
3392         domain->iommu_superpage = 0;
3393         domain->max_addr = 0;
3394
3395         /* always allocate the top pgd */
3396         domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3397         if (!domain->pgd)
3398                 return -ENOMEM;
3399         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3400         return 0;
3401 }
3402
3403 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3404                                       struct device *dev)
3405 {
3406         device_block_translation(dev);
3407         return 0;
3408 }
3409
3410 static struct iommu_domain blocking_domain = {
3411         .type = IOMMU_DOMAIN_BLOCKED,
3412         .ops = &(const struct iommu_domain_ops) {
3413                 .attach_dev     = blocking_domain_attach_dev,
3414         }
3415 };
3416
3417 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3418 {
3419         if (!intel_iommu_superpage)
3420                 return 0;
3421
3422         if (first_stage)
3423                 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3424
3425         return fls(cap_super_page_val(iommu->cap));
3426 }
3427
3428 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3429 {
3430         struct device_domain_info *info = dev_iommu_priv_get(dev);
3431         struct intel_iommu *iommu = info->iommu;
3432         struct dmar_domain *domain;
3433         int addr_width;
3434
3435         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3436         if (!domain)
3437                 return ERR_PTR(-ENOMEM);
3438
3439         INIT_LIST_HEAD(&domain->devices);
3440         INIT_LIST_HEAD(&domain->dev_pasids);
3441         INIT_LIST_HEAD(&domain->cache_tags);
3442         spin_lock_init(&domain->lock);
3443         spin_lock_init(&domain->cache_lock);
3444         xa_init(&domain->iommu_array);
3445
3446         domain->nid = dev_to_node(dev);
3447         domain->use_first_level = first_stage;
3448
3449         /* calculate the address width */
3450         addr_width = agaw_to_width(iommu->agaw);
3451         if (addr_width > cap_mgaw(iommu->cap))
3452                 addr_width = cap_mgaw(iommu->cap);
3453         domain->gaw = addr_width;
3454         domain->agaw = iommu->agaw;
3455         domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3456
3457         /* iommu memory access coherency */
3458         domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3459
3460         /* pagesize bitmap */
3461         domain->domain.pgsize_bitmap = SZ_4K;
3462         domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3463         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3464
3465         /*
3466          * IOVA aperture: First-level translation restricts the input-address
3467          * to a canonical address (i.e., address bits 63:N have the same value
3468          * as address bit [N-1], where N is 48-bits with 4-level paging and
3469          * 57-bits with 5-level paging). Hence, skip bit [N-1].
3470          */
3471         domain->domain.geometry.force_aperture = true;
3472         domain->domain.geometry.aperture_start = 0;
3473         if (first_stage)
3474                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3475         else
3476                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3477
3478         /* always allocate the top pgd */
3479         domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3480         if (!domain->pgd) {
3481                 kfree(domain);
3482                 return ERR_PTR(-ENOMEM);
3483         }
3484         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3485
3486         return domain;
3487 }
3488
3489 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3490 {
3491         struct dmar_domain *dmar_domain;
3492         struct iommu_domain *domain;
3493
3494         switch (type) {
3495         case IOMMU_DOMAIN_DMA:
3496         case IOMMU_DOMAIN_UNMANAGED:
3497                 dmar_domain = alloc_domain(type);
3498                 if (!dmar_domain) {
3499                         pr_err("Can't allocate dmar_domain\n");
3500                         return NULL;
3501                 }
3502                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3503                         pr_err("Domain initialization failed\n");
3504                         domain_exit(dmar_domain);
3505                         return NULL;
3506                 }
3507
3508                 domain = &dmar_domain->domain;
3509                 domain->geometry.aperture_start = 0;
3510                 domain->geometry.aperture_end   =
3511                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3512                 domain->geometry.force_aperture = true;
3513
3514                 return domain;
3515         default:
3516                 return NULL;
3517         }
3518
3519         return NULL;
3520 }
3521
3522 static struct iommu_domain *
3523 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3524                               struct iommu_domain *parent,
3525                               const struct iommu_user_data *user_data)
3526 {
3527         struct device_domain_info *info = dev_iommu_priv_get(dev);
3528         bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3529         bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3530         struct intel_iommu *iommu = info->iommu;
3531         struct dmar_domain *dmar_domain;
3532         struct iommu_domain *domain;
3533
3534         /* Must be NESTING domain */
3535         if (parent) {
3536                 if (!nested_supported(iommu) || flags)
3537                         return ERR_PTR(-EOPNOTSUPP);
3538                 return intel_nested_domain_alloc(parent, user_data);
3539         }
3540
3541         if (flags &
3542             (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3543                 return ERR_PTR(-EOPNOTSUPP);
3544         if (nested_parent && !nested_supported(iommu))
3545                 return ERR_PTR(-EOPNOTSUPP);
3546         if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3547                 return ERR_PTR(-EOPNOTSUPP);
3548
3549         /* Do not use first stage for user domain translation. */
3550         dmar_domain = paging_domain_alloc(dev, false);
3551         if (IS_ERR(dmar_domain))
3552                 return ERR_CAST(dmar_domain);
3553         domain = &dmar_domain->domain;
3554         domain->type = IOMMU_DOMAIN_UNMANAGED;
3555         domain->owner = &intel_iommu_ops;
3556         domain->ops = intel_iommu_ops.default_domain_ops;
3557
3558         if (nested_parent) {
3559                 dmar_domain->nested_parent = true;
3560                 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3561                 spin_lock_init(&dmar_domain->s1_lock);
3562         }
3563
3564         if (dirty_tracking) {
3565                 if (dmar_domain->use_first_level) {
3566                         iommu_domain_free(domain);
3567                         return ERR_PTR(-EOPNOTSUPP);
3568                 }
3569                 domain->dirty_ops = &intel_dirty_ops;
3570         }
3571
3572         return domain;
3573 }
3574
3575 static void intel_iommu_domain_free(struct iommu_domain *domain)
3576 {
3577         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3578
3579         WARN_ON(dmar_domain->nested_parent &&
3580                 !list_empty(&dmar_domain->s1_domains));
3581         domain_exit(dmar_domain);
3582 }
3583
3584 int prepare_domain_attach_device(struct iommu_domain *domain,
3585                                  struct device *dev)
3586 {
3587         struct device_domain_info *info = dev_iommu_priv_get(dev);
3588         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3589         struct intel_iommu *iommu = info->iommu;
3590         int addr_width;
3591
3592         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3593                 return -EINVAL;
3594
3595         if (domain->dirty_ops && !ssads_supported(iommu))
3596                 return -EINVAL;
3597
3598         /* check if this iommu agaw is sufficient for max mapped address */
3599         addr_width = agaw_to_width(iommu->agaw);
3600         if (addr_width > cap_mgaw(iommu->cap))
3601                 addr_width = cap_mgaw(iommu->cap);
3602
3603         if (dmar_domain->max_addr > (1LL << addr_width))
3604                 return -EINVAL;
3605         dmar_domain->gaw = addr_width;
3606
3607         /*
3608          * Knock out extra levels of page tables if necessary
3609          */
3610         while (iommu->agaw < dmar_domain->agaw) {
3611                 struct dma_pte *pte;
3612
3613                 pte = dmar_domain->pgd;
3614                 if (dma_pte_present(pte)) {
3615                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3616                         iommu_free_page(pte);
3617                 }
3618                 dmar_domain->agaw--;
3619         }
3620
3621         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3622             context_copied(iommu, info->bus, info->devfn))
3623                 return intel_pasid_setup_sm_context(dev);
3624
3625         return 0;
3626 }
3627
3628 static int intel_iommu_attach_device(struct iommu_domain *domain,
3629                                      struct device *dev)
3630 {
3631         int ret;
3632
3633         device_block_translation(dev);
3634
3635         ret = prepare_domain_attach_device(domain, dev);
3636         if (ret)
3637                 return ret;
3638
3639         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3640 }
3641
3642 static int intel_iommu_map(struct iommu_domain *domain,
3643                            unsigned long iova, phys_addr_t hpa,
3644                            size_t size, int iommu_prot, gfp_t gfp)
3645 {
3646         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3647         u64 max_addr;
3648         int prot = 0;
3649
3650         if (iommu_prot & IOMMU_READ)
3651                 prot |= DMA_PTE_READ;
3652         if (iommu_prot & IOMMU_WRITE)
3653                 prot |= DMA_PTE_WRITE;
3654         if (dmar_domain->set_pte_snp)
3655                 prot |= DMA_PTE_SNP;
3656
3657         max_addr = iova + size;
3658         if (dmar_domain->max_addr < max_addr) {
3659                 u64 end;
3660
3661                 /* check if minimum agaw is sufficient for mapped address */
3662                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3663                 if (end < max_addr) {
3664                         pr_err("%s: iommu width (%d) is not "
3665                                "sufficient for the mapped address (%llx)\n",
3666                                __func__, dmar_domain->gaw, max_addr);
3667                         return -EFAULT;
3668                 }
3669                 dmar_domain->max_addr = max_addr;
3670         }
3671         /* Round up size to next multiple of PAGE_SIZE, if it and
3672            the low bits of hpa would take us onto the next page */
3673         size = aligned_nrpages(hpa, size);
3674         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3675                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3676 }
3677
3678 static int intel_iommu_map_pages(struct iommu_domain *domain,
3679                                  unsigned long iova, phys_addr_t paddr,
3680                                  size_t pgsize, size_t pgcount,
3681                                  int prot, gfp_t gfp, size_t *mapped)
3682 {
3683         unsigned long pgshift = __ffs(pgsize);
3684         size_t size = pgcount << pgshift;
3685         int ret;
3686
3687         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3688                 return -EINVAL;
3689
3690         if (!IS_ALIGNED(iova | paddr, pgsize))
3691                 return -EINVAL;
3692
3693         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3694         if (!ret && mapped)
3695                 *mapped = size;
3696
3697         return ret;
3698 }
3699
3700 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3701                                 unsigned long iova, size_t size,
3702                                 struct iommu_iotlb_gather *gather)
3703 {
3704         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3705         unsigned long start_pfn, last_pfn;
3706         int level = 0;
3707
3708         /* Cope with horrid API which requires us to unmap more than the
3709            size argument if it happens to be a large-page mapping. */
3710         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3711                                      &level, GFP_ATOMIC)))
3712                 return 0;
3713
3714         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3715                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3716
3717         start_pfn = iova >> VTD_PAGE_SHIFT;
3718         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3719
3720         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3721
3722         if (dmar_domain->max_addr == iova + size)
3723                 dmar_domain->max_addr = iova;
3724
3725         /*
3726          * We do not use page-selective IOTLB invalidation in flush queue,
3727          * so there is no need to track page and sync iotlb.
3728          */
3729         if (!iommu_iotlb_gather_queued(gather))
3730                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3731
3732         return size;
3733 }
3734
3735 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3736                                       unsigned long iova,
3737                                       size_t pgsize, size_t pgcount,
3738                                       struct iommu_iotlb_gather *gather)
3739 {
3740         unsigned long pgshift = __ffs(pgsize);
3741         size_t size = pgcount << pgshift;
3742
3743         return intel_iommu_unmap(domain, iova, size, gather);
3744 }
3745
3746 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3747                                  struct iommu_iotlb_gather *gather)
3748 {
3749         cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3750                               gather->end, list_empty(&gather->freelist));
3751         iommu_put_pages_list(&gather->freelist);
3752 }
3753
3754 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3755                                             dma_addr_t iova)
3756 {
3757         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3758         struct dma_pte *pte;
3759         int level = 0;
3760         u64 phys = 0;
3761
3762         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3763                              GFP_ATOMIC);
3764         if (pte && dma_pte_present(pte))
3765                 phys = dma_pte_addr(pte) +
3766                         (iova & (BIT_MASK(level_to_offset_bits(level) +
3767                                                 VTD_PAGE_SHIFT) - 1));
3768
3769         return phys;
3770 }
3771
3772 static bool domain_support_force_snooping(struct dmar_domain *domain)
3773 {
3774         struct device_domain_info *info;
3775         bool support = true;
3776
3777         assert_spin_locked(&domain->lock);
3778         list_for_each_entry(info, &domain->devices, link) {
3779                 if (!ecap_sc_support(info->iommu->ecap)) {
3780                         support = false;
3781                         break;
3782                 }
3783         }
3784
3785         return support;
3786 }
3787
3788 static void domain_set_force_snooping(struct dmar_domain *domain)
3789 {
3790         struct device_domain_info *info;
3791
3792         assert_spin_locked(&domain->lock);
3793         /*
3794          * Second level page table supports per-PTE snoop control. The
3795          * iommu_map() interface will handle this by setting SNP bit.
3796          */
3797         if (!domain->use_first_level) {
3798                 domain->set_pte_snp = true;
3799                 return;
3800         }
3801
3802         list_for_each_entry(info, &domain->devices, link)
3803                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3804                                                      IOMMU_NO_PASID);
3805 }
3806
3807 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3808 {
3809         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3810         unsigned long flags;
3811
3812         if (dmar_domain->force_snooping)
3813                 return true;
3814
3815         spin_lock_irqsave(&dmar_domain->lock, flags);
3816         if (!domain_support_force_snooping(dmar_domain) ||
3817             (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3818                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3819                 return false;
3820         }
3821
3822         domain_set_force_snooping(dmar_domain);
3823         dmar_domain->force_snooping = true;
3824         spin_unlock_irqrestore(&dmar_domain->lock, flags);
3825
3826         return true;
3827 }
3828
3829 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3830 {
3831         struct device_domain_info *info = dev_iommu_priv_get(dev);
3832
3833         switch (cap) {
3834         case IOMMU_CAP_CACHE_COHERENCY:
3835         case IOMMU_CAP_DEFERRED_FLUSH:
3836                 return true;
3837         case IOMMU_CAP_PRE_BOOT_PROTECTION:
3838                 return dmar_platform_optin();
3839         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3840                 return ecap_sc_support(info->iommu->ecap);
3841         case IOMMU_CAP_DIRTY_TRACKING:
3842                 return ssads_supported(info->iommu);
3843         default:
3844                 return false;
3845         }
3846 }
3847
3848 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3849 {
3850         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3851         struct device_domain_info *info;
3852         struct intel_iommu *iommu;
3853         u8 bus, devfn;
3854         int ret;
3855
3856         iommu = device_lookup_iommu(dev, &bus, &devfn);
3857         if (!iommu || !iommu->iommu.ops)
3858                 return ERR_PTR(-ENODEV);
3859
3860         info = kzalloc(sizeof(*info), GFP_KERNEL);
3861         if (!info)
3862                 return ERR_PTR(-ENOMEM);
3863
3864         if (dev_is_real_dma_subdevice(dev)) {
3865                 info->bus = pdev->bus->number;
3866                 info->devfn = pdev->devfn;
3867                 info->segment = pci_domain_nr(pdev->bus);
3868         } else {
3869                 info->bus = bus;
3870                 info->devfn = devfn;
3871                 info->segment = iommu->segment;
3872         }
3873
3874         info->dev = dev;
3875         info->iommu = iommu;
3876         if (dev_is_pci(dev)) {
3877                 if (ecap_dev_iotlb_support(iommu->ecap) &&
3878                     pci_ats_supported(pdev) &&
3879                     dmar_ats_supported(pdev, iommu)) {
3880                         info->ats_supported = 1;
3881                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3882
3883                         /*
3884                          * For IOMMU that supports device IOTLB throttling
3885                          * (DIT), we assign PFSID to the invalidation desc
3886                          * of a VF such that IOMMU HW can gauge queue depth
3887                          * at PF level. If DIT is not set, PFSID will be
3888                          * treated as reserved, which should be set to 0.
3889                          */
3890                         if (ecap_dit(iommu->ecap))
3891                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
3892                         info->ats_qdep = pci_ats_queue_depth(pdev);
3893                 }
3894                 if (sm_supported(iommu)) {
3895                         if (pasid_supported(iommu)) {
3896                                 int features = pci_pasid_features(pdev);
3897
3898                                 if (features >= 0)
3899                                         info->pasid_supported = features | 1;
3900                         }
3901
3902                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
3903                             pci_pri_supported(pdev))
3904                                 info->pri_supported = 1;
3905                 }
3906         }
3907
3908         dev_iommu_priv_set(dev, info);
3909         if (pdev && pci_ats_supported(pdev)) {
3910                 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3911                 ret = device_rbtree_insert(iommu, info);
3912                 if (ret)
3913                         goto free;
3914         }
3915
3916         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3917                 ret = intel_pasid_alloc_table(dev);
3918                 if (ret) {
3919                         dev_err(dev, "PASID table allocation failed\n");
3920                         goto clear_rbtree;
3921                 }
3922
3923                 if (!context_copied(iommu, info->bus, info->devfn)) {
3924                         ret = intel_pasid_setup_sm_context(dev);
3925                         if (ret)
3926                                 goto free_table;
3927                 }
3928         }
3929
3930         intel_iommu_debugfs_create_dev(info);
3931
3932         /*
3933          * The PCIe spec, in its wisdom, declares that the behaviour of the
3934          * device is undefined if you enable PASID support after ATS support.
3935          * So always enable PASID support on devices which have it, even if
3936          * we can't yet know if we're ever going to use it.
3937          */
3938         if (info->pasid_supported &&
3939             !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3940                 info->pasid_enabled = 1;
3941
3942         return &iommu->iommu;
3943 free_table:
3944         intel_pasid_free_table(dev);
3945 clear_rbtree:
3946         device_rbtree_remove(info);
3947 free:
3948         kfree(info);
3949
3950         return ERR_PTR(ret);
3951 }
3952
3953 static void intel_iommu_release_device(struct device *dev)
3954 {
3955         struct device_domain_info *info = dev_iommu_priv_get(dev);
3956         struct intel_iommu *iommu = info->iommu;
3957
3958         if (info->pasid_enabled) {
3959                 pci_disable_pasid(to_pci_dev(dev));
3960                 info->pasid_enabled = 0;
3961         }
3962
3963         mutex_lock(&iommu->iopf_lock);
3964         if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3965                 device_rbtree_remove(info);
3966         mutex_unlock(&iommu->iopf_lock);
3967
3968         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3969             !context_copied(iommu, info->bus, info->devfn))
3970                 intel_pasid_teardown_sm_context(dev);
3971
3972         intel_pasid_free_table(dev);
3973         intel_iommu_debugfs_remove_dev(info);
3974         kfree(info);
3975         set_dma_ops(dev, NULL);
3976 }
3977
3978 static void intel_iommu_get_resv_regions(struct device *device,
3979                                          struct list_head *head)
3980 {
3981         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3982         struct iommu_resv_region *reg;
3983         struct dmar_rmrr_unit *rmrr;
3984         struct device *i_dev;
3985         int i;
3986
3987         rcu_read_lock();
3988         for_each_rmrr_units(rmrr) {
3989                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3990                                           i, i_dev) {
3991                         struct iommu_resv_region *resv;
3992                         enum iommu_resv_type type;
3993                         size_t length;
3994
3995                         if (i_dev != device &&
3996                             !is_downstream_to_pci_bridge(device, i_dev))
3997                                 continue;
3998
3999                         length = rmrr->end_address - rmrr->base_address + 1;
4000
4001                         type = device_rmrr_is_relaxable(device) ?
4002                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4003
4004                         resv = iommu_alloc_resv_region(rmrr->base_address,
4005                                                        length, prot, type,
4006                                                        GFP_ATOMIC);
4007                         if (!resv)
4008                                 break;
4009
4010                         list_add_tail(&resv->list, head);
4011                 }
4012         }
4013         rcu_read_unlock();
4014
4015 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4016         if (dev_is_pci(device)) {
4017                 struct pci_dev *pdev = to_pci_dev(device);
4018
4019                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4020                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4021                                         IOMMU_RESV_DIRECT_RELAXABLE,
4022                                         GFP_KERNEL);
4023                         if (reg)
4024                                 list_add_tail(&reg->list, head);
4025                 }
4026         }
4027 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4028
4029         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4030                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4031                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4032         if (!reg)
4033                 return;
4034         list_add_tail(&reg->list, head);
4035 }
4036
4037 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4038 {
4039         if (dev_is_pci(dev))
4040                 return pci_device_group(dev);
4041         return generic_device_group(dev);
4042 }
4043
4044 static int intel_iommu_enable_sva(struct device *dev)
4045 {
4046         struct device_domain_info *info = dev_iommu_priv_get(dev);
4047         struct intel_iommu *iommu;
4048
4049         if (!info || dmar_disabled)
4050                 return -EINVAL;
4051
4052         iommu = info->iommu;
4053         if (!iommu)
4054                 return -EINVAL;
4055
4056         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4057                 return -ENODEV;
4058
4059         if (!info->pasid_enabled || !info->ats_enabled)
4060                 return -EINVAL;
4061
4062         /*
4063          * Devices having device-specific I/O fault handling should not
4064          * support PCI/PRI. The IOMMU side has no means to check the
4065          * capability of device-specific IOPF.  Therefore, IOMMU can only
4066          * default that if the device driver enables SVA on a non-PRI
4067          * device, it will handle IOPF in its own way.
4068          */
4069         if (!info->pri_supported)
4070                 return 0;
4071
4072         /* Devices supporting PRI should have it enabled. */
4073         if (!info->pri_enabled)
4074                 return -EINVAL;
4075
4076         return 0;
4077 }
4078
4079 static int context_flip_pri(struct device_domain_info *info, bool enable)
4080 {
4081         struct intel_iommu *iommu = info->iommu;
4082         u8 bus = info->bus, devfn = info->devfn;
4083         struct context_entry *context;
4084         u16 did;
4085
4086         spin_lock(&iommu->lock);
4087         if (context_copied(iommu, bus, devfn)) {
4088                 spin_unlock(&iommu->lock);
4089                 return -EINVAL;
4090         }
4091
4092         context = iommu_context_addr(iommu, bus, devfn, false);
4093         if (!context || !context_present(context)) {
4094                 spin_unlock(&iommu->lock);
4095                 return -ENODEV;
4096         }
4097         did = context_domain_id(context);
4098
4099         if (enable)
4100                 context_set_sm_pre(context);
4101         else
4102                 context_clear_sm_pre(context);
4103
4104         if (!ecap_coherent(iommu->ecap))
4105                 clflush_cache_range(context, sizeof(*context));
4106         intel_context_flush_present(info, context, did, true);
4107         spin_unlock(&iommu->lock);
4108
4109         return 0;
4110 }
4111
4112 static int intel_iommu_enable_iopf(struct device *dev)
4113 {
4114         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4115         struct device_domain_info *info = dev_iommu_priv_get(dev);
4116         struct intel_iommu *iommu;
4117         int ret;
4118
4119         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4120                 return -ENODEV;
4121
4122         if (info->pri_enabled)
4123                 return -EBUSY;
4124
4125         iommu = info->iommu;
4126         if (!iommu)
4127                 return -EINVAL;
4128
4129         /* PASID is required in PRG Response Message. */
4130         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4131                 return -EINVAL;
4132
4133         ret = pci_reset_pri(pdev);
4134         if (ret)
4135                 return ret;
4136
4137         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4138         if (ret)
4139                 return ret;
4140
4141         ret = context_flip_pri(info, true);
4142         if (ret)
4143                 goto err_remove_device;
4144
4145         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4146         if (ret)
4147                 goto err_clear_pri;
4148
4149         info->pri_enabled = 1;
4150
4151         return 0;
4152 err_clear_pri:
4153         context_flip_pri(info, false);
4154 err_remove_device:
4155         iopf_queue_remove_device(iommu->iopf_queue, dev);
4156
4157         return ret;
4158 }
4159
4160 static int intel_iommu_disable_iopf(struct device *dev)
4161 {
4162         struct device_domain_info *info = dev_iommu_priv_get(dev);
4163         struct intel_iommu *iommu = info->iommu;
4164
4165         if (!info->pri_enabled)
4166                 return -EINVAL;
4167
4168         /* Disable new PRI reception: */
4169         context_flip_pri(info, false);
4170
4171         /*
4172          * Remove device from fault queue and acknowledge all outstanding
4173          * PRQs to the device:
4174          */
4175         iopf_queue_remove_device(iommu->iopf_queue, dev);
4176
4177         /*
4178          * PCIe spec states that by clearing PRI enable bit, the Page
4179          * Request Interface will not issue new page requests, but has
4180          * outstanding page requests that have been transmitted or are
4181          * queued for transmission. This is supposed to be called after
4182          * the device driver has stopped DMA, all PASIDs have been
4183          * unbound and the outstanding PRQs have been drained.
4184          */
4185         pci_disable_pri(to_pci_dev(dev));
4186         info->pri_enabled = 0;
4187
4188         return 0;
4189 }
4190
4191 static int
4192 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4193 {
4194         switch (feat) {
4195         case IOMMU_DEV_FEAT_IOPF:
4196                 return intel_iommu_enable_iopf(dev);
4197
4198         case IOMMU_DEV_FEAT_SVA:
4199                 return intel_iommu_enable_sva(dev);
4200
4201         default:
4202                 return -ENODEV;
4203         }
4204 }
4205
4206 static int
4207 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4208 {
4209         switch (feat) {
4210         case IOMMU_DEV_FEAT_IOPF:
4211                 return intel_iommu_disable_iopf(dev);
4212
4213         case IOMMU_DEV_FEAT_SVA:
4214                 return 0;
4215
4216         default:
4217                 return -ENODEV;
4218         }
4219 }
4220
4221 static bool intel_iommu_is_attach_deferred(struct device *dev)
4222 {
4223         struct device_domain_info *info = dev_iommu_priv_get(dev);
4224
4225         return translation_pre_enabled(info->iommu) && !info->domain;
4226 }
4227
4228 /*
4229  * Check that the device does not live on an external facing PCI port that is
4230  * marked as untrusted. Such devices should not be able to apply quirks and
4231  * thus not be able to bypass the IOMMU restrictions.
4232  */
4233 static bool risky_device(struct pci_dev *pdev)
4234 {
4235         if (pdev->untrusted) {
4236                 pci_info(pdev,
4237                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4238                          pdev->vendor, pdev->device);
4239                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4240                 return true;
4241         }
4242         return false;
4243 }
4244
4245 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4246                                       unsigned long iova, size_t size)
4247 {
4248         cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4249
4250         return 0;
4251 }
4252
4253 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4254                                          struct iommu_domain *domain)
4255 {
4256         struct device_domain_info *info = dev_iommu_priv_get(dev);
4257         struct dev_pasid_info *curr, *dev_pasid = NULL;
4258         struct intel_iommu *iommu = info->iommu;
4259         struct dmar_domain *dmar_domain;
4260         unsigned long flags;
4261
4262         if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4263                 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4264                 return;
4265         }
4266
4267         dmar_domain = to_dmar_domain(domain);
4268         spin_lock_irqsave(&dmar_domain->lock, flags);
4269         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4270                 if (curr->dev == dev && curr->pasid == pasid) {
4271                         list_del(&curr->link_domain);
4272                         dev_pasid = curr;
4273                         break;
4274                 }
4275         }
4276         WARN_ON_ONCE(!dev_pasid);
4277         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4278
4279         cache_tag_unassign_domain(dmar_domain, dev, pasid);
4280         domain_detach_iommu(dmar_domain, iommu);
4281         intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4282         kfree(dev_pasid);
4283         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4284         intel_drain_pasid_prq(dev, pasid);
4285 }
4286
4287 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4288                                      struct device *dev, ioasid_t pasid)
4289 {
4290         struct device_domain_info *info = dev_iommu_priv_get(dev);
4291         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4292         struct intel_iommu *iommu = info->iommu;
4293         struct dev_pasid_info *dev_pasid;
4294         unsigned long flags;
4295         int ret;
4296
4297         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4298                 return -EOPNOTSUPP;
4299
4300         if (domain->dirty_ops)
4301                 return -EINVAL;
4302
4303         if (context_copied(iommu, info->bus, info->devfn))
4304                 return -EBUSY;
4305
4306         ret = prepare_domain_attach_device(domain, dev);
4307         if (ret)
4308                 return ret;
4309
4310         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4311         if (!dev_pasid)
4312                 return -ENOMEM;
4313
4314         ret = domain_attach_iommu(dmar_domain, iommu);
4315         if (ret)
4316                 goto out_free;
4317
4318         ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4319         if (ret)
4320                 goto out_detach_iommu;
4321
4322         if (dmar_domain->use_first_level)
4323                 ret = domain_setup_first_level(iommu, dmar_domain,
4324                                                dev, pasid);
4325         else
4326                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4327                                                      dev, pasid);
4328         if (ret)
4329                 goto out_unassign_tag;
4330
4331         dev_pasid->dev = dev;
4332         dev_pasid->pasid = pasid;
4333         spin_lock_irqsave(&dmar_domain->lock, flags);
4334         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4335         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4336
4337         if (domain->type & __IOMMU_DOMAIN_PAGING)
4338                 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4339
4340         return 0;
4341 out_unassign_tag:
4342         cache_tag_unassign_domain(dmar_domain, dev, pasid);
4343 out_detach_iommu:
4344         domain_detach_iommu(dmar_domain, iommu);
4345 out_free:
4346         kfree(dev_pasid);
4347         return ret;
4348 }
4349
4350 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4351 {
4352         struct device_domain_info *info = dev_iommu_priv_get(dev);
4353         struct intel_iommu *iommu = info->iommu;
4354         struct iommu_hw_info_vtd *vtd;
4355
4356         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4357         if (!vtd)
4358                 return ERR_PTR(-ENOMEM);
4359
4360         vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4361         vtd->cap_reg = iommu->cap;
4362         vtd->ecap_reg = iommu->ecap;
4363         *length = sizeof(*vtd);
4364         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4365         return vtd;
4366 }
4367
4368 /*
4369  * Set dirty tracking for the device list of a domain. The caller must
4370  * hold the domain->lock when calling it.
4371  */
4372 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4373 {
4374         struct device_domain_info *info;
4375         int ret = 0;
4376
4377         list_for_each_entry(info, devices, link) {
4378                 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4379                                                        IOMMU_NO_PASID, enable);
4380                 if (ret)
4381                         break;
4382         }
4383
4384         return ret;
4385 }
4386
4387 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4388                                             bool enable)
4389 {
4390         struct dmar_domain *s1_domain;
4391         unsigned long flags;
4392         int ret;
4393
4394         spin_lock(&domain->s1_lock);
4395         list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4396                 spin_lock_irqsave(&s1_domain->lock, flags);
4397                 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4398                 spin_unlock_irqrestore(&s1_domain->lock, flags);
4399                 if (ret)
4400                         goto err_unwind;
4401         }
4402         spin_unlock(&domain->s1_lock);
4403         return 0;
4404
4405 err_unwind:
4406         list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4407                 spin_lock_irqsave(&s1_domain->lock, flags);
4408                 device_set_dirty_tracking(&s1_domain->devices,
4409                                           domain->dirty_tracking);
4410                 spin_unlock_irqrestore(&s1_domain->lock, flags);
4411         }
4412         spin_unlock(&domain->s1_lock);
4413         return ret;
4414 }
4415
4416 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4417                                           bool enable)
4418 {
4419         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4420         int ret;
4421
4422         spin_lock(&dmar_domain->lock);
4423         if (dmar_domain->dirty_tracking == enable)
4424                 goto out_unlock;
4425
4426         ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4427         if (ret)
4428                 goto err_unwind;
4429
4430         if (dmar_domain->nested_parent) {
4431                 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4432                 if (ret)
4433                         goto err_unwind;
4434         }
4435
4436         dmar_domain->dirty_tracking = enable;
4437 out_unlock:
4438         spin_unlock(&dmar_domain->lock);
4439
4440         return 0;
4441
4442 err_unwind:
4443         device_set_dirty_tracking(&dmar_domain->devices,
4444                                   dmar_domain->dirty_tracking);
4445         spin_unlock(&dmar_domain->lock);
4446         return ret;
4447 }
4448
4449 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4450                                             unsigned long iova, size_t size,
4451                                             unsigned long flags,
4452                                             struct iommu_dirty_bitmap *dirty)
4453 {
4454         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4455         unsigned long end = iova + size - 1;
4456         unsigned long pgsize;
4457
4458         /*
4459          * IOMMUFD core calls into a dirty tracking disabled domain without an
4460          * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4461          * have occurred when we stopped dirty tracking. This ensures that we
4462          * never inherit dirtied bits from a previous cycle.
4463          */
4464         if (!dmar_domain->dirty_tracking && dirty->bitmap)
4465                 return -EINVAL;
4466
4467         do {
4468                 struct dma_pte *pte;
4469                 int lvl = 0;
4470
4471                 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4472                                      GFP_ATOMIC);
4473                 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4474                 if (!pte || !dma_pte_present(pte)) {
4475                         iova += pgsize;
4476                         continue;
4477                 }
4478
4479                 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4480                         iommu_dirty_bitmap_record(dirty, iova, pgsize);
4481                 iova += pgsize;
4482         } while (iova < end);
4483
4484         return 0;
4485 }
4486
4487 static const struct iommu_dirty_ops intel_dirty_ops = {
4488         .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4489         .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4490 };
4491
4492 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4493 {
4494         struct device_domain_info *info = dev_iommu_priv_get(dev);
4495         struct intel_iommu *iommu = info->iommu;
4496         struct context_entry *context;
4497
4498         spin_lock(&iommu->lock);
4499         context = iommu_context_addr(iommu, bus, devfn, 1);
4500         if (!context) {
4501                 spin_unlock(&iommu->lock);
4502                 return -ENOMEM;
4503         }
4504
4505         if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4506                 spin_unlock(&iommu->lock);
4507                 return 0;
4508         }
4509
4510         copied_context_tear_down(iommu, context, bus, devfn);
4511         context_clear_entry(context);
4512         context_set_domain_id(context, FLPT_DEFAULT_DID);
4513
4514         /*
4515          * In pass through mode, AW must be programmed to indicate the largest
4516          * AGAW value supported by hardware. And ASR is ignored by hardware.
4517          */
4518         context_set_address_width(context, iommu->msagaw);
4519         context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4520         context_set_fault_enable(context);
4521         context_set_present(context);
4522         if (!ecap_coherent(iommu->ecap))
4523                 clflush_cache_range(context, sizeof(*context));
4524         context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4525         spin_unlock(&iommu->lock);
4526
4527         return 0;
4528 }
4529
4530 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4531 {
4532         struct device *dev = data;
4533
4534         if (dev != &pdev->dev)
4535                 return 0;
4536
4537         return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4538 }
4539
4540 static int device_setup_pass_through(struct device *dev)
4541 {
4542         struct device_domain_info *info = dev_iommu_priv_get(dev);
4543
4544         if (!dev_is_pci(dev))
4545                 return context_setup_pass_through(dev, info->bus, info->devfn);
4546
4547         return pci_for_each_dma_alias(to_pci_dev(dev),
4548                                       context_setup_pass_through_cb, dev);
4549 }
4550
4551 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4552 {
4553         struct device_domain_info *info = dev_iommu_priv_get(dev);
4554         struct intel_iommu *iommu = info->iommu;
4555         int ret;
4556
4557         device_block_translation(dev);
4558
4559         if (dev_is_real_dma_subdevice(dev))
4560                 return 0;
4561
4562         if (sm_supported(iommu)) {
4563                 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4564                 if (!ret)
4565                         iommu_enable_pci_caps(info);
4566         } else {
4567                 ret = device_setup_pass_through(dev);
4568         }
4569
4570         return ret;
4571 }
4572
4573 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4574                                          struct device *dev, ioasid_t pasid)
4575 {
4576         struct device_domain_info *info = dev_iommu_priv_get(dev);
4577         struct intel_iommu *iommu = info->iommu;
4578
4579         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4580                 return -EOPNOTSUPP;
4581
4582         return intel_pasid_setup_pass_through(iommu, dev, pasid);
4583 }
4584
4585 static struct iommu_domain identity_domain = {
4586         .type = IOMMU_DOMAIN_IDENTITY,
4587         .ops = &(const struct iommu_domain_ops) {
4588                 .attach_dev     = identity_domain_attach_dev,
4589                 .set_dev_pasid  = identity_domain_set_dev_pasid,
4590         },
4591 };
4592
4593 const struct iommu_ops intel_iommu_ops = {
4594         .blocked_domain         = &blocking_domain,
4595         .release_domain         = &blocking_domain,
4596         .identity_domain        = &identity_domain,
4597         .capable                = intel_iommu_capable,
4598         .hw_info                = intel_iommu_hw_info,
4599         .domain_alloc           = intel_iommu_domain_alloc,
4600         .domain_alloc_user      = intel_iommu_domain_alloc_user,
4601         .domain_alloc_sva       = intel_svm_domain_alloc,
4602         .probe_device           = intel_iommu_probe_device,
4603         .release_device         = intel_iommu_release_device,
4604         .get_resv_regions       = intel_iommu_get_resv_regions,
4605         .device_group           = intel_iommu_device_group,
4606         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4607         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4608         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4609         .def_domain_type        = device_def_domain_type,
4610         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4611         .pgsize_bitmap          = SZ_4K,
4612 #ifdef CONFIG_INTEL_IOMMU_SVM
4613         .page_response          = intel_svm_page_response,
4614 #endif
4615         .default_domain_ops = &(const struct iommu_domain_ops) {
4616                 .attach_dev             = intel_iommu_attach_device,
4617                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4618                 .map_pages              = intel_iommu_map_pages,
4619                 .unmap_pages            = intel_iommu_unmap_pages,
4620                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4621                 .flush_iotlb_all        = intel_flush_iotlb_all,
4622                 .iotlb_sync             = intel_iommu_tlb_sync,
4623                 .iova_to_phys           = intel_iommu_iova_to_phys,
4624                 .free                   = intel_iommu_domain_free,
4625                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4626         }
4627 };
4628
4629 static void quirk_iommu_igfx(struct pci_dev *dev)
4630 {
4631         if (risky_device(dev))
4632                 return;
4633
4634         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4635         disable_igfx_iommu = 1;
4636 }
4637
4638 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4646
4647 /* Broadwell igfx malfunctions with dmar */
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4672
4673 static void quirk_iommu_rwbf(struct pci_dev *dev)
4674 {
4675         if (risky_device(dev))
4676                 return;
4677
4678         /*
4679          * Mobile 4 Series Chipset neglects to set RWBF capability,
4680          * but needs it. Same seems to hold for the desktop versions.
4681          */
4682         pci_info(dev, "Forcing write-buffer flush capability\n");
4683         rwbf_quirk = 1;
4684 }
4685
4686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4693
4694 #define GGC 0x52
4695 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4696 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4697 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4698 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4699 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4700 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4701 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4702 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4703
4704 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4705 {
4706         unsigned short ggc;
4707
4708         if (risky_device(dev))
4709                 return;
4710
4711         if (pci_read_config_word(dev, GGC, &ggc))
4712                 return;
4713
4714         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4715                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4716                 disable_igfx_iommu = 1;
4717         } else if (!disable_igfx_iommu) {
4718                 /* we have to ensure the gfx device is idle before we flush */
4719                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4720                 iommu_set_dma_strict();
4721         }
4722 }
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4727
4728 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4729 {
4730         unsigned short ver;
4731
4732         if (!IS_GFX_DEVICE(dev))
4733                 return;
4734
4735         ver = (dev->device >> 8) & 0xff;
4736         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4737             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4738             ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4739                 return;
4740
4741         if (risky_device(dev))
4742                 return;
4743
4744         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4745         iommu_skip_te_disable = 1;
4746 }
4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4748
4749 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4750    ISOCH DMAR unit for the Azalia sound device, but not give it any
4751    TLB entries, which causes it to deadlock. Check for that.  We do
4752    this in a function called from init_dmars(), instead of in a PCI
4753    quirk, because we don't want to print the obnoxious "BIOS broken"
4754    message if VT-d is actually disabled.
4755 */
4756 static void __init check_tylersburg_isoch(void)
4757 {
4758         struct pci_dev *pdev;
4759         uint32_t vtisochctrl;
4760
4761         /* If there's no Azalia in the system anyway, forget it. */
4762         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4763         if (!pdev)
4764                 return;
4765
4766         if (risky_device(pdev)) {
4767                 pci_dev_put(pdev);
4768                 return;
4769         }
4770
4771         pci_dev_put(pdev);
4772
4773         /* System Management Registers. Might be hidden, in which case
4774            we can't do the sanity check. But that's OK, because the
4775            known-broken BIOSes _don't_ actually hide it, so far. */
4776         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4777         if (!pdev)
4778                 return;
4779
4780         if (risky_device(pdev)) {
4781                 pci_dev_put(pdev);
4782                 return;
4783         }
4784
4785         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4786                 pci_dev_put(pdev);
4787                 return;
4788         }
4789
4790         pci_dev_put(pdev);
4791
4792         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4793         if (vtisochctrl & 1)
4794                 return;
4795
4796         /* Drop all bits other than the number of TLB entries */
4797         vtisochctrl &= 0x1c;
4798
4799         /* If we have the recommended number of TLB entries (16), fine. */
4800         if (vtisochctrl == 0x10)
4801                 return;
4802
4803         /* Zero TLB entries? You get to ride the short bus to school. */
4804         if (!vtisochctrl) {
4805                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4806                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4807                      dmi_get_system_info(DMI_BIOS_VENDOR),
4808                      dmi_get_system_info(DMI_BIOS_VERSION),
4809                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4810                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4811                 return;
4812         }
4813
4814         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4815                vtisochctrl);
4816 }
4817
4818 /*
4819  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4820  * invalidation completion before posted writes initiated with translated address
4821  * that utilized translations matching the invalidation address range, violating
4822  * the invalidation completion ordering.
4823  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4824  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4825  * under the control of the trusted/privileged host device driver must use this
4826  * quirk.
4827  * Device TLBs are invalidated under the following six conditions:
4828  * 1. Device driver does DMA API unmap IOVA
4829  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4830  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4831  *    exit_mmap() due to crash
4832  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4833  *    VM has to free pages that were unmapped
4834  * 5. Userspace driver unmaps a DMA buffer
4835  * 6. Cache invalidation in vSVA usage (upcoming)
4836  *
4837  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4838  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4839  * invalidate TLB the same way as normal user unmap which will use this quirk.
4840  * The dTLB invalidation after PASID cache flush does not need this quirk.
4841  *
4842  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4843  */
4844 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4845                                unsigned long address, unsigned long mask,
4846                                u32 pasid, u16 qdep)
4847 {
4848         u16 sid;
4849
4850         if (likely(!info->dtlb_extra_inval))
4851                 return;
4852
4853         sid = PCI_DEVID(info->bus, info->devfn);
4854         if (pasid == IOMMU_NO_PASID) {
4855                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4856                                    qdep, address, mask);
4857         } else {
4858                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4859                                          pasid, qdep, address, mask);
4860         }
4861 }
4862
4863 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
4864
4865 /*
4866  * Function to submit a command to the enhanced command interface. The
4867  * valid enhanced command descriptions are defined in Table 47 of the
4868  * VT-d spec. The VT-d hardware implementation may support some but not
4869  * all commands, which can be determined by checking the Enhanced
4870  * Command Capability Register.
4871  *
4872  * Return values:
4873  *  - 0: Command successful without any error;
4874  *  - Negative: software error value;
4875  *  - Nonzero positive: failure status code defined in Table 48.
4876  */
4877 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4878 {
4879         unsigned long flags;
4880         u64 res;
4881         int ret;
4882
4883         if (!cap_ecmds(iommu->cap))
4884                 return -ENODEV;
4885
4886         raw_spin_lock_irqsave(&iommu->register_lock, flags);
4887
4888         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4889         if (res & DMA_ECMD_ECRSP_IP) {
4890                 ret = -EBUSY;
4891                 goto err;
4892         }
4893
4894         /*
4895          * Unconditionally write the operand B, because
4896          * - There is no side effect if an ecmd doesn't require an
4897          *   operand B, but we set the register to some value.
4898          * - It's not invoked in any critical path. The extra MMIO
4899          *   write doesn't bring any performance concerns.
4900          */
4901         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4902         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4903
4904         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4905                       !(res & DMA_ECMD_ECRSP_IP), res);
4906
4907         if (res & DMA_ECMD_ECRSP_IP) {
4908                 ret = -ETIMEDOUT;
4909                 goto err;
4910         }
4911
4912         ret = ecmd_get_status_code(res);
4913 err:
4914         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4915
4916         return ret;
4917 }