1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
32 #include "cap_audit.h"
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
62 * set to 1 to panic kernel if can't successfully enable VT-d
63 * (used when kernel is launched w/ TXT)
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
72 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
80 return re->lo & VTD_PAGE_MASK;
84 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
92 return re->hi & VTD_PAGE_MASK;
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
97 struct device_domain_info *info =
98 rb_entry(node, struct device_domain_info, node);
99 const u16 *rid_lhs = key;
101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
112 struct device_domain_info *info =
113 rb_entry(lhs, struct device_domain_info, node);
114 u16 key = PCI_DEVID(info->bus, info->devfn);
116 return device_rid_cmp_key(&key, rhs);
120 * Looks up an IOMMU-probed device using its source ID.
122 * Returns the pointer to the device if there is a match. Otherwise,
125 * Note that this helper doesn't guarantee that the device won't be
126 * released by the iommu subsystem after being returned. The caller
127 * should use its own synchronization mechanism to avoid the device
128 * being released during its use if its possibly the case.
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
132 struct device_domain_info *info = NULL;
133 struct rb_node *node;
136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
139 info = rb_entry(node, struct device_domain_info, node);
140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
142 return info ? info->dev : NULL;
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 struct device_domain_info *info)
148 struct rb_node *curr;
151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
160 static void device_rbtree_remove(struct device_domain_info *info)
162 struct intel_iommu *iommu = info->iommu;
165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 rb_erase(&info->node, &iommu->device_rbtree);
167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
170 struct dmar_rmrr_unit {
171 struct list_head list; /* list of rmrr units */
172 struct acpi_dmar_header *hdr; /* ACPI header */
173 u64 base_address; /* reserved base address*/
174 u64 end_address; /* reserved end address */
175 struct dmar_dev_scope *devices; /* target devices */
176 int devices_cnt; /* target device count */
179 struct dmar_atsr_unit {
180 struct list_head list; /* list of ATSR units */
181 struct acpi_dmar_header *hdr; /* ACPI header */
182 struct dmar_dev_scope *devices; /* target devices */
183 int devices_cnt; /* target device count */
184 u8 include_all:1; /* include all ports */
187 struct dmar_satc_unit {
188 struct list_head list; /* list of SATC units */
189 struct acpi_dmar_header *hdr; /* ACPI header */
190 struct dmar_dev_scope *devices; /* target devices */
191 struct intel_iommu *iommu; /* the corresponding iommu */
192 int devices_cnt; /* target device count */
193 u8 atc_required:1; /* ATS is required */
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
200 #define for_each_rmrr_units(rmrr) \
201 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
216 #define IDENTMAP_AZALIA 4
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
221 static bool translation_pre_enabled(struct intel_iommu *iommu)
223 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
228 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
231 static void init_translation_status(struct intel_iommu *iommu)
235 gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 if (gsts & DMA_GSTS_TES)
237 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
240 static int __init intel_iommu_setup(char *str)
246 if (!strncmp(str, "on", 2)) {
248 pr_info("IOMMU enabled\n");
249 } else if (!strncmp(str, "off", 3)) {
251 no_platform_optin = 1;
252 pr_info("IOMMU disabled\n");
253 } else if (!strncmp(str, "igfx_off", 8)) {
254 disable_igfx_iommu = 1;
255 pr_info("Disable GFX device mapping\n");
256 } else if (!strncmp(str, "forcedac", 8)) {
257 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 iommu_dma_forcedac = true;
259 } else if (!strncmp(str, "strict", 6)) {
260 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 iommu_set_dma_strict();
262 } else if (!strncmp(str, "sp_off", 6)) {
263 pr_info("Disable supported super page\n");
264 intel_iommu_superpage = 0;
265 } else if (!strncmp(str, "sm_on", 5)) {
266 pr_info("Enable scalable mode if hardware supports\n");
268 } else if (!strncmp(str, "sm_off", 6)) {
269 pr_info("Scalable mode is disallowed\n");
271 } else if (!strncmp(str, "tboot_noforce", 13)) {
272 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 intel_iommu_tboot_noforce = 1;
275 pr_notice("Unknown option - '%s'\n", str);
278 str += strcspn(str, ",");
285 __setup("intel_iommu=", intel_iommu_setup);
287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
289 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
291 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
295 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297 * the returned SAGAW.
299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
301 unsigned long fl_sagaw, sl_sagaw;
303 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 sl_sagaw = cap_sagaw(iommu->cap);
306 /* Second level only. */
307 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
310 /* First level only. */
311 if (!ecap_slts(iommu->ecap))
314 return fl_sagaw & sl_sagaw;
317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
322 sagaw = __iommu_calculate_sagaw(iommu);
323 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 if (test_bit(agaw, &sagaw))
332 * Calculate max SAGAW for each iommu.
334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
336 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
340 * calculate agaw for each iommu.
341 * "SAGAW" may be different across iommus, use a default agaw, and
342 * get a supported less agaw for iommus that don't support the default agaw.
344 int iommu_calculate_agaw(struct intel_iommu *iommu)
346 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
351 return sm_supported(iommu) ?
352 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
357 struct iommu_domain_info *info;
358 struct dmar_drhd_unit *drhd;
359 struct intel_iommu *iommu;
363 domain->iommu_coherency = true;
364 xa_for_each(&domain->iommu_array, i, info) {
366 if (!iommu_paging_structure_coherency(info->iommu)) {
367 domain->iommu_coherency = false;
374 /* No hardware attached; use lowest common denominator */
376 for_each_active_iommu(iommu, drhd) {
377 if (!iommu_paging_structure_coherency(iommu)) {
378 domain->iommu_coherency = false;
385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386 struct intel_iommu *skip)
388 struct dmar_drhd_unit *drhd;
389 struct intel_iommu *iommu;
392 if (!intel_iommu_superpage)
395 /* set iommu_superpage to the smallest common denominator */
397 for_each_active_iommu(iommu, drhd) {
399 if (domain && domain->use_first_level) {
400 if (!cap_fl1gp_support(iommu->cap))
403 mask &= cap_super_page_val(iommu->cap);
415 static int domain_update_device_node(struct dmar_domain *domain)
417 struct device_domain_info *info;
418 int nid = NUMA_NO_NODE;
421 spin_lock_irqsave(&domain->lock, flags);
422 list_for_each_entry(info, &domain->devices, link) {
424 * There could possibly be multiple device numa nodes as devices
425 * within the same domain may sit behind different IOMMUs. There
426 * isn't perfect answer in such situation, so we select first
427 * come first served policy.
429 nid = dev_to_node(info->dev);
430 if (nid != NUMA_NO_NODE)
433 spin_unlock_irqrestore(&domain->lock, flags);
438 /* Return the super pagesize bitmap if supported. */
439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
441 unsigned long bitmap = 0;
444 * 1-level super page supports page size of 2MiB, 2-level super page
445 * supports page size of both 2MiB and 1GiB.
447 if (domain->iommu_superpage == 1)
449 else if (domain->iommu_superpage == 2)
450 bitmap |= SZ_2M | SZ_1G;
455 /* Some capabilities may be different across iommus */
456 void domain_update_iommu_cap(struct dmar_domain *domain)
458 domain_update_iommu_coherency(domain);
459 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
462 * If RHSA is missing, we should default to the device numa domain
465 if (domain->nid == NUMA_NO_NODE)
466 domain->nid = domain_update_device_node(domain);
469 * First-level translation restricts the input-address to a
470 * canonical address (i.e., address bits 63:N have the same
471 * value as address bit [N-1], where N is 48-bits with 4-level
472 * paging and 57-bits with 5-level paging). Hence, skip bit
475 if (domain->use_first_level)
476 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
478 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
480 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
486 struct root_entry *root = &iommu->root_entry[bus];
487 struct context_entry *context;
491 * Except that the caller requested to allocate a new entry,
492 * returning a copied context entry makes no sense.
494 if (!alloc && context_copied(iommu, bus, devfn))
498 if (sm_supported(iommu)) {
506 context = phys_to_virt(*entry & VTD_PAGE_MASK);
508 unsigned long phy_addr;
512 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
516 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517 phy_addr = virt_to_phys((void *)context);
518 *entry = phy_addr | 1;
519 __iommu_flush_cache(iommu, entry, sizeof(*entry));
521 return &context[devfn];
525 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526 * sub-hierarchy of a candidate PCI-PCI bridge
527 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528 * @bridge: the candidate PCI-PCI bridge
530 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
535 struct pci_dev *pdev, *pbridge;
537 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
540 pdev = to_pci_dev(dev);
541 pbridge = to_pci_dev(bridge);
543 if (pbridge->subordinate &&
544 pbridge->subordinate->number <= pdev->bus->number &&
545 pbridge->subordinate->busn_res.end >= pdev->bus->number)
551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
553 struct dmar_drhd_unit *drhd;
557 /* We know that this device on this chipset has its own IOMMU.
558 * If we find it under a different IOMMU, then the BIOS is lying
559 * to us. Hope that the IOMMU for this device is actually
560 * disabled, and it needs no translation...
562 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
565 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
570 /* we know that the this iommu should be at offset 0xa000 from vtbar */
571 drhd = dmar_find_matched_drhd_unit(pdev);
572 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
583 if (!iommu || iommu->drhd->ignored)
586 if (dev_is_pci(dev)) {
587 struct pci_dev *pdev = to_pci_dev(dev);
589 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591 quirk_ioat_snb_local_iommu(pdev))
598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
600 struct dmar_drhd_unit *drhd = NULL;
601 struct pci_dev *pdev = NULL;
602 struct intel_iommu *iommu;
610 if (dev_is_pci(dev)) {
611 struct pci_dev *pf_pdev;
613 pdev = pci_real_dma_dev(to_pci_dev(dev));
615 /* VFs aren't listed in scope tables; we need to look up
616 * the PF instead to find the IOMMU. */
617 pf_pdev = pci_physfn(pdev);
619 segment = pci_domain_nr(pdev->bus);
620 } else if (has_acpi_companion(dev))
621 dev = &ACPI_COMPANION(dev)->dev;
624 for_each_iommu(iommu, drhd) {
625 if (pdev && segment != drhd->segment)
628 for_each_active_dev_scope(drhd->devices,
629 drhd->devices_cnt, i, tmp) {
631 /* For a VF use its original BDF# not that of the PF
632 * which we used for the IOMMU lookup. Strictly speaking
633 * we could do this for all PCI devices; we only need to
634 * get the BDF# from the scope table for ACPI matches. */
635 if (pdev && pdev->is_virtfn)
639 *bus = drhd->devices[i].bus;
640 *devfn = drhd->devices[i].devfn;
645 if (is_downstream_to_pci_bridge(dev, tmp))
649 if (pdev && drhd->include_all) {
652 *bus = pdev->bus->number;
653 *devfn = pdev->devfn;
660 if (iommu_is_dummy(iommu, dev))
668 static void domain_flush_cache(struct dmar_domain *domain,
669 void *addr, int size)
671 if (!domain->iommu_coherency)
672 clflush_cache_range(addr, size);
675 static void free_context_table(struct intel_iommu *iommu)
677 struct context_entry *context;
680 if (!iommu->root_entry)
683 for (i = 0; i < ROOT_ENTRY_NR; i++) {
684 context = iommu_context_addr(iommu, i, 0, 0);
686 iommu_free_page(context);
688 if (!sm_supported(iommu))
691 context = iommu_context_addr(iommu, i, 0x80, 0);
693 iommu_free_page(context);
696 iommu_free_page(iommu->root_entry);
697 iommu->root_entry = NULL;
700 #ifdef CONFIG_DMAR_DEBUG
701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702 u8 bus, u8 devfn, struct dma_pte *parent, int level)
708 offset = pfn_level_offset(pfn, level);
709 pte = &parent[offset];
710 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
711 pr_info("PTE not present at level %d\n", level);
715 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
720 parent = phys_to_virt(dma_pte_addr(pte));
725 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
726 unsigned long long addr, u32 pasid)
728 struct pasid_dir_entry *dir, *pde;
729 struct pasid_entry *entries, *pte;
730 struct context_entry *ctx_entry;
731 struct root_entry *rt_entry;
732 int i, dir_index, index, level;
733 u8 devfn = source_id & 0xff;
734 u8 bus = source_id >> 8;
735 struct dma_pte *pgtable;
737 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
739 /* root entry dump */
740 rt_entry = &iommu->root_entry[bus];
742 pr_info("root table entry is not present\n");
746 if (sm_supported(iommu))
747 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
748 rt_entry->hi, rt_entry->lo);
750 pr_info("root entry: 0x%016llx", rt_entry->lo);
752 /* context entry dump */
753 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
755 pr_info("context table entry is not present\n");
759 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
760 ctx_entry->hi, ctx_entry->lo);
762 /* legacy mode does not require PASID entries */
763 if (!sm_supported(iommu)) {
764 level = agaw_to_level(ctx_entry->hi & 7);
765 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
769 /* get the pointer to pasid directory entry */
770 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
772 pr_info("pasid directory entry is not present\n");
775 /* For request-without-pasid, get the pasid from context entry */
776 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
777 pasid = IOMMU_NO_PASID;
779 dir_index = pasid >> PASID_PDE_SHIFT;
780 pde = &dir[dir_index];
781 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
783 /* get the pointer to the pasid table entry */
784 entries = get_pasid_table_from_pde(pde);
786 pr_info("pasid table entry is not present\n");
789 index = pasid & PASID_PTE_MASK;
790 pte = &entries[index];
791 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
792 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
794 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
795 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
796 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
798 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
799 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
803 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
807 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
808 unsigned long pfn, int *target_level,
811 struct dma_pte *parent, *pte;
812 int level = agaw_to_level(domain->agaw);
815 if (!domain_pfn_supported(domain, pfn))
816 /* Address beyond IOMMU's addressing capabilities. */
819 parent = domain->pgd;
824 offset = pfn_level_offset(pfn, level);
825 pte = &parent[offset];
826 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
828 if (level == *target_level)
831 if (!dma_pte_present(pte)) {
832 uint64_t pteval, tmp;
834 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
839 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
840 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
841 if (domain->use_first_level)
842 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
845 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
846 /* Someone else set it while we were thinking; use theirs. */
847 iommu_free_page(tmp_page);
849 domain_flush_cache(domain, pte, sizeof(*pte));
854 parent = phys_to_virt(dma_pte_addr(pte));
859 *target_level = level;
864 /* return address's pte at specific level */
865 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
867 int level, int *large_page)
869 struct dma_pte *parent, *pte;
870 int total = agaw_to_level(domain->agaw);
873 parent = domain->pgd;
874 while (level <= total) {
875 offset = pfn_level_offset(pfn, total);
876 pte = &parent[offset];
880 if (!dma_pte_present(pte)) {
885 if (dma_pte_superpage(pte)) {
890 parent = phys_to_virt(dma_pte_addr(pte));
896 /* clear last level pte, a tlb flush should be followed */
897 static void dma_pte_clear_range(struct dmar_domain *domain,
898 unsigned long start_pfn,
899 unsigned long last_pfn)
901 unsigned int large_page;
902 struct dma_pte *first_pte, *pte;
904 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
905 WARN_ON(start_pfn > last_pfn))
908 /* we don't need lock here; nobody else touches the iova range */
911 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
913 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
918 start_pfn += lvl_to_nr_pages(large_page);
920 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
922 domain_flush_cache(domain, first_pte,
923 (void *)pte - (void *)first_pte);
925 } while (start_pfn && start_pfn <= last_pfn);
928 static void dma_pte_free_level(struct dmar_domain *domain, int level,
929 int retain_level, struct dma_pte *pte,
930 unsigned long pfn, unsigned long start_pfn,
931 unsigned long last_pfn)
933 pfn = max(start_pfn, pfn);
934 pte = &pte[pfn_level_offset(pfn, level)];
937 unsigned long level_pfn;
938 struct dma_pte *level_pte;
940 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
943 level_pfn = pfn & level_mask(level);
944 level_pte = phys_to_virt(dma_pte_addr(pte));
947 dma_pte_free_level(domain, level - 1, retain_level,
948 level_pte, level_pfn, start_pfn,
953 * Free the page table if we're below the level we want to
954 * retain and the range covers the entire table.
956 if (level < retain_level && !(start_pfn > level_pfn ||
957 last_pfn < level_pfn + level_size(level) - 1)) {
959 domain_flush_cache(domain, pte, sizeof(*pte));
960 iommu_free_page(level_pte);
963 pfn += level_size(level);
964 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
968 * clear last level (leaf) ptes and free page table pages below the
969 * level we wish to keep intact.
971 static void dma_pte_free_pagetable(struct dmar_domain *domain,
972 unsigned long start_pfn,
973 unsigned long last_pfn,
976 dma_pte_clear_range(domain, start_pfn, last_pfn);
978 /* We don't need lock here; nobody else touches the iova range */
979 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
980 domain->pgd, 0, start_pfn, last_pfn);
983 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
984 iommu_free_page(domain->pgd);
989 /* When a page at a given level is being unlinked from its parent, we don't
990 need to *modify* it at all. All we need to do is make a list of all the
991 pages which can be freed just as soon as we've flushed the IOTLB and we
992 know the hardware page-walk will no longer touch them.
993 The 'pte' argument is the *parent* PTE, pointing to the page that is to
995 static void dma_pte_list_pagetables(struct dmar_domain *domain,
996 int level, struct dma_pte *pte,
997 struct list_head *freelist)
1001 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1002 list_add_tail(&pg->lru, freelist);
1007 pte = page_address(pg);
1009 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1010 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1012 } while (!first_pte_in_page(pte));
1015 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1016 struct dma_pte *pte, unsigned long pfn,
1017 unsigned long start_pfn, unsigned long last_pfn,
1018 struct list_head *freelist)
1020 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1022 pfn = max(start_pfn, pfn);
1023 pte = &pte[pfn_level_offset(pfn, level)];
1026 unsigned long level_pfn = pfn & level_mask(level);
1028 if (!dma_pte_present(pte))
1031 /* If range covers entire pagetable, free it */
1032 if (start_pfn <= level_pfn &&
1033 last_pfn >= level_pfn + level_size(level) - 1) {
1034 /* These suborbinate page tables are going away entirely. Don't
1035 bother to clear them; we're just going to *free* them. */
1036 if (level > 1 && !dma_pte_superpage(pte))
1037 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1043 } else if (level > 1) {
1044 /* Recurse down into a level that isn't *entirely* obsolete */
1045 dma_pte_clear_level(domain, level - 1,
1046 phys_to_virt(dma_pte_addr(pte)),
1047 level_pfn, start_pfn, last_pfn,
1051 pfn = level_pfn + level_size(level);
1052 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1055 domain_flush_cache(domain, first_pte,
1056 (void *)++last_pte - (void *)first_pte);
1059 /* We can't just free the pages because the IOMMU may still be walking
1060 the page tables, and may have cached the intermediate levels. The
1061 pages can only be freed after the IOTLB flush has been done. */
1062 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1063 unsigned long last_pfn, struct list_head *freelist)
1065 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1066 WARN_ON(start_pfn > last_pfn))
1069 /* we don't need lock here; nobody else touches the iova range */
1070 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071 domain->pgd, 0, start_pfn, last_pfn, freelist);
1074 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075 struct page *pgd_page = virt_to_page(domain->pgd);
1076 list_add_tail(&pgd_page->lru, freelist);
1081 /* iommu handling */
1082 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1084 struct root_entry *root;
1086 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1088 pr_err("Allocating root entry for %s failed\n",
1093 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1094 iommu->root_entry = root;
1099 static void iommu_set_root_entry(struct intel_iommu *iommu)
1105 addr = virt_to_phys(iommu->root_entry);
1106 if (sm_supported(iommu))
1107 addr |= DMA_RTADDR_SMT;
1109 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1110 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1112 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1114 /* Make sure hardware complete it */
1115 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1116 readl, (sts & DMA_GSTS_RTPS), sts);
1118 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1121 * Hardware invalidates all DMA remapping hardware translation
1122 * caches as part of SRTP flow.
1124 if (cap_esrtps(iommu->cap))
1127 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1128 if (sm_supported(iommu))
1129 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1130 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1133 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1138 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1141 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1144 /* Make sure hardware complete it */
1145 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146 readl, (!(val & DMA_GSTS_WBFS)), val);
1148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153 u16 did, u16 source_id, u8 function_mask,
1160 case DMA_CCMD_GLOBAL_INVL:
1161 val = DMA_CCMD_GLOBAL_INVL;
1163 case DMA_CCMD_DOMAIN_INVL:
1164 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1166 case DMA_CCMD_DEVICE_INVL:
1167 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1171 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1175 val |= DMA_CCMD_ICC;
1177 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1178 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1180 /* Make sure hardware complete it */
1181 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1182 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1184 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1187 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1188 unsigned int size_order, u64 type)
1190 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191 u64 val = 0, val_iva = 0;
1195 case DMA_TLB_GLOBAL_FLUSH:
1196 /* global flush doesn't need set IVA_REG */
1197 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1199 case DMA_TLB_DSI_FLUSH:
1200 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1202 case DMA_TLB_PSI_FLUSH:
1203 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204 /* IH bit is passed in as part of address */
1205 val_iva = size_order | addr;
1208 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1213 if (cap_write_drain(iommu->cap))
1214 val |= DMA_TLB_WRITE_DRAIN;
1216 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 /* Note: Only uses first TLB reg currently */
1219 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1220 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1222 /* Make sure hardware complete it */
1223 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1224 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1228 /* check IOTLB invalidation granularity */
1229 if (DMA_TLB_IAIG(val) == 0)
1230 pr_err("Flush IOTLB failed\n");
1231 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1232 pr_debug("TLB flush request %Lx, actual %Lx\n",
1233 (unsigned long long)DMA_TLB_IIRG(type),
1234 (unsigned long long)DMA_TLB_IAIG(val));
1237 static struct device_domain_info *
1238 domain_lookup_dev_info(struct dmar_domain *domain,
1239 struct intel_iommu *iommu, u8 bus, u8 devfn)
1241 struct device_domain_info *info;
1242 unsigned long flags;
1244 spin_lock_irqsave(&domain->lock, flags);
1245 list_for_each_entry(info, &domain->devices, link) {
1246 if (info->iommu == iommu && info->bus == bus &&
1247 info->devfn == devfn) {
1248 spin_unlock_irqrestore(&domain->lock, flags);
1252 spin_unlock_irqrestore(&domain->lock, flags);
1258 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1259 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1260 * check because it applies only to the built-in QAT devices and it doesn't
1261 * grant additional privileges.
1263 #define BUGGY_QAT_DEVID_MASK 0x4940
1264 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1266 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1269 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1275 static void iommu_enable_pci_caps(struct device_domain_info *info)
1277 struct pci_dev *pdev;
1279 if (!dev_is_pci(info->dev))
1282 pdev = to_pci_dev(info->dev);
1283 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1284 !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1285 info->ats_enabled = 1;
1288 static void iommu_disable_pci_caps(struct device_domain_info *info)
1290 struct pci_dev *pdev;
1292 if (!dev_is_pci(info->dev))
1295 pdev = to_pci_dev(info->dev);
1297 if (info->ats_enabled) {
1298 pci_disable_ats(pdev);
1299 info->ats_enabled = 0;
1303 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1305 cache_tag_flush_all(to_dmar_domain(domain));
1308 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1311 unsigned long flags;
1313 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1316 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1317 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1318 pmen &= ~DMA_PMEN_EPM;
1319 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1321 /* wait for the protected region status bit to clear */
1322 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1323 readl, !(pmen & DMA_PMEN_PRS), pmen);
1325 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1328 static void iommu_enable_translation(struct intel_iommu *iommu)
1331 unsigned long flags;
1333 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1334 iommu->gcmd |= DMA_GCMD_TE;
1335 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1337 /* Make sure hardware complete it */
1338 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1339 readl, (sts & DMA_GSTS_TES), sts);
1341 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1344 static void iommu_disable_translation(struct intel_iommu *iommu)
1349 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1350 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1353 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354 iommu->gcmd &= ~DMA_GCMD_TE;
1355 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1357 /* Make sure hardware complete it */
1358 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1359 readl, (!(sts & DMA_GSTS_TES)), sts);
1361 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364 static int iommu_init_domains(struct intel_iommu *iommu)
1368 ndomains = cap_ndoms(iommu->cap);
1369 pr_debug("%s: Number of Domains supported <%d>\n",
1370 iommu->name, ndomains);
1372 spin_lock_init(&iommu->lock);
1374 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1375 if (!iommu->domain_ids)
1379 * If Caching mode is set, then invalid translations are tagged
1380 * with domain-id 0, hence we need to pre-allocate it. We also
1381 * use domain-id 0 as a marker for non-allocated domain-id, so
1382 * make sure it is not used for a real domain.
1384 set_bit(0, iommu->domain_ids);
1387 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1388 * entry for first-level or pass-through translation modes should
1389 * be programmed with a domain id different from those used for
1390 * second-level or nested translation. We reserve a domain id for
1391 * this purpose. This domain id is also used for identity domain
1394 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1399 static void disable_dmar_iommu(struct intel_iommu *iommu)
1401 if (!iommu->domain_ids)
1405 * All iommu domains must have been detached from the devices,
1406 * hence there should be no domain IDs in use.
1408 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1409 > NUM_RESERVED_DID))
1412 if (iommu->gcmd & DMA_GCMD_TE)
1413 iommu_disable_translation(iommu);
1416 static void free_dmar_iommu(struct intel_iommu *iommu)
1418 if (iommu->domain_ids) {
1419 bitmap_free(iommu->domain_ids);
1420 iommu->domain_ids = NULL;
1423 if (iommu->copied_tables) {
1424 bitmap_free(iommu->copied_tables);
1425 iommu->copied_tables = NULL;
1428 /* free context mapping */
1429 free_context_table(iommu);
1431 #ifdef CONFIG_INTEL_IOMMU_SVM
1432 if (pasid_supported(iommu)) {
1433 if (ecap_prs(iommu->ecap))
1434 intel_svm_finish_prq(iommu);
1440 * Check and return whether first level is used by default for
1443 static bool first_level_by_default(unsigned int type)
1445 /* Only SL is available in legacy mode */
1446 if (!scalable_mode_support())
1449 /* Only level (either FL or SL) is available, just use it */
1450 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1451 return intel_cap_flts_sanity();
1453 /* Both levels are available, decide it based on domain type */
1454 return type != IOMMU_DOMAIN_UNMANAGED;
1457 static struct dmar_domain *alloc_domain(unsigned int type)
1459 struct dmar_domain *domain;
1461 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1465 domain->nid = NUMA_NO_NODE;
1466 if (first_level_by_default(type))
1467 domain->use_first_level = true;
1468 INIT_LIST_HEAD(&domain->devices);
1469 INIT_LIST_HEAD(&domain->dev_pasids);
1470 INIT_LIST_HEAD(&domain->cache_tags);
1471 spin_lock_init(&domain->lock);
1472 spin_lock_init(&domain->cache_lock);
1473 xa_init(&domain->iommu_array);
1478 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1480 struct iommu_domain_info *info, *curr;
1481 unsigned long ndomains;
1482 int num, ret = -ENOSPC;
1484 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1487 info = kzalloc(sizeof(*info), GFP_KERNEL);
1491 spin_lock(&iommu->lock);
1492 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1495 spin_unlock(&iommu->lock);
1500 ndomains = cap_ndoms(iommu->cap);
1501 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502 if (num >= ndomains) {
1503 pr_err("%s: No free domain ids\n", iommu->name);
1507 set_bit(num, iommu->domain_ids);
1510 info->iommu = iommu;
1511 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1512 NULL, info, GFP_ATOMIC);
1514 ret = xa_err(curr) ? : -EBUSY;
1517 domain_update_iommu_cap(domain);
1519 spin_unlock(&iommu->lock);
1523 clear_bit(info->did, iommu->domain_ids);
1525 spin_unlock(&iommu->lock);
1530 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1532 struct iommu_domain_info *info;
1534 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1537 spin_lock(&iommu->lock);
1538 info = xa_load(&domain->iommu_array, iommu->seq_id);
1539 if (--info->refcnt == 0) {
1540 clear_bit(info->did, iommu->domain_ids);
1541 xa_erase(&domain->iommu_array, iommu->seq_id);
1542 domain->nid = NUMA_NO_NODE;
1543 domain_update_iommu_cap(domain);
1546 spin_unlock(&iommu->lock);
1549 static int guestwidth_to_adjustwidth(int gaw)
1552 int r = (gaw - 12) % 9;
1563 static void domain_exit(struct dmar_domain *domain)
1566 LIST_HEAD(freelist);
1568 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1569 iommu_put_pages_list(&freelist);
1572 if (WARN_ON(!list_empty(&domain->devices)))
1575 kfree(domain->qi_batch);
1580 * For kdump cases, old valid entries may be cached due to the
1581 * in-flight DMA and copied pgtable, but there is no unmapping
1582 * behaviour for them, thus we need an explicit cache flush for
1583 * the newly-mapped device. For kdump, at this point, the device
1584 * is supposed to finish reset at its driver probe stage, so no
1585 * in-flight DMA will exist, and we don't need to worry anymore
1588 static void copied_context_tear_down(struct intel_iommu *iommu,
1589 struct context_entry *context,
1594 if (!context_copied(iommu, bus, devfn))
1597 assert_spin_locked(&iommu->lock);
1599 did_old = context_domain_id(context);
1600 context_clear_entry(context);
1602 if (did_old < cap_ndoms(iommu->cap)) {
1603 iommu->flush.flush_context(iommu, did_old,
1604 (((u16)bus) << 8) | devfn,
1605 DMA_CCMD_MASK_NOBIT,
1606 DMA_CCMD_DEVICE_INVL);
1607 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1611 clear_context_copied(iommu, bus, devfn);
1615 * It's a non-present to present mapping. If hardware doesn't cache
1616 * non-present entry we only need to flush the write-buffer. If the
1617 * _does_ cache non-present entries, then it does so in the special
1618 * domain #0, which we have to flush:
1620 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1623 if (cap_caching_mode(iommu->cap)) {
1624 iommu->flush.flush_context(iommu, 0,
1625 (((u16)bus) << 8) | devfn,
1626 DMA_CCMD_MASK_NOBIT,
1627 DMA_CCMD_DEVICE_INVL);
1628 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1630 iommu_flush_write_buffer(iommu);
1634 static int domain_context_mapping_one(struct dmar_domain *domain,
1635 struct intel_iommu *iommu,
1638 struct device_domain_info *info =
1639 domain_lookup_dev_info(domain, iommu, bus, devfn);
1640 u16 did = domain_id_iommu(domain, iommu);
1641 int translation = CONTEXT_TT_MULTI_LEVEL;
1642 struct dma_pte *pgd = domain->pgd;
1643 struct context_entry *context;
1646 pr_debug("Set context mapping for %02x:%02x.%d\n",
1647 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1649 spin_lock(&iommu->lock);
1651 context = iommu_context_addr(iommu, bus, devfn, 1);
1656 if (context_present(context) && !context_copied(iommu, bus, devfn))
1659 copied_context_tear_down(iommu, context, bus, devfn);
1660 context_clear_entry(context);
1662 context_set_domain_id(context, did);
1665 * Skip top levels of page tables for iommu which has
1666 * less agaw than default. Unnecessary for PT mode.
1668 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1670 pgd = phys_to_virt(dma_pte_addr(pgd));
1671 if (!dma_pte_present(pgd))
1675 if (info && info->ats_supported)
1676 translation = CONTEXT_TT_DEV_IOTLB;
1678 translation = CONTEXT_TT_MULTI_LEVEL;
1680 context_set_address_root(context, virt_to_phys(pgd));
1681 context_set_address_width(context, agaw);
1682 context_set_translation_type(context, translation);
1683 context_set_fault_enable(context);
1684 context_set_present(context);
1685 if (!ecap_coherent(iommu->ecap))
1686 clflush_cache_range(context, sizeof(*context));
1687 context_present_cache_flush(iommu, did, bus, devfn);
1691 spin_unlock(&iommu->lock);
1696 static int domain_context_mapping_cb(struct pci_dev *pdev,
1697 u16 alias, void *opaque)
1699 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1700 struct intel_iommu *iommu = info->iommu;
1701 struct dmar_domain *domain = opaque;
1703 return domain_context_mapping_one(domain, iommu,
1704 PCI_BUS_NUM(alias), alias & 0xff);
1708 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1710 struct device_domain_info *info = dev_iommu_priv_get(dev);
1711 struct intel_iommu *iommu = info->iommu;
1712 u8 bus = info->bus, devfn = info->devfn;
1714 if (!dev_is_pci(dev))
1715 return domain_context_mapping_one(domain, iommu, bus, devfn);
1717 return pci_for_each_dma_alias(to_pci_dev(dev),
1718 domain_context_mapping_cb, domain);
1721 /* Return largest possible superpage level for a given mapping */
1722 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1723 unsigned long phy_pfn, unsigned long pages)
1725 int support, level = 1;
1726 unsigned long pfnmerge;
1728 support = domain->iommu_superpage;
1730 /* To use a large page, the virtual *and* physical addresses
1731 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1732 of them will mean we have to use smaller pages. So just
1733 merge them and check both at once. */
1734 pfnmerge = iov_pfn | phy_pfn;
1736 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1737 pages >>= VTD_STRIDE_SHIFT;
1740 pfnmerge >>= VTD_STRIDE_SHIFT;
1748 * Ensure that old small page tables are removed to make room for superpage(s).
1749 * We're going to add new large pages, so make sure we don't remove their parent
1750 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1752 static void switch_to_super_page(struct dmar_domain *domain,
1753 unsigned long start_pfn,
1754 unsigned long end_pfn, int level)
1756 unsigned long lvl_pages = lvl_to_nr_pages(level);
1757 struct dma_pte *pte = NULL;
1759 while (start_pfn <= end_pfn) {
1761 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1764 if (dma_pte_present(pte)) {
1765 dma_pte_free_pagetable(domain, start_pfn,
1766 start_pfn + lvl_pages - 1,
1769 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1770 end_pfn << VTD_PAGE_SHIFT, 0);
1774 start_pfn += lvl_pages;
1775 if (first_pte_in_page(pte))
1781 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1782 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1785 struct dma_pte *first_pte = NULL, *pte = NULL;
1786 unsigned int largepage_lvl = 0;
1787 unsigned long lvl_pages = 0;
1791 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1794 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1797 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1798 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1802 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1803 attr |= DMA_FL_PTE_PRESENT;
1804 if (domain->use_first_level) {
1805 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1806 if (prot & DMA_PTE_WRITE)
1807 attr |= DMA_FL_PTE_DIRTY;
1810 domain->has_mappings = true;
1812 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1814 while (nr_pages > 0) {
1818 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1819 phys_pfn, nr_pages);
1821 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1827 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1829 /* It is large page*/
1830 if (largepage_lvl > 1) {
1831 unsigned long end_pfn;
1832 unsigned long pages_to_remove;
1834 pteval |= DMA_PTE_LARGE_PAGE;
1835 pages_to_remove = min_t(unsigned long, nr_pages,
1836 nr_pte_to_next_page(pte) * lvl_pages);
1837 end_pfn = iov_pfn + pages_to_remove - 1;
1838 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1840 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1844 /* We don't need lock here, nobody else
1845 * touches the iova range
1848 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1849 static int dumps = 5;
1850 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851 iov_pfn, tmp, (unsigned long long)pteval);
1854 debug_dma_dump_mappings(NULL);
1859 nr_pages -= lvl_pages;
1860 iov_pfn += lvl_pages;
1861 phys_pfn += lvl_pages;
1862 pteval += lvl_pages * VTD_PAGE_SIZE;
1864 /* If the next PTE would be the first in a new page, then we
1865 * need to flush the cache on the entries we've just written.
1866 * And then we'll need to recalculate 'pte', so clear it and
1867 * let it get set again in the if (!pte) block above.
1869 * If we're done (!nr_pages) we need to flush the cache too.
1871 * Also if we've been setting superpages, we may need to
1872 * recalculate 'pte' and switch back to smaller pages for the
1873 * end of the mapping, if the trailing size is not enough to
1874 * use another superpage (i.e. nr_pages < lvl_pages).
1877 if (!nr_pages || first_pte_in_page(pte) ||
1878 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1879 domain_flush_cache(domain, first_pte,
1880 (void *)pte - (void *)first_pte);
1888 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1890 struct intel_iommu *iommu = info->iommu;
1891 struct context_entry *context;
1894 spin_lock(&iommu->lock);
1895 context = iommu_context_addr(iommu, bus, devfn, 0);
1897 spin_unlock(&iommu->lock);
1901 did = context_domain_id(context);
1902 context_clear_entry(context);
1903 __iommu_flush_cache(iommu, context, sizeof(*context));
1904 spin_unlock(&iommu->lock);
1905 intel_context_flush_present(info, context, did, true);
1908 static int domain_setup_first_level(struct intel_iommu *iommu,
1909 struct dmar_domain *domain,
1913 struct dma_pte *pgd = domain->pgd;
1918 * Skip top levels of page tables for iommu which has
1919 * less agaw than default. Unnecessary for PT mode.
1921 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1922 pgd = phys_to_virt(dma_pte_addr(pgd));
1923 if (!dma_pte_present(pgd))
1927 level = agaw_to_level(agaw);
1928 if (level != 4 && level != 5)
1932 flags |= PASID_FLAG_FL5LP;
1934 if (domain->force_snooping)
1935 flags |= PASID_FLAG_PAGE_SNOOP;
1937 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1938 domain_id_iommu(domain, iommu),
1942 static bool dev_is_real_dma_subdevice(struct device *dev)
1944 return dev && dev_is_pci(dev) &&
1945 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1948 static int dmar_domain_attach_device(struct dmar_domain *domain,
1951 struct device_domain_info *info = dev_iommu_priv_get(dev);
1952 struct intel_iommu *iommu = info->iommu;
1953 unsigned long flags;
1956 ret = domain_attach_iommu(domain, iommu);
1960 info->domain = domain;
1961 spin_lock_irqsave(&domain->lock, flags);
1962 list_add(&info->link, &domain->devices);
1963 spin_unlock_irqrestore(&domain->lock, flags);
1965 if (dev_is_real_dma_subdevice(dev))
1968 if (!sm_supported(iommu))
1969 ret = domain_context_mapping(domain, dev);
1970 else if (domain->use_first_level)
1971 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
1973 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
1976 goto out_block_translation;
1978 iommu_enable_pci_caps(info);
1980 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1982 goto out_block_translation;
1986 out_block_translation:
1987 device_block_translation(dev);
1992 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1993 * is relaxable (ie. is allowed to be not enforced under some conditions)
1994 * @dev: device handle
1996 * We assume that PCI USB devices with RMRRs have them largely
1997 * for historical reasons and that the RMRR space is not actively used post
1998 * boot. This exclusion may change if vendors begin to abuse it.
2000 * The same exception is made for graphics devices, with the requirement that
2001 * any use of the RMRR regions will be torn down before assigning the device
2004 * Return: true if the RMRR is relaxable, false otherwise
2006 static bool device_rmrr_is_relaxable(struct device *dev)
2008 struct pci_dev *pdev;
2010 if (!dev_is_pci(dev))
2013 pdev = to_pci_dev(dev);
2014 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2020 static int device_def_domain_type(struct device *dev)
2022 struct device_domain_info *info = dev_iommu_priv_get(dev);
2023 struct intel_iommu *iommu = info->iommu;
2026 * Hardware does not support the passthrough translation mode.
2027 * Always use a dynamaic mapping domain.
2029 if (!ecap_pass_through(iommu->ecap))
2030 return IOMMU_DOMAIN_DMA;
2032 if (dev_is_pci(dev)) {
2033 struct pci_dev *pdev = to_pci_dev(dev);
2035 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2036 return IOMMU_DOMAIN_IDENTITY;
2042 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2045 * Start from the sane iommu hardware state.
2046 * If the queued invalidation is already initialized by us
2047 * (for example, while enabling interrupt-remapping) then
2048 * we got the things already rolling from a sane state.
2052 * Clear any previous faults.
2054 dmar_fault(-1, iommu);
2056 * Disable queued invalidation if supported and already enabled
2057 * before OS handover.
2059 dmar_disable_qi(iommu);
2062 if (dmar_enable_qi(iommu)) {
2064 * Queued Invalidate not enabled, use Register Based Invalidate
2066 iommu->flush.flush_context = __iommu_flush_context;
2067 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2068 pr_info("%s: Using Register based invalidation\n",
2071 iommu->flush.flush_context = qi_flush_context;
2072 iommu->flush.flush_iotlb = qi_flush_iotlb;
2073 pr_info("%s: Using Queued invalidation\n", iommu->name);
2077 static int copy_context_table(struct intel_iommu *iommu,
2078 struct root_entry *old_re,
2079 struct context_entry **tbl,
2082 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2083 struct context_entry *new_ce = NULL, ce;
2084 struct context_entry *old_ce = NULL;
2085 struct root_entry re;
2086 phys_addr_t old_ce_phys;
2088 tbl_idx = ext ? bus * 2 : bus;
2089 memcpy(&re, old_re, sizeof(re));
2091 for (devfn = 0; devfn < 256; devfn++) {
2092 /* First calculate the correct index */
2093 idx = (ext ? devfn * 2 : devfn) % 256;
2096 /* First save what we may have and clean up */
2098 tbl[tbl_idx] = new_ce;
2099 __iommu_flush_cache(iommu, new_ce,
2109 old_ce_phys = root_entry_lctp(&re);
2111 old_ce_phys = root_entry_uctp(&re);
2114 if (ext && devfn == 0) {
2115 /* No LCTP, try UCTP */
2124 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2129 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2136 /* Now copy the context entry */
2137 memcpy(&ce, old_ce + idx, sizeof(ce));
2139 if (!context_present(&ce))
2142 did = context_domain_id(&ce);
2143 if (did >= 0 && did < cap_ndoms(iommu->cap))
2144 set_bit(did, iommu->domain_ids);
2146 set_context_copied(iommu, bus, devfn);
2150 tbl[tbl_idx + pos] = new_ce;
2152 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2161 static int copy_translation_tables(struct intel_iommu *iommu)
2163 struct context_entry **ctxt_tbls;
2164 struct root_entry *old_rt;
2165 phys_addr_t old_rt_phys;
2166 int ctxt_table_entries;
2171 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2172 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2173 new_ext = !!sm_supported(iommu);
2176 * The RTT bit can only be changed when translation is disabled,
2177 * but disabling translation means to open a window for data
2178 * corruption. So bail out and don't copy anything if we would
2179 * have to change the bit.
2184 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2185 if (!iommu->copied_tables)
2188 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2192 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2196 /* This is too big for the stack - allocate it from slab */
2197 ctxt_table_entries = ext ? 512 : 256;
2199 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2203 for (bus = 0; bus < 256; bus++) {
2204 ret = copy_context_table(iommu, &old_rt[bus],
2205 ctxt_tbls, bus, ext);
2207 pr_err("%s: Failed to copy context table for bus %d\n",
2213 spin_lock(&iommu->lock);
2215 /* Context tables are copied, now write them to the root_entry table */
2216 for (bus = 0; bus < 256; bus++) {
2217 int idx = ext ? bus * 2 : bus;
2220 if (ctxt_tbls[idx]) {
2221 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2222 iommu->root_entry[bus].lo = val;
2225 if (!ext || !ctxt_tbls[idx + 1])
2228 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2229 iommu->root_entry[bus].hi = val;
2232 spin_unlock(&iommu->lock);
2236 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2246 static int __init init_dmars(void)
2248 struct dmar_drhd_unit *drhd;
2249 struct intel_iommu *iommu;
2252 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2256 for_each_iommu(iommu, drhd) {
2257 if (drhd->ignored) {
2258 iommu_disable_translation(iommu);
2263 * Find the max pasid size of all IOMMU's in the system.
2264 * We need to ensure the system pasid table is no bigger
2265 * than the smallest supported.
2267 if (pasid_supported(iommu)) {
2268 u32 temp = 2 << ecap_pss(iommu->ecap);
2270 intel_pasid_max_id = min_t(u32, temp,
2271 intel_pasid_max_id);
2274 intel_iommu_init_qi(iommu);
2276 ret = iommu_init_domains(iommu);
2280 init_translation_status(iommu);
2282 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2283 iommu_disable_translation(iommu);
2284 clear_translation_pre_enabled(iommu);
2285 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2291 * we could share the same root & context tables
2292 * among all IOMMU's. Need to Split it later.
2294 ret = iommu_alloc_root_entry(iommu);
2298 if (translation_pre_enabled(iommu)) {
2299 pr_info("Translation already enabled - trying to copy translation structures\n");
2301 ret = copy_translation_tables(iommu);
2304 * We found the IOMMU with translation
2305 * enabled - but failed to copy over the
2306 * old root-entry table. Try to proceed
2307 * by disabling translation now and
2308 * allocating a clean root-entry table.
2309 * This might cause DMAR faults, but
2310 * probably the dump will still succeed.
2312 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2314 iommu_disable_translation(iommu);
2315 clear_translation_pre_enabled(iommu);
2317 pr_info("Copied translation tables from previous kernel for %s\n",
2322 intel_svm_check(iommu);
2326 * Now that qi is enabled on all iommus, set the root entry and flush
2327 * caches. This is required on some Intel X58 chipsets, otherwise the
2328 * flush_context function will loop forever and the boot hangs.
2330 for_each_active_iommu(iommu, drhd) {
2331 iommu_flush_write_buffer(iommu);
2332 iommu_set_root_entry(iommu);
2335 check_tylersburg_isoch();
2340 * global invalidate context cache
2341 * global invalidate iotlb
2342 * enable translation
2344 for_each_iommu(iommu, drhd) {
2345 if (drhd->ignored) {
2347 * we always have to disable PMRs or DMA may fail on
2351 iommu_disable_protect_mem_regions(iommu);
2355 iommu_flush_write_buffer(iommu);
2357 #ifdef CONFIG_INTEL_IOMMU_SVM
2358 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2360 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2361 * could cause possible lock race condition.
2363 up_write(&dmar_global_lock);
2364 ret = intel_svm_enable_prq(iommu);
2365 down_write(&dmar_global_lock);
2370 ret = dmar_set_interrupt(iommu);
2378 for_each_active_iommu(iommu, drhd) {
2379 disable_dmar_iommu(iommu);
2380 free_dmar_iommu(iommu);
2386 static void __init init_no_remapping_devices(void)
2388 struct dmar_drhd_unit *drhd;
2392 for_each_drhd_unit(drhd) {
2393 if (!drhd->include_all) {
2394 for_each_active_dev_scope(drhd->devices,
2395 drhd->devices_cnt, i, dev)
2397 /* ignore DMAR unit if no devices exist */
2398 if (i == drhd->devices_cnt)
2403 for_each_active_drhd_unit(drhd) {
2404 if (drhd->include_all)
2407 for_each_active_dev_scope(drhd->devices,
2408 drhd->devices_cnt, i, dev)
2409 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2411 if (i < drhd->devices_cnt)
2414 /* This IOMMU has *only* gfx devices. Either bypass it or
2415 set the gfx_mapped flag, as appropriate */
2416 drhd->gfx_dedicated = 1;
2417 if (disable_igfx_iommu)
2422 #ifdef CONFIG_SUSPEND
2423 static int init_iommu_hw(void)
2425 struct dmar_drhd_unit *drhd;
2426 struct intel_iommu *iommu = NULL;
2429 for_each_active_iommu(iommu, drhd) {
2431 ret = dmar_reenable_qi(iommu);
2437 for_each_iommu(iommu, drhd) {
2438 if (drhd->ignored) {
2440 * we always have to disable PMRs or DMA may fail on
2444 iommu_disable_protect_mem_regions(iommu);
2448 iommu_flush_write_buffer(iommu);
2449 iommu_set_root_entry(iommu);
2450 iommu_enable_translation(iommu);
2451 iommu_disable_protect_mem_regions(iommu);
2457 static void iommu_flush_all(void)
2459 struct dmar_drhd_unit *drhd;
2460 struct intel_iommu *iommu;
2462 for_each_active_iommu(iommu, drhd) {
2463 iommu->flush.flush_context(iommu, 0, 0, 0,
2464 DMA_CCMD_GLOBAL_INVL);
2465 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2466 DMA_TLB_GLOBAL_FLUSH);
2470 static int iommu_suspend(void)
2472 struct dmar_drhd_unit *drhd;
2473 struct intel_iommu *iommu = NULL;
2478 for_each_active_iommu(iommu, drhd) {
2479 iommu_disable_translation(iommu);
2481 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2483 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2484 readl(iommu->reg + DMAR_FECTL_REG);
2485 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2486 readl(iommu->reg + DMAR_FEDATA_REG);
2487 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2488 readl(iommu->reg + DMAR_FEADDR_REG);
2489 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2490 readl(iommu->reg + DMAR_FEUADDR_REG);
2492 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2497 static void iommu_resume(void)
2499 struct dmar_drhd_unit *drhd;
2500 struct intel_iommu *iommu = NULL;
2503 if (init_iommu_hw()) {
2505 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2507 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2511 for_each_active_iommu(iommu, drhd) {
2513 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2515 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2516 iommu->reg + DMAR_FECTL_REG);
2517 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2518 iommu->reg + DMAR_FEDATA_REG);
2519 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2520 iommu->reg + DMAR_FEADDR_REG);
2521 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2522 iommu->reg + DMAR_FEUADDR_REG);
2524 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2528 static struct syscore_ops iommu_syscore_ops = {
2529 .resume = iommu_resume,
2530 .suspend = iommu_suspend,
2533 static void __init init_iommu_pm_ops(void)
2535 register_syscore_ops(&iommu_syscore_ops);
2539 static inline void init_iommu_pm_ops(void) {}
2540 #endif /* CONFIG_PM */
2542 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2544 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2545 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2546 rmrr->end_address <= rmrr->base_address ||
2547 arch_rmrr_sanity_check(rmrr))
2553 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2555 struct acpi_dmar_reserved_memory *rmrr;
2556 struct dmar_rmrr_unit *rmrru;
2558 rmrr = (struct acpi_dmar_reserved_memory *)header;
2559 if (rmrr_sanity_check(rmrr)) {
2561 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2562 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2563 rmrr->base_address, rmrr->end_address,
2564 dmi_get_system_info(DMI_BIOS_VENDOR),
2565 dmi_get_system_info(DMI_BIOS_VERSION),
2566 dmi_get_system_info(DMI_PRODUCT_VERSION));
2567 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2570 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2574 rmrru->hdr = header;
2576 rmrru->base_address = rmrr->base_address;
2577 rmrru->end_address = rmrr->end_address;
2579 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2580 ((void *)rmrr) + rmrr->header.length,
2581 &rmrru->devices_cnt);
2582 if (rmrru->devices_cnt && rmrru->devices == NULL)
2585 list_add(&rmrru->list, &dmar_rmrr_units);
2594 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2596 struct dmar_atsr_unit *atsru;
2597 struct acpi_dmar_atsr *tmp;
2599 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2601 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2602 if (atsr->segment != tmp->segment)
2604 if (atsr->header.length != tmp->header.length)
2606 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2613 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2615 struct acpi_dmar_atsr *atsr;
2616 struct dmar_atsr_unit *atsru;
2618 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2621 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2622 atsru = dmar_find_atsr(atsr);
2626 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2631 * If memory is allocated from slab by ACPI _DSM method, we need to
2632 * copy the memory content because the memory buffer will be freed
2635 atsru->hdr = (void *)(atsru + 1);
2636 memcpy(atsru->hdr, hdr, hdr->length);
2637 atsru->include_all = atsr->flags & 0x1;
2638 if (!atsru->include_all) {
2639 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2640 (void *)atsr + atsr->header.length,
2641 &atsru->devices_cnt);
2642 if (atsru->devices_cnt && atsru->devices == NULL) {
2648 list_add_rcu(&atsru->list, &dmar_atsr_units);
2653 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2655 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2659 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2661 struct acpi_dmar_atsr *atsr;
2662 struct dmar_atsr_unit *atsru;
2664 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2665 atsru = dmar_find_atsr(atsr);
2667 list_del_rcu(&atsru->list);
2669 intel_iommu_free_atsr(atsru);
2675 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2679 struct acpi_dmar_atsr *atsr;
2680 struct dmar_atsr_unit *atsru;
2682 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2683 atsru = dmar_find_atsr(atsr);
2687 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2688 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2696 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2698 struct dmar_satc_unit *satcu;
2699 struct acpi_dmar_satc *tmp;
2701 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2703 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2704 if (satc->segment != tmp->segment)
2706 if (satc->header.length != tmp->header.length)
2708 if (memcmp(satc, tmp, satc->header.length) == 0)
2715 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2717 struct acpi_dmar_satc *satc;
2718 struct dmar_satc_unit *satcu;
2720 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2723 satc = container_of(hdr, struct acpi_dmar_satc, header);
2724 satcu = dmar_find_satc(satc);
2728 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2732 satcu->hdr = (void *)(satcu + 1);
2733 memcpy(satcu->hdr, hdr, hdr->length);
2734 satcu->atc_required = satc->flags & 0x1;
2735 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2736 (void *)satc + satc->header.length,
2737 &satcu->devices_cnt);
2738 if (satcu->devices_cnt && !satcu->devices) {
2742 list_add_rcu(&satcu->list, &dmar_satc_units);
2747 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2750 struct intel_iommu *iommu = dmaru->iommu;
2752 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2756 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2757 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2758 pr_warn("%s: Doesn't support large page.\n",
2764 * Disable translation if already enabled prior to OS handover.
2766 if (iommu->gcmd & DMA_GCMD_TE)
2767 iommu_disable_translation(iommu);
2769 ret = iommu_init_domains(iommu);
2771 ret = iommu_alloc_root_entry(iommu);
2775 intel_svm_check(iommu);
2777 if (dmaru->ignored) {
2779 * we always have to disable PMRs or DMA may fail on this device
2782 iommu_disable_protect_mem_regions(iommu);
2786 intel_iommu_init_qi(iommu);
2787 iommu_flush_write_buffer(iommu);
2789 #ifdef CONFIG_INTEL_IOMMU_SVM
2790 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2791 ret = intel_svm_enable_prq(iommu);
2796 ret = dmar_set_interrupt(iommu);
2800 iommu_set_root_entry(iommu);
2801 iommu_enable_translation(iommu);
2803 iommu_disable_protect_mem_regions(iommu);
2807 disable_dmar_iommu(iommu);
2809 free_dmar_iommu(iommu);
2813 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2816 struct intel_iommu *iommu = dmaru->iommu;
2818 if (!intel_iommu_enabled)
2824 ret = intel_iommu_add(dmaru);
2826 disable_dmar_iommu(iommu);
2827 free_dmar_iommu(iommu);
2833 static void intel_iommu_free_dmars(void)
2835 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2836 struct dmar_atsr_unit *atsru, *atsr_n;
2837 struct dmar_satc_unit *satcu, *satc_n;
2839 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2840 list_del(&rmrru->list);
2841 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2845 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2846 list_del(&atsru->list);
2847 intel_iommu_free_atsr(atsru);
2849 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2850 list_del(&satcu->list);
2851 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2856 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2858 struct dmar_satc_unit *satcu;
2859 struct acpi_dmar_satc *satc;
2863 dev = pci_physfn(dev);
2866 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2867 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2868 if (satc->segment != pci_domain_nr(dev->bus))
2870 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2871 if (to_pci_dev(tmp) == dev)
2880 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2883 struct pci_bus *bus;
2884 struct pci_dev *bridge = NULL;
2886 struct acpi_dmar_atsr *atsr;
2887 struct dmar_atsr_unit *atsru;
2888 struct dmar_satc_unit *satcu;
2890 dev = pci_physfn(dev);
2891 satcu = dmar_find_matched_satc_unit(dev);
2894 * This device supports ATS as it is in SATC table.
2895 * When IOMMU is in legacy mode, enabling ATS is done
2896 * automatically by HW for the device that requires
2897 * ATS, hence OS should not enable this device ATS
2898 * to avoid duplicated TLB invalidation.
2900 return !(satcu->atc_required && !sm_supported(iommu));
2902 for (bus = dev->bus; bus; bus = bus->parent) {
2904 /* If it's an integrated device, allow ATS */
2907 /* Connected via non-PCIe: no ATS */
2908 if (!pci_is_pcie(bridge) ||
2909 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2911 /* If we found the root port, look it up in the ATSR */
2912 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2917 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2918 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2919 if (atsr->segment != pci_domain_nr(dev->bus))
2922 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2923 if (tmp == &bridge->dev)
2926 if (atsru->include_all)
2936 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2939 struct dmar_rmrr_unit *rmrru;
2940 struct dmar_atsr_unit *atsru;
2941 struct dmar_satc_unit *satcu;
2942 struct acpi_dmar_atsr *atsr;
2943 struct acpi_dmar_reserved_memory *rmrr;
2944 struct acpi_dmar_satc *satc;
2946 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2949 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2950 rmrr = container_of(rmrru->hdr,
2951 struct acpi_dmar_reserved_memory, header);
2952 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2953 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2954 ((void *)rmrr) + rmrr->header.length,
2955 rmrr->segment, rmrru->devices,
2956 rmrru->devices_cnt);
2959 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2960 dmar_remove_dev_scope(info, rmrr->segment,
2961 rmrru->devices, rmrru->devices_cnt);
2965 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2966 if (atsru->include_all)
2969 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2970 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2971 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2972 (void *)atsr + atsr->header.length,
2973 atsr->segment, atsru->devices,
2974 atsru->devices_cnt);
2979 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2980 if (dmar_remove_dev_scope(info, atsr->segment,
2981 atsru->devices, atsru->devices_cnt))
2985 list_for_each_entry(satcu, &dmar_satc_units, list) {
2986 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2987 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2988 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2989 (void *)satc + satc->header.length,
2990 satc->segment, satcu->devices,
2991 satcu->devices_cnt);
2996 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2997 if (dmar_remove_dev_scope(info, satc->segment,
2998 satcu->devices, satcu->devices_cnt))
3006 static void intel_disable_iommus(void)
3008 struct intel_iommu *iommu = NULL;
3009 struct dmar_drhd_unit *drhd;
3011 for_each_iommu(iommu, drhd)
3012 iommu_disable_translation(iommu);
3015 void intel_iommu_shutdown(void)
3017 struct dmar_drhd_unit *drhd;
3018 struct intel_iommu *iommu = NULL;
3020 if (no_iommu || dmar_disabled)
3023 down_write(&dmar_global_lock);
3025 /* Disable PMRs explicitly here. */
3026 for_each_iommu(iommu, drhd)
3027 iommu_disable_protect_mem_regions(iommu);
3029 /* Make sure the IOMMUs are switched off */
3030 intel_disable_iommus();
3032 up_write(&dmar_global_lock);
3035 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3037 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3039 return container_of(iommu_dev, struct intel_iommu, iommu);
3042 static ssize_t version_show(struct device *dev,
3043 struct device_attribute *attr, char *buf)
3045 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3046 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3047 return sysfs_emit(buf, "%d:%d\n",
3048 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3050 static DEVICE_ATTR_RO(version);
3052 static ssize_t address_show(struct device *dev,
3053 struct device_attribute *attr, char *buf)
3055 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3056 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3058 static DEVICE_ATTR_RO(address);
3060 static ssize_t cap_show(struct device *dev,
3061 struct device_attribute *attr, char *buf)
3063 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3064 return sysfs_emit(buf, "%llx\n", iommu->cap);
3066 static DEVICE_ATTR_RO(cap);
3068 static ssize_t ecap_show(struct device *dev,
3069 struct device_attribute *attr, char *buf)
3071 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3072 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3074 static DEVICE_ATTR_RO(ecap);
3076 static ssize_t domains_supported_show(struct device *dev,
3077 struct device_attribute *attr, char *buf)
3079 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3080 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3082 static DEVICE_ATTR_RO(domains_supported);
3084 static ssize_t domains_used_show(struct device *dev,
3085 struct device_attribute *attr, char *buf)
3087 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088 return sysfs_emit(buf, "%d\n",
3089 bitmap_weight(iommu->domain_ids,
3090 cap_ndoms(iommu->cap)));
3092 static DEVICE_ATTR_RO(domains_used);
3094 static struct attribute *intel_iommu_attrs[] = {
3095 &dev_attr_version.attr,
3096 &dev_attr_address.attr,
3098 &dev_attr_ecap.attr,
3099 &dev_attr_domains_supported.attr,
3100 &dev_attr_domains_used.attr,
3104 static struct attribute_group intel_iommu_group = {
3105 .name = "intel-iommu",
3106 .attrs = intel_iommu_attrs,
3109 const struct attribute_group *intel_iommu_groups[] = {
3114 static bool has_external_pci(void)
3116 struct pci_dev *pdev = NULL;
3118 for_each_pci_dev(pdev)
3119 if (pdev->external_facing) {
3127 static int __init platform_optin_force_iommu(void)
3129 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3132 if (no_iommu || dmar_disabled)
3133 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3136 * If Intel-IOMMU is disabled by default, we will apply identity
3137 * map for all devices except those marked as being untrusted.
3140 iommu_set_default_passthrough(false);
3148 static int __init probe_acpi_namespace_devices(void)
3150 struct dmar_drhd_unit *drhd;
3151 /* To avoid a -Wunused-but-set-variable warning. */
3152 struct intel_iommu *iommu __maybe_unused;
3156 for_each_active_iommu(iommu, drhd) {
3157 for_each_active_dev_scope(drhd->devices,
3158 drhd->devices_cnt, i, dev) {
3159 struct acpi_device_physical_node *pn;
3160 struct acpi_device *adev;
3162 if (dev->bus != &acpi_bus_type)
3165 adev = to_acpi_device(dev);
3166 mutex_lock(&adev->physical_node_lock);
3167 list_for_each_entry(pn,
3168 &adev->physical_node_list, node) {
3169 ret = iommu_probe_device(pn->dev);
3173 mutex_unlock(&adev->physical_node_lock);
3183 static __init int tboot_force_iommu(void)
3185 if (!tboot_enabled())
3188 if (no_iommu || dmar_disabled)
3189 pr_warn("Forcing Intel-IOMMU to enabled\n");
3197 int __init intel_iommu_init(void)
3200 struct dmar_drhd_unit *drhd;
3201 struct intel_iommu *iommu;
3204 * Intel IOMMU is required for a TXT/tboot launch or platform
3205 * opt in, so enforce that.
3207 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3208 platform_optin_force_iommu();
3210 down_write(&dmar_global_lock);
3211 if (dmar_table_init()) {
3213 panic("tboot: Failed to initialize DMAR table\n");
3217 if (dmar_dev_scope_init() < 0) {
3219 panic("tboot: Failed to initialize DMAR device scope\n");
3223 up_write(&dmar_global_lock);
3226 * The bus notifier takes the dmar_global_lock, so lockdep will
3227 * complain later when we register it under the lock.
3229 dmar_register_bus_notifier();
3231 down_write(&dmar_global_lock);
3234 intel_iommu_debugfs_init();
3236 if (no_iommu || dmar_disabled) {
3238 * We exit the function here to ensure IOMMU's remapping and
3239 * mempool aren't setup, which means that the IOMMU's PMRs
3240 * won't be disabled via the call to init_dmars(). So disable
3241 * it explicitly here. The PMRs were setup by tboot prior to
3242 * calling SENTER, but the kernel is expected to reset/tear
3245 if (intel_iommu_tboot_noforce) {
3246 for_each_iommu(iommu, drhd)
3247 iommu_disable_protect_mem_regions(iommu);
3251 * Make sure the IOMMUs are switched off, even when we
3252 * boot into a kexec kernel and the previous kernel left
3255 intel_disable_iommus();
3259 if (list_empty(&dmar_rmrr_units))
3260 pr_info("No RMRR found\n");
3262 if (list_empty(&dmar_atsr_units))
3263 pr_info("No ATSR found\n");
3265 if (list_empty(&dmar_satc_units))
3266 pr_info("No SATC found\n");
3268 init_no_remapping_devices();
3273 panic("tboot: Failed to initialize DMARs\n");
3274 pr_err("Initialization failed\n");
3277 up_write(&dmar_global_lock);
3279 init_iommu_pm_ops();
3281 down_read(&dmar_global_lock);
3282 for_each_active_iommu(iommu, drhd) {
3284 * The flush queue implementation does not perform
3285 * page-selective invalidations that are required for efficient
3286 * TLB flushes in virtual environments. The benefit of batching
3287 * is likely to be much lower than the overhead of synchronizing
3288 * the virtual and physical IOMMU page-tables.
3290 if (cap_caching_mode(iommu->cap) &&
3291 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3292 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3293 iommu_set_dma_strict();
3295 iommu_device_sysfs_add(&iommu->iommu, NULL,
3298 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3300 iommu_pmu_register(iommu);
3303 if (probe_acpi_namespace_devices())
3304 pr_warn("ACPI name space devices didn't probe correctly\n");
3306 /* Finally, we enable the DMA remapping hardware. */
3307 for_each_iommu(iommu, drhd) {
3308 if (!drhd->ignored && !translation_pre_enabled(iommu))
3309 iommu_enable_translation(iommu);
3311 iommu_disable_protect_mem_regions(iommu);
3313 up_read(&dmar_global_lock);
3315 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3317 intel_iommu_enabled = 1;
3322 intel_iommu_free_dmars();
3323 up_write(&dmar_global_lock);
3327 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3329 struct device_domain_info *info = opaque;
3331 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3336 * NB - intel-iommu lacks any sort of reference counting for the users of
3337 * dependent devices. If multiple endpoints have intersecting dependent
3338 * devices, unbinding the driver from any one of them will possibly leave
3339 * the others unable to operate.
3341 static void domain_context_clear(struct device_domain_info *info)
3343 if (!dev_is_pci(info->dev))
3344 domain_context_clear_one(info, info->bus, info->devfn);
3346 pci_for_each_dma_alias(to_pci_dev(info->dev),
3347 &domain_context_clear_one_cb, info);
3351 * Clear the page table pointer in context or pasid table entries so that
3352 * all DMA requests without PASID from the device are blocked. If the page
3353 * table has been set, clean up the data structures.
3355 void device_block_translation(struct device *dev)
3357 struct device_domain_info *info = dev_iommu_priv_get(dev);
3358 struct intel_iommu *iommu = info->iommu;
3359 unsigned long flags;
3361 iommu_disable_pci_caps(info);
3362 if (!dev_is_real_dma_subdevice(dev)) {
3363 if (sm_supported(iommu))
3364 intel_pasid_tear_down_entry(iommu, dev,
3365 IOMMU_NO_PASID, false);
3367 domain_context_clear(info);
3373 spin_lock_irqsave(&info->domain->lock, flags);
3374 list_del(&info->link);
3375 spin_unlock_irqrestore(&info->domain->lock, flags);
3377 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3378 domain_detach_iommu(info->domain, iommu);
3379 info->domain = NULL;
3382 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3386 /* calculate AGAW */
3387 domain->gaw = guest_width;
3388 adjust_width = guestwidth_to_adjustwidth(guest_width);
3389 domain->agaw = width_to_agaw(adjust_width);
3391 domain->iommu_coherency = false;
3392 domain->iommu_superpage = 0;
3393 domain->max_addr = 0;
3395 /* always allocate the top pgd */
3396 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3399 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3403 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3406 device_block_translation(dev);
3410 static struct iommu_domain blocking_domain = {
3411 .type = IOMMU_DOMAIN_BLOCKED,
3412 .ops = &(const struct iommu_domain_ops) {
3413 .attach_dev = blocking_domain_attach_dev,
3417 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3419 if (!intel_iommu_superpage)
3423 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3425 return fls(cap_super_page_val(iommu->cap));
3428 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3430 struct device_domain_info *info = dev_iommu_priv_get(dev);
3431 struct intel_iommu *iommu = info->iommu;
3432 struct dmar_domain *domain;
3435 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3437 return ERR_PTR(-ENOMEM);
3439 INIT_LIST_HEAD(&domain->devices);
3440 INIT_LIST_HEAD(&domain->dev_pasids);
3441 INIT_LIST_HEAD(&domain->cache_tags);
3442 spin_lock_init(&domain->lock);
3443 spin_lock_init(&domain->cache_lock);
3444 xa_init(&domain->iommu_array);
3446 domain->nid = dev_to_node(dev);
3447 domain->use_first_level = first_stage;
3449 /* calculate the address width */
3450 addr_width = agaw_to_width(iommu->agaw);
3451 if (addr_width > cap_mgaw(iommu->cap))
3452 addr_width = cap_mgaw(iommu->cap);
3453 domain->gaw = addr_width;
3454 domain->agaw = iommu->agaw;
3455 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3457 /* iommu memory access coherency */
3458 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3460 /* pagesize bitmap */
3461 domain->domain.pgsize_bitmap = SZ_4K;
3462 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3463 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3466 * IOVA aperture: First-level translation restricts the input-address
3467 * to a canonical address (i.e., address bits 63:N have the same value
3468 * as address bit [N-1], where N is 48-bits with 4-level paging and
3469 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3471 domain->domain.geometry.force_aperture = true;
3472 domain->domain.geometry.aperture_start = 0;
3474 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3476 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3478 /* always allocate the top pgd */
3479 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3482 return ERR_PTR(-ENOMEM);
3484 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3489 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3491 struct dmar_domain *dmar_domain;
3492 struct iommu_domain *domain;
3495 case IOMMU_DOMAIN_DMA:
3496 case IOMMU_DOMAIN_UNMANAGED:
3497 dmar_domain = alloc_domain(type);
3499 pr_err("Can't allocate dmar_domain\n");
3502 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3503 pr_err("Domain initialization failed\n");
3504 domain_exit(dmar_domain);
3508 domain = &dmar_domain->domain;
3509 domain->geometry.aperture_start = 0;
3510 domain->geometry.aperture_end =
3511 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3512 domain->geometry.force_aperture = true;
3522 static struct iommu_domain *
3523 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3524 struct iommu_domain *parent,
3525 const struct iommu_user_data *user_data)
3527 struct device_domain_info *info = dev_iommu_priv_get(dev);
3528 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3529 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3530 struct intel_iommu *iommu = info->iommu;
3531 struct dmar_domain *dmar_domain;
3532 struct iommu_domain *domain;
3534 /* Must be NESTING domain */
3536 if (!nested_supported(iommu) || flags)
3537 return ERR_PTR(-EOPNOTSUPP);
3538 return intel_nested_domain_alloc(parent, user_data);
3542 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3543 return ERR_PTR(-EOPNOTSUPP);
3544 if (nested_parent && !nested_supported(iommu))
3545 return ERR_PTR(-EOPNOTSUPP);
3546 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3547 return ERR_PTR(-EOPNOTSUPP);
3549 /* Do not use first stage for user domain translation. */
3550 dmar_domain = paging_domain_alloc(dev, false);
3551 if (IS_ERR(dmar_domain))
3552 return ERR_CAST(dmar_domain);
3553 domain = &dmar_domain->domain;
3554 domain->type = IOMMU_DOMAIN_UNMANAGED;
3555 domain->owner = &intel_iommu_ops;
3556 domain->ops = intel_iommu_ops.default_domain_ops;
3558 if (nested_parent) {
3559 dmar_domain->nested_parent = true;
3560 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3561 spin_lock_init(&dmar_domain->s1_lock);
3564 if (dirty_tracking) {
3565 if (dmar_domain->use_first_level) {
3566 iommu_domain_free(domain);
3567 return ERR_PTR(-EOPNOTSUPP);
3569 domain->dirty_ops = &intel_dirty_ops;
3575 static void intel_iommu_domain_free(struct iommu_domain *domain)
3577 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3579 WARN_ON(dmar_domain->nested_parent &&
3580 !list_empty(&dmar_domain->s1_domains));
3581 domain_exit(dmar_domain);
3584 int prepare_domain_attach_device(struct iommu_domain *domain,
3587 struct device_domain_info *info = dev_iommu_priv_get(dev);
3588 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3589 struct intel_iommu *iommu = info->iommu;
3592 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3595 if (domain->dirty_ops && !ssads_supported(iommu))
3598 /* check if this iommu agaw is sufficient for max mapped address */
3599 addr_width = agaw_to_width(iommu->agaw);
3600 if (addr_width > cap_mgaw(iommu->cap))
3601 addr_width = cap_mgaw(iommu->cap);
3603 if (dmar_domain->max_addr > (1LL << addr_width))
3605 dmar_domain->gaw = addr_width;
3608 * Knock out extra levels of page tables if necessary
3610 while (iommu->agaw < dmar_domain->agaw) {
3611 struct dma_pte *pte;
3613 pte = dmar_domain->pgd;
3614 if (dma_pte_present(pte)) {
3615 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3616 iommu_free_page(pte);
3618 dmar_domain->agaw--;
3621 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3622 context_copied(iommu, info->bus, info->devfn))
3623 return intel_pasid_setup_sm_context(dev);
3628 static int intel_iommu_attach_device(struct iommu_domain *domain,
3633 device_block_translation(dev);
3635 ret = prepare_domain_attach_device(domain, dev);
3639 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3642 static int intel_iommu_map(struct iommu_domain *domain,
3643 unsigned long iova, phys_addr_t hpa,
3644 size_t size, int iommu_prot, gfp_t gfp)
3646 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3650 if (iommu_prot & IOMMU_READ)
3651 prot |= DMA_PTE_READ;
3652 if (iommu_prot & IOMMU_WRITE)
3653 prot |= DMA_PTE_WRITE;
3654 if (dmar_domain->set_pte_snp)
3655 prot |= DMA_PTE_SNP;
3657 max_addr = iova + size;
3658 if (dmar_domain->max_addr < max_addr) {
3661 /* check if minimum agaw is sufficient for mapped address */
3662 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3663 if (end < max_addr) {
3664 pr_err("%s: iommu width (%d) is not "
3665 "sufficient for the mapped address (%llx)\n",
3666 __func__, dmar_domain->gaw, max_addr);
3669 dmar_domain->max_addr = max_addr;
3671 /* Round up size to next multiple of PAGE_SIZE, if it and
3672 the low bits of hpa would take us onto the next page */
3673 size = aligned_nrpages(hpa, size);
3674 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3675 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3678 static int intel_iommu_map_pages(struct iommu_domain *domain,
3679 unsigned long iova, phys_addr_t paddr,
3680 size_t pgsize, size_t pgcount,
3681 int prot, gfp_t gfp, size_t *mapped)
3683 unsigned long pgshift = __ffs(pgsize);
3684 size_t size = pgcount << pgshift;
3687 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3690 if (!IS_ALIGNED(iova | paddr, pgsize))
3693 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3700 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3701 unsigned long iova, size_t size,
3702 struct iommu_iotlb_gather *gather)
3704 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3705 unsigned long start_pfn, last_pfn;
3708 /* Cope with horrid API which requires us to unmap more than the
3709 size argument if it happens to be a large-page mapping. */
3710 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3711 &level, GFP_ATOMIC)))
3714 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3715 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3717 start_pfn = iova >> VTD_PAGE_SHIFT;
3718 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3720 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3722 if (dmar_domain->max_addr == iova + size)
3723 dmar_domain->max_addr = iova;
3726 * We do not use page-selective IOTLB invalidation in flush queue,
3727 * so there is no need to track page and sync iotlb.
3729 if (!iommu_iotlb_gather_queued(gather))
3730 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3735 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3737 size_t pgsize, size_t pgcount,
3738 struct iommu_iotlb_gather *gather)
3740 unsigned long pgshift = __ffs(pgsize);
3741 size_t size = pgcount << pgshift;
3743 return intel_iommu_unmap(domain, iova, size, gather);
3746 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3747 struct iommu_iotlb_gather *gather)
3749 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3750 gather->end, list_empty(&gather->freelist));
3751 iommu_put_pages_list(&gather->freelist);
3754 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3757 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3758 struct dma_pte *pte;
3762 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3764 if (pte && dma_pte_present(pte))
3765 phys = dma_pte_addr(pte) +
3766 (iova & (BIT_MASK(level_to_offset_bits(level) +
3767 VTD_PAGE_SHIFT) - 1));
3772 static bool domain_support_force_snooping(struct dmar_domain *domain)
3774 struct device_domain_info *info;
3775 bool support = true;
3777 assert_spin_locked(&domain->lock);
3778 list_for_each_entry(info, &domain->devices, link) {
3779 if (!ecap_sc_support(info->iommu->ecap)) {
3788 static void domain_set_force_snooping(struct dmar_domain *domain)
3790 struct device_domain_info *info;
3792 assert_spin_locked(&domain->lock);
3794 * Second level page table supports per-PTE snoop control. The
3795 * iommu_map() interface will handle this by setting SNP bit.
3797 if (!domain->use_first_level) {
3798 domain->set_pte_snp = true;
3802 list_for_each_entry(info, &domain->devices, link)
3803 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3807 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3809 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3810 unsigned long flags;
3812 if (dmar_domain->force_snooping)
3815 spin_lock_irqsave(&dmar_domain->lock, flags);
3816 if (!domain_support_force_snooping(dmar_domain) ||
3817 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3818 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3822 domain_set_force_snooping(dmar_domain);
3823 dmar_domain->force_snooping = true;
3824 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3829 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3831 struct device_domain_info *info = dev_iommu_priv_get(dev);
3834 case IOMMU_CAP_CACHE_COHERENCY:
3835 case IOMMU_CAP_DEFERRED_FLUSH:
3837 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3838 return dmar_platform_optin();
3839 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3840 return ecap_sc_support(info->iommu->ecap);
3841 case IOMMU_CAP_DIRTY_TRACKING:
3842 return ssads_supported(info->iommu);
3848 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3850 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3851 struct device_domain_info *info;
3852 struct intel_iommu *iommu;
3856 iommu = device_lookup_iommu(dev, &bus, &devfn);
3857 if (!iommu || !iommu->iommu.ops)
3858 return ERR_PTR(-ENODEV);
3860 info = kzalloc(sizeof(*info), GFP_KERNEL);
3862 return ERR_PTR(-ENOMEM);
3864 if (dev_is_real_dma_subdevice(dev)) {
3865 info->bus = pdev->bus->number;
3866 info->devfn = pdev->devfn;
3867 info->segment = pci_domain_nr(pdev->bus);
3870 info->devfn = devfn;
3871 info->segment = iommu->segment;
3875 info->iommu = iommu;
3876 if (dev_is_pci(dev)) {
3877 if (ecap_dev_iotlb_support(iommu->ecap) &&
3878 pci_ats_supported(pdev) &&
3879 dmar_ats_supported(pdev, iommu)) {
3880 info->ats_supported = 1;
3881 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3884 * For IOMMU that supports device IOTLB throttling
3885 * (DIT), we assign PFSID to the invalidation desc
3886 * of a VF such that IOMMU HW can gauge queue depth
3887 * at PF level. If DIT is not set, PFSID will be
3888 * treated as reserved, which should be set to 0.
3890 if (ecap_dit(iommu->ecap))
3891 info->pfsid = pci_dev_id(pci_physfn(pdev));
3892 info->ats_qdep = pci_ats_queue_depth(pdev);
3894 if (sm_supported(iommu)) {
3895 if (pasid_supported(iommu)) {
3896 int features = pci_pasid_features(pdev);
3899 info->pasid_supported = features | 1;
3902 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3903 pci_pri_supported(pdev))
3904 info->pri_supported = 1;
3908 dev_iommu_priv_set(dev, info);
3909 if (pdev && pci_ats_supported(pdev)) {
3910 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3911 ret = device_rbtree_insert(iommu, info);
3916 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3917 ret = intel_pasid_alloc_table(dev);
3919 dev_err(dev, "PASID table allocation failed\n");
3923 if (!context_copied(iommu, info->bus, info->devfn)) {
3924 ret = intel_pasid_setup_sm_context(dev);
3930 intel_iommu_debugfs_create_dev(info);
3933 * The PCIe spec, in its wisdom, declares that the behaviour of the
3934 * device is undefined if you enable PASID support after ATS support.
3935 * So always enable PASID support on devices which have it, even if
3936 * we can't yet know if we're ever going to use it.
3938 if (info->pasid_supported &&
3939 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3940 info->pasid_enabled = 1;
3942 return &iommu->iommu;
3944 intel_pasid_free_table(dev);
3946 device_rbtree_remove(info);
3950 return ERR_PTR(ret);
3953 static void intel_iommu_release_device(struct device *dev)
3955 struct device_domain_info *info = dev_iommu_priv_get(dev);
3956 struct intel_iommu *iommu = info->iommu;
3958 if (info->pasid_enabled) {
3959 pci_disable_pasid(to_pci_dev(dev));
3960 info->pasid_enabled = 0;
3963 mutex_lock(&iommu->iopf_lock);
3964 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3965 device_rbtree_remove(info);
3966 mutex_unlock(&iommu->iopf_lock);
3968 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3969 !context_copied(iommu, info->bus, info->devfn))
3970 intel_pasid_teardown_sm_context(dev);
3972 intel_pasid_free_table(dev);
3973 intel_iommu_debugfs_remove_dev(info);
3975 set_dma_ops(dev, NULL);
3978 static void intel_iommu_get_resv_regions(struct device *device,
3979 struct list_head *head)
3981 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3982 struct iommu_resv_region *reg;
3983 struct dmar_rmrr_unit *rmrr;
3984 struct device *i_dev;
3988 for_each_rmrr_units(rmrr) {
3989 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3991 struct iommu_resv_region *resv;
3992 enum iommu_resv_type type;
3995 if (i_dev != device &&
3996 !is_downstream_to_pci_bridge(device, i_dev))
3999 length = rmrr->end_address - rmrr->base_address + 1;
4001 type = device_rmrr_is_relaxable(device) ?
4002 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4004 resv = iommu_alloc_resv_region(rmrr->base_address,
4010 list_add_tail(&resv->list, head);
4015 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4016 if (dev_is_pci(device)) {
4017 struct pci_dev *pdev = to_pci_dev(device);
4019 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4020 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4021 IOMMU_RESV_DIRECT_RELAXABLE,
4024 list_add_tail(®->list, head);
4027 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4029 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4030 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4031 0, IOMMU_RESV_MSI, GFP_KERNEL);
4034 list_add_tail(®->list, head);
4037 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4039 if (dev_is_pci(dev))
4040 return pci_device_group(dev);
4041 return generic_device_group(dev);
4044 static int intel_iommu_enable_sva(struct device *dev)
4046 struct device_domain_info *info = dev_iommu_priv_get(dev);
4047 struct intel_iommu *iommu;
4049 if (!info || dmar_disabled)
4052 iommu = info->iommu;
4056 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4059 if (!info->pasid_enabled || !info->ats_enabled)
4063 * Devices having device-specific I/O fault handling should not
4064 * support PCI/PRI. The IOMMU side has no means to check the
4065 * capability of device-specific IOPF. Therefore, IOMMU can only
4066 * default that if the device driver enables SVA on a non-PRI
4067 * device, it will handle IOPF in its own way.
4069 if (!info->pri_supported)
4072 /* Devices supporting PRI should have it enabled. */
4073 if (!info->pri_enabled)
4079 static int context_flip_pri(struct device_domain_info *info, bool enable)
4081 struct intel_iommu *iommu = info->iommu;
4082 u8 bus = info->bus, devfn = info->devfn;
4083 struct context_entry *context;
4086 spin_lock(&iommu->lock);
4087 if (context_copied(iommu, bus, devfn)) {
4088 spin_unlock(&iommu->lock);
4092 context = iommu_context_addr(iommu, bus, devfn, false);
4093 if (!context || !context_present(context)) {
4094 spin_unlock(&iommu->lock);
4097 did = context_domain_id(context);
4100 context_set_sm_pre(context);
4102 context_clear_sm_pre(context);
4104 if (!ecap_coherent(iommu->ecap))
4105 clflush_cache_range(context, sizeof(*context));
4106 intel_context_flush_present(info, context, did, true);
4107 spin_unlock(&iommu->lock);
4112 static int intel_iommu_enable_iopf(struct device *dev)
4114 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4115 struct device_domain_info *info = dev_iommu_priv_get(dev);
4116 struct intel_iommu *iommu;
4119 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4122 if (info->pri_enabled)
4125 iommu = info->iommu;
4129 /* PASID is required in PRG Response Message. */
4130 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4133 ret = pci_reset_pri(pdev);
4137 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4141 ret = context_flip_pri(info, true);
4143 goto err_remove_device;
4145 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4149 info->pri_enabled = 1;
4153 context_flip_pri(info, false);
4155 iopf_queue_remove_device(iommu->iopf_queue, dev);
4160 static int intel_iommu_disable_iopf(struct device *dev)
4162 struct device_domain_info *info = dev_iommu_priv_get(dev);
4163 struct intel_iommu *iommu = info->iommu;
4165 if (!info->pri_enabled)
4168 /* Disable new PRI reception: */
4169 context_flip_pri(info, false);
4172 * Remove device from fault queue and acknowledge all outstanding
4173 * PRQs to the device:
4175 iopf_queue_remove_device(iommu->iopf_queue, dev);
4178 * PCIe spec states that by clearing PRI enable bit, the Page
4179 * Request Interface will not issue new page requests, but has
4180 * outstanding page requests that have been transmitted or are
4181 * queued for transmission. This is supposed to be called after
4182 * the device driver has stopped DMA, all PASIDs have been
4183 * unbound and the outstanding PRQs have been drained.
4185 pci_disable_pri(to_pci_dev(dev));
4186 info->pri_enabled = 0;
4192 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4195 case IOMMU_DEV_FEAT_IOPF:
4196 return intel_iommu_enable_iopf(dev);
4198 case IOMMU_DEV_FEAT_SVA:
4199 return intel_iommu_enable_sva(dev);
4207 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4210 case IOMMU_DEV_FEAT_IOPF:
4211 return intel_iommu_disable_iopf(dev);
4213 case IOMMU_DEV_FEAT_SVA:
4221 static bool intel_iommu_is_attach_deferred(struct device *dev)
4223 struct device_domain_info *info = dev_iommu_priv_get(dev);
4225 return translation_pre_enabled(info->iommu) && !info->domain;
4229 * Check that the device does not live on an external facing PCI port that is
4230 * marked as untrusted. Such devices should not be able to apply quirks and
4231 * thus not be able to bypass the IOMMU restrictions.
4233 static bool risky_device(struct pci_dev *pdev)
4235 if (pdev->untrusted) {
4237 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4238 pdev->vendor, pdev->device);
4239 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4245 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4246 unsigned long iova, size_t size)
4248 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4253 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4254 struct iommu_domain *domain)
4256 struct device_domain_info *info = dev_iommu_priv_get(dev);
4257 struct dev_pasid_info *curr, *dev_pasid = NULL;
4258 struct intel_iommu *iommu = info->iommu;
4259 struct dmar_domain *dmar_domain;
4260 unsigned long flags;
4262 if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4263 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4267 dmar_domain = to_dmar_domain(domain);
4268 spin_lock_irqsave(&dmar_domain->lock, flags);
4269 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4270 if (curr->dev == dev && curr->pasid == pasid) {
4271 list_del(&curr->link_domain);
4276 WARN_ON_ONCE(!dev_pasid);
4277 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4279 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4280 domain_detach_iommu(dmar_domain, iommu);
4281 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4283 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4284 intel_drain_pasid_prq(dev, pasid);
4287 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4288 struct device *dev, ioasid_t pasid)
4290 struct device_domain_info *info = dev_iommu_priv_get(dev);
4291 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4292 struct intel_iommu *iommu = info->iommu;
4293 struct dev_pasid_info *dev_pasid;
4294 unsigned long flags;
4297 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4300 if (domain->dirty_ops)
4303 if (context_copied(iommu, info->bus, info->devfn))
4306 ret = prepare_domain_attach_device(domain, dev);
4310 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4314 ret = domain_attach_iommu(dmar_domain, iommu);
4318 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4320 goto out_detach_iommu;
4322 if (dmar_domain->use_first_level)
4323 ret = domain_setup_first_level(iommu, dmar_domain,
4326 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4329 goto out_unassign_tag;
4331 dev_pasid->dev = dev;
4332 dev_pasid->pasid = pasid;
4333 spin_lock_irqsave(&dmar_domain->lock, flags);
4334 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4335 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4337 if (domain->type & __IOMMU_DOMAIN_PAGING)
4338 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4342 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4344 domain_detach_iommu(dmar_domain, iommu);
4350 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4352 struct device_domain_info *info = dev_iommu_priv_get(dev);
4353 struct intel_iommu *iommu = info->iommu;
4354 struct iommu_hw_info_vtd *vtd;
4356 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4358 return ERR_PTR(-ENOMEM);
4360 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4361 vtd->cap_reg = iommu->cap;
4362 vtd->ecap_reg = iommu->ecap;
4363 *length = sizeof(*vtd);
4364 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4369 * Set dirty tracking for the device list of a domain. The caller must
4370 * hold the domain->lock when calling it.
4372 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4374 struct device_domain_info *info;
4377 list_for_each_entry(info, devices, link) {
4378 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4379 IOMMU_NO_PASID, enable);
4387 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4390 struct dmar_domain *s1_domain;
4391 unsigned long flags;
4394 spin_lock(&domain->s1_lock);
4395 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4396 spin_lock_irqsave(&s1_domain->lock, flags);
4397 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4398 spin_unlock_irqrestore(&s1_domain->lock, flags);
4402 spin_unlock(&domain->s1_lock);
4406 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4407 spin_lock_irqsave(&s1_domain->lock, flags);
4408 device_set_dirty_tracking(&s1_domain->devices,
4409 domain->dirty_tracking);
4410 spin_unlock_irqrestore(&s1_domain->lock, flags);
4412 spin_unlock(&domain->s1_lock);
4416 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4419 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4422 spin_lock(&dmar_domain->lock);
4423 if (dmar_domain->dirty_tracking == enable)
4426 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4430 if (dmar_domain->nested_parent) {
4431 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4436 dmar_domain->dirty_tracking = enable;
4438 spin_unlock(&dmar_domain->lock);
4443 device_set_dirty_tracking(&dmar_domain->devices,
4444 dmar_domain->dirty_tracking);
4445 spin_unlock(&dmar_domain->lock);
4449 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4450 unsigned long iova, size_t size,
4451 unsigned long flags,
4452 struct iommu_dirty_bitmap *dirty)
4454 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4455 unsigned long end = iova + size - 1;
4456 unsigned long pgsize;
4459 * IOMMUFD core calls into a dirty tracking disabled domain without an
4460 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4461 * have occurred when we stopped dirty tracking. This ensures that we
4462 * never inherit dirtied bits from a previous cycle.
4464 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4468 struct dma_pte *pte;
4471 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4473 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4474 if (!pte || !dma_pte_present(pte)) {
4479 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4480 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4482 } while (iova < end);
4487 static const struct iommu_dirty_ops intel_dirty_ops = {
4488 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4489 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4492 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4494 struct device_domain_info *info = dev_iommu_priv_get(dev);
4495 struct intel_iommu *iommu = info->iommu;
4496 struct context_entry *context;
4498 spin_lock(&iommu->lock);
4499 context = iommu_context_addr(iommu, bus, devfn, 1);
4501 spin_unlock(&iommu->lock);
4505 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4506 spin_unlock(&iommu->lock);
4510 copied_context_tear_down(iommu, context, bus, devfn);
4511 context_clear_entry(context);
4512 context_set_domain_id(context, FLPT_DEFAULT_DID);
4515 * In pass through mode, AW must be programmed to indicate the largest
4516 * AGAW value supported by hardware. And ASR is ignored by hardware.
4518 context_set_address_width(context, iommu->msagaw);
4519 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4520 context_set_fault_enable(context);
4521 context_set_present(context);
4522 if (!ecap_coherent(iommu->ecap))
4523 clflush_cache_range(context, sizeof(*context));
4524 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4525 spin_unlock(&iommu->lock);
4530 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4532 struct device *dev = data;
4534 if (dev != &pdev->dev)
4537 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4540 static int device_setup_pass_through(struct device *dev)
4542 struct device_domain_info *info = dev_iommu_priv_get(dev);
4544 if (!dev_is_pci(dev))
4545 return context_setup_pass_through(dev, info->bus, info->devfn);
4547 return pci_for_each_dma_alias(to_pci_dev(dev),
4548 context_setup_pass_through_cb, dev);
4551 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4553 struct device_domain_info *info = dev_iommu_priv_get(dev);
4554 struct intel_iommu *iommu = info->iommu;
4557 device_block_translation(dev);
4559 if (dev_is_real_dma_subdevice(dev))
4562 if (sm_supported(iommu)) {
4563 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4565 iommu_enable_pci_caps(info);
4567 ret = device_setup_pass_through(dev);
4573 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4574 struct device *dev, ioasid_t pasid)
4576 struct device_domain_info *info = dev_iommu_priv_get(dev);
4577 struct intel_iommu *iommu = info->iommu;
4579 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4582 return intel_pasid_setup_pass_through(iommu, dev, pasid);
4585 static struct iommu_domain identity_domain = {
4586 .type = IOMMU_DOMAIN_IDENTITY,
4587 .ops = &(const struct iommu_domain_ops) {
4588 .attach_dev = identity_domain_attach_dev,
4589 .set_dev_pasid = identity_domain_set_dev_pasid,
4593 const struct iommu_ops intel_iommu_ops = {
4594 .blocked_domain = &blocking_domain,
4595 .release_domain = &blocking_domain,
4596 .identity_domain = &identity_domain,
4597 .capable = intel_iommu_capable,
4598 .hw_info = intel_iommu_hw_info,
4599 .domain_alloc = intel_iommu_domain_alloc,
4600 .domain_alloc_user = intel_iommu_domain_alloc_user,
4601 .domain_alloc_sva = intel_svm_domain_alloc,
4602 .probe_device = intel_iommu_probe_device,
4603 .release_device = intel_iommu_release_device,
4604 .get_resv_regions = intel_iommu_get_resv_regions,
4605 .device_group = intel_iommu_device_group,
4606 .dev_enable_feat = intel_iommu_dev_enable_feat,
4607 .dev_disable_feat = intel_iommu_dev_disable_feat,
4608 .is_attach_deferred = intel_iommu_is_attach_deferred,
4609 .def_domain_type = device_def_domain_type,
4610 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4611 .pgsize_bitmap = SZ_4K,
4612 #ifdef CONFIG_INTEL_IOMMU_SVM
4613 .page_response = intel_svm_page_response,
4615 .default_domain_ops = &(const struct iommu_domain_ops) {
4616 .attach_dev = intel_iommu_attach_device,
4617 .set_dev_pasid = intel_iommu_set_dev_pasid,
4618 .map_pages = intel_iommu_map_pages,
4619 .unmap_pages = intel_iommu_unmap_pages,
4620 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4621 .flush_iotlb_all = intel_flush_iotlb_all,
4622 .iotlb_sync = intel_iommu_tlb_sync,
4623 .iova_to_phys = intel_iommu_iova_to_phys,
4624 .free = intel_iommu_domain_free,
4625 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4629 static void quirk_iommu_igfx(struct pci_dev *dev)
4631 if (risky_device(dev))
4634 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4635 disable_igfx_iommu = 1;
4638 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4647 /* Broadwell igfx malfunctions with dmar */
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4673 static void quirk_iommu_rwbf(struct pci_dev *dev)
4675 if (risky_device(dev))
4679 * Mobile 4 Series Chipset neglects to set RWBF capability,
4680 * but needs it. Same seems to hold for the desktop versions.
4682 pci_info(dev, "Forcing write-buffer flush capability\n");
4686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4695 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4696 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4697 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4698 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4699 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4700 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4701 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4702 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4704 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4708 if (risky_device(dev))
4711 if (pci_read_config_word(dev, GGC, &ggc))
4714 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4715 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4716 disable_igfx_iommu = 1;
4717 } else if (!disable_igfx_iommu) {
4718 /* we have to ensure the gfx device is idle before we flush */
4719 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4720 iommu_set_dma_strict();
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4728 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4732 if (!IS_GFX_DEVICE(dev))
4735 ver = (dev->device >> 8) & 0xff;
4736 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4737 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4738 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4741 if (risky_device(dev))
4744 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4745 iommu_skip_te_disable = 1;
4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4749 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4750 ISOCH DMAR unit for the Azalia sound device, but not give it any
4751 TLB entries, which causes it to deadlock. Check for that. We do
4752 this in a function called from init_dmars(), instead of in a PCI
4753 quirk, because we don't want to print the obnoxious "BIOS broken"
4754 message if VT-d is actually disabled.
4756 static void __init check_tylersburg_isoch(void)
4758 struct pci_dev *pdev;
4759 uint32_t vtisochctrl;
4761 /* If there's no Azalia in the system anyway, forget it. */
4762 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4766 if (risky_device(pdev)) {
4773 /* System Management Registers. Might be hidden, in which case
4774 we can't do the sanity check. But that's OK, because the
4775 known-broken BIOSes _don't_ actually hide it, so far. */
4776 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4780 if (risky_device(pdev)) {
4785 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4792 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4793 if (vtisochctrl & 1)
4796 /* Drop all bits other than the number of TLB entries */
4797 vtisochctrl &= 0x1c;
4799 /* If we have the recommended number of TLB entries (16), fine. */
4800 if (vtisochctrl == 0x10)
4803 /* Zero TLB entries? You get to ride the short bus to school. */
4805 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4806 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4807 dmi_get_system_info(DMI_BIOS_VENDOR),
4808 dmi_get_system_info(DMI_BIOS_VERSION),
4809 dmi_get_system_info(DMI_PRODUCT_VERSION));
4810 iommu_identity_mapping |= IDENTMAP_AZALIA;
4814 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4819 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4820 * invalidation completion before posted writes initiated with translated address
4821 * that utilized translations matching the invalidation address range, violating
4822 * the invalidation completion ordering.
4823 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4824 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4825 * under the control of the trusted/privileged host device driver must use this
4827 * Device TLBs are invalidated under the following six conditions:
4828 * 1. Device driver does DMA API unmap IOVA
4829 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4830 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4831 * exit_mmap() due to crash
4832 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4833 * VM has to free pages that were unmapped
4834 * 5. Userspace driver unmaps a DMA buffer
4835 * 6. Cache invalidation in vSVA usage (upcoming)
4837 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4838 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4839 * invalidate TLB the same way as normal user unmap which will use this quirk.
4840 * The dTLB invalidation after PASID cache flush does not need this quirk.
4842 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4844 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4845 unsigned long address, unsigned long mask,
4846 u32 pasid, u16 qdep)
4850 if (likely(!info->dtlb_extra_inval))
4853 sid = PCI_DEVID(info->bus, info->devfn);
4854 if (pasid == IOMMU_NO_PASID) {
4855 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4856 qdep, address, mask);
4858 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4859 pasid, qdep, address, mask);
4863 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4866 * Function to submit a command to the enhanced command interface. The
4867 * valid enhanced command descriptions are defined in Table 47 of the
4868 * VT-d spec. The VT-d hardware implementation may support some but not
4869 * all commands, which can be determined by checking the Enhanced
4870 * Command Capability Register.
4873 * - 0: Command successful without any error;
4874 * - Negative: software error value;
4875 * - Nonzero positive: failure status code defined in Table 48.
4877 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4879 unsigned long flags;
4883 if (!cap_ecmds(iommu->cap))
4886 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4888 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4889 if (res & DMA_ECMD_ECRSP_IP) {
4895 * Unconditionally write the operand B, because
4896 * - There is no side effect if an ecmd doesn't require an
4897 * operand B, but we set the register to some value.
4898 * - It's not invoked in any critical path. The extra MMIO
4899 * write doesn't bring any performance concerns.
4901 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4902 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4904 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4905 !(res & DMA_ECMD_ECRSP_IP), res);
4907 if (res & DMA_ECMD_ECRSP_IP) {
4912 ret = ecmd_get_status_code(res);
4914 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);