1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47 #include <trace/events/intel_iommu.h>
49 #include "../irq_remapping.h"
52 #define ROOT_SIZE VTD_PAGE_SIZE
53 #define CONTEXT_SIZE VTD_PAGE_SIZE
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 #define IOAPIC_RANGE_START (0xfee00000)
61 #define IOAPIC_RANGE_END (0xfeefffff)
62 #define IOVA_START_ADDR (0x1000)
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
75 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 /* IO virtual address start page frame number */
79 #define IOVA_START_PFN (1)
81 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83 /* page table handling */
84 #define LEVEL_STRIDE (9)
85 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
88 * This bitmap is used to advertise the page sizes our hardware support
89 * to the IOMMU core, which will then use this information to split
90 * physically contiguous memory regions it is mapping into page sizes
93 * Traditionally the IOMMU core just handed us the mappings directly,
94 * after making sure the size is an order of a 4KiB page and that the
95 * mapping has natural alignment.
97 * To retain this behavior, we currently advertise that we support
98 * all page sizes that are an order of 4KiB.
100 * If at some point we'd like to utilize the IOMMU core's new behavior,
101 * we could change this to advertise the real page sizes we support.
103 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105 static inline int agaw_to_level(int agaw)
110 static inline int agaw_to_width(int agaw)
112 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
115 static inline int width_to_agaw(int width)
117 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
120 static inline unsigned int level_to_offset_bits(int level)
122 return (level - 1) * LEVEL_STRIDE;
125 static inline int pfn_level_offset(u64 pfn, int level)
127 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
130 static inline u64 level_mask(int level)
132 return -1ULL << level_to_offset_bits(level);
135 static inline u64 level_size(int level)
137 return 1ULL << level_to_offset_bits(level);
140 static inline u64 align_to_level(u64 pfn, int level)
142 return (pfn + level_size(level) - 1) & level_mask(level);
145 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
150 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
151 are never going to work. */
152 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
157 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 return mm_to_dma_pfn(page_to_pfn(pg));
165 static inline unsigned long virt_to_dma_pfn(void *p)
167 return page_to_dma_pfn(virt_to_page(p));
170 /* global iommu list, set NULL for ignored DMAR units */
171 static struct intel_iommu **g_iommus;
173 static void __init check_tylersburg_isoch(void);
174 static int rwbf_quirk;
177 * set to 1 to panic kernel if can't successfully enable VT-d
178 * (used when kernel is launched w/ TXT)
180 static int force_on = 0;
181 static int intel_iommu_tboot_noforce;
182 static int no_platform_optin;
184 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
190 static phys_addr_t root_entry_lctp(struct root_entry *re)
195 return re->lo & VTD_PAGE_MASK;
199 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
202 static phys_addr_t root_entry_uctp(struct root_entry *re)
207 return re->hi & VTD_PAGE_MASK;
210 static inline void context_clear_pasid_enable(struct context_entry *context)
212 context->lo &= ~(1ULL << 11);
215 static inline bool context_pasid_enabled(struct context_entry *context)
217 return !!(context->lo & (1ULL << 11));
220 static inline void context_set_copied(struct context_entry *context)
222 context->hi |= (1ull << 3);
225 static inline bool context_copied(struct context_entry *context)
227 return !!(context->hi & (1ULL << 3));
230 static inline bool __context_present(struct context_entry *context)
232 return (context->lo & 1);
235 bool context_present(struct context_entry *context)
237 return context_pasid_enabled(context) ?
238 __context_present(context) :
239 __context_present(context) && !context_copied(context);
242 static inline void context_set_present(struct context_entry *context)
247 static inline void context_set_fault_enable(struct context_entry *context)
249 context->lo &= (((u64)-1) << 2) | 1;
252 static inline void context_set_translation_type(struct context_entry *context,
255 context->lo &= (((u64)-1) << 4) | 3;
256 context->lo |= (value & 3) << 2;
259 static inline void context_set_address_root(struct context_entry *context,
262 context->lo &= ~VTD_PAGE_MASK;
263 context->lo |= value & VTD_PAGE_MASK;
266 static inline void context_set_address_width(struct context_entry *context,
269 context->hi |= value & 7;
272 static inline void context_set_domain_id(struct context_entry *context,
275 context->hi |= (value & ((1 << 16) - 1)) << 8;
278 static inline int context_domain_id(struct context_entry *c)
280 return((c->hi >> 8) & 0xffff);
283 static inline void context_clear_entry(struct context_entry *context)
290 * This domain is a statically identity mapping domain.
291 * 1. This domain creats a static 1:1 mapping to all usable memory.
292 * 2. It maps to each iommu if successful.
293 * 3. Each iommu mapps to this domain if successful.
295 static struct dmar_domain *si_domain;
296 static int hw_pass_through = 1;
298 #define for_each_domain_iommu(idx, domain) \
299 for (idx = 0; idx < g_num_of_iommus; idx++) \
300 if (domain->iommu_refcnt[idx])
302 struct dmar_rmrr_unit {
303 struct list_head list; /* list of rmrr units */
304 struct acpi_dmar_header *hdr; /* ACPI header */
305 u64 base_address; /* reserved base address*/
306 u64 end_address; /* reserved end address */
307 struct dmar_dev_scope *devices; /* target devices */
308 int devices_cnt; /* target device count */
311 struct dmar_atsr_unit {
312 struct list_head list; /* list of ATSR units */
313 struct acpi_dmar_header *hdr; /* ACPI header */
314 struct dmar_dev_scope *devices; /* target devices */
315 int devices_cnt; /* target device count */
316 u8 include_all:1; /* include all ports */
319 static LIST_HEAD(dmar_atsr_units);
320 static LIST_HEAD(dmar_rmrr_units);
322 #define for_each_rmrr_units(rmrr) \
323 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325 /* bitmap for indexing intel_iommus */
326 static int g_num_of_iommus;
328 static void domain_exit(struct dmar_domain *domain);
329 static void domain_remove_dev_info(struct dmar_domain *domain);
330 static void dmar_remove_one_dev_info(struct device *dev);
331 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
332 static int intel_iommu_attach_device(struct iommu_domain *domain,
334 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
337 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
338 int dmar_disabled = 0;
340 int dmar_disabled = 1;
341 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
344 int intel_iommu_sm = 1;
347 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349 int intel_iommu_enabled = 0;
350 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352 static int dmar_map_gfx = 1;
353 static int dmar_forcedac;
354 static int intel_iommu_strict;
355 static int intel_iommu_superpage = 1;
356 static int iommu_identity_mapping;
357 static int iommu_skip_te_disable;
359 #define IDENTMAP_GFX 2
360 #define IDENTMAP_AZALIA 4
362 int intel_iommu_gfx_mapped;
363 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
366 struct device_domain_info *get_domain_info(struct device *dev)
368 struct device_domain_info *info;
373 info = dev_iommu_priv_get(dev);
374 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
380 DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
384 * Iterate over elements in device_domain_list and call the specified
385 * callback @fn against each element.
387 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
388 void *data), void *data)
392 struct device_domain_info *info;
394 spin_lock_irqsave(&device_domain_lock, flags);
395 list_for_each_entry(info, &device_domain_list, global) {
396 ret = fn(info, data);
398 spin_unlock_irqrestore(&device_domain_lock, flags);
402 spin_unlock_irqrestore(&device_domain_lock, flags);
407 const struct iommu_ops intel_iommu_ops;
409 static bool translation_pre_enabled(struct intel_iommu *iommu)
411 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
414 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
416 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
419 static void init_translation_status(struct intel_iommu *iommu)
423 gsts = readl(iommu->reg + DMAR_GSTS_REG);
424 if (gsts & DMA_GSTS_TES)
425 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
428 static int __init intel_iommu_setup(char *str)
433 if (!strncmp(str, "on", 2)) {
435 pr_info("IOMMU enabled\n");
436 } else if (!strncmp(str, "off", 3)) {
438 no_platform_optin = 1;
439 pr_info("IOMMU disabled\n");
440 } else if (!strncmp(str, "igfx_off", 8)) {
442 pr_info("Disable GFX device mapping\n");
443 } else if (!strncmp(str, "forcedac", 8)) {
444 pr_info("Forcing DAC for PCI devices\n");
446 } else if (!strncmp(str, "strict", 6)) {
447 pr_info("Disable batched IOTLB flush\n");
448 intel_iommu_strict = 1;
449 } else if (!strncmp(str, "sp_off", 6)) {
450 pr_info("Disable supported super page\n");
451 intel_iommu_superpage = 0;
452 } else if (!strncmp(str, "sm_on", 5)) {
453 pr_info("Intel-IOMMU: scalable mode supported\n");
455 } else if (!strncmp(str, "tboot_noforce", 13)) {
456 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 intel_iommu_tboot_noforce = 1;
460 str += strcspn(str, ",");
466 __setup("intel_iommu=", intel_iommu_setup);
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 struct dmar_domain **domains;
476 domains = iommu->domains[idx];
480 return domains[did & 0xff];
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 struct dmar_domain *domain)
486 struct dmar_domain **domains;
489 if (!iommu->domains[idx]) {
490 size_t size = 256 * sizeof(struct dmar_domain *);
491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494 domains = iommu->domains[idx];
495 if (WARN_ON(!domains))
498 domains[did & 0xff] = domain;
501 void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_si(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 static inline bool domain_use_first_level(struct dmar_domain *domain)
544 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
547 static inline int domain_pfn_supported(struct dmar_domain *domain,
550 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
552 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
555 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
560 sagaw = cap_sagaw(iommu->cap);
561 for (agaw = width_to_agaw(max_gaw);
563 if (test_bit(agaw, &sagaw))
571 * Calculate max SAGAW for each iommu.
573 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
575 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
579 * calculate agaw for each iommu.
580 * "SAGAW" may be different across iommus, use a default agaw, and
581 * get a supported less agaw for iommus that don't support the default agaw.
583 int iommu_calculate_agaw(struct intel_iommu *iommu)
585 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
588 /* This functionin only returns single iommu in a domain */
589 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
593 /* si_domain and vm domain should not get here. */
594 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
597 for_each_domain_iommu(iommu_id, domain)
600 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
603 return g_iommus[iommu_id];
606 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
608 return sm_supported(iommu) ?
609 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
612 static void domain_update_iommu_coherency(struct dmar_domain *domain)
614 struct dmar_drhd_unit *drhd;
615 struct intel_iommu *iommu;
619 domain->iommu_coherency = 1;
621 for_each_domain_iommu(i, domain) {
623 if (!iommu_paging_structure_coherency(g_iommus[i])) {
624 domain->iommu_coherency = 0;
631 /* No hardware attached; use lowest common denominator */
633 for_each_active_iommu(iommu, drhd) {
634 if (!iommu_paging_structure_coherency(iommu)) {
635 domain->iommu_coherency = 0;
642 static int domain_update_iommu_snooping(struct intel_iommu *skip)
644 struct dmar_drhd_unit *drhd;
645 struct intel_iommu *iommu;
649 for_each_active_iommu(iommu, drhd) {
651 if (!ecap_sc_support(iommu->ecap)) {
662 static int domain_update_iommu_superpage(struct dmar_domain *domain,
663 struct intel_iommu *skip)
665 struct dmar_drhd_unit *drhd;
666 struct intel_iommu *iommu;
669 if (!intel_iommu_superpage) {
673 /* set iommu_superpage to the smallest common denominator */
675 for_each_active_iommu(iommu, drhd) {
677 if (domain && domain_use_first_level(domain)) {
678 if (!cap_fl1gp_support(iommu->cap))
681 mask &= cap_super_page_val(iommu->cap);
693 static int domain_update_device_node(struct dmar_domain *domain)
695 struct device_domain_info *info;
696 int nid = NUMA_NO_NODE;
698 assert_spin_locked(&device_domain_lock);
700 if (list_empty(&domain->devices))
703 list_for_each_entry(info, &domain->devices, link) {
708 * There could possibly be multiple device numa nodes as devices
709 * within the same domain may sit behind different IOMMUs. There
710 * isn't perfect answer in such situation, so we select first
711 * come first served policy.
713 nid = dev_to_node(info->dev);
714 if (nid != NUMA_NO_NODE)
721 static void domain_update_iotlb(struct dmar_domain *domain);
723 /* Some capabilities may be different across iommus */
724 static void domain_update_iommu_cap(struct dmar_domain *domain)
726 domain_update_iommu_coherency(domain);
727 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
728 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
731 * If RHSA is missing, we should default to the device numa domain
734 if (domain->nid == NUMA_NO_NODE)
735 domain->nid = domain_update_device_node(domain);
738 * First-level translation restricts the input-address to a
739 * canonical address (i.e., address bits 63:N have the same
740 * value as address bit [N-1], where N is 48-bits with 4-level
741 * paging and 57-bits with 5-level paging). Hence, skip bit
744 if (domain_use_first_level(domain))
745 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
747 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
749 domain_update_iotlb(domain);
752 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
755 struct root_entry *root = &iommu->root_entry[bus];
756 struct context_entry *context;
760 if (sm_supported(iommu)) {
768 context = phys_to_virt(*entry & VTD_PAGE_MASK);
770 unsigned long phy_addr;
774 context = alloc_pgtable_page(iommu->node);
778 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
779 phy_addr = virt_to_phys((void *)context);
780 *entry = phy_addr | 1;
781 __iommu_flush_cache(iommu, entry, sizeof(*entry));
783 return &context[devfn];
786 static bool attach_deferred(struct device *dev)
788 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
792 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
793 * sub-hierarchy of a candidate PCI-PCI bridge
794 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
795 * @bridge: the candidate PCI-PCI bridge
797 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
800 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
802 struct pci_dev *pdev, *pbridge;
804 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
807 pdev = to_pci_dev(dev);
808 pbridge = to_pci_dev(bridge);
810 if (pbridge->subordinate &&
811 pbridge->subordinate->number <= pdev->bus->number &&
812 pbridge->subordinate->busn_res.end >= pdev->bus->number)
818 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
820 struct dmar_drhd_unit *drhd;
824 /* We know that this device on this chipset has its own IOMMU.
825 * If we find it under a different IOMMU, then the BIOS is lying
826 * to us. Hope that the IOMMU for this device is actually
827 * disabled, and it needs no translation...
829 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
832 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
837 /* we know that the this iommu should be at offset 0xa000 from vtbar */
838 drhd = dmar_find_matched_drhd_unit(pdev);
839 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
840 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
841 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
848 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
850 if (!iommu || iommu->drhd->ignored)
853 if (dev_is_pci(dev)) {
854 struct pci_dev *pdev = to_pci_dev(dev);
856 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
857 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
858 quirk_ioat_snb_local_iommu(pdev))
865 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
867 struct dmar_drhd_unit *drhd = NULL;
868 struct pci_dev *pdev = NULL;
869 struct intel_iommu *iommu;
877 if (dev_is_pci(dev)) {
878 struct pci_dev *pf_pdev;
880 pdev = pci_real_dma_dev(to_pci_dev(dev));
882 /* VFs aren't listed in scope tables; we need to look up
883 * the PF instead to find the IOMMU. */
884 pf_pdev = pci_physfn(pdev);
886 segment = pci_domain_nr(pdev->bus);
887 } else if (has_acpi_companion(dev))
888 dev = &ACPI_COMPANION(dev)->dev;
891 for_each_iommu(iommu, drhd) {
892 if (pdev && segment != drhd->segment)
895 for_each_active_dev_scope(drhd->devices,
896 drhd->devices_cnt, i, tmp) {
898 /* For a VF use its original BDF# not that of the PF
899 * which we used for the IOMMU lookup. Strictly speaking
900 * we could do this for all PCI devices; we only need to
901 * get the BDF# from the scope table for ACPI matches. */
902 if (pdev && pdev->is_virtfn)
906 *bus = drhd->devices[i].bus;
907 *devfn = drhd->devices[i].devfn;
912 if (is_downstream_to_pci_bridge(dev, tmp))
916 if (pdev && drhd->include_all) {
919 *bus = pdev->bus->number;
920 *devfn = pdev->devfn;
927 if (iommu_is_dummy(iommu, dev))
935 static void domain_flush_cache(struct dmar_domain *domain,
936 void *addr, int size)
938 if (!domain->iommu_coherency)
939 clflush_cache_range(addr, size);
942 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
944 struct context_entry *context;
948 spin_lock_irqsave(&iommu->lock, flags);
949 context = iommu_context_addr(iommu, bus, devfn, 0);
951 ret = context_present(context);
952 spin_unlock_irqrestore(&iommu->lock, flags);
956 static void free_context_table(struct intel_iommu *iommu)
960 struct context_entry *context;
962 spin_lock_irqsave(&iommu->lock, flags);
963 if (!iommu->root_entry) {
966 for (i = 0; i < ROOT_ENTRY_NR; i++) {
967 context = iommu_context_addr(iommu, i, 0, 0);
969 free_pgtable_page(context);
971 if (!sm_supported(iommu))
974 context = iommu_context_addr(iommu, i, 0x80, 0);
976 free_pgtable_page(context);
979 free_pgtable_page(iommu->root_entry);
980 iommu->root_entry = NULL;
982 spin_unlock_irqrestore(&iommu->lock, flags);
985 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
986 unsigned long pfn, int *target_level)
988 struct dma_pte *parent, *pte;
989 int level = agaw_to_level(domain->agaw);
992 BUG_ON(!domain->pgd);
994 if (!domain_pfn_supported(domain, pfn))
995 /* Address beyond IOMMU's addressing capabilities. */
998 parent = domain->pgd;
1003 offset = pfn_level_offset(pfn, level);
1004 pte = &parent[offset];
1005 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1007 if (level == *target_level)
1010 if (!dma_pte_present(pte)) {
1013 tmp_page = alloc_pgtable_page(domain->nid);
1018 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1019 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1020 if (domain_use_first_level(domain))
1021 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1022 if (cmpxchg64(&pte->val, 0ULL, pteval))
1023 /* Someone else set it while we were thinking; use theirs. */
1024 free_pgtable_page(tmp_page);
1026 domain_flush_cache(domain, pte, sizeof(*pte));
1031 parent = phys_to_virt(dma_pte_addr(pte));
1036 *target_level = level;
1041 /* return address's pte at specific level */
1042 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1044 int level, int *large_page)
1046 struct dma_pte *parent, *pte;
1047 int total = agaw_to_level(domain->agaw);
1050 parent = domain->pgd;
1051 while (level <= total) {
1052 offset = pfn_level_offset(pfn, total);
1053 pte = &parent[offset];
1057 if (!dma_pte_present(pte)) {
1058 *large_page = total;
1062 if (dma_pte_superpage(pte)) {
1063 *large_page = total;
1067 parent = phys_to_virt(dma_pte_addr(pte));
1073 /* clear last level pte, a tlb flush should be followed */
1074 static void dma_pte_clear_range(struct dmar_domain *domain,
1075 unsigned long start_pfn,
1076 unsigned long last_pfn)
1078 unsigned int large_page;
1079 struct dma_pte *first_pte, *pte;
1081 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1082 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1083 BUG_ON(start_pfn > last_pfn);
1085 /* we don't need lock here; nobody else touches the iova range */
1088 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1090 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1095 start_pfn += lvl_to_nr_pages(large_page);
1097 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1099 domain_flush_cache(domain, first_pte,
1100 (void *)pte - (void *)first_pte);
1102 } while (start_pfn && start_pfn <= last_pfn);
1105 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1106 int retain_level, struct dma_pte *pte,
1107 unsigned long pfn, unsigned long start_pfn,
1108 unsigned long last_pfn)
1110 pfn = max(start_pfn, pfn);
1111 pte = &pte[pfn_level_offset(pfn, level)];
1114 unsigned long level_pfn;
1115 struct dma_pte *level_pte;
1117 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120 level_pfn = pfn & level_mask(level);
1121 level_pte = phys_to_virt(dma_pte_addr(pte));
1124 dma_pte_free_level(domain, level - 1, retain_level,
1125 level_pte, level_pfn, start_pfn,
1130 * Free the page table if we're below the level we want to
1131 * retain and the range covers the entire table.
1133 if (level < retain_level && !(start_pfn > level_pfn ||
1134 last_pfn < level_pfn + level_size(level) - 1)) {
1136 domain_flush_cache(domain, pte, sizeof(*pte));
1137 free_pgtable_page(level_pte);
1140 pfn += level_size(level);
1141 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1145 * clear last level (leaf) ptes and free page table pages below the
1146 * level we wish to keep intact.
1148 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1149 unsigned long start_pfn,
1150 unsigned long last_pfn,
1153 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1154 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1155 BUG_ON(start_pfn > last_pfn);
1157 dma_pte_clear_range(domain, start_pfn, last_pfn);
1159 /* We don't need lock here; nobody else touches the iova range */
1160 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1161 domain->pgd, 0, start_pfn, last_pfn);
1164 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1165 free_pgtable_page(domain->pgd);
1170 /* When a page at a given level is being unlinked from its parent, we don't
1171 need to *modify* it at all. All we need to do is make a list of all the
1172 pages which can be freed just as soon as we've flushed the IOTLB and we
1173 know the hardware page-walk will no longer touch them.
1174 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1176 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1177 int level, struct dma_pte *pte,
1178 struct page *freelist)
1182 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1183 pg->freelist = freelist;
1189 pte = page_address(pg);
1191 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1192 freelist = dma_pte_list_pagetables(domain, level - 1,
1195 } while (!first_pte_in_page(pte));
1200 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1201 struct dma_pte *pte, unsigned long pfn,
1202 unsigned long start_pfn,
1203 unsigned long last_pfn,
1204 struct page *freelist)
1206 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1208 pfn = max(start_pfn, pfn);
1209 pte = &pte[pfn_level_offset(pfn, level)];
1212 unsigned long level_pfn;
1214 if (!dma_pte_present(pte))
1217 level_pfn = pfn & level_mask(level);
1219 /* If range covers entire pagetable, free it */
1220 if (start_pfn <= level_pfn &&
1221 last_pfn >= level_pfn + level_size(level) - 1) {
1222 /* These suborbinate page tables are going away entirely. Don't
1223 bother to clear them; we're just going to *free* them. */
1224 if (level > 1 && !dma_pte_superpage(pte))
1225 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1231 } else if (level > 1) {
1232 /* Recurse down into a level that isn't *entirely* obsolete */
1233 freelist = dma_pte_clear_level(domain, level - 1,
1234 phys_to_virt(dma_pte_addr(pte)),
1235 level_pfn, start_pfn, last_pfn,
1239 pfn += level_size(level);
1240 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243 domain_flush_cache(domain, first_pte,
1244 (void *)++last_pte - (void *)first_pte);
1249 /* We can't just free the pages because the IOMMU may still be walking
1250 the page tables, and may have cached the intermediate levels. The
1251 pages can only be freed after the IOTLB flush has been done. */
1252 static struct page *domain_unmap(struct dmar_domain *domain,
1253 unsigned long start_pfn,
1254 unsigned long last_pfn,
1255 struct page *freelist)
1257 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1258 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1259 BUG_ON(start_pfn > last_pfn);
1261 /* we don't need lock here; nobody else touches the iova range */
1262 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1263 domain->pgd, 0, start_pfn, last_pfn,
1267 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1268 struct page *pgd_page = virt_to_page(domain->pgd);
1269 pgd_page->freelist = freelist;
1270 freelist = pgd_page;
1278 static void dma_free_pagelist(struct page *freelist)
1282 while ((pg = freelist)) {
1283 freelist = pg->freelist;
1284 free_pgtable_page(page_address(pg));
1288 /* iommu handling */
1289 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1291 struct root_entry *root;
1292 unsigned long flags;
1294 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1296 pr_err("Allocating root entry for %s failed\n",
1301 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1303 spin_lock_irqsave(&iommu->lock, flags);
1304 iommu->root_entry = root;
1305 spin_unlock_irqrestore(&iommu->lock, flags);
1310 static void iommu_set_root_entry(struct intel_iommu *iommu)
1316 addr = virt_to_phys(iommu->root_entry);
1317 if (sm_supported(iommu))
1318 addr |= DMA_RTADDR_SMT;
1320 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1321 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1323 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1325 /* Make sure hardware complete it */
1326 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1327 readl, (sts & DMA_GSTS_RTPS), sts);
1329 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1337 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1340 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1341 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1343 /* Make sure hardware complete it */
1344 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1345 readl, (!(val & DMA_GSTS_WBFS)), val);
1347 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1350 /* return value determine if we need a write buffer flush */
1351 static void __iommu_flush_context(struct intel_iommu *iommu,
1352 u16 did, u16 source_id, u8 function_mask,
1359 case DMA_CCMD_GLOBAL_INVL:
1360 val = DMA_CCMD_GLOBAL_INVL;
1362 case DMA_CCMD_DOMAIN_INVL:
1363 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1365 case DMA_CCMD_DEVICE_INVL:
1366 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1367 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1372 val |= DMA_CCMD_ICC;
1374 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1375 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1377 /* Make sure hardware complete it */
1378 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1379 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1381 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1384 /* return value determine if we need a write buffer flush */
1385 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1386 u64 addr, unsigned int size_order, u64 type)
1388 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1389 u64 val = 0, val_iva = 0;
1393 case DMA_TLB_GLOBAL_FLUSH:
1394 /* global flush doesn't need set IVA_REG */
1395 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1397 case DMA_TLB_DSI_FLUSH:
1398 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1400 case DMA_TLB_PSI_FLUSH:
1401 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1402 /* IH bit is passed in as part of address */
1403 val_iva = size_order | addr;
1408 /* Note: set drain read/write */
1411 * This is probably to be super secure.. Looks like we can
1412 * ignore it without any impact.
1414 if (cap_read_drain(iommu->cap))
1415 val |= DMA_TLB_READ_DRAIN;
1417 if (cap_write_drain(iommu->cap))
1418 val |= DMA_TLB_WRITE_DRAIN;
1420 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1421 /* Note: Only uses first TLB reg currently */
1423 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1424 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1426 /* Make sure hardware complete it */
1427 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1428 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1430 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1432 /* check IOTLB invalidation granularity */
1433 if (DMA_TLB_IAIG(val) == 0)
1434 pr_err("Flush IOTLB failed\n");
1435 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1436 pr_debug("TLB flush request %Lx, actual %Lx\n",
1437 (unsigned long long)DMA_TLB_IIRG(type),
1438 (unsigned long long)DMA_TLB_IAIG(val));
1441 static struct device_domain_info *
1442 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1445 struct device_domain_info *info;
1447 assert_spin_locked(&device_domain_lock);
1452 list_for_each_entry(info, &domain->devices, link)
1453 if (info->iommu == iommu && info->bus == bus &&
1454 info->devfn == devfn) {
1455 if (info->ats_supported && info->dev)
1463 static void domain_update_iotlb(struct dmar_domain *domain)
1465 struct device_domain_info *info;
1466 bool has_iotlb_device = false;
1468 assert_spin_locked(&device_domain_lock);
1470 list_for_each_entry(info, &domain->devices, link)
1471 if (info->ats_enabled) {
1472 has_iotlb_device = true;
1476 if (!has_iotlb_device) {
1477 struct subdev_domain_info *sinfo;
1479 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1480 info = get_domain_info(sinfo->pdev);
1481 if (info && info->ats_enabled) {
1482 has_iotlb_device = true;
1488 domain->has_iotlb_device = has_iotlb_device;
1491 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1493 struct pci_dev *pdev;
1495 assert_spin_locked(&device_domain_lock);
1497 if (!info || !dev_is_pci(info->dev))
1500 pdev = to_pci_dev(info->dev);
1501 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1502 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1503 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1504 * reserved, which should be set to 0.
1506 if (!ecap_dit(info->iommu->ecap))
1509 struct pci_dev *pf_pdev;
1511 /* pdev will be returned if device is not a vf */
1512 pf_pdev = pci_physfn(pdev);
1513 info->pfsid = pci_dev_id(pf_pdev);
1516 #ifdef CONFIG_INTEL_IOMMU_SVM
1517 /* The PCIe spec, in its wisdom, declares that the behaviour of
1518 the device if you enable PASID support after ATS support is
1519 undefined. So always enable PASID support on devices which
1520 have it, even if we can't yet know if we're ever going to
1522 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1523 info->pasid_enabled = 1;
1525 if (info->pri_supported &&
1526 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1527 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1528 info->pri_enabled = 1;
1530 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1531 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1532 info->ats_enabled = 1;
1533 domain_update_iotlb(info->domain);
1534 info->ats_qdep = pci_ats_queue_depth(pdev);
1538 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1540 struct pci_dev *pdev;
1542 assert_spin_locked(&device_domain_lock);
1544 if (!dev_is_pci(info->dev))
1547 pdev = to_pci_dev(info->dev);
1549 if (info->ats_enabled) {
1550 pci_disable_ats(pdev);
1551 info->ats_enabled = 0;
1552 domain_update_iotlb(info->domain);
1554 #ifdef CONFIG_INTEL_IOMMU_SVM
1555 if (info->pri_enabled) {
1556 pci_disable_pri(pdev);
1557 info->pri_enabled = 0;
1559 if (info->pasid_enabled) {
1560 pci_disable_pasid(pdev);
1561 info->pasid_enabled = 0;
1566 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1567 u64 addr, unsigned int mask)
1571 if (!info || !info->ats_enabled)
1574 sid = info->bus << 8 | info->devfn;
1575 qdep = info->ats_qdep;
1576 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1580 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1581 u64 addr, unsigned mask)
1583 unsigned long flags;
1584 struct device_domain_info *info;
1585 struct subdev_domain_info *sinfo;
1587 if (!domain->has_iotlb_device)
1590 spin_lock_irqsave(&device_domain_lock, flags);
1591 list_for_each_entry(info, &domain->devices, link)
1592 __iommu_flush_dev_iotlb(info, addr, mask);
1594 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1595 info = get_domain_info(sinfo->pdev);
1596 __iommu_flush_dev_iotlb(info, addr, mask);
1598 spin_unlock_irqrestore(&device_domain_lock, flags);
1601 static void domain_flush_piotlb(struct intel_iommu *iommu,
1602 struct dmar_domain *domain,
1603 u64 addr, unsigned long npages, bool ih)
1605 u16 did = domain->iommu_did[iommu->seq_id];
1607 if (domain->default_pasid)
1608 qi_flush_piotlb(iommu, did, domain->default_pasid,
1611 if (!list_empty(&domain->devices))
1612 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1615 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1616 struct dmar_domain *domain,
1617 unsigned long pfn, unsigned int pages,
1620 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1621 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1622 u16 did = domain->iommu_did[iommu->seq_id];
1629 if (domain_use_first_level(domain)) {
1630 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1633 * Fallback to domain selective flush if no PSI support or
1634 * the size is too big. PSI requires page size to be 2 ^ x,
1635 * and the base address is naturally aligned to the size.
1637 if (!cap_pgsel_inv(iommu->cap) ||
1638 mask > cap_max_amask_val(iommu->cap))
1639 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1642 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1647 * In caching mode, changes of pages from non-present to present require
1648 * flush. However, device IOTLB doesn't need to be flushed in this case.
1650 if (!cap_caching_mode(iommu->cap) || !map)
1651 iommu_flush_dev_iotlb(domain, addr, mask);
1654 /* Notification for newly created mappings */
1655 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1656 struct dmar_domain *domain,
1657 unsigned long pfn, unsigned int pages)
1660 * It's a non-present to present mapping. Only flush if caching mode
1663 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1664 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1666 iommu_flush_write_buffer(iommu);
1669 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1671 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1674 for_each_domain_iommu(idx, dmar_domain) {
1675 struct intel_iommu *iommu = g_iommus[idx];
1676 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1678 if (domain_use_first_level(dmar_domain))
1679 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1681 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1684 if (!cap_caching_mode(iommu->cap))
1685 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1686 0, MAX_AGAW_PFN_WIDTH);
1690 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1693 unsigned long flags;
1695 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1698 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1699 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1700 pmen &= ~DMA_PMEN_EPM;
1701 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1703 /* wait for the protected region status bit to clear */
1704 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1705 readl, !(pmen & DMA_PMEN_PRS), pmen);
1707 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1710 static void iommu_enable_translation(struct intel_iommu *iommu)
1713 unsigned long flags;
1715 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1716 iommu->gcmd |= DMA_GCMD_TE;
1717 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1719 /* Make sure hardware complete it */
1720 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1721 readl, (sts & DMA_GSTS_TES), sts);
1723 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1726 static void iommu_disable_translation(struct intel_iommu *iommu)
1731 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1732 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1735 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1736 iommu->gcmd &= ~DMA_GCMD_TE;
1737 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1739 /* Make sure hardware complete it */
1740 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1741 readl, (!(sts & DMA_GSTS_TES)), sts);
1743 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1746 static int iommu_init_domains(struct intel_iommu *iommu)
1748 u32 ndomains, nlongs;
1751 ndomains = cap_ndoms(iommu->cap);
1752 pr_debug("%s: Number of Domains supported <%d>\n",
1753 iommu->name, ndomains);
1754 nlongs = BITS_TO_LONGS(ndomains);
1756 spin_lock_init(&iommu->lock);
1758 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1759 if (!iommu->domain_ids) {
1760 pr_err("%s: Allocating domain id array failed\n",
1765 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1766 iommu->domains = kzalloc(size, GFP_KERNEL);
1768 if (iommu->domains) {
1769 size = 256 * sizeof(struct dmar_domain *);
1770 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1773 if (!iommu->domains || !iommu->domains[0]) {
1774 pr_err("%s: Allocating domain array failed\n",
1776 kfree(iommu->domain_ids);
1777 kfree(iommu->domains);
1778 iommu->domain_ids = NULL;
1779 iommu->domains = NULL;
1784 * If Caching mode is set, then invalid translations are tagged
1785 * with domain-id 0, hence we need to pre-allocate it. We also
1786 * use domain-id 0 as a marker for non-allocated domain-id, so
1787 * make sure it is not used for a real domain.
1789 set_bit(0, iommu->domain_ids);
1792 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1793 * entry for first-level or pass-through translation modes should
1794 * be programmed with a domain id different from those used for
1795 * second-level or nested translation. We reserve a domain id for
1798 if (sm_supported(iommu))
1799 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1804 static void disable_dmar_iommu(struct intel_iommu *iommu)
1806 struct device_domain_info *info, *tmp;
1807 unsigned long flags;
1809 if (!iommu->domains || !iommu->domain_ids)
1812 spin_lock_irqsave(&device_domain_lock, flags);
1813 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1814 if (info->iommu != iommu)
1817 if (!info->dev || !info->domain)
1820 __dmar_remove_one_dev_info(info);
1822 spin_unlock_irqrestore(&device_domain_lock, flags);
1824 if (iommu->gcmd & DMA_GCMD_TE)
1825 iommu_disable_translation(iommu);
1828 static void free_dmar_iommu(struct intel_iommu *iommu)
1830 if ((iommu->domains) && (iommu->domain_ids)) {
1831 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1834 for (i = 0; i < elems; i++)
1835 kfree(iommu->domains[i]);
1836 kfree(iommu->domains);
1837 kfree(iommu->domain_ids);
1838 iommu->domains = NULL;
1839 iommu->domain_ids = NULL;
1842 g_iommus[iommu->seq_id] = NULL;
1844 /* free context mapping */
1845 free_context_table(iommu);
1847 #ifdef CONFIG_INTEL_IOMMU_SVM
1848 if (pasid_supported(iommu)) {
1849 if (ecap_prs(iommu->ecap))
1850 intel_svm_finish_prq(iommu);
1852 if (vccap_pasid(iommu->vccap))
1853 ioasid_unregister_allocator(&iommu->pasid_allocator);
1859 * Check and return whether first level is used by default for
1862 static bool first_level_by_default(void)
1864 struct dmar_drhd_unit *drhd;
1865 struct intel_iommu *iommu;
1866 static int first_level_support = -1;
1868 if (likely(first_level_support != -1))
1869 return first_level_support;
1871 first_level_support = 1;
1874 for_each_active_iommu(iommu, drhd) {
1875 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1876 first_level_support = 0;
1882 return first_level_support;
1885 static struct dmar_domain *alloc_domain(int flags)
1887 struct dmar_domain *domain;
1889 domain = alloc_domain_mem();
1893 memset(domain, 0, sizeof(*domain));
1894 domain->nid = NUMA_NO_NODE;
1895 domain->flags = flags;
1896 if (first_level_by_default())
1897 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1898 domain->has_iotlb_device = false;
1899 INIT_LIST_HEAD(&domain->devices);
1900 INIT_LIST_HEAD(&domain->subdevices);
1905 /* Must be called with iommu->lock */
1906 static int domain_attach_iommu(struct dmar_domain *domain,
1907 struct intel_iommu *iommu)
1909 unsigned long ndomains;
1912 assert_spin_locked(&device_domain_lock);
1913 assert_spin_locked(&iommu->lock);
1915 domain->iommu_refcnt[iommu->seq_id] += 1;
1916 domain->iommu_count += 1;
1917 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1918 ndomains = cap_ndoms(iommu->cap);
1919 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1921 if (num >= ndomains) {
1922 pr_err("%s: No free domain ids\n", iommu->name);
1923 domain->iommu_refcnt[iommu->seq_id] -= 1;
1924 domain->iommu_count -= 1;
1928 set_bit(num, iommu->domain_ids);
1929 set_iommu_domain(iommu, num, domain);
1931 domain->iommu_did[iommu->seq_id] = num;
1932 domain->nid = iommu->node;
1934 domain_update_iommu_cap(domain);
1940 static int domain_detach_iommu(struct dmar_domain *domain,
1941 struct intel_iommu *iommu)
1945 assert_spin_locked(&device_domain_lock);
1946 assert_spin_locked(&iommu->lock);
1948 domain->iommu_refcnt[iommu->seq_id] -= 1;
1949 count = --domain->iommu_count;
1950 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1951 num = domain->iommu_did[iommu->seq_id];
1952 clear_bit(num, iommu->domain_ids);
1953 set_iommu_domain(iommu, num, NULL);
1955 domain_update_iommu_cap(domain);
1956 domain->iommu_did[iommu->seq_id] = 0;
1962 static inline int guestwidth_to_adjustwidth(int gaw)
1965 int r = (gaw - 12) % 9;
1976 static void domain_exit(struct dmar_domain *domain)
1979 /* Remove associated devices and clear attached or cached domains */
1980 domain_remove_dev_info(domain);
1983 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1984 iommu_put_dma_cookie(&domain->domain);
1987 struct page *freelist;
1989 freelist = domain_unmap(domain, 0,
1990 DOMAIN_MAX_PFN(domain->gaw), NULL);
1991 dma_free_pagelist(freelist);
1994 free_domain_mem(domain);
1998 * Get the PASID directory size for scalable mode context entry.
1999 * Value of X in the PDTS field of a scalable mode context entry
2000 * indicates PASID directory with 2^(X + 7) entries.
2002 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2006 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2007 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2015 * Set the RID_PASID field of a scalable mode context entry. The
2016 * IOMMU hardware will use the PASID value set in this field for
2017 * DMA translations of DMA requests without PASID.
2020 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2022 context->hi |= pasid & ((1 << 20) - 1);
2026 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2029 static inline void context_set_sm_dte(struct context_entry *context)
2031 context->lo |= (1 << 2);
2035 * Set the PRE(Page Request Enable) field of a scalable mode context
2038 static inline void context_set_sm_pre(struct context_entry *context)
2040 context->lo |= (1 << 4);
2043 /* Convert value to context PASID directory size field coding. */
2044 #define context_pdts(pds) (((pds) & 0x7) << 9)
2046 static int domain_context_mapping_one(struct dmar_domain *domain,
2047 struct intel_iommu *iommu,
2048 struct pasid_table *table,
2051 u16 did = domain->iommu_did[iommu->seq_id];
2052 int translation = CONTEXT_TT_MULTI_LEVEL;
2053 struct device_domain_info *info = NULL;
2054 struct context_entry *context;
2055 unsigned long flags;
2060 if (hw_pass_through && domain_type_is_si(domain))
2061 translation = CONTEXT_TT_PASS_THROUGH;
2063 pr_debug("Set context mapping for %02x:%02x.%d\n",
2064 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2066 BUG_ON(!domain->pgd);
2068 spin_lock_irqsave(&device_domain_lock, flags);
2069 spin_lock(&iommu->lock);
2072 context = iommu_context_addr(iommu, bus, devfn, 1);
2077 if (context_present(context))
2081 * For kdump cases, old valid entries may be cached due to the
2082 * in-flight DMA and copied pgtable, but there is no unmapping
2083 * behaviour for them, thus we need an explicit cache flush for
2084 * the newly-mapped device. For kdump, at this point, the device
2085 * is supposed to finish reset at its driver probe stage, so no
2086 * in-flight DMA will exist, and we don't need to worry anymore
2089 if (context_copied(context)) {
2090 u16 did_old = context_domain_id(context);
2092 if (did_old < cap_ndoms(iommu->cap)) {
2093 iommu->flush.flush_context(iommu, did_old,
2094 (((u16)bus) << 8) | devfn,
2095 DMA_CCMD_MASK_NOBIT,
2096 DMA_CCMD_DEVICE_INVL);
2097 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2102 context_clear_entry(context);
2104 if (sm_supported(iommu)) {
2109 /* Setup the PASID DIR pointer: */
2110 pds = context_get_sm_pds(table);
2111 context->lo = (u64)virt_to_phys(table->table) |
2114 /* Setup the RID_PASID field: */
2115 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2118 * Setup the Device-TLB enable bit and Page request
2121 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2122 if (info && info->ats_supported)
2123 context_set_sm_dte(context);
2124 if (info && info->pri_supported)
2125 context_set_sm_pre(context);
2127 struct dma_pte *pgd = domain->pgd;
2130 context_set_domain_id(context, did);
2132 if (translation != CONTEXT_TT_PASS_THROUGH) {
2134 * Skip top levels of page tables for iommu which has
2135 * less agaw than default. Unnecessary for PT mode.
2137 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2139 pgd = phys_to_virt(dma_pte_addr(pgd));
2140 if (!dma_pte_present(pgd))
2144 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2145 if (info && info->ats_supported)
2146 translation = CONTEXT_TT_DEV_IOTLB;
2148 translation = CONTEXT_TT_MULTI_LEVEL;
2150 context_set_address_root(context, virt_to_phys(pgd));
2151 context_set_address_width(context, agaw);
2154 * In pass through mode, AW must be programmed to
2155 * indicate the largest AGAW value supported by
2156 * hardware. And ASR is ignored by hardware.
2158 context_set_address_width(context, iommu->msagaw);
2161 context_set_translation_type(context, translation);
2164 context_set_fault_enable(context);
2165 context_set_present(context);
2166 if (!ecap_coherent(iommu->ecap))
2167 clflush_cache_range(context, sizeof(*context));
2170 * It's a non-present to present mapping. If hardware doesn't cache
2171 * non-present entry we only need to flush the write-buffer. If the
2172 * _does_ cache non-present entries, then it does so in the special
2173 * domain #0, which we have to flush:
2175 if (cap_caching_mode(iommu->cap)) {
2176 iommu->flush.flush_context(iommu, 0,
2177 (((u16)bus) << 8) | devfn,
2178 DMA_CCMD_MASK_NOBIT,
2179 DMA_CCMD_DEVICE_INVL);
2180 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2182 iommu_flush_write_buffer(iommu);
2184 iommu_enable_dev_iotlb(info);
2189 spin_unlock(&iommu->lock);
2190 spin_unlock_irqrestore(&device_domain_lock, flags);
2195 struct domain_context_mapping_data {
2196 struct dmar_domain *domain;
2197 struct intel_iommu *iommu;
2198 struct pasid_table *table;
2201 static int domain_context_mapping_cb(struct pci_dev *pdev,
2202 u16 alias, void *opaque)
2204 struct domain_context_mapping_data *data = opaque;
2206 return domain_context_mapping_one(data->domain, data->iommu,
2207 data->table, PCI_BUS_NUM(alias),
2212 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2214 struct domain_context_mapping_data data;
2215 struct pasid_table *table;
2216 struct intel_iommu *iommu;
2219 iommu = device_to_iommu(dev, &bus, &devfn);
2223 table = intel_pasid_get_table(dev);
2225 if (!dev_is_pci(dev))
2226 return domain_context_mapping_one(domain, iommu, table,
2229 data.domain = domain;
2233 return pci_for_each_dma_alias(to_pci_dev(dev),
2234 &domain_context_mapping_cb, &data);
2237 static int domain_context_mapped_cb(struct pci_dev *pdev,
2238 u16 alias, void *opaque)
2240 struct intel_iommu *iommu = opaque;
2242 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2245 static int domain_context_mapped(struct device *dev)
2247 struct intel_iommu *iommu;
2250 iommu = device_to_iommu(dev, &bus, &devfn);
2254 if (!dev_is_pci(dev))
2255 return device_context_mapped(iommu, bus, devfn);
2257 return !pci_for_each_dma_alias(to_pci_dev(dev),
2258 domain_context_mapped_cb, iommu);
2261 /* Returns a number of VTD pages, but aligned to MM page size */
2262 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2265 host_addr &= ~PAGE_MASK;
2266 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2269 /* Return largest possible superpage level for a given mapping */
2270 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2271 unsigned long iov_pfn,
2272 unsigned long phy_pfn,
2273 unsigned long pages)
2275 int support, level = 1;
2276 unsigned long pfnmerge;
2278 support = domain->iommu_superpage;
2280 /* To use a large page, the virtual *and* physical addresses
2281 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2282 of them will mean we have to use smaller pages. So just
2283 merge them and check both at once. */
2284 pfnmerge = iov_pfn | phy_pfn;
2286 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2287 pages >>= VTD_STRIDE_SHIFT;
2290 pfnmerge >>= VTD_STRIDE_SHIFT;
2298 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2299 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2301 struct dma_pte *first_pte = NULL, *pte = NULL;
2302 unsigned int largepage_lvl = 0;
2303 unsigned long lvl_pages = 0;
2307 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2309 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2312 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2313 if (domain_use_first_level(domain))
2314 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2316 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2318 while (nr_pages > 0) {
2322 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2323 phys_pfn, nr_pages);
2325 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328 /* It is large page*/
2329 if (largepage_lvl > 1) {
2330 unsigned long nr_superpages, end_pfn;
2332 pteval |= DMA_PTE_LARGE_PAGE;
2333 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2335 nr_superpages = nr_pages / lvl_pages;
2336 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339 * Ensure that old small page tables are
2340 * removed to make room for superpage(s).
2341 * We're adding new large pages, so make sure
2342 * we don't remove their parent tables.
2344 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2351 /* We don't need lock here, nobody else
2352 * touches the iova range
2354 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2356 static int dumps = 5;
2357 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2358 iov_pfn, tmp, (unsigned long long)pteval);
2361 debug_dma_dump_mappings(NULL);
2366 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2368 BUG_ON(nr_pages < lvl_pages);
2370 nr_pages -= lvl_pages;
2371 iov_pfn += lvl_pages;
2372 phys_pfn += lvl_pages;
2373 pteval += lvl_pages * VTD_PAGE_SIZE;
2375 /* If the next PTE would be the first in a new page, then we
2376 * need to flush the cache on the entries we've just written.
2377 * And then we'll need to recalculate 'pte', so clear it and
2378 * let it get set again in the if (!pte) block above.
2380 * If we're done (!nr_pages) we need to flush the cache too.
2382 * Also if we've been setting superpages, we may need to
2383 * recalculate 'pte' and switch back to smaller pages for the
2384 * end of the mapping, if the trailing size is not enough to
2385 * use another superpage (i.e. nr_pages < lvl_pages).
2388 if (!nr_pages || first_pte_in_page(pte) ||
2389 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2390 domain_flush_cache(domain, first_pte,
2391 (void *)pte - (void *)first_pte);
2400 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2401 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2404 struct intel_iommu *iommu;
2406 /* Do the real mapping first */
2407 ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2411 for_each_domain_iommu(iommu_id, domain) {
2412 iommu = g_iommus[iommu_id];
2413 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2421 unsigned long flags;
2422 struct context_entry *context;
2428 spin_lock_irqsave(&iommu->lock, flags);
2429 context = iommu_context_addr(iommu, bus, devfn, 0);
2431 spin_unlock_irqrestore(&iommu->lock, flags);
2434 did_old = context_domain_id(context);
2435 context_clear_entry(context);
2436 __iommu_flush_cache(iommu, context, sizeof(*context));
2437 spin_unlock_irqrestore(&iommu->lock, flags);
2438 iommu->flush.flush_context(iommu,
2440 (((u16)bus) << 8) | devfn,
2441 DMA_CCMD_MASK_NOBIT,
2442 DMA_CCMD_DEVICE_INVL);
2443 iommu->flush.flush_iotlb(iommu,
2450 static inline void unlink_domain_info(struct device_domain_info *info)
2452 assert_spin_locked(&device_domain_lock);
2453 list_del(&info->link);
2454 list_del(&info->global);
2456 dev_iommu_priv_set(info->dev, NULL);
2459 static void domain_remove_dev_info(struct dmar_domain *domain)
2461 struct device_domain_info *info, *tmp;
2462 unsigned long flags;
2464 spin_lock_irqsave(&device_domain_lock, flags);
2465 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2466 __dmar_remove_one_dev_info(info);
2467 spin_unlock_irqrestore(&device_domain_lock, flags);
2470 struct dmar_domain *find_domain(struct device *dev)
2472 struct device_domain_info *info;
2474 if (unlikely(!dev || !dev->iommu))
2477 if (unlikely(attach_deferred(dev)))
2480 /* No lock here, assumes no domain exit in normal case */
2481 info = get_domain_info(dev);
2483 return info->domain;
2488 static inline struct device_domain_info *
2489 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2491 struct device_domain_info *info;
2493 list_for_each_entry(info, &device_domain_list, global)
2494 if (info->segment == segment && info->bus == bus &&
2495 info->devfn == devfn)
2501 static int domain_setup_first_level(struct intel_iommu *iommu,
2502 struct dmar_domain *domain,
2506 int flags = PASID_FLAG_SUPERVISOR_MODE;
2507 struct dma_pte *pgd = domain->pgd;
2511 * Skip top levels of page tables for iommu which has
2512 * less agaw than default. Unnecessary for PT mode.
2514 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2515 pgd = phys_to_virt(dma_pte_addr(pgd));
2516 if (!dma_pte_present(pgd))
2520 level = agaw_to_level(agaw);
2521 if (level != 4 && level != 5)
2524 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2526 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2527 domain->iommu_did[iommu->seq_id],
2531 static bool dev_is_real_dma_subdevice(struct device *dev)
2533 return dev && dev_is_pci(dev) &&
2534 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2537 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2540 struct dmar_domain *domain)
2542 struct dmar_domain *found = NULL;
2543 struct device_domain_info *info;
2544 unsigned long flags;
2547 info = alloc_devinfo_mem();
2551 if (!dev_is_real_dma_subdevice(dev)) {
2553 info->devfn = devfn;
2554 info->segment = iommu->segment;
2556 struct pci_dev *pdev = to_pci_dev(dev);
2558 info->bus = pdev->bus->number;
2559 info->devfn = pdev->devfn;
2560 info->segment = pci_domain_nr(pdev->bus);
2563 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2564 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2567 info->domain = domain;
2568 info->iommu = iommu;
2569 info->pasid_table = NULL;
2570 info->auxd_enabled = 0;
2571 INIT_LIST_HEAD(&info->subdevices);
2573 if (dev && dev_is_pci(dev)) {
2574 struct pci_dev *pdev = to_pci_dev(info->dev);
2576 if (ecap_dev_iotlb_support(iommu->ecap) &&
2577 pci_ats_supported(pdev) &&
2578 dmar_find_matched_atsr_unit(pdev))
2579 info->ats_supported = 1;
2581 if (sm_supported(iommu)) {
2582 if (pasid_supported(iommu)) {
2583 int features = pci_pasid_features(pdev);
2585 info->pasid_supported = features | 1;
2588 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2589 pci_pri_supported(pdev))
2590 info->pri_supported = 1;
2594 spin_lock_irqsave(&device_domain_lock, flags);
2596 found = find_domain(dev);
2599 struct device_domain_info *info2;
2600 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2603 found = info2->domain;
2609 spin_unlock_irqrestore(&device_domain_lock, flags);
2610 free_devinfo_mem(info);
2611 /* Caller must free the original domain */
2615 spin_lock(&iommu->lock);
2616 ret = domain_attach_iommu(domain, iommu);
2617 spin_unlock(&iommu->lock);
2620 spin_unlock_irqrestore(&device_domain_lock, flags);
2621 free_devinfo_mem(info);
2625 list_add(&info->link, &domain->devices);
2626 list_add(&info->global, &device_domain_list);
2628 dev_iommu_priv_set(dev, info);
2629 spin_unlock_irqrestore(&device_domain_lock, flags);
2631 /* PASID table is mandatory for a PCI device in scalable mode. */
2632 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2633 ret = intel_pasid_alloc_table(dev);
2635 dev_err(dev, "PASID table allocation failed\n");
2636 dmar_remove_one_dev_info(dev);
2640 /* Setup the PASID entry for requests without PASID: */
2641 spin_lock_irqsave(&iommu->lock, flags);
2642 if (hw_pass_through && domain_type_is_si(domain))
2643 ret = intel_pasid_setup_pass_through(iommu, domain,
2644 dev, PASID_RID2PASID);
2645 else if (domain_use_first_level(domain))
2646 ret = domain_setup_first_level(iommu, domain, dev,
2649 ret = intel_pasid_setup_second_level(iommu, domain,
2650 dev, PASID_RID2PASID);
2651 spin_unlock_irqrestore(&iommu->lock, flags);
2653 dev_err(dev, "Setup RID2PASID failed\n");
2654 dmar_remove_one_dev_info(dev);
2659 if (dev && domain_context_mapping(domain, dev)) {
2660 dev_err(dev, "Domain context map failed\n");
2661 dmar_remove_one_dev_info(dev);
2668 static int iommu_domain_identity_map(struct dmar_domain *domain,
2669 unsigned long first_vpfn,
2670 unsigned long last_vpfn)
2673 * RMRR range might have overlap with physical memory range,
2676 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2678 return __domain_mapping(domain, first_vpfn,
2679 first_vpfn, last_vpfn - first_vpfn + 1,
2680 DMA_PTE_READ|DMA_PTE_WRITE);
2683 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2685 static int __init si_domain_init(int hw)
2687 struct dmar_rmrr_unit *rmrr;
2691 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2695 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2696 domain_exit(si_domain);
2703 for_each_online_node(nid) {
2704 unsigned long start_pfn, end_pfn;
2707 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2708 ret = iommu_domain_identity_map(si_domain,
2709 mm_to_dma_pfn(start_pfn),
2710 mm_to_dma_pfn(end_pfn));
2717 * Identity map the RMRRs so that devices with RMRRs could also use
2720 for_each_rmrr_units(rmrr) {
2721 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2723 unsigned long long start = rmrr->base_address;
2724 unsigned long long end = rmrr->end_address;
2726 if (WARN_ON(end < start ||
2727 end >> agaw_to_width(si_domain->agaw)))
2730 ret = iommu_domain_identity_map(si_domain,
2731 mm_to_dma_pfn(start >> PAGE_SHIFT),
2732 mm_to_dma_pfn(end >> PAGE_SHIFT));
2741 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2743 struct dmar_domain *ndomain;
2744 struct intel_iommu *iommu;
2747 iommu = device_to_iommu(dev, &bus, &devfn);
2751 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2752 if (ndomain != domain)
2758 static bool device_has_rmrr(struct device *dev)
2760 struct dmar_rmrr_unit *rmrr;
2765 for_each_rmrr_units(rmrr) {
2767 * Return TRUE if this RMRR contains the device that
2770 for_each_active_dev_scope(rmrr->devices,
2771 rmrr->devices_cnt, i, tmp)
2773 is_downstream_to_pci_bridge(dev, tmp)) {
2783 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2784 * is relaxable (ie. is allowed to be not enforced under some conditions)
2785 * @dev: device handle
2787 * We assume that PCI USB devices with RMRRs have them largely
2788 * for historical reasons and that the RMRR space is not actively used post
2789 * boot. This exclusion may change if vendors begin to abuse it.
2791 * The same exception is made for graphics devices, with the requirement that
2792 * any use of the RMRR regions will be torn down before assigning the device
2795 * Return: true if the RMRR is relaxable, false otherwise
2797 static bool device_rmrr_is_relaxable(struct device *dev)
2799 struct pci_dev *pdev;
2801 if (!dev_is_pci(dev))
2804 pdev = to_pci_dev(dev);
2805 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2812 * There are a couple cases where we need to restrict the functionality of
2813 * devices associated with RMRRs. The first is when evaluating a device for
2814 * identity mapping because problems exist when devices are moved in and out
2815 * of domains and their respective RMRR information is lost. This means that
2816 * a device with associated RMRRs will never be in a "passthrough" domain.
2817 * The second is use of the device through the IOMMU API. This interface
2818 * expects to have full control of the IOVA space for the device. We cannot
2819 * satisfy both the requirement that RMRR access is maintained and have an
2820 * unencumbered IOVA space. We also have no ability to quiesce the device's
2821 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2822 * We therefore prevent devices associated with an RMRR from participating in
2823 * the IOMMU API, which eliminates them from device assignment.
2825 * In both cases, devices which have relaxable RMRRs are not concerned by this
2826 * restriction. See device_rmrr_is_relaxable comment.
2828 static bool device_is_rmrr_locked(struct device *dev)
2830 if (!device_has_rmrr(dev))
2833 if (device_rmrr_is_relaxable(dev))
2840 * Return the required default domain type for a specific device.
2842 * @dev: the device in query
2843 * @startup: true if this is during early boot
2846 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2847 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2848 * - 0: both identity and dynamic domains work for this device
2850 static int device_def_domain_type(struct device *dev)
2852 if (dev_is_pci(dev)) {
2853 struct pci_dev *pdev = to_pci_dev(dev);
2855 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2856 return IOMMU_DOMAIN_IDENTITY;
2858 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2859 return IOMMU_DOMAIN_IDENTITY;
2865 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2868 * Start from the sane iommu hardware state.
2869 * If the queued invalidation is already initialized by us
2870 * (for example, while enabling interrupt-remapping) then
2871 * we got the things already rolling from a sane state.
2875 * Clear any previous faults.
2877 dmar_fault(-1, iommu);
2879 * Disable queued invalidation if supported and already enabled
2880 * before OS handover.
2882 dmar_disable_qi(iommu);
2885 if (dmar_enable_qi(iommu)) {
2887 * Queued Invalidate not enabled, use Register Based Invalidate
2889 iommu->flush.flush_context = __iommu_flush_context;
2890 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2891 pr_info("%s: Using Register based invalidation\n",
2894 iommu->flush.flush_context = qi_flush_context;
2895 iommu->flush.flush_iotlb = qi_flush_iotlb;
2896 pr_info("%s: Using Queued invalidation\n", iommu->name);
2900 static int copy_context_table(struct intel_iommu *iommu,
2901 struct root_entry *old_re,
2902 struct context_entry **tbl,
2905 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2906 struct context_entry *new_ce = NULL, ce;
2907 struct context_entry *old_ce = NULL;
2908 struct root_entry re;
2909 phys_addr_t old_ce_phys;
2911 tbl_idx = ext ? bus * 2 : bus;
2912 memcpy(&re, old_re, sizeof(re));
2914 for (devfn = 0; devfn < 256; devfn++) {
2915 /* First calculate the correct index */
2916 idx = (ext ? devfn * 2 : devfn) % 256;
2919 /* First save what we may have and clean up */
2921 tbl[tbl_idx] = new_ce;
2922 __iommu_flush_cache(iommu, new_ce,
2932 old_ce_phys = root_entry_lctp(&re);
2934 old_ce_phys = root_entry_uctp(&re);
2937 if (ext && devfn == 0) {
2938 /* No LCTP, try UCTP */
2947 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2952 new_ce = alloc_pgtable_page(iommu->node);
2959 /* Now copy the context entry */
2960 memcpy(&ce, old_ce + idx, sizeof(ce));
2962 if (!__context_present(&ce))
2965 did = context_domain_id(&ce);
2966 if (did >= 0 && did < cap_ndoms(iommu->cap))
2967 set_bit(did, iommu->domain_ids);
2970 * We need a marker for copied context entries. This
2971 * marker needs to work for the old format as well as
2972 * for extended context entries.
2974 * Bit 67 of the context entry is used. In the old
2975 * format this bit is available to software, in the
2976 * extended format it is the PGE bit, but PGE is ignored
2977 * by HW if PASIDs are disabled (and thus still
2980 * So disable PASIDs first and then mark the entry
2981 * copied. This means that we don't copy PASID
2982 * translations from the old kernel, but this is fine as
2983 * faults there are not fatal.
2985 context_clear_pasid_enable(&ce);
2986 context_set_copied(&ce);
2991 tbl[tbl_idx + pos] = new_ce;
2993 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3002 static int copy_translation_tables(struct intel_iommu *iommu)
3004 struct context_entry **ctxt_tbls;
3005 struct root_entry *old_rt;
3006 phys_addr_t old_rt_phys;
3007 int ctxt_table_entries;
3008 unsigned long flags;
3013 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3014 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3015 new_ext = !!ecap_ecs(iommu->ecap);
3018 * The RTT bit can only be changed when translation is disabled,
3019 * but disabling translation means to open a window for data
3020 * corruption. So bail out and don't copy anything if we would
3021 * have to change the bit.
3026 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3030 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3034 /* This is too big for the stack - allocate it from slab */
3035 ctxt_table_entries = ext ? 512 : 256;
3037 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3041 for (bus = 0; bus < 256; bus++) {
3042 ret = copy_context_table(iommu, &old_rt[bus],
3043 ctxt_tbls, bus, ext);
3045 pr_err("%s: Failed to copy context table for bus %d\n",
3051 spin_lock_irqsave(&iommu->lock, flags);
3053 /* Context tables are copied, now write them to the root_entry table */
3054 for (bus = 0; bus < 256; bus++) {
3055 int idx = ext ? bus * 2 : bus;
3058 if (ctxt_tbls[idx]) {
3059 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3060 iommu->root_entry[bus].lo = val;
3063 if (!ext || !ctxt_tbls[idx + 1])
3066 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3067 iommu->root_entry[bus].hi = val;
3070 spin_unlock_irqrestore(&iommu->lock, flags);
3074 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3084 #ifdef CONFIG_INTEL_IOMMU_SVM
3085 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3087 struct intel_iommu *iommu = data;
3091 return INVALID_IOASID;
3093 * VT-d virtual command interface always uses the full 20 bit
3094 * PASID range. Host can partition guest PASID range based on
3095 * policies but it is out of guest's control.
3097 if (min < PASID_MIN || max > intel_pasid_max_id)
3098 return INVALID_IOASID;
3100 if (vcmd_alloc_pasid(iommu, &ioasid))
3101 return INVALID_IOASID;
3106 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3108 struct intel_iommu *iommu = data;
3113 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3114 * We can only free the PASID when all the devices are unbound.
3116 if (ioasid_find(NULL, ioasid, NULL)) {
3117 pr_alert("Cannot free active IOASID %d\n", ioasid);
3120 vcmd_free_pasid(iommu, ioasid);
3123 static void register_pasid_allocator(struct intel_iommu *iommu)
3126 * If we are running in the host, no need for custom allocator
3127 * in that PASIDs are allocated from the host system-wide.
3129 if (!cap_caching_mode(iommu->cap))
3132 if (!sm_supported(iommu)) {
3133 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3138 * Register a custom PASID allocator if we are running in a guest,
3139 * guest PASID must be obtained via virtual command interface.
3140 * There can be multiple vIOMMUs in each guest but only one allocator
3141 * is active. All vIOMMU allocators will eventually be calling the same
3144 if (!vccap_pasid(iommu->vccap))
3147 pr_info("Register custom PASID allocator\n");
3148 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3149 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3150 iommu->pasid_allocator.pdata = (void *)iommu;
3151 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3152 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3154 * Disable scalable mode on this IOMMU if there
3155 * is no custom allocator. Mixing SM capable vIOMMU
3156 * and non-SM vIOMMU are not supported.
3163 static int __init init_dmars(void)
3165 struct dmar_drhd_unit *drhd;
3166 struct intel_iommu *iommu;
3172 * initialize and program root entry to not present
3175 for_each_drhd_unit(drhd) {
3177 * lock not needed as this is only incremented in the single
3178 * threaded kernel __init code path all other access are read
3181 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3185 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3188 /* Preallocate enough resources for IOMMU hot-addition */
3189 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3190 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3192 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3195 pr_err("Allocating global iommu array failed\n");
3200 for_each_iommu(iommu, drhd) {
3201 if (drhd->ignored) {
3202 iommu_disable_translation(iommu);
3207 * Find the max pasid size of all IOMMU's in the system.
3208 * We need to ensure the system pasid table is no bigger
3209 * than the smallest supported.
3211 if (pasid_supported(iommu)) {
3212 u32 temp = 2 << ecap_pss(iommu->ecap);
3214 intel_pasid_max_id = min_t(u32, temp,
3215 intel_pasid_max_id);
3218 g_iommus[iommu->seq_id] = iommu;
3220 intel_iommu_init_qi(iommu);
3222 ret = iommu_init_domains(iommu);
3226 init_translation_status(iommu);
3228 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3229 iommu_disable_translation(iommu);
3230 clear_translation_pre_enabled(iommu);
3231 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3237 * we could share the same root & context tables
3238 * among all IOMMU's. Need to Split it later.
3240 ret = iommu_alloc_root_entry(iommu);
3244 if (translation_pre_enabled(iommu)) {
3245 pr_info("Translation already enabled - trying to copy translation structures\n");
3247 ret = copy_translation_tables(iommu);
3250 * We found the IOMMU with translation
3251 * enabled - but failed to copy over the
3252 * old root-entry table. Try to proceed
3253 * by disabling translation now and
3254 * allocating a clean root-entry table.
3255 * This might cause DMAR faults, but
3256 * probably the dump will still succeed.
3258 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3260 iommu_disable_translation(iommu);
3261 clear_translation_pre_enabled(iommu);
3263 pr_info("Copied translation tables from previous kernel for %s\n",
3268 if (!ecap_pass_through(iommu->ecap))
3269 hw_pass_through = 0;
3270 intel_svm_check(iommu);
3274 * Now that qi is enabled on all iommus, set the root entry and flush
3275 * caches. This is required on some Intel X58 chipsets, otherwise the
3276 * flush_context function will loop forever and the boot hangs.
3278 for_each_active_iommu(iommu, drhd) {
3279 iommu_flush_write_buffer(iommu);
3280 #ifdef CONFIG_INTEL_IOMMU_SVM
3281 register_pasid_allocator(iommu);
3283 iommu_set_root_entry(iommu);
3284 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3285 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3288 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3293 iommu_identity_mapping |= IDENTMAP_GFX;
3295 check_tylersburg_isoch();
3297 ret = si_domain_init(hw_pass_through);
3304 * global invalidate context cache
3305 * global invalidate iotlb
3306 * enable translation
3308 for_each_iommu(iommu, drhd) {
3309 if (drhd->ignored) {
3311 * we always have to disable PMRs or DMA may fail on
3315 iommu_disable_protect_mem_regions(iommu);
3319 iommu_flush_write_buffer(iommu);
3321 #ifdef CONFIG_INTEL_IOMMU_SVM
3322 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3324 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3325 * could cause possible lock race condition.
3327 up_write(&dmar_global_lock);
3328 ret = intel_svm_enable_prq(iommu);
3329 down_write(&dmar_global_lock);
3334 ret = dmar_set_interrupt(iommu);
3342 for_each_active_iommu(iommu, drhd) {
3343 disable_dmar_iommu(iommu);
3344 free_dmar_iommu(iommu);
3353 static inline int iommu_domain_cache_init(void)
3357 iommu_domain_cache = kmem_cache_create("iommu_domain",
3358 sizeof(struct dmar_domain),
3363 if (!iommu_domain_cache) {
3364 pr_err("Couldn't create iommu_domain cache\n");
3371 static inline int iommu_devinfo_cache_init(void)
3375 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3376 sizeof(struct device_domain_info),
3380 if (!iommu_devinfo_cache) {
3381 pr_err("Couldn't create devinfo cache\n");
3388 static int __init iommu_init_mempool(void)
3391 ret = iova_cache_get();
3395 ret = iommu_domain_cache_init();
3399 ret = iommu_devinfo_cache_init();
3403 kmem_cache_destroy(iommu_domain_cache);
3410 static void __init iommu_exit_mempool(void)
3412 kmem_cache_destroy(iommu_devinfo_cache);
3413 kmem_cache_destroy(iommu_domain_cache);
3417 static void __init init_no_remapping_devices(void)
3419 struct dmar_drhd_unit *drhd;
3423 for_each_drhd_unit(drhd) {
3424 if (!drhd->include_all) {
3425 for_each_active_dev_scope(drhd->devices,
3426 drhd->devices_cnt, i, dev)
3428 /* ignore DMAR unit if no devices exist */
3429 if (i == drhd->devices_cnt)
3434 for_each_active_drhd_unit(drhd) {
3435 if (drhd->include_all)
3438 for_each_active_dev_scope(drhd->devices,
3439 drhd->devices_cnt, i, dev)
3440 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3442 if (i < drhd->devices_cnt)
3445 /* This IOMMU has *only* gfx devices. Either bypass it or
3446 set the gfx_mapped flag, as appropriate */
3447 drhd->gfx_dedicated = 1;
3453 #ifdef CONFIG_SUSPEND
3454 static int init_iommu_hw(void)
3456 struct dmar_drhd_unit *drhd;
3457 struct intel_iommu *iommu = NULL;
3459 for_each_active_iommu(iommu, drhd)
3461 dmar_reenable_qi(iommu);
3463 for_each_iommu(iommu, drhd) {
3464 if (drhd->ignored) {
3466 * we always have to disable PMRs or DMA may fail on
3470 iommu_disable_protect_mem_regions(iommu);
3474 iommu_flush_write_buffer(iommu);
3476 iommu_set_root_entry(iommu);
3478 iommu->flush.flush_context(iommu, 0, 0, 0,
3479 DMA_CCMD_GLOBAL_INVL);
3480 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3481 iommu_enable_translation(iommu);
3482 iommu_disable_protect_mem_regions(iommu);
3488 static void iommu_flush_all(void)
3490 struct dmar_drhd_unit *drhd;
3491 struct intel_iommu *iommu;
3493 for_each_active_iommu(iommu, drhd) {
3494 iommu->flush.flush_context(iommu, 0, 0, 0,
3495 DMA_CCMD_GLOBAL_INVL);
3496 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3497 DMA_TLB_GLOBAL_FLUSH);
3501 static int iommu_suspend(void)
3503 struct dmar_drhd_unit *drhd;
3504 struct intel_iommu *iommu = NULL;
3507 for_each_active_iommu(iommu, drhd) {
3508 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3510 if (!iommu->iommu_state)
3516 for_each_active_iommu(iommu, drhd) {
3517 iommu_disable_translation(iommu);
3519 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3521 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3522 readl(iommu->reg + DMAR_FECTL_REG);
3523 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3524 readl(iommu->reg + DMAR_FEDATA_REG);
3525 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3526 readl(iommu->reg + DMAR_FEADDR_REG);
3527 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3528 readl(iommu->reg + DMAR_FEUADDR_REG);
3530 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3535 for_each_active_iommu(iommu, drhd)
3536 kfree(iommu->iommu_state);
3541 static void iommu_resume(void)
3543 struct dmar_drhd_unit *drhd;
3544 struct intel_iommu *iommu = NULL;
3547 if (init_iommu_hw()) {
3549 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3551 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3555 for_each_active_iommu(iommu, drhd) {
3557 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3559 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3560 iommu->reg + DMAR_FECTL_REG);
3561 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3562 iommu->reg + DMAR_FEDATA_REG);
3563 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3564 iommu->reg + DMAR_FEADDR_REG);
3565 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3566 iommu->reg + DMAR_FEUADDR_REG);
3568 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3571 for_each_active_iommu(iommu, drhd)
3572 kfree(iommu->iommu_state);
3575 static struct syscore_ops iommu_syscore_ops = {
3576 .resume = iommu_resume,
3577 .suspend = iommu_suspend,
3580 static void __init init_iommu_pm_ops(void)
3582 register_syscore_ops(&iommu_syscore_ops);
3586 static inline void init_iommu_pm_ops(void) {}
3587 #endif /* CONFIG_PM */
3589 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3591 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3592 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3593 rmrr->end_address <= rmrr->base_address ||
3594 arch_rmrr_sanity_check(rmrr))
3600 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3602 struct acpi_dmar_reserved_memory *rmrr;
3603 struct dmar_rmrr_unit *rmrru;
3605 rmrr = (struct acpi_dmar_reserved_memory *)header;
3606 if (rmrr_sanity_check(rmrr)) {
3608 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3609 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3610 rmrr->base_address, rmrr->end_address,
3611 dmi_get_system_info(DMI_BIOS_VENDOR),
3612 dmi_get_system_info(DMI_BIOS_VERSION),
3613 dmi_get_system_info(DMI_PRODUCT_VERSION));
3614 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3617 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3621 rmrru->hdr = header;
3623 rmrru->base_address = rmrr->base_address;
3624 rmrru->end_address = rmrr->end_address;
3626 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3627 ((void *)rmrr) + rmrr->header.length,
3628 &rmrru->devices_cnt);
3629 if (rmrru->devices_cnt && rmrru->devices == NULL)
3632 list_add(&rmrru->list, &dmar_rmrr_units);
3641 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3643 struct dmar_atsr_unit *atsru;
3644 struct acpi_dmar_atsr *tmp;
3646 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3648 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3649 if (atsr->segment != tmp->segment)
3651 if (atsr->header.length != tmp->header.length)
3653 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3660 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3662 struct acpi_dmar_atsr *atsr;
3663 struct dmar_atsr_unit *atsru;
3665 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3668 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3669 atsru = dmar_find_atsr(atsr);
3673 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3678 * If memory is allocated from slab by ACPI _DSM method, we need to
3679 * copy the memory content because the memory buffer will be freed
3682 atsru->hdr = (void *)(atsru + 1);
3683 memcpy(atsru->hdr, hdr, hdr->length);
3684 atsru->include_all = atsr->flags & 0x1;
3685 if (!atsru->include_all) {
3686 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3687 (void *)atsr + atsr->header.length,
3688 &atsru->devices_cnt);
3689 if (atsru->devices_cnt && atsru->devices == NULL) {
3695 list_add_rcu(&atsru->list, &dmar_atsr_units);
3700 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3702 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3706 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3708 struct acpi_dmar_atsr *atsr;
3709 struct dmar_atsr_unit *atsru;
3711 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3712 atsru = dmar_find_atsr(atsr);
3714 list_del_rcu(&atsru->list);
3716 intel_iommu_free_atsr(atsru);
3722 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3726 struct acpi_dmar_atsr *atsr;
3727 struct dmar_atsr_unit *atsru;
3729 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3730 atsru = dmar_find_atsr(atsr);
3734 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3735 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3743 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3746 struct intel_iommu *iommu = dmaru->iommu;
3748 if (g_iommus[iommu->seq_id])
3751 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3752 pr_warn("%s: Doesn't support hardware pass through.\n",
3756 if (!ecap_sc_support(iommu->ecap) &&
3757 domain_update_iommu_snooping(iommu)) {
3758 pr_warn("%s: Doesn't support snooping.\n",
3762 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3763 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3764 pr_warn("%s: Doesn't support large page.\n",
3770 * Disable translation if already enabled prior to OS handover.
3772 if (iommu->gcmd & DMA_GCMD_TE)
3773 iommu_disable_translation(iommu);
3775 g_iommus[iommu->seq_id] = iommu;
3776 ret = iommu_init_domains(iommu);
3778 ret = iommu_alloc_root_entry(iommu);
3782 intel_svm_check(iommu);
3784 if (dmaru->ignored) {
3786 * we always have to disable PMRs or DMA may fail on this device
3789 iommu_disable_protect_mem_regions(iommu);
3793 intel_iommu_init_qi(iommu);
3794 iommu_flush_write_buffer(iommu);
3796 #ifdef CONFIG_INTEL_IOMMU_SVM
3797 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3798 ret = intel_svm_enable_prq(iommu);
3803 ret = dmar_set_interrupt(iommu);
3807 iommu_set_root_entry(iommu);
3808 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3809 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3810 iommu_enable_translation(iommu);
3812 iommu_disable_protect_mem_regions(iommu);
3816 disable_dmar_iommu(iommu);
3818 free_dmar_iommu(iommu);
3822 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3825 struct intel_iommu *iommu = dmaru->iommu;
3827 if (!intel_iommu_enabled)
3833 ret = intel_iommu_add(dmaru);
3835 disable_dmar_iommu(iommu);
3836 free_dmar_iommu(iommu);
3842 static void intel_iommu_free_dmars(void)
3844 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3845 struct dmar_atsr_unit *atsru, *atsr_n;
3847 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3848 list_del(&rmrru->list);
3849 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3853 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3854 list_del(&atsru->list);
3855 intel_iommu_free_atsr(atsru);
3859 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3862 struct pci_bus *bus;
3863 struct pci_dev *bridge = NULL;
3865 struct acpi_dmar_atsr *atsr;
3866 struct dmar_atsr_unit *atsru;
3868 dev = pci_physfn(dev);
3869 for (bus = dev->bus; bus; bus = bus->parent) {
3871 /* If it's an integrated device, allow ATS */
3874 /* Connected via non-PCIe: no ATS */
3875 if (!pci_is_pcie(bridge) ||
3876 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3878 /* If we found the root port, look it up in the ATSR */
3879 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3884 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3885 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3886 if (atsr->segment != pci_domain_nr(dev->bus))
3889 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3890 if (tmp == &bridge->dev)
3893 if (atsru->include_all)
3903 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3906 struct dmar_rmrr_unit *rmrru;
3907 struct dmar_atsr_unit *atsru;
3908 struct acpi_dmar_atsr *atsr;
3909 struct acpi_dmar_reserved_memory *rmrr;
3911 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3914 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3915 rmrr = container_of(rmrru->hdr,
3916 struct acpi_dmar_reserved_memory, header);
3917 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3918 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3919 ((void *)rmrr) + rmrr->header.length,
3920 rmrr->segment, rmrru->devices,
3921 rmrru->devices_cnt);
3924 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3925 dmar_remove_dev_scope(info, rmrr->segment,
3926 rmrru->devices, rmrru->devices_cnt);
3930 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3931 if (atsru->include_all)
3934 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3935 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3936 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3937 (void *)atsr + atsr->header.length,
3938 atsr->segment, atsru->devices,
3939 atsru->devices_cnt);
3944 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3945 if (dmar_remove_dev_scope(info, atsr->segment,
3946 atsru->devices, atsru->devices_cnt))
3954 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3955 unsigned long val, void *v)
3957 struct memory_notify *mhp = v;
3958 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3959 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3963 case MEM_GOING_ONLINE:
3964 if (iommu_domain_identity_map(si_domain,
3965 start_vpfn, last_vpfn)) {
3966 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3967 start_vpfn, last_vpfn);
3973 case MEM_CANCEL_ONLINE:
3975 struct dmar_drhd_unit *drhd;
3976 struct intel_iommu *iommu;
3977 struct page *freelist;
3979 freelist = domain_unmap(si_domain,
3980 start_vpfn, last_vpfn,
3984 for_each_active_iommu(iommu, drhd)
3985 iommu_flush_iotlb_psi(iommu, si_domain,
3986 start_vpfn, mhp->nr_pages,
3989 dma_free_pagelist(freelist);
3997 static struct notifier_block intel_iommu_memory_nb = {
3998 .notifier_call = intel_iommu_memory_notifier,
4002 static void free_all_cpu_cached_iovas(unsigned int cpu)
4006 for (i = 0; i < g_num_of_iommus; i++) {
4007 struct intel_iommu *iommu = g_iommus[i];
4008 struct dmar_domain *domain;
4014 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4015 domain = get_iommu_domain(iommu, (u16)did);
4017 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4020 iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4025 static int intel_iommu_cpu_dead(unsigned int cpu)
4027 free_all_cpu_cached_iovas(cpu);
4031 static void intel_disable_iommus(void)
4033 struct intel_iommu *iommu = NULL;
4034 struct dmar_drhd_unit *drhd;
4036 for_each_iommu(iommu, drhd)
4037 iommu_disable_translation(iommu);
4040 void intel_iommu_shutdown(void)
4042 struct dmar_drhd_unit *drhd;
4043 struct intel_iommu *iommu = NULL;
4045 if (no_iommu || dmar_disabled)
4048 down_write(&dmar_global_lock);
4050 /* Disable PMRs explicitly here. */
4051 for_each_iommu(iommu, drhd)
4052 iommu_disable_protect_mem_regions(iommu);
4054 /* Make sure the IOMMUs are switched off */
4055 intel_disable_iommus();
4057 up_write(&dmar_global_lock);
4060 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4062 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4064 return container_of(iommu_dev, struct intel_iommu, iommu);
4067 static ssize_t intel_iommu_show_version(struct device *dev,
4068 struct device_attribute *attr,
4071 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4072 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4073 return sprintf(buf, "%d:%d\n",
4074 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4076 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4078 static ssize_t intel_iommu_show_address(struct device *dev,
4079 struct device_attribute *attr,
4082 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4083 return sprintf(buf, "%llx\n", iommu->reg_phys);
4085 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4087 static ssize_t intel_iommu_show_cap(struct device *dev,
4088 struct device_attribute *attr,
4091 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4092 return sprintf(buf, "%llx\n", iommu->cap);
4094 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4096 static ssize_t intel_iommu_show_ecap(struct device *dev,
4097 struct device_attribute *attr,
4100 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4101 return sprintf(buf, "%llx\n", iommu->ecap);
4103 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4105 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4106 struct device_attribute *attr,
4109 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4110 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4112 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4114 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4115 struct device_attribute *attr,
4118 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4119 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4120 cap_ndoms(iommu->cap)));
4122 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4124 static struct attribute *intel_iommu_attrs[] = {
4125 &dev_attr_version.attr,
4126 &dev_attr_address.attr,
4128 &dev_attr_ecap.attr,
4129 &dev_attr_domains_supported.attr,
4130 &dev_attr_domains_used.attr,
4134 static struct attribute_group intel_iommu_group = {
4135 .name = "intel-iommu",
4136 .attrs = intel_iommu_attrs,
4139 const struct attribute_group *intel_iommu_groups[] = {
4144 static inline bool has_external_pci(void)
4146 struct pci_dev *pdev = NULL;
4148 for_each_pci_dev(pdev)
4149 if (pdev->external_facing)
4155 static int __init platform_optin_force_iommu(void)
4157 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4160 if (no_iommu || dmar_disabled)
4161 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4164 * If Intel-IOMMU is disabled by default, we will apply identity
4165 * map for all devices except those marked as being untrusted.
4168 iommu_set_default_passthrough(false);
4176 static int __init probe_acpi_namespace_devices(void)
4178 struct dmar_drhd_unit *drhd;
4179 /* To avoid a -Wunused-but-set-variable warning. */
4180 struct intel_iommu *iommu __maybe_unused;
4184 for_each_active_iommu(iommu, drhd) {
4185 for_each_active_dev_scope(drhd->devices,
4186 drhd->devices_cnt, i, dev) {
4187 struct acpi_device_physical_node *pn;
4188 struct iommu_group *group;
4189 struct acpi_device *adev;
4191 if (dev->bus != &acpi_bus_type)
4194 adev = to_acpi_device(dev);
4195 mutex_lock(&adev->physical_node_lock);
4196 list_for_each_entry(pn,
4197 &adev->physical_node_list, node) {
4198 group = iommu_group_get(pn->dev);
4200 iommu_group_put(group);
4204 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4205 ret = iommu_probe_device(pn->dev);
4209 mutex_unlock(&adev->physical_node_lock);
4219 int __init intel_iommu_init(void)
4222 struct dmar_drhd_unit *drhd;
4223 struct intel_iommu *iommu;
4226 * Intel IOMMU is required for a TXT/tboot launch or platform
4227 * opt in, so enforce that.
4229 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4230 platform_optin_force_iommu();
4232 if (iommu_init_mempool()) {
4234 panic("tboot: Failed to initialize iommu memory\n");
4238 down_write(&dmar_global_lock);
4239 if (dmar_table_init()) {
4241 panic("tboot: Failed to initialize DMAR table\n");
4245 if (dmar_dev_scope_init() < 0) {
4247 panic("tboot: Failed to initialize DMAR device scope\n");
4251 up_write(&dmar_global_lock);
4254 * The bus notifier takes the dmar_global_lock, so lockdep will
4255 * complain later when we register it under the lock.
4257 dmar_register_bus_notifier();
4259 down_write(&dmar_global_lock);
4262 intel_iommu_debugfs_init();
4264 if (no_iommu || dmar_disabled) {
4266 * We exit the function here to ensure IOMMU's remapping and
4267 * mempool aren't setup, which means that the IOMMU's PMRs
4268 * won't be disabled via the call to init_dmars(). So disable
4269 * it explicitly here. The PMRs were setup by tboot prior to
4270 * calling SENTER, but the kernel is expected to reset/tear
4273 if (intel_iommu_tboot_noforce) {
4274 for_each_iommu(iommu, drhd)
4275 iommu_disable_protect_mem_regions(iommu);
4279 * Make sure the IOMMUs are switched off, even when we
4280 * boot into a kexec kernel and the previous kernel left
4283 intel_disable_iommus();
4287 if (list_empty(&dmar_rmrr_units))
4288 pr_info("No RMRR found\n");
4290 if (list_empty(&dmar_atsr_units))
4291 pr_info("No ATSR found\n");
4294 intel_iommu_gfx_mapped = 1;
4296 init_no_remapping_devices();
4301 panic("tboot: Failed to initialize DMARs\n");
4302 pr_err("Initialization failed\n");
4305 up_write(&dmar_global_lock);
4307 init_iommu_pm_ops();
4309 down_read(&dmar_global_lock);
4310 for_each_active_iommu(iommu, drhd) {
4311 iommu_device_sysfs_add(&iommu->iommu, NULL,
4314 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4315 iommu_device_register(&iommu->iommu);
4317 up_read(&dmar_global_lock);
4319 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4320 if (si_domain && !hw_pass_through)
4321 register_memory_notifier(&intel_iommu_memory_nb);
4322 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4323 intel_iommu_cpu_dead);
4325 down_read(&dmar_global_lock);
4326 if (probe_acpi_namespace_devices())
4327 pr_warn("ACPI name space devices didn't probe correctly\n");
4329 /* Finally, we enable the DMA remapping hardware. */
4330 for_each_iommu(iommu, drhd) {
4331 if (!drhd->ignored && !translation_pre_enabled(iommu))
4332 iommu_enable_translation(iommu);
4334 iommu_disable_protect_mem_regions(iommu);
4336 up_read(&dmar_global_lock);
4338 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4340 intel_iommu_enabled = 1;
4345 intel_iommu_free_dmars();
4346 up_write(&dmar_global_lock);
4347 iommu_exit_mempool();
4351 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4353 struct intel_iommu *iommu = opaque;
4355 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4360 * NB - intel-iommu lacks any sort of reference counting for the users of
4361 * dependent devices. If multiple endpoints have intersecting dependent
4362 * devices, unbinding the driver from any one of them will possibly leave
4363 * the others unable to operate.
4365 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4367 if (!iommu || !dev || !dev_is_pci(dev))
4370 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4373 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4375 struct dmar_domain *domain;
4376 struct intel_iommu *iommu;
4377 unsigned long flags;
4379 assert_spin_locked(&device_domain_lock);
4384 iommu = info->iommu;
4385 domain = info->domain;
4388 if (dev_is_pci(info->dev) && sm_supported(iommu))
4389 intel_pasid_tear_down_entry(iommu, info->dev,
4390 PASID_RID2PASID, false);
4392 iommu_disable_dev_iotlb(info);
4393 if (!dev_is_real_dma_subdevice(info->dev))
4394 domain_context_clear(iommu, info->dev);
4395 intel_pasid_free_table(info->dev);
4398 unlink_domain_info(info);
4400 spin_lock_irqsave(&iommu->lock, flags);
4401 domain_detach_iommu(domain, iommu);
4402 spin_unlock_irqrestore(&iommu->lock, flags);
4404 free_devinfo_mem(info);
4407 static void dmar_remove_one_dev_info(struct device *dev)
4409 struct device_domain_info *info;
4410 unsigned long flags;
4412 spin_lock_irqsave(&device_domain_lock, flags);
4413 info = get_domain_info(dev);
4415 __dmar_remove_one_dev_info(info);
4416 spin_unlock_irqrestore(&device_domain_lock, flags);
4419 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4423 /* calculate AGAW */
4424 domain->gaw = guest_width;
4425 adjust_width = guestwidth_to_adjustwidth(guest_width);
4426 domain->agaw = width_to_agaw(adjust_width);
4428 domain->iommu_coherency = 0;
4429 domain->iommu_snooping = 0;
4430 domain->iommu_superpage = 0;
4431 domain->max_addr = 0;
4433 /* always allocate the top pgd */
4434 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4437 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4441 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4443 struct dmar_domain *dmar_domain;
4444 struct iommu_domain *domain;
4447 case IOMMU_DOMAIN_DMA:
4448 case IOMMU_DOMAIN_UNMANAGED:
4449 dmar_domain = alloc_domain(0);
4451 pr_err("Can't allocate dmar_domain\n");
4454 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4455 pr_err("Domain initialization failed\n");
4456 domain_exit(dmar_domain);
4460 if (type == IOMMU_DOMAIN_DMA &&
4461 iommu_get_dma_cookie(&dmar_domain->domain))
4464 domain = &dmar_domain->domain;
4465 domain->geometry.aperture_start = 0;
4466 domain->geometry.aperture_end =
4467 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4468 domain->geometry.force_aperture = true;
4471 case IOMMU_DOMAIN_IDENTITY:
4472 return &si_domain->domain;
4480 static void intel_iommu_domain_free(struct iommu_domain *domain)
4482 if (domain != &si_domain->domain)
4483 domain_exit(to_dmar_domain(domain));
4487 * Check whether a @domain could be attached to the @dev through the
4488 * aux-domain attach/detach APIs.
4491 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4493 struct device_domain_info *info = get_domain_info(dev);
4495 return info && info->auxd_enabled &&
4496 domain->type == IOMMU_DOMAIN_UNMANAGED;
4499 static inline struct subdev_domain_info *
4500 lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4502 struct subdev_domain_info *sinfo;
4504 if (!list_empty(&domain->subdevices)) {
4505 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4506 if (sinfo->pdev == dev)
4514 static int auxiliary_link_device(struct dmar_domain *domain,
4517 struct device_domain_info *info = get_domain_info(dev);
4518 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4520 assert_spin_locked(&device_domain_lock);
4525 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4526 sinfo->domain = domain;
4528 list_add(&sinfo->link_phys, &info->subdevices);
4529 list_add(&sinfo->link_domain, &domain->subdevices);
4532 return ++sinfo->users;
4535 static int auxiliary_unlink_device(struct dmar_domain *domain,
4538 struct device_domain_info *info = get_domain_info(dev);
4539 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4542 assert_spin_locked(&device_domain_lock);
4543 if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4546 ret = --sinfo->users;
4548 list_del(&sinfo->link_phys);
4549 list_del(&sinfo->link_domain);
4556 static int aux_domain_add_dev(struct dmar_domain *domain,
4560 unsigned long flags;
4561 struct intel_iommu *iommu;
4563 iommu = device_to_iommu(dev, NULL, NULL);
4567 if (domain->default_pasid <= 0) {
4570 /* No private data needed for the default pasid */
4571 pasid = ioasid_alloc(NULL, PASID_MIN,
4572 pci_max_pasids(to_pci_dev(dev)) - 1,
4574 if (pasid == INVALID_IOASID) {
4575 pr_err("Can't allocate default pasid\n");
4578 domain->default_pasid = pasid;
4581 spin_lock_irqsave(&device_domain_lock, flags);
4582 ret = auxiliary_link_device(domain, dev);
4587 * Subdevices from the same physical device can be attached to the
4588 * same domain. For such cases, only the first subdevice attachment
4589 * needs to go through the full steps in this function. So if ret >
4596 * iommu->lock must be held to attach domain to iommu and setup the
4597 * pasid entry for second level translation.
4599 spin_lock(&iommu->lock);
4600 ret = domain_attach_iommu(domain, iommu);
4604 /* Setup the PASID entry for mediated devices: */
4605 if (domain_use_first_level(domain))
4606 ret = domain_setup_first_level(iommu, domain, dev,
4607 domain->default_pasid);
4609 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4610 domain->default_pasid);
4614 spin_unlock(&iommu->lock);
4616 spin_unlock_irqrestore(&device_domain_lock, flags);
4621 domain_detach_iommu(domain, iommu);
4623 spin_unlock(&iommu->lock);
4624 auxiliary_unlink_device(domain, dev);
4626 spin_unlock_irqrestore(&device_domain_lock, flags);
4627 if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4628 ioasid_put(domain->default_pasid);
4633 static void aux_domain_remove_dev(struct dmar_domain *domain,
4636 struct device_domain_info *info;
4637 struct intel_iommu *iommu;
4638 unsigned long flags;
4640 if (!is_aux_domain(dev, &domain->domain))
4643 spin_lock_irqsave(&device_domain_lock, flags);
4644 info = get_domain_info(dev);
4645 iommu = info->iommu;
4647 if (!auxiliary_unlink_device(domain, dev)) {
4648 spin_lock(&iommu->lock);
4649 intel_pasid_tear_down_entry(iommu, dev,
4650 domain->default_pasid, false);
4651 domain_detach_iommu(domain, iommu);
4652 spin_unlock(&iommu->lock);
4655 spin_unlock_irqrestore(&device_domain_lock, flags);
4657 if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4658 ioasid_put(domain->default_pasid);
4661 static int prepare_domain_attach_device(struct iommu_domain *domain,
4664 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4665 struct intel_iommu *iommu;
4668 iommu = device_to_iommu(dev, NULL, NULL);
4672 /* check if this iommu agaw is sufficient for max mapped address */
4673 addr_width = agaw_to_width(iommu->agaw);
4674 if (addr_width > cap_mgaw(iommu->cap))
4675 addr_width = cap_mgaw(iommu->cap);
4677 if (dmar_domain->max_addr > (1LL << addr_width)) {
4678 dev_err(dev, "%s: iommu width (%d) is not "
4679 "sufficient for the mapped address (%llx)\n",
4680 __func__, addr_width, dmar_domain->max_addr);
4683 dmar_domain->gaw = addr_width;
4686 * Knock out extra levels of page tables if necessary
4688 while (iommu->agaw < dmar_domain->agaw) {
4689 struct dma_pte *pte;
4691 pte = dmar_domain->pgd;
4692 if (dma_pte_present(pte)) {
4693 dmar_domain->pgd = (struct dma_pte *)
4694 phys_to_virt(dma_pte_addr(pte));
4695 free_pgtable_page(pte);
4697 dmar_domain->agaw--;
4703 static int intel_iommu_attach_device(struct iommu_domain *domain,
4708 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4709 device_is_rmrr_locked(dev)) {
4710 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4714 if (is_aux_domain(dev, domain))
4717 /* normally dev is not mapped */
4718 if (unlikely(domain_context_mapped(dev))) {
4719 struct dmar_domain *old_domain;
4721 old_domain = find_domain(dev);
4723 dmar_remove_one_dev_info(dev);
4726 ret = prepare_domain_attach_device(domain, dev);
4730 return domain_add_dev_info(to_dmar_domain(domain), dev);
4733 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4738 if (!is_aux_domain(dev, domain))
4741 ret = prepare_domain_attach_device(domain, dev);
4745 return aux_domain_add_dev(to_dmar_domain(domain), dev);
4748 static void intel_iommu_detach_device(struct iommu_domain *domain,
4751 dmar_remove_one_dev_info(dev);
4754 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4757 aux_domain_remove_dev(to_dmar_domain(domain), dev);
4760 #ifdef CONFIG_INTEL_IOMMU_SVM
4762 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4763 * VT-d granularity. Invalidation is typically included in the unmap operation
4764 * as a result of DMA or VFIO unmap. However, for assigned devices guest
4765 * owns the first level page tables. Invalidations of translation caches in the
4766 * guest are trapped and passed down to the host.
4768 * vIOMMU in the guest will only expose first level page tables, therefore
4769 * we do not support IOTLB granularity for request without PASID (second level).
4771 * For example, to find the VT-d granularity encoding for IOTLB
4772 * type and page selective granularity within PASID:
4773 * X: indexed by iommu cache type
4774 * Y: indexed by enum iommu_inv_granularity
4775 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4779 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4781 * PASID based IOTLB invalidation: PASID selective (per PASID),
4782 * page selective (address granularity)
4784 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4785 /* PASID based dev TLBs */
4786 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4788 {-EINVAL, -EINVAL, -EINVAL}
4791 static inline int to_vtd_granularity(int type, int granu)
4793 return inv_type_granu_table[type][granu];
4796 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4798 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4800 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4801 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4802 * granu size in contiguous memory.
4804 return order_base_2(nr_pages);
4808 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4809 struct iommu_cache_invalidate_info *inv_info)
4811 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4812 struct device_domain_info *info;
4813 struct intel_iommu *iommu;
4814 unsigned long flags;
4821 if (!inv_info || !dmar_domain)
4824 if (!dev || !dev_is_pci(dev))
4827 iommu = device_to_iommu(dev, &bus, &devfn);
4831 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4834 spin_lock_irqsave(&device_domain_lock, flags);
4835 spin_lock(&iommu->lock);
4836 info = get_domain_info(dev);
4841 did = dmar_domain->iommu_did[iommu->seq_id];
4842 sid = PCI_DEVID(bus, devfn);
4844 /* Size is only valid in address selective invalidation */
4845 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4846 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4847 inv_info->granu.addr_info.nb_granules);
4849 for_each_set_bit(cache_type,
4850 (unsigned long *)&inv_info->cache,
4851 IOMMU_CACHE_INV_TYPE_NR) {
4856 granu = to_vtd_granularity(cache_type, inv_info->granularity);
4857 if (granu == -EINVAL) {
4858 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4859 cache_type, inv_info->granularity);
4864 * PASID is stored in different locations based on the
4867 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4868 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4869 pasid = inv_info->granu.pasid_info.pasid;
4870 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4871 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4872 pasid = inv_info->granu.addr_info.pasid;
4874 switch (BIT(cache_type)) {
4875 case IOMMU_CACHE_INV_TYPE_IOTLB:
4876 /* HW will ignore LSB bits based on address mask */
4877 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4879 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4880 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4881 inv_info->granu.addr_info.addr, size);
4885 * If granu is PASID-selective, address is ignored.
4886 * We use npages = -1 to indicate that.
4888 qi_flush_piotlb(iommu, did, pasid,
4889 mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4890 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4891 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4893 if (!info->ats_enabled)
4896 * Always flush device IOTLB if ATS is enabled. vIOMMU
4897 * in the guest may assume IOTLB flush is inclusive,
4898 * which is more efficient.
4901 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4903 * PASID based device TLB invalidation does not support
4904 * IOMMU_INV_GRANU_PASID granularity but only supports
4905 * IOMMU_INV_GRANU_ADDR.
4906 * The equivalent of that is we set the size to be the
4907 * entire range of 64 bit. User only provides PASID info
4908 * without address info. So we set addr to 0.
4910 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4911 size = 64 - VTD_PAGE_SHIFT;
4913 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4914 addr = inv_info->granu.addr_info.addr;
4917 if (info->ats_enabled)
4918 qi_flush_dev_iotlb_pasid(iommu, sid,
4920 info->ats_qdep, addr,
4923 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4926 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4932 spin_unlock(&iommu->lock);
4933 spin_unlock_irqrestore(&device_domain_lock, flags);
4939 static int intel_iommu_map(struct iommu_domain *domain,
4940 unsigned long iova, phys_addr_t hpa,
4941 size_t size, int iommu_prot, gfp_t gfp)
4943 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4948 if (iommu_prot & IOMMU_READ)
4949 prot |= DMA_PTE_READ;
4950 if (iommu_prot & IOMMU_WRITE)
4951 prot |= DMA_PTE_WRITE;
4952 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4953 prot |= DMA_PTE_SNP;
4955 max_addr = iova + size;
4956 if (dmar_domain->max_addr < max_addr) {
4959 /* check if minimum agaw is sufficient for mapped address */
4960 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4961 if (end < max_addr) {
4962 pr_err("%s: iommu width (%d) is not "
4963 "sufficient for the mapped address (%llx)\n",
4964 __func__, dmar_domain->gaw, max_addr);
4967 dmar_domain->max_addr = max_addr;
4969 /* Round up size to next multiple of PAGE_SIZE, if it and
4970 the low bits of hpa would take us onto the next page */
4971 size = aligned_nrpages(hpa, size);
4972 ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4973 hpa >> VTD_PAGE_SHIFT, size, prot);
4977 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4978 unsigned long iova, size_t size,
4979 struct iommu_iotlb_gather *gather)
4981 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4982 unsigned long start_pfn, last_pfn;
4985 /* Cope with horrid API which requires us to unmap more than the
4986 size argument if it happens to be a large-page mapping. */
4987 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4989 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4990 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4992 start_pfn = iova >> VTD_PAGE_SHIFT;
4993 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4995 gather->freelist = domain_unmap(dmar_domain, start_pfn,
4996 last_pfn, gather->freelist);
4998 if (dmar_domain->max_addr == iova + size)
4999 dmar_domain->max_addr = iova;
5001 iommu_iotlb_gather_add_page(domain, gather, iova, size);
5006 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5007 struct iommu_iotlb_gather *gather)
5009 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5010 unsigned long iova_pfn = IOVA_PFN(gather->start);
5011 size_t size = gather->end - gather->start;
5012 unsigned long start_pfn;
5013 unsigned long nrpages;
5016 nrpages = aligned_nrpages(gather->start, size);
5017 start_pfn = mm_to_dma_pfn(iova_pfn);
5019 for_each_domain_iommu(iommu_id, dmar_domain)
5020 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5021 start_pfn, nrpages, !gather->freelist, 0);
5023 dma_free_pagelist(gather->freelist);
5026 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5029 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5030 struct dma_pte *pte;
5034 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5035 if (pte && dma_pte_present(pte))
5036 phys = dma_pte_addr(pte) +
5037 (iova & (BIT_MASK(level_to_offset_bits(level) +
5038 VTD_PAGE_SHIFT) - 1));
5043 static inline bool scalable_mode_support(void)
5045 struct dmar_drhd_unit *drhd;
5046 struct intel_iommu *iommu;
5050 for_each_active_iommu(iommu, drhd) {
5051 if (!sm_supported(iommu)) {
5061 static inline bool iommu_pasid_support(void)
5063 struct dmar_drhd_unit *drhd;
5064 struct intel_iommu *iommu;
5068 for_each_active_iommu(iommu, drhd) {
5069 if (!pasid_supported(iommu)) {
5079 static inline bool nested_mode_support(void)
5081 struct dmar_drhd_unit *drhd;
5082 struct intel_iommu *iommu;
5086 for_each_active_iommu(iommu, drhd) {
5087 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5097 static bool intel_iommu_capable(enum iommu_cap cap)
5099 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5100 return domain_update_iommu_snooping(NULL) == 1;
5101 if (cap == IOMMU_CAP_INTR_REMAP)
5102 return irq_remapping_enabled == 1;
5107 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5109 struct intel_iommu *iommu;
5111 iommu = device_to_iommu(dev, NULL, NULL);
5113 return ERR_PTR(-ENODEV);
5115 if (translation_pre_enabled(iommu))
5116 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5118 return &iommu->iommu;
5121 static void intel_iommu_release_device(struct device *dev)
5123 struct intel_iommu *iommu;
5125 iommu = device_to_iommu(dev, NULL, NULL);
5129 dmar_remove_one_dev_info(dev);
5131 set_dma_ops(dev, NULL);
5134 static void intel_iommu_probe_finalize(struct device *dev)
5136 dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5137 struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5138 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5140 if (domain && domain->type == IOMMU_DOMAIN_DMA)
5141 iommu_setup_dma_ops(dev, base,
5142 __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5144 set_dma_ops(dev, NULL);
5147 static void intel_iommu_get_resv_regions(struct device *device,
5148 struct list_head *head)
5150 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5151 struct iommu_resv_region *reg;
5152 struct dmar_rmrr_unit *rmrr;
5153 struct device *i_dev;
5156 down_read(&dmar_global_lock);
5157 for_each_rmrr_units(rmrr) {
5158 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5160 struct iommu_resv_region *resv;
5161 enum iommu_resv_type type;
5164 if (i_dev != device &&
5165 !is_downstream_to_pci_bridge(device, i_dev))
5168 length = rmrr->end_address - rmrr->base_address + 1;
5170 type = device_rmrr_is_relaxable(device) ?
5171 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5173 resv = iommu_alloc_resv_region(rmrr->base_address,
5174 length, prot, type);
5178 list_add_tail(&resv->list, head);
5181 up_read(&dmar_global_lock);
5183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5184 if (dev_is_pci(device)) {
5185 struct pci_dev *pdev = to_pci_dev(device);
5187 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5188 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5189 IOMMU_RESV_DIRECT_RELAXABLE);
5191 list_add_tail(®->list, head);
5194 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5196 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5197 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5201 list_add_tail(®->list, head);
5204 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5206 struct device_domain_info *info;
5207 struct context_entry *context;
5208 struct dmar_domain *domain;
5209 unsigned long flags;
5213 domain = find_domain(dev);
5217 spin_lock_irqsave(&device_domain_lock, flags);
5218 spin_lock(&iommu->lock);
5221 info = get_domain_info(dev);
5222 if (!info || !info->pasid_supported)
5225 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5226 if (WARN_ON(!context))
5229 ctx_lo = context[0].lo;
5231 if (!(ctx_lo & CONTEXT_PASIDE)) {
5232 ctx_lo |= CONTEXT_PASIDE;
5233 context[0].lo = ctx_lo;
5235 iommu->flush.flush_context(iommu,
5236 domain->iommu_did[iommu->seq_id],
5237 PCI_DEVID(info->bus, info->devfn),
5238 DMA_CCMD_MASK_NOBIT,
5239 DMA_CCMD_DEVICE_INVL);
5242 /* Enable PASID support in the device, if it wasn't already */
5243 if (!info->pasid_enabled)
5244 iommu_enable_dev_iotlb(info);
5249 spin_unlock(&iommu->lock);
5250 spin_unlock_irqrestore(&device_domain_lock, flags);
5255 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5257 if (dev_is_pci(dev))
5258 return pci_device_group(dev);
5259 return generic_device_group(dev);
5262 static int intel_iommu_enable_auxd(struct device *dev)
5264 struct device_domain_info *info;
5265 struct intel_iommu *iommu;
5266 unsigned long flags;
5269 iommu = device_to_iommu(dev, NULL, NULL);
5270 if (!iommu || dmar_disabled)
5273 if (!sm_supported(iommu) || !pasid_supported(iommu))
5276 ret = intel_iommu_enable_pasid(iommu, dev);
5280 spin_lock_irqsave(&device_domain_lock, flags);
5281 info = get_domain_info(dev);
5282 info->auxd_enabled = 1;
5283 spin_unlock_irqrestore(&device_domain_lock, flags);
5288 static int intel_iommu_disable_auxd(struct device *dev)
5290 struct device_domain_info *info;
5291 unsigned long flags;
5293 spin_lock_irqsave(&device_domain_lock, flags);
5294 info = get_domain_info(dev);
5295 if (!WARN_ON(!info))
5296 info->auxd_enabled = 0;
5297 spin_unlock_irqrestore(&device_domain_lock, flags);
5303 * A PCI express designated vendor specific extended capability is defined
5304 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5305 * for system software and tools to detect endpoint devices supporting the
5306 * Intel scalable IO virtualization without host driver dependency.
5308 * Returns the address of the matching extended capability structure within
5309 * the device's PCI configuration space or 0 if the device does not support
5312 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5317 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5319 pci_read_config_word(pdev, pos + 4, &vendor);
5320 pci_read_config_word(pdev, pos + 8, &id);
5321 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5324 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5331 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5333 if (feat == IOMMU_DEV_FEAT_AUX) {
5336 if (!dev_is_pci(dev) || dmar_disabled ||
5337 !scalable_mode_support() || !iommu_pasid_support())
5340 ret = pci_pasid_features(to_pci_dev(dev));
5344 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5347 if (feat == IOMMU_DEV_FEAT_SVA) {
5348 struct device_domain_info *info = get_domain_info(dev);
5350 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5351 info->pasid_supported && info->pri_supported &&
5352 info->ats_supported;
5359 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5361 if (feat == IOMMU_DEV_FEAT_AUX)
5362 return intel_iommu_enable_auxd(dev);
5364 if (feat == IOMMU_DEV_FEAT_SVA) {
5365 struct device_domain_info *info = get_domain_info(dev);
5370 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5378 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5380 if (feat == IOMMU_DEV_FEAT_AUX)
5381 return intel_iommu_disable_auxd(dev);
5387 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5389 struct device_domain_info *info = get_domain_info(dev);
5391 if (feat == IOMMU_DEV_FEAT_AUX)
5392 return scalable_mode_support() && info && info->auxd_enabled;
5398 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5400 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402 return dmar_domain->default_pasid > 0 ?
5403 dmar_domain->default_pasid : -EINVAL;
5406 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5409 return attach_deferred(dev);
5413 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5414 enum iommu_attr attr, void *data)
5416 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5417 unsigned long flags;
5420 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5424 case DOMAIN_ATTR_NESTING:
5425 spin_lock_irqsave(&device_domain_lock, flags);
5426 if (nested_mode_support() &&
5427 list_empty(&dmar_domain->devices)) {
5428 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5429 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5433 spin_unlock_irqrestore(&device_domain_lock, flags);
5444 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5445 enum iommu_attr attr, void *data)
5447 switch (domain->type) {
5448 case IOMMU_DOMAIN_UNMANAGED:
5450 case IOMMU_DOMAIN_DMA:
5452 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5453 *(int *)data = !intel_iommu_strict;
5465 * Check that the device does not live on an external facing PCI port that is
5466 * marked as untrusted. Such devices should not be able to apply quirks and
5467 * thus not be able to bypass the IOMMU restrictions.
5469 static bool risky_device(struct pci_dev *pdev)
5471 if (pdev->untrusted) {
5473 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5474 pdev->vendor, pdev->device);
5475 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5481 const struct iommu_ops intel_iommu_ops = {
5482 .capable = intel_iommu_capable,
5483 .domain_alloc = intel_iommu_domain_alloc,
5484 .domain_free = intel_iommu_domain_free,
5485 .domain_get_attr = intel_iommu_domain_get_attr,
5486 .domain_set_attr = intel_iommu_domain_set_attr,
5487 .attach_dev = intel_iommu_attach_device,
5488 .detach_dev = intel_iommu_detach_device,
5489 .aux_attach_dev = intel_iommu_aux_attach_device,
5490 .aux_detach_dev = intel_iommu_aux_detach_device,
5491 .aux_get_pasid = intel_iommu_aux_get_pasid,
5492 .map = intel_iommu_map,
5493 .unmap = intel_iommu_unmap,
5494 .flush_iotlb_all = intel_flush_iotlb_all,
5495 .iotlb_sync = intel_iommu_tlb_sync,
5496 .iova_to_phys = intel_iommu_iova_to_phys,
5497 .probe_device = intel_iommu_probe_device,
5498 .probe_finalize = intel_iommu_probe_finalize,
5499 .release_device = intel_iommu_release_device,
5500 .get_resv_regions = intel_iommu_get_resv_regions,
5501 .put_resv_regions = generic_iommu_put_resv_regions,
5502 .device_group = intel_iommu_device_group,
5503 .dev_has_feat = intel_iommu_dev_has_feat,
5504 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5505 .dev_enable_feat = intel_iommu_dev_enable_feat,
5506 .dev_disable_feat = intel_iommu_dev_disable_feat,
5507 .is_attach_deferred = intel_iommu_is_attach_deferred,
5508 .def_domain_type = device_def_domain_type,
5509 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5510 #ifdef CONFIG_INTEL_IOMMU_SVM
5511 .cache_invalidate = intel_iommu_sva_invalidate,
5512 .sva_bind_gpasid = intel_svm_bind_gpasid,
5513 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
5514 .sva_bind = intel_svm_bind,
5515 .sva_unbind = intel_svm_unbind,
5516 .sva_get_pasid = intel_svm_get_pasid,
5517 .page_response = intel_svm_page_response,
5521 static void quirk_iommu_igfx(struct pci_dev *dev)
5523 if (risky_device(dev))
5526 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5530 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5539 /* Broadwell igfx malfunctions with dmar */
5540 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5545 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5552 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5553 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5555 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5556 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5557 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5558 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5560 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5562 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5563 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5565 static void quirk_iommu_rwbf(struct pci_dev *dev)
5567 if (risky_device(dev))
5571 * Mobile 4 Series Chipset neglects to set RWBF capability,
5572 * but needs it. Same seems to hold for the desktop versions.
5574 pci_info(dev, "Forcing write-buffer flush capability\n");
5578 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5582 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5587 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5588 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5589 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5590 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5591 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5592 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5593 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5594 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5596 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5600 if (risky_device(dev))
5603 if (pci_read_config_word(dev, GGC, &ggc))
5606 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5607 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5609 } else if (dmar_map_gfx) {
5610 /* we have to ensure the gfx device is idle before we flush */
5611 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5612 intel_iommu_strict = 1;
5615 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5616 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5617 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5618 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5620 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5624 if (!IS_GFX_DEVICE(dev))
5627 ver = (dev->device >> 8) & 0xff;
5628 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5629 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5633 if (risky_device(dev))
5636 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5637 iommu_skip_te_disable = 1;
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5641 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5642 ISOCH DMAR unit for the Azalia sound device, but not give it any
5643 TLB entries, which causes it to deadlock. Check for that. We do
5644 this in a function called from init_dmars(), instead of in a PCI
5645 quirk, because we don't want to print the obnoxious "BIOS broken"
5646 message if VT-d is actually disabled.
5648 static void __init check_tylersburg_isoch(void)
5650 struct pci_dev *pdev;
5651 uint32_t vtisochctrl;
5653 /* If there's no Azalia in the system anyway, forget it. */
5654 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5658 if (risky_device(pdev)) {
5665 /* System Management Registers. Might be hidden, in which case
5666 we can't do the sanity check. But that's OK, because the
5667 known-broken BIOSes _don't_ actually hide it, so far. */
5668 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5672 if (risky_device(pdev)) {
5677 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5684 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5685 if (vtisochctrl & 1)
5688 /* Drop all bits other than the number of TLB entries */
5689 vtisochctrl &= 0x1c;
5691 /* If we have the recommended number of TLB entries (16), fine. */
5692 if (vtisochctrl == 0x10)
5695 /* Zero TLB entries? You get to ride the short bus to school. */
5697 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5698 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5699 dmi_get_system_info(DMI_BIOS_VENDOR),
5700 dmi_get_system_info(DMI_BIOS_VERSION),
5701 dmi_get_system_info(DMI_PRODUCT_VERSION));
5702 iommu_identity_mapping |= IDENTMAP_AZALIA;
5706 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",