1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-map-ops.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "../irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(u64 pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline u64 level_mask(int level)
133 return -1ULL << level_to_offset_bits(level);
136 static inline u64 level_size(int level)
138 return 1ULL << level_to_offset_bits(level);
141 static inline u64 align_to_level(u64 pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int iommu_skip_te_disable;
360 #define IDENTMAP_GFX 2
361 #define IDENTMAP_AZALIA 4
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
367 struct device_domain_info *get_domain_info(struct device *dev)
369 struct device_domain_info *info;
374 info = dev_iommu_priv_get(dev);
375 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
381 DEFINE_SPINLOCK(device_domain_lock);
382 static LIST_HEAD(device_domain_list);
385 * Iterate over elements in device_domain_list and call the specified
386 * callback @fn against each element.
388 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
389 void *data), void *data)
393 struct device_domain_info *info;
395 spin_lock_irqsave(&device_domain_lock, flags);
396 list_for_each_entry(info, &device_domain_list, global) {
397 ret = fn(info, data);
399 spin_unlock_irqrestore(&device_domain_lock, flags);
403 spin_unlock_irqrestore(&device_domain_lock, flags);
408 const struct iommu_ops intel_iommu_ops;
410 static bool translation_pre_enabled(struct intel_iommu *iommu)
412 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
417 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420 static void init_translation_status(struct intel_iommu *iommu)
424 gsts = readl(iommu->reg + DMAR_GSTS_REG);
425 if (gsts & DMA_GSTS_TES)
426 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429 static int __init intel_iommu_setup(char *str)
434 if (!strncmp(str, "on", 2)) {
436 pr_info("IOMMU enabled\n");
437 } else if (!strncmp(str, "off", 3)) {
439 no_platform_optin = 1;
440 pr_info("IOMMU disabled\n");
441 } else if (!strncmp(str, "igfx_off", 8)) {
443 pr_info("Disable GFX device mapping\n");
444 } else if (!strncmp(str, "forcedac", 8)) {
445 pr_info("Forcing DAC for PCI devices\n");
447 } else if (!strncmp(str, "strict", 6)) {
448 pr_info("Disable batched IOTLB flush\n");
449 intel_iommu_strict = 1;
450 } else if (!strncmp(str, "sp_off", 6)) {
451 pr_info("Disable supported super page\n");
452 intel_iommu_superpage = 0;
453 } else if (!strncmp(str, "sm_on", 5)) {
454 pr_info("Intel-IOMMU: scalable mode supported\n");
456 } else if (!strncmp(str, "tboot_noforce", 13)) {
457 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
458 intel_iommu_tboot_noforce = 1;
461 str += strcspn(str, ",");
467 __setup("intel_iommu=", intel_iommu_setup);
469 static struct kmem_cache *iommu_domain_cache;
470 static struct kmem_cache *iommu_devinfo_cache;
472 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
474 struct dmar_domain **domains;
477 domains = iommu->domains[idx];
481 return domains[did & 0xff];
484 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
485 struct dmar_domain *domain)
487 struct dmar_domain **domains;
490 if (!iommu->domains[idx]) {
491 size_t size = 256 * sizeof(struct dmar_domain *);
492 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
495 domains = iommu->domains[idx];
496 if (WARN_ON(!domains))
499 domains[did & 0xff] = domain;
502 void *alloc_pgtable_page(int node)
507 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
509 vaddr = page_address(page);
513 void free_pgtable_page(void *vaddr)
515 free_page((unsigned long)vaddr);
518 static inline void *alloc_domain_mem(void)
520 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
523 static void free_domain_mem(void *vaddr)
525 kmem_cache_free(iommu_domain_cache, vaddr);
528 static inline void * alloc_devinfo_mem(void)
530 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
533 static inline void free_devinfo_mem(void *vaddr)
535 kmem_cache_free(iommu_devinfo_cache, vaddr);
538 static inline int domain_type_is_si(struct dmar_domain *domain)
540 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
543 static inline bool domain_use_first_level(struct dmar_domain *domain)
545 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
548 static inline int domain_pfn_supported(struct dmar_domain *domain,
551 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
553 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
561 sagaw = cap_sagaw(iommu->cap);
562 for (agaw = width_to_agaw(max_gaw);
564 if (test_bit(agaw, &sagaw))
572 * Calculate max SAGAW for each iommu.
574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
576 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
580 * calculate agaw for each iommu.
581 * "SAGAW" may be different across iommus, use a default agaw, and
582 * get a supported less agaw for iommus that don't support the default agaw.
584 int iommu_calculate_agaw(struct intel_iommu *iommu)
586 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
589 /* This functionin only returns single iommu in a domain */
590 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
594 /* si_domain and vm domain should not get here. */
595 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
598 for_each_domain_iommu(iommu_id, domain)
601 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
604 return g_iommus[iommu_id];
607 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
609 return sm_supported(iommu) ?
610 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
613 static void domain_update_iommu_coherency(struct dmar_domain *domain)
615 struct dmar_drhd_unit *drhd;
616 struct intel_iommu *iommu;
620 domain->iommu_coherency = 1;
622 for_each_domain_iommu(i, domain) {
624 if (!iommu_paging_structure_coherency(g_iommus[i])) {
625 domain->iommu_coherency = 0;
632 /* No hardware attached; use lowest common denominator */
634 for_each_active_iommu(iommu, drhd) {
635 if (!iommu_paging_structure_coherency(iommu)) {
636 domain->iommu_coherency = 0;
643 static int domain_update_iommu_snooping(struct intel_iommu *skip)
645 struct dmar_drhd_unit *drhd;
646 struct intel_iommu *iommu;
650 for_each_active_iommu(iommu, drhd) {
652 if (!ecap_sc_support(iommu->ecap)) {
663 static int domain_update_iommu_superpage(struct dmar_domain *domain,
664 struct intel_iommu *skip)
666 struct dmar_drhd_unit *drhd;
667 struct intel_iommu *iommu;
670 if (!intel_iommu_superpage) {
674 /* set iommu_superpage to the smallest common denominator */
676 for_each_active_iommu(iommu, drhd) {
678 if (domain && domain_use_first_level(domain)) {
679 if (!cap_fl1gp_support(iommu->cap))
682 mask &= cap_super_page_val(iommu->cap);
694 static int domain_update_device_node(struct dmar_domain *domain)
696 struct device_domain_info *info;
697 int nid = NUMA_NO_NODE;
699 assert_spin_locked(&device_domain_lock);
701 if (list_empty(&domain->devices))
704 list_for_each_entry(info, &domain->devices, link) {
709 * There could possibly be multiple device numa nodes as devices
710 * within the same domain may sit behind different IOMMUs. There
711 * isn't perfect answer in such situation, so we select first
712 * come first served policy.
714 nid = dev_to_node(info->dev);
715 if (nid != NUMA_NO_NODE)
722 /* Some capabilities may be different across iommus */
723 static void domain_update_iommu_cap(struct dmar_domain *domain)
725 domain_update_iommu_coherency(domain);
726 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
727 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
730 * If RHSA is missing, we should default to the device numa domain
733 if (domain->nid == NUMA_NO_NODE)
734 domain->nid = domain_update_device_node(domain);
737 * First-level translation restricts the input-address to a
738 * canonical address (i.e., address bits 63:N have the same
739 * value as address bit [N-1], where N is 48-bits with 4-level
740 * paging and 57-bits with 5-level paging). Hence, skip bit
743 if (domain_use_first_level(domain))
744 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
746 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
749 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
752 struct root_entry *root = &iommu->root_entry[bus];
753 struct context_entry *context;
757 if (sm_supported(iommu)) {
765 context = phys_to_virt(*entry & VTD_PAGE_MASK);
767 unsigned long phy_addr;
771 context = alloc_pgtable_page(iommu->node);
775 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
776 phy_addr = virt_to_phys((void *)context);
777 *entry = phy_addr | 1;
778 __iommu_flush_cache(iommu, entry, sizeof(*entry));
780 return &context[devfn];
783 static bool attach_deferred(struct device *dev)
785 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
789 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
790 * sub-hierarchy of a candidate PCI-PCI bridge
791 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
792 * @bridge: the candidate PCI-PCI bridge
794 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
797 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
799 struct pci_dev *pdev, *pbridge;
801 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
804 pdev = to_pci_dev(dev);
805 pbridge = to_pci_dev(bridge);
807 if (pbridge->subordinate &&
808 pbridge->subordinate->number <= pdev->bus->number &&
809 pbridge->subordinate->busn_res.end >= pdev->bus->number)
815 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
817 struct dmar_drhd_unit *drhd;
821 /* We know that this device on this chipset has its own IOMMU.
822 * If we find it under a different IOMMU, then the BIOS is lying
823 * to us. Hope that the IOMMU for this device is actually
824 * disabled, and it needs no translation...
826 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
829 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
834 /* we know that the this iommu should be at offset 0xa000 from vtbar */
835 drhd = dmar_find_matched_drhd_unit(pdev);
836 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
837 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
838 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
845 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
847 if (!iommu || iommu->drhd->ignored)
850 if (dev_is_pci(dev)) {
851 struct pci_dev *pdev = to_pci_dev(dev);
853 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
854 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
855 quirk_ioat_snb_local_iommu(pdev))
862 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
864 struct dmar_drhd_unit *drhd = NULL;
865 struct pci_dev *pdev = NULL;
866 struct intel_iommu *iommu;
874 if (dev_is_pci(dev)) {
875 struct pci_dev *pf_pdev;
877 pdev = pci_real_dma_dev(to_pci_dev(dev));
879 /* VFs aren't listed in scope tables; we need to look up
880 * the PF instead to find the IOMMU. */
881 pf_pdev = pci_physfn(pdev);
883 segment = pci_domain_nr(pdev->bus);
884 } else if (has_acpi_companion(dev))
885 dev = &ACPI_COMPANION(dev)->dev;
888 for_each_iommu(iommu, drhd) {
889 if (pdev && segment != drhd->segment)
892 for_each_active_dev_scope(drhd->devices,
893 drhd->devices_cnt, i, tmp) {
895 /* For a VF use its original BDF# not that of the PF
896 * which we used for the IOMMU lookup. Strictly speaking
897 * we could do this for all PCI devices; we only need to
898 * get the BDF# from the scope table for ACPI matches. */
899 if (pdev && pdev->is_virtfn)
903 *bus = drhd->devices[i].bus;
904 *devfn = drhd->devices[i].devfn;
909 if (is_downstream_to_pci_bridge(dev, tmp))
913 if (pdev && drhd->include_all) {
916 *bus = pdev->bus->number;
917 *devfn = pdev->devfn;
924 if (iommu_is_dummy(iommu, dev))
932 static void domain_flush_cache(struct dmar_domain *domain,
933 void *addr, int size)
935 if (!domain->iommu_coherency)
936 clflush_cache_range(addr, size);
939 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
941 struct context_entry *context;
945 spin_lock_irqsave(&iommu->lock, flags);
946 context = iommu_context_addr(iommu, bus, devfn, 0);
948 ret = context_present(context);
949 spin_unlock_irqrestore(&iommu->lock, flags);
953 static void free_context_table(struct intel_iommu *iommu)
957 struct context_entry *context;
959 spin_lock_irqsave(&iommu->lock, flags);
960 if (!iommu->root_entry) {
963 for (i = 0; i < ROOT_ENTRY_NR; i++) {
964 context = iommu_context_addr(iommu, i, 0, 0);
966 free_pgtable_page(context);
968 if (!sm_supported(iommu))
971 context = iommu_context_addr(iommu, i, 0x80, 0);
973 free_pgtable_page(context);
976 free_pgtable_page(iommu->root_entry);
977 iommu->root_entry = NULL;
979 spin_unlock_irqrestore(&iommu->lock, flags);
982 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
983 unsigned long pfn, int *target_level)
985 struct dma_pte *parent, *pte;
986 int level = agaw_to_level(domain->agaw);
989 BUG_ON(!domain->pgd);
991 if (!domain_pfn_supported(domain, pfn))
992 /* Address beyond IOMMU's addressing capabilities. */
995 parent = domain->pgd;
1000 offset = pfn_level_offset(pfn, level);
1001 pte = &parent[offset];
1002 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1004 if (level == *target_level)
1007 if (!dma_pte_present(pte)) {
1010 tmp_page = alloc_pgtable_page(domain->nid);
1015 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1016 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1017 if (domain_use_first_level(domain))
1018 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1019 if (cmpxchg64(&pte->val, 0ULL, pteval))
1020 /* Someone else set it while we were thinking; use theirs. */
1021 free_pgtable_page(tmp_page);
1023 domain_flush_cache(domain, pte, sizeof(*pte));
1028 parent = phys_to_virt(dma_pte_addr(pte));
1033 *target_level = level;
1038 /* return address's pte at specific level */
1039 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1041 int level, int *large_page)
1043 struct dma_pte *parent, *pte;
1044 int total = agaw_to_level(domain->agaw);
1047 parent = domain->pgd;
1048 while (level <= total) {
1049 offset = pfn_level_offset(pfn, total);
1050 pte = &parent[offset];
1054 if (!dma_pte_present(pte)) {
1055 *large_page = total;
1059 if (dma_pte_superpage(pte)) {
1060 *large_page = total;
1064 parent = phys_to_virt(dma_pte_addr(pte));
1070 /* clear last level pte, a tlb flush should be followed */
1071 static void dma_pte_clear_range(struct dmar_domain *domain,
1072 unsigned long start_pfn,
1073 unsigned long last_pfn)
1075 unsigned int large_page;
1076 struct dma_pte *first_pte, *pte;
1078 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080 BUG_ON(start_pfn > last_pfn);
1082 /* we don't need lock here; nobody else touches the iova range */
1085 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1087 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1092 start_pfn += lvl_to_nr_pages(large_page);
1094 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1096 domain_flush_cache(domain, first_pte,
1097 (void *)pte - (void *)first_pte);
1099 } while (start_pfn && start_pfn <= last_pfn);
1102 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1103 int retain_level, struct dma_pte *pte,
1104 unsigned long pfn, unsigned long start_pfn,
1105 unsigned long last_pfn)
1107 pfn = max(start_pfn, pfn);
1108 pte = &pte[pfn_level_offset(pfn, level)];
1111 unsigned long level_pfn;
1112 struct dma_pte *level_pte;
1114 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1117 level_pfn = pfn & level_mask(level);
1118 level_pte = phys_to_virt(dma_pte_addr(pte));
1121 dma_pte_free_level(domain, level - 1, retain_level,
1122 level_pte, level_pfn, start_pfn,
1127 * Free the page table if we're below the level we want to
1128 * retain and the range covers the entire table.
1130 if (level < retain_level && !(start_pfn > level_pfn ||
1131 last_pfn < level_pfn + level_size(level) - 1)) {
1133 domain_flush_cache(domain, pte, sizeof(*pte));
1134 free_pgtable_page(level_pte);
1137 pfn += level_size(level);
1138 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1142 * clear last level (leaf) ptes and free page table pages below the
1143 * level we wish to keep intact.
1145 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1146 unsigned long start_pfn,
1147 unsigned long last_pfn,
1150 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1151 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1152 BUG_ON(start_pfn > last_pfn);
1154 dma_pte_clear_range(domain, start_pfn, last_pfn);
1156 /* We don't need lock here; nobody else touches the iova range */
1157 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1158 domain->pgd, 0, start_pfn, last_pfn);
1161 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162 free_pgtable_page(domain->pgd);
1167 /* When a page at a given level is being unlinked from its parent, we don't
1168 need to *modify* it at all. All we need to do is make a list of all the
1169 pages which can be freed just as soon as we've flushed the IOTLB and we
1170 know the hardware page-walk will no longer touch them.
1171 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1173 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1174 int level, struct dma_pte *pte,
1175 struct page *freelist)
1179 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1180 pg->freelist = freelist;
1186 pte = page_address(pg);
1188 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1189 freelist = dma_pte_list_pagetables(domain, level - 1,
1192 } while (!first_pte_in_page(pte));
1197 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1198 struct dma_pte *pte, unsigned long pfn,
1199 unsigned long start_pfn,
1200 unsigned long last_pfn,
1201 struct page *freelist)
1203 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1205 pfn = max(start_pfn, pfn);
1206 pte = &pte[pfn_level_offset(pfn, level)];
1209 unsigned long level_pfn;
1211 if (!dma_pte_present(pte))
1214 level_pfn = pfn & level_mask(level);
1216 /* If range covers entire pagetable, free it */
1217 if (start_pfn <= level_pfn &&
1218 last_pfn >= level_pfn + level_size(level) - 1) {
1219 /* These suborbinate page tables are going away entirely. Don't
1220 bother to clear them; we're just going to *free* them. */
1221 if (level > 1 && !dma_pte_superpage(pte))
1222 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228 } else if (level > 1) {
1229 /* Recurse down into a level that isn't *entirely* obsolete */
1230 freelist = dma_pte_clear_level(domain, level - 1,
1231 phys_to_virt(dma_pte_addr(pte)),
1232 level_pfn, start_pfn, last_pfn,
1236 pfn += level_size(level);
1237 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1240 domain_flush_cache(domain, first_pte,
1241 (void *)++last_pte - (void *)first_pte);
1246 /* We can't just free the pages because the IOMMU may still be walking
1247 the page tables, and may have cached the intermediate levels. The
1248 pages can only be freed after the IOTLB flush has been done. */
1249 static struct page *domain_unmap(struct dmar_domain *domain,
1250 unsigned long start_pfn,
1251 unsigned long last_pfn,
1252 struct page *freelist)
1254 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1255 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1256 BUG_ON(start_pfn > last_pfn);
1258 /* we don't need lock here; nobody else touches the iova range */
1259 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1260 domain->pgd, 0, start_pfn, last_pfn,
1264 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1265 struct page *pgd_page = virt_to_page(domain->pgd);
1266 pgd_page->freelist = freelist;
1267 freelist = pgd_page;
1275 static void dma_free_pagelist(struct page *freelist)
1279 while ((pg = freelist)) {
1280 freelist = pg->freelist;
1281 free_pgtable_page(page_address(pg));
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1288 struct root_entry *root;
1289 unsigned long flags;
1291 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1293 pr_err("Allocating root entry for %s failed\n",
1298 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1300 spin_lock_irqsave(&iommu->lock, flags);
1301 iommu->root_entry = root;
1302 spin_unlock_irqrestore(&iommu->lock, flags);
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1313 addr = virt_to_phys(iommu->root_entry);
1314 if (sm_supported(iommu))
1315 addr |= DMA_RTADDR_SMT;
1317 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1320 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1322 /* Make sure hardware complete it */
1323 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324 readl, (sts & DMA_GSTS_RTPS), sts);
1326 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1334 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1337 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1338 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1340 /* Make sure hardware complete it */
1341 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1342 readl, (!(val & DMA_GSTS_WBFS)), val);
1344 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347 /* return value determine if we need a write buffer flush */
1348 static void __iommu_flush_context(struct intel_iommu *iommu,
1349 u16 did, u16 source_id, u8 function_mask,
1356 case DMA_CCMD_GLOBAL_INVL:
1357 val = DMA_CCMD_GLOBAL_INVL;
1359 case DMA_CCMD_DOMAIN_INVL:
1360 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1362 case DMA_CCMD_DEVICE_INVL:
1363 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1364 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1369 val |= DMA_CCMD_ICC;
1371 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1372 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1374 /* Make sure hardware complete it */
1375 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1376 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1378 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1381 /* return value determine if we need a write buffer flush */
1382 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1383 u64 addr, unsigned int size_order, u64 type)
1385 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1386 u64 val = 0, val_iva = 0;
1390 case DMA_TLB_GLOBAL_FLUSH:
1391 /* global flush doesn't need set IVA_REG */
1392 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1394 case DMA_TLB_DSI_FLUSH:
1395 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1397 case DMA_TLB_PSI_FLUSH:
1398 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1399 /* IH bit is passed in as part of address */
1400 val_iva = size_order | addr;
1405 /* Note: set drain read/write */
1408 * This is probably to be super secure.. Looks like we can
1409 * ignore it without any impact.
1411 if (cap_read_drain(iommu->cap))
1412 val |= DMA_TLB_READ_DRAIN;
1414 if (cap_write_drain(iommu->cap))
1415 val |= DMA_TLB_WRITE_DRAIN;
1417 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1418 /* Note: Only uses first TLB reg currently */
1420 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1421 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1423 /* Make sure hardware complete it */
1424 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1425 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1427 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1429 /* check IOTLB invalidation granularity */
1430 if (DMA_TLB_IAIG(val) == 0)
1431 pr_err("Flush IOTLB failed\n");
1432 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1433 pr_debug("TLB flush request %Lx, actual %Lx\n",
1434 (unsigned long long)DMA_TLB_IIRG(type),
1435 (unsigned long long)DMA_TLB_IAIG(val));
1438 static struct device_domain_info *
1439 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1442 struct device_domain_info *info;
1444 assert_spin_locked(&device_domain_lock);
1449 list_for_each_entry(info, &domain->devices, link)
1450 if (info->iommu == iommu && info->bus == bus &&
1451 info->devfn == devfn) {
1452 if (info->ats_supported && info->dev)
1460 static void domain_update_iotlb(struct dmar_domain *domain)
1462 struct device_domain_info *info;
1463 bool has_iotlb_device = false;
1465 assert_spin_locked(&device_domain_lock);
1467 list_for_each_entry(info, &domain->devices, link) {
1468 struct pci_dev *pdev;
1470 if (!info->dev || !dev_is_pci(info->dev))
1473 pdev = to_pci_dev(info->dev);
1474 if (pdev->ats_enabled) {
1475 has_iotlb_device = true;
1480 domain->has_iotlb_device = has_iotlb_device;
1483 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1485 struct pci_dev *pdev;
1487 assert_spin_locked(&device_domain_lock);
1489 if (!info || !dev_is_pci(info->dev))
1492 pdev = to_pci_dev(info->dev);
1493 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1494 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1495 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1496 * reserved, which should be set to 0.
1498 if (!ecap_dit(info->iommu->ecap))
1501 struct pci_dev *pf_pdev;
1503 /* pdev will be returned if device is not a vf */
1504 pf_pdev = pci_physfn(pdev);
1505 info->pfsid = pci_dev_id(pf_pdev);
1508 #ifdef CONFIG_INTEL_IOMMU_SVM
1509 /* The PCIe spec, in its wisdom, declares that the behaviour of
1510 the device if you enable PASID support after ATS support is
1511 undefined. So always enable PASID support on devices which
1512 have it, even if we can't yet know if we're ever going to
1514 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1515 info->pasid_enabled = 1;
1517 if (info->pri_supported &&
1518 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1519 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1520 info->pri_enabled = 1;
1522 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1523 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1524 info->ats_enabled = 1;
1525 domain_update_iotlb(info->domain);
1526 info->ats_qdep = pci_ats_queue_depth(pdev);
1530 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1532 struct pci_dev *pdev;
1534 assert_spin_locked(&device_domain_lock);
1536 if (!dev_is_pci(info->dev))
1539 pdev = to_pci_dev(info->dev);
1541 if (info->ats_enabled) {
1542 pci_disable_ats(pdev);
1543 info->ats_enabled = 0;
1544 domain_update_iotlb(info->domain);
1546 #ifdef CONFIG_INTEL_IOMMU_SVM
1547 if (info->pri_enabled) {
1548 pci_disable_pri(pdev);
1549 info->pri_enabled = 0;
1551 if (info->pasid_enabled) {
1552 pci_disable_pasid(pdev);
1553 info->pasid_enabled = 0;
1558 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1559 u64 addr, unsigned mask)
1562 unsigned long flags;
1563 struct device_domain_info *info;
1565 if (!domain->has_iotlb_device)
1568 spin_lock_irqsave(&device_domain_lock, flags);
1569 list_for_each_entry(info, &domain->devices, link) {
1570 if (!info->ats_enabled)
1573 sid = info->bus << 8 | info->devfn;
1574 qdep = info->ats_qdep;
1575 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1578 spin_unlock_irqrestore(&device_domain_lock, flags);
1581 static void domain_flush_piotlb(struct intel_iommu *iommu,
1582 struct dmar_domain *domain,
1583 u64 addr, unsigned long npages, bool ih)
1585 u16 did = domain->iommu_did[iommu->seq_id];
1587 if (domain->default_pasid)
1588 qi_flush_piotlb(iommu, did, domain->default_pasid,
1591 if (!list_empty(&domain->devices))
1592 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1595 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1596 struct dmar_domain *domain,
1597 unsigned long pfn, unsigned int pages,
1600 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1601 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1602 u16 did = domain->iommu_did[iommu->seq_id];
1609 if (domain_use_first_level(domain)) {
1610 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1613 * Fallback to domain selective flush if no PSI support or
1614 * the size is too big. PSI requires page size to be 2 ^ x,
1615 * and the base address is naturally aligned to the size.
1617 if (!cap_pgsel_inv(iommu->cap) ||
1618 mask > cap_max_amask_val(iommu->cap))
1619 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1622 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1627 * In caching mode, changes of pages from non-present to present require
1628 * flush. However, device IOTLB doesn't need to be flushed in this case.
1630 if (!cap_caching_mode(iommu->cap) || !map)
1631 iommu_flush_dev_iotlb(domain, addr, mask);
1634 /* Notification for newly created mappings */
1635 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1636 struct dmar_domain *domain,
1637 unsigned long pfn, unsigned int pages)
1640 * It's a non-present to present mapping. Only flush if caching mode
1643 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1644 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1646 iommu_flush_write_buffer(iommu);
1649 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1651 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1654 for_each_domain_iommu(idx, dmar_domain) {
1655 struct intel_iommu *iommu = g_iommus[idx];
1656 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1658 if (domain_use_first_level(dmar_domain))
1659 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1661 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664 if (!cap_caching_mode(iommu->cap))
1665 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1666 0, MAX_AGAW_PFN_WIDTH);
1670 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1673 unsigned long flags;
1675 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1678 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1679 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1680 pmen &= ~DMA_PMEN_EPM;
1681 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1683 /* wait for the protected region status bit to clear */
1684 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1685 readl, !(pmen & DMA_PMEN_PRS), pmen);
1687 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1690 static void iommu_enable_translation(struct intel_iommu *iommu)
1693 unsigned long flags;
1695 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1696 iommu->gcmd |= DMA_GCMD_TE;
1697 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1699 /* Make sure hardware complete it */
1700 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1701 readl, (sts & DMA_GSTS_TES), sts);
1703 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1706 static void iommu_disable_translation(struct intel_iommu *iommu)
1711 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1712 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1715 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1716 iommu->gcmd &= ~DMA_GCMD_TE;
1717 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1719 /* Make sure hardware complete it */
1720 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1721 readl, (!(sts & DMA_GSTS_TES)), sts);
1723 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1726 static int iommu_init_domains(struct intel_iommu *iommu)
1728 u32 ndomains, nlongs;
1731 ndomains = cap_ndoms(iommu->cap);
1732 pr_debug("%s: Number of Domains supported <%d>\n",
1733 iommu->name, ndomains);
1734 nlongs = BITS_TO_LONGS(ndomains);
1736 spin_lock_init(&iommu->lock);
1738 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1739 if (!iommu->domain_ids) {
1740 pr_err("%s: Allocating domain id array failed\n",
1745 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1746 iommu->domains = kzalloc(size, GFP_KERNEL);
1748 if (iommu->domains) {
1749 size = 256 * sizeof(struct dmar_domain *);
1750 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1753 if (!iommu->domains || !iommu->domains[0]) {
1754 pr_err("%s: Allocating domain array failed\n",
1756 kfree(iommu->domain_ids);
1757 kfree(iommu->domains);
1758 iommu->domain_ids = NULL;
1759 iommu->domains = NULL;
1764 * If Caching mode is set, then invalid translations are tagged
1765 * with domain-id 0, hence we need to pre-allocate it. We also
1766 * use domain-id 0 as a marker for non-allocated domain-id, so
1767 * make sure it is not used for a real domain.
1769 set_bit(0, iommu->domain_ids);
1772 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1773 * entry for first-level or pass-through translation modes should
1774 * be programmed with a domain id different from those used for
1775 * second-level or nested translation. We reserve a domain id for
1778 if (sm_supported(iommu))
1779 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1784 static void disable_dmar_iommu(struct intel_iommu *iommu)
1786 struct device_domain_info *info, *tmp;
1787 unsigned long flags;
1789 if (!iommu->domains || !iommu->domain_ids)
1792 spin_lock_irqsave(&device_domain_lock, flags);
1793 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1794 if (info->iommu != iommu)
1797 if (!info->dev || !info->domain)
1800 __dmar_remove_one_dev_info(info);
1802 spin_unlock_irqrestore(&device_domain_lock, flags);
1804 if (iommu->gcmd & DMA_GCMD_TE)
1805 iommu_disable_translation(iommu);
1808 static void free_dmar_iommu(struct intel_iommu *iommu)
1810 if ((iommu->domains) && (iommu->domain_ids)) {
1811 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1814 for (i = 0; i < elems; i++)
1815 kfree(iommu->domains[i]);
1816 kfree(iommu->domains);
1817 kfree(iommu->domain_ids);
1818 iommu->domains = NULL;
1819 iommu->domain_ids = NULL;
1822 g_iommus[iommu->seq_id] = NULL;
1824 /* free context mapping */
1825 free_context_table(iommu);
1827 #ifdef CONFIG_INTEL_IOMMU_SVM
1828 if (pasid_supported(iommu)) {
1829 if (ecap_prs(iommu->ecap))
1830 intel_svm_finish_prq(iommu);
1832 if (vccap_pasid(iommu->vccap))
1833 ioasid_unregister_allocator(&iommu->pasid_allocator);
1839 * Check and return whether first level is used by default for
1842 static bool first_level_by_default(void)
1844 struct dmar_drhd_unit *drhd;
1845 struct intel_iommu *iommu;
1846 static int first_level_support = -1;
1848 if (likely(first_level_support != -1))
1849 return first_level_support;
1851 first_level_support = 1;
1854 for_each_active_iommu(iommu, drhd) {
1855 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1856 first_level_support = 0;
1862 return first_level_support;
1865 static struct dmar_domain *alloc_domain(int flags)
1867 struct dmar_domain *domain;
1869 domain = alloc_domain_mem();
1873 memset(domain, 0, sizeof(*domain));
1874 domain->nid = NUMA_NO_NODE;
1875 domain->flags = flags;
1876 if (first_level_by_default())
1877 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1878 domain->has_iotlb_device = false;
1879 INIT_LIST_HEAD(&domain->devices);
1884 /* Must be called with iommu->lock */
1885 static int domain_attach_iommu(struct dmar_domain *domain,
1886 struct intel_iommu *iommu)
1888 unsigned long ndomains;
1891 assert_spin_locked(&device_domain_lock);
1892 assert_spin_locked(&iommu->lock);
1894 domain->iommu_refcnt[iommu->seq_id] += 1;
1895 domain->iommu_count += 1;
1896 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1897 ndomains = cap_ndoms(iommu->cap);
1898 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1900 if (num >= ndomains) {
1901 pr_err("%s: No free domain ids\n", iommu->name);
1902 domain->iommu_refcnt[iommu->seq_id] -= 1;
1903 domain->iommu_count -= 1;
1907 set_bit(num, iommu->domain_ids);
1908 set_iommu_domain(iommu, num, domain);
1910 domain->iommu_did[iommu->seq_id] = num;
1911 domain->nid = iommu->node;
1913 domain_update_iommu_cap(domain);
1919 static int domain_detach_iommu(struct dmar_domain *domain,
1920 struct intel_iommu *iommu)
1924 assert_spin_locked(&device_domain_lock);
1925 assert_spin_locked(&iommu->lock);
1927 domain->iommu_refcnt[iommu->seq_id] -= 1;
1928 count = --domain->iommu_count;
1929 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1930 num = domain->iommu_did[iommu->seq_id];
1931 clear_bit(num, iommu->domain_ids);
1932 set_iommu_domain(iommu, num, NULL);
1934 domain_update_iommu_cap(domain);
1935 domain->iommu_did[iommu->seq_id] = 0;
1941 static inline int guestwidth_to_adjustwidth(int gaw)
1944 int r = (gaw - 12) % 9;
1955 static void domain_exit(struct dmar_domain *domain)
1958 /* Remove associated devices and clear attached or cached domains */
1959 domain_remove_dev_info(domain);
1962 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1963 iommu_put_dma_cookie(&domain->domain);
1966 struct page *freelist;
1968 freelist = domain_unmap(domain, 0,
1969 DOMAIN_MAX_PFN(domain->gaw), NULL);
1970 dma_free_pagelist(freelist);
1973 free_domain_mem(domain);
1977 * Get the PASID directory size for scalable mode context entry.
1978 * Value of X in the PDTS field of a scalable mode context entry
1979 * indicates PASID directory with 2^(X + 7) entries.
1981 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1985 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1986 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1994 * Set the RID_PASID field of a scalable mode context entry. The
1995 * IOMMU hardware will use the PASID value set in this field for
1996 * DMA translations of DMA requests without PASID.
1999 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2001 context->hi |= pasid & ((1 << 20) - 1);
2005 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2008 static inline void context_set_sm_dte(struct context_entry *context)
2010 context->lo |= (1 << 2);
2014 * Set the PRE(Page Request Enable) field of a scalable mode context
2017 static inline void context_set_sm_pre(struct context_entry *context)
2019 context->lo |= (1 << 4);
2022 /* Convert value to context PASID directory size field coding. */
2023 #define context_pdts(pds) (((pds) & 0x7) << 9)
2025 static int domain_context_mapping_one(struct dmar_domain *domain,
2026 struct intel_iommu *iommu,
2027 struct pasid_table *table,
2030 u16 did = domain->iommu_did[iommu->seq_id];
2031 int translation = CONTEXT_TT_MULTI_LEVEL;
2032 struct device_domain_info *info = NULL;
2033 struct context_entry *context;
2034 unsigned long flags;
2039 if (hw_pass_through && domain_type_is_si(domain))
2040 translation = CONTEXT_TT_PASS_THROUGH;
2042 pr_debug("Set context mapping for %02x:%02x.%d\n",
2043 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2045 BUG_ON(!domain->pgd);
2047 spin_lock_irqsave(&device_domain_lock, flags);
2048 spin_lock(&iommu->lock);
2051 context = iommu_context_addr(iommu, bus, devfn, 1);
2056 if (context_present(context))
2060 * For kdump cases, old valid entries may be cached due to the
2061 * in-flight DMA and copied pgtable, but there is no unmapping
2062 * behaviour for them, thus we need an explicit cache flush for
2063 * the newly-mapped device. For kdump, at this point, the device
2064 * is supposed to finish reset at its driver probe stage, so no
2065 * in-flight DMA will exist, and we don't need to worry anymore
2068 if (context_copied(context)) {
2069 u16 did_old = context_domain_id(context);
2071 if (did_old < cap_ndoms(iommu->cap)) {
2072 iommu->flush.flush_context(iommu, did_old,
2073 (((u16)bus) << 8) | devfn,
2074 DMA_CCMD_MASK_NOBIT,
2075 DMA_CCMD_DEVICE_INVL);
2076 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2081 context_clear_entry(context);
2083 if (sm_supported(iommu)) {
2088 /* Setup the PASID DIR pointer: */
2089 pds = context_get_sm_pds(table);
2090 context->lo = (u64)virt_to_phys(table->table) |
2093 /* Setup the RID_PASID field: */
2094 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2097 * Setup the Device-TLB enable bit and Page request
2100 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2101 if (info && info->ats_supported)
2102 context_set_sm_dte(context);
2103 if (info && info->pri_supported)
2104 context_set_sm_pre(context);
2106 struct dma_pte *pgd = domain->pgd;
2109 context_set_domain_id(context, did);
2111 if (translation != CONTEXT_TT_PASS_THROUGH) {
2113 * Skip top levels of page tables for iommu which has
2114 * less agaw than default. Unnecessary for PT mode.
2116 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2118 pgd = phys_to_virt(dma_pte_addr(pgd));
2119 if (!dma_pte_present(pgd))
2123 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2124 if (info && info->ats_supported)
2125 translation = CONTEXT_TT_DEV_IOTLB;
2127 translation = CONTEXT_TT_MULTI_LEVEL;
2129 context_set_address_root(context, virt_to_phys(pgd));
2130 context_set_address_width(context, agaw);
2133 * In pass through mode, AW must be programmed to
2134 * indicate the largest AGAW value supported by
2135 * hardware. And ASR is ignored by hardware.
2137 context_set_address_width(context, iommu->msagaw);
2140 context_set_translation_type(context, translation);
2143 context_set_fault_enable(context);
2144 context_set_present(context);
2145 if (!ecap_coherent(iommu->ecap))
2146 clflush_cache_range(context, sizeof(*context));
2149 * It's a non-present to present mapping. If hardware doesn't cache
2150 * non-present entry we only need to flush the write-buffer. If the
2151 * _does_ cache non-present entries, then it does so in the special
2152 * domain #0, which we have to flush:
2154 if (cap_caching_mode(iommu->cap)) {
2155 iommu->flush.flush_context(iommu, 0,
2156 (((u16)bus) << 8) | devfn,
2157 DMA_CCMD_MASK_NOBIT,
2158 DMA_CCMD_DEVICE_INVL);
2159 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2161 iommu_flush_write_buffer(iommu);
2163 iommu_enable_dev_iotlb(info);
2168 spin_unlock(&iommu->lock);
2169 spin_unlock_irqrestore(&device_domain_lock, flags);
2174 struct domain_context_mapping_data {
2175 struct dmar_domain *domain;
2176 struct intel_iommu *iommu;
2177 struct pasid_table *table;
2180 static int domain_context_mapping_cb(struct pci_dev *pdev,
2181 u16 alias, void *opaque)
2183 struct domain_context_mapping_data *data = opaque;
2185 return domain_context_mapping_one(data->domain, data->iommu,
2186 data->table, PCI_BUS_NUM(alias),
2191 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2193 struct domain_context_mapping_data data;
2194 struct pasid_table *table;
2195 struct intel_iommu *iommu;
2198 iommu = device_to_iommu(dev, &bus, &devfn);
2202 table = intel_pasid_get_table(dev);
2204 if (!dev_is_pci(dev))
2205 return domain_context_mapping_one(domain, iommu, table,
2208 data.domain = domain;
2212 return pci_for_each_dma_alias(to_pci_dev(dev),
2213 &domain_context_mapping_cb, &data);
2216 static int domain_context_mapped_cb(struct pci_dev *pdev,
2217 u16 alias, void *opaque)
2219 struct intel_iommu *iommu = opaque;
2221 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2224 static int domain_context_mapped(struct device *dev)
2226 struct intel_iommu *iommu;
2229 iommu = device_to_iommu(dev, &bus, &devfn);
2233 if (!dev_is_pci(dev))
2234 return device_context_mapped(iommu, bus, devfn);
2236 return !pci_for_each_dma_alias(to_pci_dev(dev),
2237 domain_context_mapped_cb, iommu);
2240 /* Returns a number of VTD pages, but aligned to MM page size */
2241 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2244 host_addr &= ~PAGE_MASK;
2245 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2248 /* Return largest possible superpage level for a given mapping */
2249 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2250 unsigned long iov_pfn,
2251 unsigned long phy_pfn,
2252 unsigned long pages)
2254 int support, level = 1;
2255 unsigned long pfnmerge;
2257 support = domain->iommu_superpage;
2259 /* To use a large page, the virtual *and* physical addresses
2260 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2261 of them will mean we have to use smaller pages. So just
2262 merge them and check both at once. */
2263 pfnmerge = iov_pfn | phy_pfn;
2265 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2266 pages >>= VTD_STRIDE_SHIFT;
2269 pfnmerge >>= VTD_STRIDE_SHIFT;
2277 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2278 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2280 struct dma_pte *first_pte = NULL, *pte = NULL;
2281 unsigned int largepage_lvl = 0;
2282 unsigned long lvl_pages = 0;
2286 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2288 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2291 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2292 if (domain_use_first_level(domain))
2293 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2295 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2297 while (nr_pages > 0) {
2301 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2302 phys_pfn, nr_pages);
2304 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2307 /* It is large page*/
2308 if (largepage_lvl > 1) {
2309 unsigned long nr_superpages, end_pfn;
2311 pteval |= DMA_PTE_LARGE_PAGE;
2312 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2314 nr_superpages = nr_pages / lvl_pages;
2315 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2318 * Ensure that old small page tables are
2319 * removed to make room for superpage(s).
2320 * We're adding new large pages, so make sure
2321 * we don't remove their parent tables.
2323 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2326 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2330 /* We don't need lock here, nobody else
2331 * touches the iova range
2333 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2335 static int dumps = 5;
2336 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2337 iov_pfn, tmp, (unsigned long long)pteval);
2340 debug_dma_dump_mappings(NULL);
2345 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2347 BUG_ON(nr_pages < lvl_pages);
2349 nr_pages -= lvl_pages;
2350 iov_pfn += lvl_pages;
2351 phys_pfn += lvl_pages;
2352 pteval += lvl_pages * VTD_PAGE_SIZE;
2354 /* If the next PTE would be the first in a new page, then we
2355 * need to flush the cache on the entries we've just written.
2356 * And then we'll need to recalculate 'pte', so clear it and
2357 * let it get set again in the if (!pte) block above.
2359 * If we're done (!nr_pages) we need to flush the cache too.
2361 * Also if we've been setting superpages, we may need to
2362 * recalculate 'pte' and switch back to smaller pages for the
2363 * end of the mapping, if the trailing size is not enough to
2364 * use another superpage (i.e. nr_pages < lvl_pages).
2367 if (!nr_pages || first_pte_in_page(pte) ||
2368 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2369 domain_flush_cache(domain, first_pte,
2370 (void *)pte - (void *)first_pte);
2379 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2383 struct intel_iommu *iommu;
2385 /* Do the real mapping first */
2386 ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2390 for_each_domain_iommu(iommu_id, domain) {
2391 iommu = g_iommus[iommu_id];
2392 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2398 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2400 unsigned long flags;
2401 struct context_entry *context;
2407 spin_lock_irqsave(&iommu->lock, flags);
2408 context = iommu_context_addr(iommu, bus, devfn, 0);
2410 spin_unlock_irqrestore(&iommu->lock, flags);
2413 did_old = context_domain_id(context);
2414 context_clear_entry(context);
2415 __iommu_flush_cache(iommu, context, sizeof(*context));
2416 spin_unlock_irqrestore(&iommu->lock, flags);
2417 iommu->flush.flush_context(iommu,
2419 (((u16)bus) << 8) | devfn,
2420 DMA_CCMD_MASK_NOBIT,
2421 DMA_CCMD_DEVICE_INVL);
2422 iommu->flush.flush_iotlb(iommu,
2429 static inline void unlink_domain_info(struct device_domain_info *info)
2431 assert_spin_locked(&device_domain_lock);
2432 list_del(&info->link);
2433 list_del(&info->global);
2435 dev_iommu_priv_set(info->dev, NULL);
2438 static void domain_remove_dev_info(struct dmar_domain *domain)
2440 struct device_domain_info *info, *tmp;
2441 unsigned long flags;
2443 spin_lock_irqsave(&device_domain_lock, flags);
2444 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2445 __dmar_remove_one_dev_info(info);
2446 spin_unlock_irqrestore(&device_domain_lock, flags);
2449 struct dmar_domain *find_domain(struct device *dev)
2451 struct device_domain_info *info;
2453 if (unlikely(!dev || !dev->iommu))
2456 if (unlikely(attach_deferred(dev)))
2459 /* No lock here, assumes no domain exit in normal case */
2460 info = get_domain_info(dev);
2462 return info->domain;
2467 static inline struct device_domain_info *
2468 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2470 struct device_domain_info *info;
2472 list_for_each_entry(info, &device_domain_list, global)
2473 if (info->segment == segment && info->bus == bus &&
2474 info->devfn == devfn)
2480 static int domain_setup_first_level(struct intel_iommu *iommu,
2481 struct dmar_domain *domain,
2485 int flags = PASID_FLAG_SUPERVISOR_MODE;
2486 struct dma_pte *pgd = domain->pgd;
2490 * Skip top levels of page tables for iommu which has
2491 * less agaw than default. Unnecessary for PT mode.
2493 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2494 pgd = phys_to_virt(dma_pte_addr(pgd));
2495 if (!dma_pte_present(pgd))
2499 level = agaw_to_level(agaw);
2500 if (level != 4 && level != 5)
2503 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2505 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2506 domain->iommu_did[iommu->seq_id],
2510 static bool dev_is_real_dma_subdevice(struct device *dev)
2512 return dev && dev_is_pci(dev) &&
2513 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2516 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2519 struct dmar_domain *domain)
2521 struct dmar_domain *found = NULL;
2522 struct device_domain_info *info;
2523 unsigned long flags;
2526 info = alloc_devinfo_mem();
2530 if (!dev_is_real_dma_subdevice(dev)) {
2532 info->devfn = devfn;
2533 info->segment = iommu->segment;
2535 struct pci_dev *pdev = to_pci_dev(dev);
2537 info->bus = pdev->bus->number;
2538 info->devfn = pdev->devfn;
2539 info->segment = pci_domain_nr(pdev->bus);
2542 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2543 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2546 info->domain = domain;
2547 info->iommu = iommu;
2548 info->pasid_table = NULL;
2549 info->auxd_enabled = 0;
2550 INIT_LIST_HEAD(&info->auxiliary_domains);
2552 if (dev && dev_is_pci(dev)) {
2553 struct pci_dev *pdev = to_pci_dev(info->dev);
2555 if (ecap_dev_iotlb_support(iommu->ecap) &&
2556 pci_ats_supported(pdev) &&
2557 dmar_find_matched_atsr_unit(pdev))
2558 info->ats_supported = 1;
2560 if (sm_supported(iommu)) {
2561 if (pasid_supported(iommu)) {
2562 int features = pci_pasid_features(pdev);
2564 info->pasid_supported = features | 1;
2567 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2568 pci_pri_supported(pdev))
2569 info->pri_supported = 1;
2573 spin_lock_irqsave(&device_domain_lock, flags);
2575 found = find_domain(dev);
2578 struct device_domain_info *info2;
2579 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2582 found = info2->domain;
2588 spin_unlock_irqrestore(&device_domain_lock, flags);
2589 free_devinfo_mem(info);
2590 /* Caller must free the original domain */
2594 spin_lock(&iommu->lock);
2595 ret = domain_attach_iommu(domain, iommu);
2596 spin_unlock(&iommu->lock);
2599 spin_unlock_irqrestore(&device_domain_lock, flags);
2600 free_devinfo_mem(info);
2604 list_add(&info->link, &domain->devices);
2605 list_add(&info->global, &device_domain_list);
2607 dev_iommu_priv_set(dev, info);
2608 spin_unlock_irqrestore(&device_domain_lock, flags);
2610 /* PASID table is mandatory for a PCI device in scalable mode. */
2611 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2612 ret = intel_pasid_alloc_table(dev);
2614 dev_err(dev, "PASID table allocation failed\n");
2615 dmar_remove_one_dev_info(dev);
2619 /* Setup the PASID entry for requests without PASID: */
2620 spin_lock_irqsave(&iommu->lock, flags);
2621 if (hw_pass_through && domain_type_is_si(domain))
2622 ret = intel_pasid_setup_pass_through(iommu, domain,
2623 dev, PASID_RID2PASID);
2624 else if (domain_use_first_level(domain))
2625 ret = domain_setup_first_level(iommu, domain, dev,
2628 ret = intel_pasid_setup_second_level(iommu, domain,
2629 dev, PASID_RID2PASID);
2630 spin_unlock_irqrestore(&iommu->lock, flags);
2632 dev_err(dev, "Setup RID2PASID failed\n");
2633 dmar_remove_one_dev_info(dev);
2638 if (dev && domain_context_mapping(domain, dev)) {
2639 dev_err(dev, "Domain context map failed\n");
2640 dmar_remove_one_dev_info(dev);
2647 static int iommu_domain_identity_map(struct dmar_domain *domain,
2648 unsigned long first_vpfn,
2649 unsigned long last_vpfn)
2652 * RMRR range might have overlap with physical memory range,
2655 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2657 return __domain_mapping(domain, first_vpfn,
2658 first_vpfn, last_vpfn - first_vpfn + 1,
2659 DMA_PTE_READ|DMA_PTE_WRITE);
2662 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2664 static int __init si_domain_init(int hw)
2666 struct dmar_rmrr_unit *rmrr;
2670 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2674 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2675 domain_exit(si_domain);
2682 for_each_online_node(nid) {
2683 unsigned long start_pfn, end_pfn;
2686 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2687 ret = iommu_domain_identity_map(si_domain,
2688 mm_to_dma_pfn(start_pfn),
2689 mm_to_dma_pfn(end_pfn));
2696 * Identity map the RMRRs so that devices with RMRRs could also use
2699 for_each_rmrr_units(rmrr) {
2700 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2702 unsigned long long start = rmrr->base_address;
2703 unsigned long long end = rmrr->end_address;
2705 if (WARN_ON(end < start ||
2706 end >> agaw_to_width(si_domain->agaw)))
2709 ret = iommu_domain_identity_map(si_domain,
2710 mm_to_dma_pfn(start >> PAGE_SHIFT),
2711 mm_to_dma_pfn(end >> PAGE_SHIFT));
2720 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2722 struct dmar_domain *ndomain;
2723 struct intel_iommu *iommu;
2726 iommu = device_to_iommu(dev, &bus, &devfn);
2730 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2731 if (ndomain != domain)
2737 static bool device_has_rmrr(struct device *dev)
2739 struct dmar_rmrr_unit *rmrr;
2744 for_each_rmrr_units(rmrr) {
2746 * Return TRUE if this RMRR contains the device that
2749 for_each_active_dev_scope(rmrr->devices,
2750 rmrr->devices_cnt, i, tmp)
2752 is_downstream_to_pci_bridge(dev, tmp)) {
2762 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2763 * is relaxable (ie. is allowed to be not enforced under some conditions)
2764 * @dev: device handle
2766 * We assume that PCI USB devices with RMRRs have them largely
2767 * for historical reasons and that the RMRR space is not actively used post
2768 * boot. This exclusion may change if vendors begin to abuse it.
2770 * The same exception is made for graphics devices, with the requirement that
2771 * any use of the RMRR regions will be torn down before assigning the device
2774 * Return: true if the RMRR is relaxable, false otherwise
2776 static bool device_rmrr_is_relaxable(struct device *dev)
2778 struct pci_dev *pdev;
2780 if (!dev_is_pci(dev))
2783 pdev = to_pci_dev(dev);
2784 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2791 * There are a couple cases where we need to restrict the functionality of
2792 * devices associated with RMRRs. The first is when evaluating a device for
2793 * identity mapping because problems exist when devices are moved in and out
2794 * of domains and their respective RMRR information is lost. This means that
2795 * a device with associated RMRRs will never be in a "passthrough" domain.
2796 * The second is use of the device through the IOMMU API. This interface
2797 * expects to have full control of the IOVA space for the device. We cannot
2798 * satisfy both the requirement that RMRR access is maintained and have an
2799 * unencumbered IOVA space. We also have no ability to quiesce the device's
2800 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2801 * We therefore prevent devices associated with an RMRR from participating in
2802 * the IOMMU API, which eliminates them from device assignment.
2804 * In both cases, devices which have relaxable RMRRs are not concerned by this
2805 * restriction. See device_rmrr_is_relaxable comment.
2807 static bool device_is_rmrr_locked(struct device *dev)
2809 if (!device_has_rmrr(dev))
2812 if (device_rmrr_is_relaxable(dev))
2819 * Return the required default domain type for a specific device.
2821 * @dev: the device in query
2822 * @startup: true if this is during early boot
2825 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2826 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2827 * - 0: both identity and dynamic domains work for this device
2829 static int device_def_domain_type(struct device *dev)
2831 if (dev_is_pci(dev)) {
2832 struct pci_dev *pdev = to_pci_dev(dev);
2834 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2835 return IOMMU_DOMAIN_IDENTITY;
2837 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2838 return IOMMU_DOMAIN_IDENTITY;
2844 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2847 * Start from the sane iommu hardware state.
2848 * If the queued invalidation is already initialized by us
2849 * (for example, while enabling interrupt-remapping) then
2850 * we got the things already rolling from a sane state.
2854 * Clear any previous faults.
2856 dmar_fault(-1, iommu);
2858 * Disable queued invalidation if supported and already enabled
2859 * before OS handover.
2861 dmar_disable_qi(iommu);
2864 if (dmar_enable_qi(iommu)) {
2866 * Queued Invalidate not enabled, use Register Based Invalidate
2868 iommu->flush.flush_context = __iommu_flush_context;
2869 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2870 pr_info("%s: Using Register based invalidation\n",
2873 iommu->flush.flush_context = qi_flush_context;
2874 iommu->flush.flush_iotlb = qi_flush_iotlb;
2875 pr_info("%s: Using Queued invalidation\n", iommu->name);
2879 static int copy_context_table(struct intel_iommu *iommu,
2880 struct root_entry *old_re,
2881 struct context_entry **tbl,
2884 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2885 struct context_entry *new_ce = NULL, ce;
2886 struct context_entry *old_ce = NULL;
2887 struct root_entry re;
2888 phys_addr_t old_ce_phys;
2890 tbl_idx = ext ? bus * 2 : bus;
2891 memcpy(&re, old_re, sizeof(re));
2893 for (devfn = 0; devfn < 256; devfn++) {
2894 /* First calculate the correct index */
2895 idx = (ext ? devfn * 2 : devfn) % 256;
2898 /* First save what we may have and clean up */
2900 tbl[tbl_idx] = new_ce;
2901 __iommu_flush_cache(iommu, new_ce,
2911 old_ce_phys = root_entry_lctp(&re);
2913 old_ce_phys = root_entry_uctp(&re);
2916 if (ext && devfn == 0) {
2917 /* No LCTP, try UCTP */
2926 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2931 new_ce = alloc_pgtable_page(iommu->node);
2938 /* Now copy the context entry */
2939 memcpy(&ce, old_ce + idx, sizeof(ce));
2941 if (!__context_present(&ce))
2944 did = context_domain_id(&ce);
2945 if (did >= 0 && did < cap_ndoms(iommu->cap))
2946 set_bit(did, iommu->domain_ids);
2949 * We need a marker for copied context entries. This
2950 * marker needs to work for the old format as well as
2951 * for extended context entries.
2953 * Bit 67 of the context entry is used. In the old
2954 * format this bit is available to software, in the
2955 * extended format it is the PGE bit, but PGE is ignored
2956 * by HW if PASIDs are disabled (and thus still
2959 * So disable PASIDs first and then mark the entry
2960 * copied. This means that we don't copy PASID
2961 * translations from the old kernel, but this is fine as
2962 * faults there are not fatal.
2964 context_clear_pasid_enable(&ce);
2965 context_set_copied(&ce);
2970 tbl[tbl_idx + pos] = new_ce;
2972 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2981 static int copy_translation_tables(struct intel_iommu *iommu)
2983 struct context_entry **ctxt_tbls;
2984 struct root_entry *old_rt;
2985 phys_addr_t old_rt_phys;
2986 int ctxt_table_entries;
2987 unsigned long flags;
2992 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2993 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2994 new_ext = !!ecap_ecs(iommu->ecap);
2997 * The RTT bit can only be changed when translation is disabled,
2998 * but disabling translation means to open a window for data
2999 * corruption. So bail out and don't copy anything if we would
3000 * have to change the bit.
3005 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3009 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3013 /* This is too big for the stack - allocate it from slab */
3014 ctxt_table_entries = ext ? 512 : 256;
3016 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3020 for (bus = 0; bus < 256; bus++) {
3021 ret = copy_context_table(iommu, &old_rt[bus],
3022 ctxt_tbls, bus, ext);
3024 pr_err("%s: Failed to copy context table for bus %d\n",
3030 spin_lock_irqsave(&iommu->lock, flags);
3032 /* Context tables are copied, now write them to the root_entry table */
3033 for (bus = 0; bus < 256; bus++) {
3034 int idx = ext ? bus * 2 : bus;
3037 if (ctxt_tbls[idx]) {
3038 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3039 iommu->root_entry[bus].lo = val;
3042 if (!ext || !ctxt_tbls[idx + 1])
3045 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3046 iommu->root_entry[bus].hi = val;
3049 spin_unlock_irqrestore(&iommu->lock, flags);
3053 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3063 #ifdef CONFIG_INTEL_IOMMU_SVM
3064 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3066 struct intel_iommu *iommu = data;
3070 return INVALID_IOASID;
3072 * VT-d virtual command interface always uses the full 20 bit
3073 * PASID range. Host can partition guest PASID range based on
3074 * policies but it is out of guest's control.
3076 if (min < PASID_MIN || max > intel_pasid_max_id)
3077 return INVALID_IOASID;
3079 if (vcmd_alloc_pasid(iommu, &ioasid))
3080 return INVALID_IOASID;
3085 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3087 struct intel_iommu *iommu = data;
3092 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3093 * We can only free the PASID when all the devices are unbound.
3095 if (ioasid_find(NULL, ioasid, NULL)) {
3096 pr_alert("Cannot free active IOASID %d\n", ioasid);
3099 vcmd_free_pasid(iommu, ioasid);
3102 static void register_pasid_allocator(struct intel_iommu *iommu)
3105 * If we are running in the host, no need for custom allocator
3106 * in that PASIDs are allocated from the host system-wide.
3108 if (!cap_caching_mode(iommu->cap))
3111 if (!sm_supported(iommu)) {
3112 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3117 * Register a custom PASID allocator if we are running in a guest,
3118 * guest PASID must be obtained via virtual command interface.
3119 * There can be multiple vIOMMUs in each guest but only one allocator
3120 * is active. All vIOMMU allocators will eventually be calling the same
3123 if (!vccap_pasid(iommu->vccap))
3126 pr_info("Register custom PASID allocator\n");
3127 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3128 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3129 iommu->pasid_allocator.pdata = (void *)iommu;
3130 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3131 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3133 * Disable scalable mode on this IOMMU if there
3134 * is no custom allocator. Mixing SM capable vIOMMU
3135 * and non-SM vIOMMU are not supported.
3142 static int __init init_dmars(void)
3144 struct dmar_drhd_unit *drhd;
3145 struct intel_iommu *iommu;
3151 * initialize and program root entry to not present
3154 for_each_drhd_unit(drhd) {
3156 * lock not needed as this is only incremented in the single
3157 * threaded kernel __init code path all other access are read
3160 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3164 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3167 /* Preallocate enough resources for IOMMU hot-addition */
3168 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3169 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3171 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3174 pr_err("Allocating global iommu array failed\n");
3179 for_each_iommu(iommu, drhd) {
3180 if (drhd->ignored) {
3181 iommu_disable_translation(iommu);
3186 * Find the max pasid size of all IOMMU's in the system.
3187 * We need to ensure the system pasid table is no bigger
3188 * than the smallest supported.
3190 if (pasid_supported(iommu)) {
3191 u32 temp = 2 << ecap_pss(iommu->ecap);
3193 intel_pasid_max_id = min_t(u32, temp,
3194 intel_pasid_max_id);
3197 g_iommus[iommu->seq_id] = iommu;
3199 intel_iommu_init_qi(iommu);
3201 ret = iommu_init_domains(iommu);
3205 init_translation_status(iommu);
3207 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3208 iommu_disable_translation(iommu);
3209 clear_translation_pre_enabled(iommu);
3210 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3216 * we could share the same root & context tables
3217 * among all IOMMU's. Need to Split it later.
3219 ret = iommu_alloc_root_entry(iommu);
3223 if (translation_pre_enabled(iommu)) {
3224 pr_info("Translation already enabled - trying to copy translation structures\n");
3226 ret = copy_translation_tables(iommu);
3229 * We found the IOMMU with translation
3230 * enabled - but failed to copy over the
3231 * old root-entry table. Try to proceed
3232 * by disabling translation now and
3233 * allocating a clean root-entry table.
3234 * This might cause DMAR faults, but
3235 * probably the dump will still succeed.
3237 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3239 iommu_disable_translation(iommu);
3240 clear_translation_pre_enabled(iommu);
3242 pr_info("Copied translation tables from previous kernel for %s\n",
3247 if (!ecap_pass_through(iommu->ecap))
3248 hw_pass_through = 0;
3249 intel_svm_check(iommu);
3253 * Now that qi is enabled on all iommus, set the root entry and flush
3254 * caches. This is required on some Intel X58 chipsets, otherwise the
3255 * flush_context function will loop forever and the boot hangs.
3257 for_each_active_iommu(iommu, drhd) {
3258 iommu_flush_write_buffer(iommu);
3259 #ifdef CONFIG_INTEL_IOMMU_SVM
3260 register_pasid_allocator(iommu);
3262 iommu_set_root_entry(iommu);
3263 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3264 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3267 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3272 iommu_identity_mapping |= IDENTMAP_GFX;
3274 check_tylersburg_isoch();
3276 ret = si_domain_init(hw_pass_through);
3283 * global invalidate context cache
3284 * global invalidate iotlb
3285 * enable translation
3287 for_each_iommu(iommu, drhd) {
3288 if (drhd->ignored) {
3290 * we always have to disable PMRs or DMA may fail on
3294 iommu_disable_protect_mem_regions(iommu);
3298 iommu_flush_write_buffer(iommu);
3300 #ifdef CONFIG_INTEL_IOMMU_SVM
3301 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3303 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3304 * could cause possible lock race condition.
3306 up_write(&dmar_global_lock);
3307 ret = intel_svm_enable_prq(iommu);
3308 down_write(&dmar_global_lock);
3313 ret = dmar_set_interrupt(iommu);
3321 for_each_active_iommu(iommu, drhd) {
3322 disable_dmar_iommu(iommu);
3323 free_dmar_iommu(iommu);
3332 static inline int iommu_domain_cache_init(void)
3336 iommu_domain_cache = kmem_cache_create("iommu_domain",
3337 sizeof(struct dmar_domain),
3342 if (!iommu_domain_cache) {
3343 pr_err("Couldn't create iommu_domain cache\n");
3350 static inline int iommu_devinfo_cache_init(void)
3354 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3355 sizeof(struct device_domain_info),
3359 if (!iommu_devinfo_cache) {
3360 pr_err("Couldn't create devinfo cache\n");
3367 static int __init iommu_init_mempool(void)
3370 ret = iova_cache_get();
3374 ret = iommu_domain_cache_init();
3378 ret = iommu_devinfo_cache_init();
3382 kmem_cache_destroy(iommu_domain_cache);
3389 static void __init iommu_exit_mempool(void)
3391 kmem_cache_destroy(iommu_devinfo_cache);
3392 kmem_cache_destroy(iommu_domain_cache);
3396 static void __init init_no_remapping_devices(void)
3398 struct dmar_drhd_unit *drhd;
3402 for_each_drhd_unit(drhd) {
3403 if (!drhd->include_all) {
3404 for_each_active_dev_scope(drhd->devices,
3405 drhd->devices_cnt, i, dev)
3407 /* ignore DMAR unit if no devices exist */
3408 if (i == drhd->devices_cnt)
3413 for_each_active_drhd_unit(drhd) {
3414 if (drhd->include_all)
3417 for_each_active_dev_scope(drhd->devices,
3418 drhd->devices_cnt, i, dev)
3419 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3421 if (i < drhd->devices_cnt)
3424 /* This IOMMU has *only* gfx devices. Either bypass it or
3425 set the gfx_mapped flag, as appropriate */
3426 drhd->gfx_dedicated = 1;
3432 #ifdef CONFIG_SUSPEND
3433 static int init_iommu_hw(void)
3435 struct dmar_drhd_unit *drhd;
3436 struct intel_iommu *iommu = NULL;
3438 for_each_active_iommu(iommu, drhd)
3440 dmar_reenable_qi(iommu);
3442 for_each_iommu(iommu, drhd) {
3443 if (drhd->ignored) {
3445 * we always have to disable PMRs or DMA may fail on
3449 iommu_disable_protect_mem_regions(iommu);
3453 iommu_flush_write_buffer(iommu);
3455 iommu_set_root_entry(iommu);
3457 iommu->flush.flush_context(iommu, 0, 0, 0,
3458 DMA_CCMD_GLOBAL_INVL);
3459 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3460 iommu_enable_translation(iommu);
3461 iommu_disable_protect_mem_regions(iommu);
3467 static void iommu_flush_all(void)
3469 struct dmar_drhd_unit *drhd;
3470 struct intel_iommu *iommu;
3472 for_each_active_iommu(iommu, drhd) {
3473 iommu->flush.flush_context(iommu, 0, 0, 0,
3474 DMA_CCMD_GLOBAL_INVL);
3475 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3476 DMA_TLB_GLOBAL_FLUSH);
3480 static int iommu_suspend(void)
3482 struct dmar_drhd_unit *drhd;
3483 struct intel_iommu *iommu = NULL;
3486 for_each_active_iommu(iommu, drhd) {
3487 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3489 if (!iommu->iommu_state)
3495 for_each_active_iommu(iommu, drhd) {
3496 iommu_disable_translation(iommu);
3498 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3500 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3501 readl(iommu->reg + DMAR_FECTL_REG);
3502 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3503 readl(iommu->reg + DMAR_FEDATA_REG);
3504 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3505 readl(iommu->reg + DMAR_FEADDR_REG);
3506 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3507 readl(iommu->reg + DMAR_FEUADDR_REG);
3509 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3514 for_each_active_iommu(iommu, drhd)
3515 kfree(iommu->iommu_state);
3520 static void iommu_resume(void)
3522 struct dmar_drhd_unit *drhd;
3523 struct intel_iommu *iommu = NULL;
3526 if (init_iommu_hw()) {
3528 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3530 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3534 for_each_active_iommu(iommu, drhd) {
3536 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3538 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3539 iommu->reg + DMAR_FECTL_REG);
3540 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3541 iommu->reg + DMAR_FEDATA_REG);
3542 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3543 iommu->reg + DMAR_FEADDR_REG);
3544 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3545 iommu->reg + DMAR_FEUADDR_REG);
3547 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3550 for_each_active_iommu(iommu, drhd)
3551 kfree(iommu->iommu_state);
3554 static struct syscore_ops iommu_syscore_ops = {
3555 .resume = iommu_resume,
3556 .suspend = iommu_suspend,
3559 static void __init init_iommu_pm_ops(void)
3561 register_syscore_ops(&iommu_syscore_ops);
3565 static inline void init_iommu_pm_ops(void) {}
3566 #endif /* CONFIG_PM */
3568 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3570 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3571 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3572 rmrr->end_address <= rmrr->base_address ||
3573 arch_rmrr_sanity_check(rmrr))
3579 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3581 struct acpi_dmar_reserved_memory *rmrr;
3582 struct dmar_rmrr_unit *rmrru;
3584 rmrr = (struct acpi_dmar_reserved_memory *)header;
3585 if (rmrr_sanity_check(rmrr)) {
3587 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3588 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3589 rmrr->base_address, rmrr->end_address,
3590 dmi_get_system_info(DMI_BIOS_VENDOR),
3591 dmi_get_system_info(DMI_BIOS_VERSION),
3592 dmi_get_system_info(DMI_PRODUCT_VERSION));
3593 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3596 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3600 rmrru->hdr = header;
3602 rmrru->base_address = rmrr->base_address;
3603 rmrru->end_address = rmrr->end_address;
3605 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3606 ((void *)rmrr) + rmrr->header.length,
3607 &rmrru->devices_cnt);
3608 if (rmrru->devices_cnt && rmrru->devices == NULL)
3611 list_add(&rmrru->list, &dmar_rmrr_units);
3620 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3622 struct dmar_atsr_unit *atsru;
3623 struct acpi_dmar_atsr *tmp;
3625 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3627 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3628 if (atsr->segment != tmp->segment)
3630 if (atsr->header.length != tmp->header.length)
3632 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3639 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3641 struct acpi_dmar_atsr *atsr;
3642 struct dmar_atsr_unit *atsru;
3644 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3647 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3648 atsru = dmar_find_atsr(atsr);
3652 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3657 * If memory is allocated from slab by ACPI _DSM method, we need to
3658 * copy the memory content because the memory buffer will be freed
3661 atsru->hdr = (void *)(atsru + 1);
3662 memcpy(atsru->hdr, hdr, hdr->length);
3663 atsru->include_all = atsr->flags & 0x1;
3664 if (!atsru->include_all) {
3665 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3666 (void *)atsr + atsr->header.length,
3667 &atsru->devices_cnt);
3668 if (atsru->devices_cnt && atsru->devices == NULL) {
3674 list_add_rcu(&atsru->list, &dmar_atsr_units);
3679 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3681 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3685 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3687 struct acpi_dmar_atsr *atsr;
3688 struct dmar_atsr_unit *atsru;
3690 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3691 atsru = dmar_find_atsr(atsr);
3693 list_del_rcu(&atsru->list);
3695 intel_iommu_free_atsr(atsru);
3701 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3705 struct acpi_dmar_atsr *atsr;
3706 struct dmar_atsr_unit *atsru;
3708 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3709 atsru = dmar_find_atsr(atsr);
3713 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3714 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3722 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3725 struct intel_iommu *iommu = dmaru->iommu;
3727 if (g_iommus[iommu->seq_id])
3730 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3731 pr_warn("%s: Doesn't support hardware pass through.\n",
3735 if (!ecap_sc_support(iommu->ecap) &&
3736 domain_update_iommu_snooping(iommu)) {
3737 pr_warn("%s: Doesn't support snooping.\n",
3741 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3742 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3743 pr_warn("%s: Doesn't support large page.\n",
3749 * Disable translation if already enabled prior to OS handover.
3751 if (iommu->gcmd & DMA_GCMD_TE)
3752 iommu_disable_translation(iommu);
3754 g_iommus[iommu->seq_id] = iommu;
3755 ret = iommu_init_domains(iommu);
3757 ret = iommu_alloc_root_entry(iommu);
3761 intel_svm_check(iommu);
3763 if (dmaru->ignored) {
3765 * we always have to disable PMRs or DMA may fail on this device
3768 iommu_disable_protect_mem_regions(iommu);
3772 intel_iommu_init_qi(iommu);
3773 iommu_flush_write_buffer(iommu);
3775 #ifdef CONFIG_INTEL_IOMMU_SVM
3776 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3777 ret = intel_svm_enable_prq(iommu);
3782 ret = dmar_set_interrupt(iommu);
3786 iommu_set_root_entry(iommu);
3787 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3788 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3789 iommu_enable_translation(iommu);
3791 iommu_disable_protect_mem_regions(iommu);
3795 disable_dmar_iommu(iommu);
3797 free_dmar_iommu(iommu);
3801 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3804 struct intel_iommu *iommu = dmaru->iommu;
3806 if (!intel_iommu_enabled)
3812 ret = intel_iommu_add(dmaru);
3814 disable_dmar_iommu(iommu);
3815 free_dmar_iommu(iommu);
3821 static void intel_iommu_free_dmars(void)
3823 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3824 struct dmar_atsr_unit *atsru, *atsr_n;
3826 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3827 list_del(&rmrru->list);
3828 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3832 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3833 list_del(&atsru->list);
3834 intel_iommu_free_atsr(atsru);
3838 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3841 struct pci_bus *bus;
3842 struct pci_dev *bridge = NULL;
3844 struct acpi_dmar_atsr *atsr;
3845 struct dmar_atsr_unit *atsru;
3847 dev = pci_physfn(dev);
3848 for (bus = dev->bus; bus; bus = bus->parent) {
3850 /* If it's an integrated device, allow ATS */
3853 /* Connected via non-PCIe: no ATS */
3854 if (!pci_is_pcie(bridge) ||
3855 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3857 /* If we found the root port, look it up in the ATSR */
3858 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3863 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3864 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3865 if (atsr->segment != pci_domain_nr(dev->bus))
3868 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3869 if (tmp == &bridge->dev)
3872 if (atsru->include_all)
3882 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3885 struct dmar_rmrr_unit *rmrru;
3886 struct dmar_atsr_unit *atsru;
3887 struct acpi_dmar_atsr *atsr;
3888 struct acpi_dmar_reserved_memory *rmrr;
3890 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3893 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3894 rmrr = container_of(rmrru->hdr,
3895 struct acpi_dmar_reserved_memory, header);
3896 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3897 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3898 ((void *)rmrr) + rmrr->header.length,
3899 rmrr->segment, rmrru->devices,
3900 rmrru->devices_cnt);
3903 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3904 dmar_remove_dev_scope(info, rmrr->segment,
3905 rmrru->devices, rmrru->devices_cnt);
3909 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3910 if (atsru->include_all)
3913 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3914 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3915 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3916 (void *)atsr + atsr->header.length,
3917 atsr->segment, atsru->devices,
3918 atsru->devices_cnt);
3923 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3924 if (dmar_remove_dev_scope(info, atsr->segment,
3925 atsru->devices, atsru->devices_cnt))
3933 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3934 unsigned long val, void *v)
3936 struct memory_notify *mhp = v;
3937 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3938 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3942 case MEM_GOING_ONLINE:
3943 if (iommu_domain_identity_map(si_domain,
3944 start_vpfn, last_vpfn)) {
3945 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3946 start_vpfn, last_vpfn);
3952 case MEM_CANCEL_ONLINE:
3954 struct dmar_drhd_unit *drhd;
3955 struct intel_iommu *iommu;
3956 struct page *freelist;
3958 freelist = domain_unmap(si_domain,
3959 start_vpfn, last_vpfn,
3963 for_each_active_iommu(iommu, drhd)
3964 iommu_flush_iotlb_psi(iommu, si_domain,
3965 start_vpfn, mhp->nr_pages,
3968 dma_free_pagelist(freelist);
3976 static struct notifier_block intel_iommu_memory_nb = {
3977 .notifier_call = intel_iommu_memory_notifier,
3981 static void free_all_cpu_cached_iovas(unsigned int cpu)
3985 for (i = 0; i < g_num_of_iommus; i++) {
3986 struct intel_iommu *iommu = g_iommus[i];
3987 struct dmar_domain *domain;
3993 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
3994 domain = get_iommu_domain(iommu, (u16)did);
3996 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
3999 iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4004 static int intel_iommu_cpu_dead(unsigned int cpu)
4006 free_all_cpu_cached_iovas(cpu);
4010 static void intel_disable_iommus(void)
4012 struct intel_iommu *iommu = NULL;
4013 struct dmar_drhd_unit *drhd;
4015 for_each_iommu(iommu, drhd)
4016 iommu_disable_translation(iommu);
4019 void intel_iommu_shutdown(void)
4021 struct dmar_drhd_unit *drhd;
4022 struct intel_iommu *iommu = NULL;
4024 if (no_iommu || dmar_disabled)
4027 down_write(&dmar_global_lock);
4029 /* Disable PMRs explicitly here. */
4030 for_each_iommu(iommu, drhd)
4031 iommu_disable_protect_mem_regions(iommu);
4033 /* Make sure the IOMMUs are switched off */
4034 intel_disable_iommus();
4036 up_write(&dmar_global_lock);
4039 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4041 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4043 return container_of(iommu_dev, struct intel_iommu, iommu);
4046 static ssize_t intel_iommu_show_version(struct device *dev,
4047 struct device_attribute *attr,
4050 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4051 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4052 return sprintf(buf, "%d:%d\n",
4053 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4055 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4057 static ssize_t intel_iommu_show_address(struct device *dev,
4058 struct device_attribute *attr,
4061 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4062 return sprintf(buf, "%llx\n", iommu->reg_phys);
4064 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4066 static ssize_t intel_iommu_show_cap(struct device *dev,
4067 struct device_attribute *attr,
4070 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4071 return sprintf(buf, "%llx\n", iommu->cap);
4073 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4075 static ssize_t intel_iommu_show_ecap(struct device *dev,
4076 struct device_attribute *attr,
4079 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4080 return sprintf(buf, "%llx\n", iommu->ecap);
4082 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4084 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4085 struct device_attribute *attr,
4088 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4089 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4091 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4093 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4094 struct device_attribute *attr,
4097 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4098 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4099 cap_ndoms(iommu->cap)));
4101 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4103 static struct attribute *intel_iommu_attrs[] = {
4104 &dev_attr_version.attr,
4105 &dev_attr_address.attr,
4107 &dev_attr_ecap.attr,
4108 &dev_attr_domains_supported.attr,
4109 &dev_attr_domains_used.attr,
4113 static struct attribute_group intel_iommu_group = {
4114 .name = "intel-iommu",
4115 .attrs = intel_iommu_attrs,
4118 const struct attribute_group *intel_iommu_groups[] = {
4123 static inline bool has_external_pci(void)
4125 struct pci_dev *pdev = NULL;
4127 for_each_pci_dev(pdev)
4128 if (pdev->external_facing)
4134 static int __init platform_optin_force_iommu(void)
4136 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4139 if (no_iommu || dmar_disabled)
4140 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4143 * If Intel-IOMMU is disabled by default, we will apply identity
4144 * map for all devices except those marked as being untrusted.
4147 iommu_set_default_passthrough(false);
4155 static int __init probe_acpi_namespace_devices(void)
4157 struct dmar_drhd_unit *drhd;
4158 /* To avoid a -Wunused-but-set-variable warning. */
4159 struct intel_iommu *iommu __maybe_unused;
4163 for_each_active_iommu(iommu, drhd) {
4164 for_each_active_dev_scope(drhd->devices,
4165 drhd->devices_cnt, i, dev) {
4166 struct acpi_device_physical_node *pn;
4167 struct iommu_group *group;
4168 struct acpi_device *adev;
4170 if (dev->bus != &acpi_bus_type)
4173 adev = to_acpi_device(dev);
4174 mutex_lock(&adev->physical_node_lock);
4175 list_for_each_entry(pn,
4176 &adev->physical_node_list, node) {
4177 group = iommu_group_get(pn->dev);
4179 iommu_group_put(group);
4183 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4184 ret = iommu_probe_device(pn->dev);
4188 mutex_unlock(&adev->physical_node_lock);
4198 int __init intel_iommu_init(void)
4201 struct dmar_drhd_unit *drhd;
4202 struct intel_iommu *iommu;
4205 * Intel IOMMU is required for a TXT/tboot launch or platform
4206 * opt in, so enforce that.
4208 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4209 platform_optin_force_iommu();
4211 if (iommu_init_mempool()) {
4213 panic("tboot: Failed to initialize iommu memory\n");
4217 down_write(&dmar_global_lock);
4218 if (dmar_table_init()) {
4220 panic("tboot: Failed to initialize DMAR table\n");
4224 if (dmar_dev_scope_init() < 0) {
4226 panic("tboot: Failed to initialize DMAR device scope\n");
4230 up_write(&dmar_global_lock);
4233 * The bus notifier takes the dmar_global_lock, so lockdep will
4234 * complain later when we register it under the lock.
4236 dmar_register_bus_notifier();
4238 down_write(&dmar_global_lock);
4241 intel_iommu_debugfs_init();
4243 if (no_iommu || dmar_disabled) {
4245 * We exit the function here to ensure IOMMU's remapping and
4246 * mempool aren't setup, which means that the IOMMU's PMRs
4247 * won't be disabled via the call to init_dmars(). So disable
4248 * it explicitly here. The PMRs were setup by tboot prior to
4249 * calling SENTER, but the kernel is expected to reset/tear
4252 if (intel_iommu_tboot_noforce) {
4253 for_each_iommu(iommu, drhd)
4254 iommu_disable_protect_mem_regions(iommu);
4258 * Make sure the IOMMUs are switched off, even when we
4259 * boot into a kexec kernel and the previous kernel left
4262 intel_disable_iommus();
4266 if (list_empty(&dmar_rmrr_units))
4267 pr_info("No RMRR found\n");
4269 if (list_empty(&dmar_atsr_units))
4270 pr_info("No ATSR found\n");
4273 intel_iommu_gfx_mapped = 1;
4275 init_no_remapping_devices();
4280 panic("tboot: Failed to initialize DMARs\n");
4281 pr_err("Initialization failed\n");
4284 up_write(&dmar_global_lock);
4286 init_iommu_pm_ops();
4288 down_read(&dmar_global_lock);
4289 for_each_active_iommu(iommu, drhd) {
4290 iommu_device_sysfs_add(&iommu->iommu, NULL,
4293 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4294 iommu_device_register(&iommu->iommu);
4296 up_read(&dmar_global_lock);
4298 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4299 if (si_domain && !hw_pass_through)
4300 register_memory_notifier(&intel_iommu_memory_nb);
4301 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4302 intel_iommu_cpu_dead);
4304 down_read(&dmar_global_lock);
4305 if (probe_acpi_namespace_devices())
4306 pr_warn("ACPI name space devices didn't probe correctly\n");
4308 /* Finally, we enable the DMA remapping hardware. */
4309 for_each_iommu(iommu, drhd) {
4310 if (!drhd->ignored && !translation_pre_enabled(iommu))
4311 iommu_enable_translation(iommu);
4313 iommu_disable_protect_mem_regions(iommu);
4315 up_read(&dmar_global_lock);
4317 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4319 intel_iommu_enabled = 1;
4324 intel_iommu_free_dmars();
4325 up_write(&dmar_global_lock);
4326 iommu_exit_mempool();
4330 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4332 struct intel_iommu *iommu = opaque;
4334 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4339 * NB - intel-iommu lacks any sort of reference counting for the users of
4340 * dependent devices. If multiple endpoints have intersecting dependent
4341 * devices, unbinding the driver from any one of them will possibly leave
4342 * the others unable to operate.
4344 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4346 if (!iommu || !dev || !dev_is_pci(dev))
4349 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4352 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4354 struct dmar_domain *domain;
4355 struct intel_iommu *iommu;
4356 unsigned long flags;
4358 assert_spin_locked(&device_domain_lock);
4363 iommu = info->iommu;
4364 domain = info->domain;
4367 if (dev_is_pci(info->dev) && sm_supported(iommu))
4368 intel_pasid_tear_down_entry(iommu, info->dev,
4369 PASID_RID2PASID, false);
4371 iommu_disable_dev_iotlb(info);
4372 if (!dev_is_real_dma_subdevice(info->dev))
4373 domain_context_clear(iommu, info->dev);
4374 intel_pasid_free_table(info->dev);
4377 unlink_domain_info(info);
4379 spin_lock_irqsave(&iommu->lock, flags);
4380 domain_detach_iommu(domain, iommu);
4381 spin_unlock_irqrestore(&iommu->lock, flags);
4383 free_devinfo_mem(info);
4386 static void dmar_remove_one_dev_info(struct device *dev)
4388 struct device_domain_info *info;
4389 unsigned long flags;
4391 spin_lock_irqsave(&device_domain_lock, flags);
4392 info = get_domain_info(dev);
4394 __dmar_remove_one_dev_info(info);
4395 spin_unlock_irqrestore(&device_domain_lock, flags);
4398 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4402 /* calculate AGAW */
4403 domain->gaw = guest_width;
4404 adjust_width = guestwidth_to_adjustwidth(guest_width);
4405 domain->agaw = width_to_agaw(adjust_width);
4407 domain->iommu_coherency = 0;
4408 domain->iommu_snooping = 0;
4409 domain->iommu_superpage = 0;
4410 domain->max_addr = 0;
4412 /* always allocate the top pgd */
4413 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4416 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4420 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4422 struct dmar_domain *dmar_domain;
4423 struct iommu_domain *domain;
4426 case IOMMU_DOMAIN_DMA:
4427 case IOMMU_DOMAIN_UNMANAGED:
4428 dmar_domain = alloc_domain(0);
4430 pr_err("Can't allocate dmar_domain\n");
4433 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4434 pr_err("Domain initialization failed\n");
4435 domain_exit(dmar_domain);
4439 if (type == IOMMU_DOMAIN_DMA &&
4440 iommu_get_dma_cookie(&dmar_domain->domain))
4443 domain = &dmar_domain->domain;
4444 domain->geometry.aperture_start = 0;
4445 domain->geometry.aperture_end =
4446 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4447 domain->geometry.force_aperture = true;
4450 case IOMMU_DOMAIN_IDENTITY:
4451 return &si_domain->domain;
4459 static void intel_iommu_domain_free(struct iommu_domain *domain)
4461 if (domain != &si_domain->domain)
4462 domain_exit(to_dmar_domain(domain));
4466 * Check whether a @domain could be attached to the @dev through the
4467 * aux-domain attach/detach APIs.
4470 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4472 struct device_domain_info *info = get_domain_info(dev);
4474 return info && info->auxd_enabled &&
4475 domain->type == IOMMU_DOMAIN_UNMANAGED;
4478 static void auxiliary_link_device(struct dmar_domain *domain,
4481 struct device_domain_info *info = get_domain_info(dev);
4483 assert_spin_locked(&device_domain_lock);
4487 domain->auxd_refcnt++;
4488 list_add(&domain->auxd, &info->auxiliary_domains);
4491 static void auxiliary_unlink_device(struct dmar_domain *domain,
4494 struct device_domain_info *info = get_domain_info(dev);
4496 assert_spin_locked(&device_domain_lock);
4500 list_del(&domain->auxd);
4501 domain->auxd_refcnt--;
4503 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4504 ioasid_put(domain->default_pasid);
4507 static int aux_domain_add_dev(struct dmar_domain *domain,
4511 unsigned long flags;
4512 struct intel_iommu *iommu;
4514 iommu = device_to_iommu(dev, NULL, NULL);
4518 if (domain->default_pasid <= 0) {
4521 /* No private data needed for the default pasid */
4522 pasid = ioasid_alloc(NULL, PASID_MIN,
4523 pci_max_pasids(to_pci_dev(dev)) - 1,
4525 if (pasid == INVALID_IOASID) {
4526 pr_err("Can't allocate default pasid\n");
4529 domain->default_pasid = pasid;
4532 spin_lock_irqsave(&device_domain_lock, flags);
4534 * iommu->lock must be held to attach domain to iommu and setup the
4535 * pasid entry for second level translation.
4537 spin_lock(&iommu->lock);
4538 ret = domain_attach_iommu(domain, iommu);
4542 /* Setup the PASID entry for mediated devices: */
4543 if (domain_use_first_level(domain))
4544 ret = domain_setup_first_level(iommu, domain, dev,
4545 domain->default_pasid);
4547 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4548 domain->default_pasid);
4551 spin_unlock(&iommu->lock);
4553 auxiliary_link_device(domain, dev);
4555 spin_unlock_irqrestore(&device_domain_lock, flags);
4560 domain_detach_iommu(domain, iommu);
4562 spin_unlock(&iommu->lock);
4563 spin_unlock_irqrestore(&device_domain_lock, flags);
4564 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4565 ioasid_put(domain->default_pasid);
4570 static void aux_domain_remove_dev(struct dmar_domain *domain,
4573 struct device_domain_info *info;
4574 struct intel_iommu *iommu;
4575 unsigned long flags;
4577 if (!is_aux_domain(dev, &domain->domain))
4580 spin_lock_irqsave(&device_domain_lock, flags);
4581 info = get_domain_info(dev);
4582 iommu = info->iommu;
4584 auxiliary_unlink_device(domain, dev);
4586 spin_lock(&iommu->lock);
4587 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
4588 domain_detach_iommu(domain, iommu);
4589 spin_unlock(&iommu->lock);
4591 spin_unlock_irqrestore(&device_domain_lock, flags);
4594 static int prepare_domain_attach_device(struct iommu_domain *domain,
4597 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4598 struct intel_iommu *iommu;
4601 iommu = device_to_iommu(dev, NULL, NULL);
4605 /* check if this iommu agaw is sufficient for max mapped address */
4606 addr_width = agaw_to_width(iommu->agaw);
4607 if (addr_width > cap_mgaw(iommu->cap))
4608 addr_width = cap_mgaw(iommu->cap);
4610 if (dmar_domain->max_addr > (1LL << addr_width)) {
4611 dev_err(dev, "%s: iommu width (%d) is not "
4612 "sufficient for the mapped address (%llx)\n",
4613 __func__, addr_width, dmar_domain->max_addr);
4616 dmar_domain->gaw = addr_width;
4619 * Knock out extra levels of page tables if necessary
4621 while (iommu->agaw < dmar_domain->agaw) {
4622 struct dma_pte *pte;
4624 pte = dmar_domain->pgd;
4625 if (dma_pte_present(pte)) {
4626 dmar_domain->pgd = (struct dma_pte *)
4627 phys_to_virt(dma_pte_addr(pte));
4628 free_pgtable_page(pte);
4630 dmar_domain->agaw--;
4636 static int intel_iommu_attach_device(struct iommu_domain *domain,
4641 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4642 device_is_rmrr_locked(dev)) {
4643 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4647 if (is_aux_domain(dev, domain))
4650 /* normally dev is not mapped */
4651 if (unlikely(domain_context_mapped(dev))) {
4652 struct dmar_domain *old_domain;
4654 old_domain = find_domain(dev);
4656 dmar_remove_one_dev_info(dev);
4659 ret = prepare_domain_attach_device(domain, dev);
4663 return domain_add_dev_info(to_dmar_domain(domain), dev);
4666 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4671 if (!is_aux_domain(dev, domain))
4674 ret = prepare_domain_attach_device(domain, dev);
4678 return aux_domain_add_dev(to_dmar_domain(domain), dev);
4681 static void intel_iommu_detach_device(struct iommu_domain *domain,
4684 dmar_remove_one_dev_info(dev);
4687 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4690 aux_domain_remove_dev(to_dmar_domain(domain), dev);
4693 #ifdef CONFIG_INTEL_IOMMU_SVM
4695 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4696 * VT-d granularity. Invalidation is typically included in the unmap operation
4697 * as a result of DMA or VFIO unmap. However, for assigned devices guest
4698 * owns the first level page tables. Invalidations of translation caches in the
4699 * guest are trapped and passed down to the host.
4701 * vIOMMU in the guest will only expose first level page tables, therefore
4702 * we do not support IOTLB granularity for request without PASID (second level).
4704 * For example, to find the VT-d granularity encoding for IOTLB
4705 * type and page selective granularity within PASID:
4706 * X: indexed by iommu cache type
4707 * Y: indexed by enum iommu_inv_granularity
4708 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4712 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4714 * PASID based IOTLB invalidation: PASID selective (per PASID),
4715 * page selective (address granularity)
4717 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4718 /* PASID based dev TLBs */
4719 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4721 {-EINVAL, -EINVAL, -EINVAL}
4724 static inline int to_vtd_granularity(int type, int granu)
4726 return inv_type_granu_table[type][granu];
4729 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4731 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4733 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4734 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4735 * granu size in contiguous memory.
4737 return order_base_2(nr_pages);
4741 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4742 struct iommu_cache_invalidate_info *inv_info)
4744 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4745 struct device_domain_info *info;
4746 struct intel_iommu *iommu;
4747 unsigned long flags;
4754 if (!inv_info || !dmar_domain)
4757 if (!dev || !dev_is_pci(dev))
4760 iommu = device_to_iommu(dev, &bus, &devfn);
4764 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4767 spin_lock_irqsave(&device_domain_lock, flags);
4768 spin_lock(&iommu->lock);
4769 info = get_domain_info(dev);
4774 did = dmar_domain->iommu_did[iommu->seq_id];
4775 sid = PCI_DEVID(bus, devfn);
4777 /* Size is only valid in address selective invalidation */
4778 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4779 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4780 inv_info->granu.addr_info.nb_granules);
4782 for_each_set_bit(cache_type,
4783 (unsigned long *)&inv_info->cache,
4784 IOMMU_CACHE_INV_TYPE_NR) {
4789 granu = to_vtd_granularity(cache_type, inv_info->granularity);
4790 if (granu == -EINVAL) {
4791 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4792 cache_type, inv_info->granularity);
4797 * PASID is stored in different locations based on the
4800 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4801 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4802 pasid = inv_info->granu.pasid_info.pasid;
4803 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4804 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4805 pasid = inv_info->granu.addr_info.pasid;
4807 switch (BIT(cache_type)) {
4808 case IOMMU_CACHE_INV_TYPE_IOTLB:
4809 /* HW will ignore LSB bits based on address mask */
4810 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4812 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4813 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4814 inv_info->granu.addr_info.addr, size);
4818 * If granu is PASID-selective, address is ignored.
4819 * We use npages = -1 to indicate that.
4821 qi_flush_piotlb(iommu, did, pasid,
4822 mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4823 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4824 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4826 if (!info->ats_enabled)
4829 * Always flush device IOTLB if ATS is enabled. vIOMMU
4830 * in the guest may assume IOTLB flush is inclusive,
4831 * which is more efficient.
4834 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4836 * PASID based device TLB invalidation does not support
4837 * IOMMU_INV_GRANU_PASID granularity but only supports
4838 * IOMMU_INV_GRANU_ADDR.
4839 * The equivalent of that is we set the size to be the
4840 * entire range of 64 bit. User only provides PASID info
4841 * without address info. So we set addr to 0.
4843 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4844 size = 64 - VTD_PAGE_SHIFT;
4846 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4847 addr = inv_info->granu.addr_info.addr;
4850 if (info->ats_enabled)
4851 qi_flush_dev_iotlb_pasid(iommu, sid,
4853 info->ats_qdep, addr,
4856 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4859 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4865 spin_unlock(&iommu->lock);
4866 spin_unlock_irqrestore(&device_domain_lock, flags);
4872 static int intel_iommu_map(struct iommu_domain *domain,
4873 unsigned long iova, phys_addr_t hpa,
4874 size_t size, int iommu_prot, gfp_t gfp)
4876 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4881 if (iommu_prot & IOMMU_READ)
4882 prot |= DMA_PTE_READ;
4883 if (iommu_prot & IOMMU_WRITE)
4884 prot |= DMA_PTE_WRITE;
4885 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4886 prot |= DMA_PTE_SNP;
4888 max_addr = iova + size;
4889 if (dmar_domain->max_addr < max_addr) {
4892 /* check if minimum agaw is sufficient for mapped address */
4893 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4894 if (end < max_addr) {
4895 pr_err("%s: iommu width (%d) is not "
4896 "sufficient for the mapped address (%llx)\n",
4897 __func__, dmar_domain->gaw, max_addr);
4900 dmar_domain->max_addr = max_addr;
4902 /* Round up size to next multiple of PAGE_SIZE, if it and
4903 the low bits of hpa would take us onto the next page */
4904 size = aligned_nrpages(hpa, size);
4905 ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4906 hpa >> VTD_PAGE_SHIFT, size, prot);
4910 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4911 unsigned long iova, size_t size,
4912 struct iommu_iotlb_gather *gather)
4914 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4915 unsigned long start_pfn, last_pfn;
4918 /* Cope with horrid API which requires us to unmap more than the
4919 size argument if it happens to be a large-page mapping. */
4920 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4922 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4923 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4925 start_pfn = iova >> VTD_PAGE_SHIFT;
4926 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4928 gather->freelist = domain_unmap(dmar_domain, start_pfn,
4929 last_pfn, gather->freelist);
4931 if (dmar_domain->max_addr == iova + size)
4932 dmar_domain->max_addr = iova;
4934 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4939 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4940 struct iommu_iotlb_gather *gather)
4942 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4943 unsigned long iova_pfn = IOVA_PFN(gather->start);
4944 size_t size = gather->end - gather->start;
4945 unsigned long start_pfn;
4946 unsigned long nrpages;
4949 nrpages = aligned_nrpages(gather->start, size);
4950 start_pfn = mm_to_dma_pfn(iova_pfn);
4952 for_each_domain_iommu(iommu_id, dmar_domain)
4953 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4954 start_pfn, nrpages, !gather->freelist, 0);
4956 dma_free_pagelist(gather->freelist);
4959 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4962 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4963 struct dma_pte *pte;
4967 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4968 if (pte && dma_pte_present(pte))
4969 phys = dma_pte_addr(pte) +
4970 (iova & (BIT_MASK(level_to_offset_bits(level) +
4971 VTD_PAGE_SHIFT) - 1));
4976 static inline bool scalable_mode_support(void)
4978 struct dmar_drhd_unit *drhd;
4979 struct intel_iommu *iommu;
4983 for_each_active_iommu(iommu, drhd) {
4984 if (!sm_supported(iommu)) {
4994 static inline bool iommu_pasid_support(void)
4996 struct dmar_drhd_unit *drhd;
4997 struct intel_iommu *iommu;
5001 for_each_active_iommu(iommu, drhd) {
5002 if (!pasid_supported(iommu)) {
5012 static inline bool nested_mode_support(void)
5014 struct dmar_drhd_unit *drhd;
5015 struct intel_iommu *iommu;
5019 for_each_active_iommu(iommu, drhd) {
5020 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5030 static bool intel_iommu_capable(enum iommu_cap cap)
5032 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5033 return domain_update_iommu_snooping(NULL) == 1;
5034 if (cap == IOMMU_CAP_INTR_REMAP)
5035 return irq_remapping_enabled == 1;
5040 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5042 struct intel_iommu *iommu;
5044 iommu = device_to_iommu(dev, NULL, NULL);
5046 return ERR_PTR(-ENODEV);
5048 if (translation_pre_enabled(iommu))
5049 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5051 return &iommu->iommu;
5054 static void intel_iommu_release_device(struct device *dev)
5056 struct intel_iommu *iommu;
5058 iommu = device_to_iommu(dev, NULL, NULL);
5062 dmar_remove_one_dev_info(dev);
5064 set_dma_ops(dev, NULL);
5067 static void intel_iommu_probe_finalize(struct device *dev)
5069 dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5070 struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5071 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5073 if (domain && domain->type == IOMMU_DOMAIN_DMA)
5074 iommu_setup_dma_ops(dev, base,
5075 __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5077 set_dma_ops(dev, NULL);
5080 static void intel_iommu_get_resv_regions(struct device *device,
5081 struct list_head *head)
5083 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5084 struct iommu_resv_region *reg;
5085 struct dmar_rmrr_unit *rmrr;
5086 struct device *i_dev;
5089 down_read(&dmar_global_lock);
5090 for_each_rmrr_units(rmrr) {
5091 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5093 struct iommu_resv_region *resv;
5094 enum iommu_resv_type type;
5097 if (i_dev != device &&
5098 !is_downstream_to_pci_bridge(device, i_dev))
5101 length = rmrr->end_address - rmrr->base_address + 1;
5103 type = device_rmrr_is_relaxable(device) ?
5104 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5106 resv = iommu_alloc_resv_region(rmrr->base_address,
5107 length, prot, type);
5111 list_add_tail(&resv->list, head);
5114 up_read(&dmar_global_lock);
5116 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5117 if (dev_is_pci(device)) {
5118 struct pci_dev *pdev = to_pci_dev(device);
5120 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5121 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5122 IOMMU_RESV_DIRECT_RELAXABLE);
5124 list_add_tail(®->list, head);
5127 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5129 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5130 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5134 list_add_tail(®->list, head);
5137 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5139 struct device_domain_info *info;
5140 struct context_entry *context;
5141 struct dmar_domain *domain;
5142 unsigned long flags;
5146 domain = find_domain(dev);
5150 spin_lock_irqsave(&device_domain_lock, flags);
5151 spin_lock(&iommu->lock);
5154 info = get_domain_info(dev);
5155 if (!info || !info->pasid_supported)
5158 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5159 if (WARN_ON(!context))
5162 ctx_lo = context[0].lo;
5164 if (!(ctx_lo & CONTEXT_PASIDE)) {
5165 ctx_lo |= CONTEXT_PASIDE;
5166 context[0].lo = ctx_lo;
5168 iommu->flush.flush_context(iommu,
5169 domain->iommu_did[iommu->seq_id],
5170 PCI_DEVID(info->bus, info->devfn),
5171 DMA_CCMD_MASK_NOBIT,
5172 DMA_CCMD_DEVICE_INVL);
5175 /* Enable PASID support in the device, if it wasn't already */
5176 if (!info->pasid_enabled)
5177 iommu_enable_dev_iotlb(info);
5182 spin_unlock(&iommu->lock);
5183 spin_unlock_irqrestore(&device_domain_lock, flags);
5188 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5190 if (dev_is_pci(dev))
5191 return pci_device_group(dev);
5192 return generic_device_group(dev);
5195 static int intel_iommu_enable_auxd(struct device *dev)
5197 struct device_domain_info *info;
5198 struct intel_iommu *iommu;
5199 unsigned long flags;
5202 iommu = device_to_iommu(dev, NULL, NULL);
5203 if (!iommu || dmar_disabled)
5206 if (!sm_supported(iommu) || !pasid_supported(iommu))
5209 ret = intel_iommu_enable_pasid(iommu, dev);
5213 spin_lock_irqsave(&device_domain_lock, flags);
5214 info = get_domain_info(dev);
5215 info->auxd_enabled = 1;
5216 spin_unlock_irqrestore(&device_domain_lock, flags);
5221 static int intel_iommu_disable_auxd(struct device *dev)
5223 struct device_domain_info *info;
5224 unsigned long flags;
5226 spin_lock_irqsave(&device_domain_lock, flags);
5227 info = get_domain_info(dev);
5228 if (!WARN_ON(!info))
5229 info->auxd_enabled = 0;
5230 spin_unlock_irqrestore(&device_domain_lock, flags);
5236 * A PCI express designated vendor specific extended capability is defined
5237 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5238 * for system software and tools to detect endpoint devices supporting the
5239 * Intel scalable IO virtualization without host driver dependency.
5241 * Returns the address of the matching extended capability structure within
5242 * the device's PCI configuration space or 0 if the device does not support
5245 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5250 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5252 pci_read_config_word(pdev, pos + 4, &vendor);
5253 pci_read_config_word(pdev, pos + 8, &id);
5254 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5257 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5264 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5266 if (feat == IOMMU_DEV_FEAT_AUX) {
5269 if (!dev_is_pci(dev) || dmar_disabled ||
5270 !scalable_mode_support() || !iommu_pasid_support())
5273 ret = pci_pasid_features(to_pci_dev(dev));
5277 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5280 if (feat == IOMMU_DEV_FEAT_SVA) {
5281 struct device_domain_info *info = get_domain_info(dev);
5283 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5284 info->pasid_supported && info->pri_supported &&
5285 info->ats_supported;
5292 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5294 if (feat == IOMMU_DEV_FEAT_AUX)
5295 return intel_iommu_enable_auxd(dev);
5297 if (feat == IOMMU_DEV_FEAT_SVA) {
5298 struct device_domain_info *info = get_domain_info(dev);
5303 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5311 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5313 if (feat == IOMMU_DEV_FEAT_AUX)
5314 return intel_iommu_disable_auxd(dev);
5320 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5322 struct device_domain_info *info = get_domain_info(dev);
5324 if (feat == IOMMU_DEV_FEAT_AUX)
5325 return scalable_mode_support() && info && info->auxd_enabled;
5331 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5333 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5335 return dmar_domain->default_pasid > 0 ?
5336 dmar_domain->default_pasid : -EINVAL;
5339 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5342 return attach_deferred(dev);
5346 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5347 enum iommu_attr attr, void *data)
5349 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5350 unsigned long flags;
5353 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5357 case DOMAIN_ATTR_NESTING:
5358 spin_lock_irqsave(&device_domain_lock, flags);
5359 if (nested_mode_support() &&
5360 list_empty(&dmar_domain->devices)) {
5361 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5362 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5366 spin_unlock_irqrestore(&device_domain_lock, flags);
5377 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5378 enum iommu_attr attr, void *data)
5380 switch (domain->type) {
5381 case IOMMU_DOMAIN_UNMANAGED:
5383 case IOMMU_DOMAIN_DMA:
5385 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5386 *(int *)data = !intel_iommu_strict;
5398 * Check that the device does not live on an external facing PCI port that is
5399 * marked as untrusted. Such devices should not be able to apply quirks and
5400 * thus not be able to bypass the IOMMU restrictions.
5402 static bool risky_device(struct pci_dev *pdev)
5404 if (pdev->untrusted) {
5406 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5407 pdev->vendor, pdev->device);
5408 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5414 const struct iommu_ops intel_iommu_ops = {
5415 .capable = intel_iommu_capable,
5416 .domain_alloc = intel_iommu_domain_alloc,
5417 .domain_free = intel_iommu_domain_free,
5418 .domain_get_attr = intel_iommu_domain_get_attr,
5419 .domain_set_attr = intel_iommu_domain_set_attr,
5420 .attach_dev = intel_iommu_attach_device,
5421 .detach_dev = intel_iommu_detach_device,
5422 .aux_attach_dev = intel_iommu_aux_attach_device,
5423 .aux_detach_dev = intel_iommu_aux_detach_device,
5424 .aux_get_pasid = intel_iommu_aux_get_pasid,
5425 .map = intel_iommu_map,
5426 .unmap = intel_iommu_unmap,
5427 .flush_iotlb_all = intel_flush_iotlb_all,
5428 .iotlb_sync = intel_iommu_tlb_sync,
5429 .iova_to_phys = intel_iommu_iova_to_phys,
5430 .probe_device = intel_iommu_probe_device,
5431 .probe_finalize = intel_iommu_probe_finalize,
5432 .release_device = intel_iommu_release_device,
5433 .get_resv_regions = intel_iommu_get_resv_regions,
5434 .put_resv_regions = generic_iommu_put_resv_regions,
5435 .device_group = intel_iommu_device_group,
5436 .dev_has_feat = intel_iommu_dev_has_feat,
5437 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5438 .dev_enable_feat = intel_iommu_dev_enable_feat,
5439 .dev_disable_feat = intel_iommu_dev_disable_feat,
5440 .is_attach_deferred = intel_iommu_is_attach_deferred,
5441 .def_domain_type = device_def_domain_type,
5442 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5443 #ifdef CONFIG_INTEL_IOMMU_SVM
5444 .cache_invalidate = intel_iommu_sva_invalidate,
5445 .sva_bind_gpasid = intel_svm_bind_gpasid,
5446 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
5447 .sva_bind = intel_svm_bind,
5448 .sva_unbind = intel_svm_unbind,
5449 .sva_get_pasid = intel_svm_get_pasid,
5450 .page_response = intel_svm_page_response,
5454 static void quirk_iommu_igfx(struct pci_dev *dev)
5456 if (risky_device(dev))
5459 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5463 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5469 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5470 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5472 /* Broadwell igfx malfunctions with dmar */
5473 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5488 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5489 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5491 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5492 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5493 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5498 static void quirk_iommu_rwbf(struct pci_dev *dev)
5500 if (risky_device(dev))
5504 * Mobile 4 Series Chipset neglects to set RWBF capability,
5505 * but needs it. Same seems to hold for the desktop versions.
5507 pci_info(dev, "Forcing write-buffer flush capability\n");
5511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5520 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5521 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5522 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5523 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5524 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5525 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5526 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5527 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5529 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5533 if (risky_device(dev))
5536 if (pci_read_config_word(dev, GGC, &ggc))
5539 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5540 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5542 } else if (dmar_map_gfx) {
5543 /* we have to ensure the gfx device is idle before we flush */
5544 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5545 intel_iommu_strict = 1;
5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5553 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5557 if (!IS_GFX_DEVICE(dev))
5560 ver = (dev->device >> 8) & 0xff;
5561 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5562 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5566 if (risky_device(dev))
5569 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5570 iommu_skip_te_disable = 1;
5572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5574 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5575 ISOCH DMAR unit for the Azalia sound device, but not give it any
5576 TLB entries, which causes it to deadlock. Check for that. We do
5577 this in a function called from init_dmars(), instead of in a PCI
5578 quirk, because we don't want to print the obnoxious "BIOS broken"
5579 message if VT-d is actually disabled.
5581 static void __init check_tylersburg_isoch(void)
5583 struct pci_dev *pdev;
5584 uint32_t vtisochctrl;
5586 /* If there's no Azalia in the system anyway, forget it. */
5587 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5591 if (risky_device(pdev)) {
5598 /* System Management Registers. Might be hidden, in which case
5599 we can't do the sanity check. But that's OK, because the
5600 known-broken BIOSes _don't_ actually hide it, so far. */
5601 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5605 if (risky_device(pdev)) {
5610 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5617 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5618 if (vtisochctrl & 1)
5621 /* Drop all bits other than the number of TLB entries */
5622 vtisochctrl &= 0x1c;
5624 /* If we have the recommended number of TLB entries (16), fine. */
5625 if (vtisochctrl == 0x10)
5628 /* Zero TLB entries? You get to ride the short bus to school. */
5630 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5631 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5632 dmi_get_system_info(DMI_BIOS_VENDOR),
5633 dmi_get_system_info(DMI_BIOS_VERSION),
5634 dmi_get_system_info(DMI_PRODUCT_VERSION));
5635 iommu_identity_mapping |= IDENTMAP_AZALIA;
5639 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",