2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
22 #define dev_fmt(fmt) pr_fmt(fmt)
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
59 #define ROOT_SIZE VTD_PAGE_SIZE
60 #define CONTEXT_SIZE VTD_PAGE_SIZE
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
67 #define IOAPIC_RANGE_START (0xfee00000)
68 #define IOAPIC_RANGE_END (0xfeefffff)
69 #define IOVA_START_ADDR (0x1000)
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
76 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
82 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN (1)
88 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
90 /* page table handling */
91 #define LEVEL_STRIDE (9)
92 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
95 * This bitmap is used to advertise the page sizes our hardware support
96 * to the IOMMU core, which will then use this information to split
97 * physically contiguous memory regions it is mapping into page sizes
100 * Traditionally the IOMMU core just handed us the mappings directly,
101 * after making sure the size is an order of a 4KiB page and that the
102 * mapping has natural alignment.
104 * To retain this behavior, we currently advertise that we support
105 * all page sizes that are an order of 4KiB.
107 * If at some point we'd like to utilize the IOMMU core's new behavior,
108 * we could change this to advertise the real page sizes we support.
110 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
112 static inline int agaw_to_level(int agaw)
117 static inline int agaw_to_width(int agaw)
119 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
122 static inline int width_to_agaw(int width)
124 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
127 static inline unsigned int level_to_offset_bits(int level)
129 return (level - 1) * LEVEL_STRIDE;
132 static inline int pfn_level_offset(unsigned long pfn, int level)
134 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
137 static inline unsigned long level_mask(int level)
139 return -1UL << level_to_offset_bits(level);
142 static inline unsigned long level_size(int level)
144 return 1UL << level_to_offset_bits(level);
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
149 return (pfn + level_size(level) - 1) & level_mask(level);
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
154 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158 are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
161 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
166 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
170 return mm_to_dma_pfn(page_to_pfn(pg));
172 static inline unsigned long virt_to_dma_pfn(void *p)
174 return page_to_dma_pfn(virt_to_page(p));
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
184 * set to 1 to panic kernel if can't successfully enable VT-d
185 * (used when kernel is launched w/ TXT)
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
194 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 return re->lo & VTD_PAGE_MASK;
206 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 return re->hi & VTD_PAGE_MASK;
217 static inline void context_clear_pasid_enable(struct context_entry *context)
219 context->lo &= ~(1ULL << 11);
222 static inline bool context_pasid_enabled(struct context_entry *context)
224 return !!(context->lo & (1ULL << 11));
227 static inline void context_set_copied(struct context_entry *context)
229 context->hi |= (1ull << 3);
232 static inline bool context_copied(struct context_entry *context)
234 return !!(context->hi & (1ULL << 3));
237 static inline bool __context_present(struct context_entry *context)
239 return (context->lo & 1);
242 bool context_present(struct context_entry *context)
244 return context_pasid_enabled(context) ?
245 __context_present(context) :
246 __context_present(context) && !context_copied(context);
249 static inline void context_set_present(struct context_entry *context)
254 static inline void context_set_fault_enable(struct context_entry *context)
256 context->lo &= (((u64)-1) << 2) | 1;
259 static inline void context_set_translation_type(struct context_entry *context,
262 context->lo &= (((u64)-1) << 4) | 3;
263 context->lo |= (value & 3) << 2;
266 static inline void context_set_address_root(struct context_entry *context,
269 context->lo &= ~VTD_PAGE_MASK;
270 context->lo |= value & VTD_PAGE_MASK;
273 static inline void context_set_address_width(struct context_entry *context,
276 context->hi |= value & 7;
279 static inline void context_set_domain_id(struct context_entry *context,
282 context->hi |= (value & ((1 << 16) - 1)) << 8;
285 static inline int context_domain_id(struct context_entry *c)
287 return((c->hi >> 8) & 0xffff);
290 static inline void context_clear_entry(struct context_entry *context)
297 * This domain is a statically identity mapping domain.
298 * 1. This domain creats a static 1:1 mapping to all usable memory.
299 * 2. It maps to each iommu if successful.
300 * 3. Each iommu mapps to this domain if successful.
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
305 /* si_domain contains mulitple devices */
306 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
319 struct iommu_resv_region *resv; /* reserved region handle */
322 struct dmar_atsr_unit {
323 struct list_head list; /* list of ATSR units */
324 struct acpi_dmar_header *hdr; /* ACPI header */
325 struct dmar_dev_scope *devices; /* target devices */
326 int devices_cnt; /* target device count */
327 u8 include_all:1; /* include all ports */
330 static LIST_HEAD(dmar_atsr_units);
331 static LIST_HEAD(dmar_rmrr_units);
333 #define for_each_rmrr_units(rmrr) \
334 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336 /* bitmap for indexing intel_iommus */
337 static int g_num_of_iommus;
339 static void domain_exit(struct dmar_domain *domain);
340 static void domain_remove_dev_info(struct dmar_domain *domain);
341 static void dmar_remove_one_dev_info(struct device *dev);
342 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
343 static void domain_context_clear(struct intel_iommu *iommu,
345 static int domain_detach_iommu(struct dmar_domain *domain,
346 struct intel_iommu *iommu);
347 static bool device_is_rmrr_locked(struct device *dev);
349 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
350 int dmar_disabled = 0;
352 int dmar_disabled = 1;
353 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
356 int intel_iommu_enabled = 0;
357 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
359 static int dmar_map_gfx = 1;
360 static int dmar_forcedac;
361 static int intel_iommu_strict;
362 static int intel_iommu_superpage = 1;
363 static int iommu_identity_mapping;
365 #define IDENTMAP_ALL 1
366 #define IDENTMAP_GFX 2
367 #define IDENTMAP_AZALIA 4
369 int intel_iommu_gfx_mapped;
370 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
372 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
373 static DEFINE_SPINLOCK(device_domain_lock);
374 static LIST_HEAD(device_domain_list);
377 * Iterate over elements in device_domain_list and call the specified
378 * callback @fn against each element.
380 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
381 void *data), void *data)
385 struct device_domain_info *info;
387 spin_lock_irqsave(&device_domain_lock, flags);
388 list_for_each_entry(info, &device_domain_list, global) {
389 ret = fn(info, data);
391 spin_unlock_irqrestore(&device_domain_lock, flags);
395 spin_unlock_irqrestore(&device_domain_lock, flags);
400 const struct iommu_ops intel_iommu_ops;
402 static bool translation_pre_enabled(struct intel_iommu *iommu)
404 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
409 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 static void init_translation_status(struct intel_iommu *iommu)
416 gsts = readl(iommu->reg + DMAR_GSTS_REG);
417 if (gsts & DMA_GSTS_TES)
418 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
422 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
424 return container_of(dom, struct dmar_domain, domain);
427 static int __init intel_iommu_setup(char *str)
432 if (!strncmp(str, "on", 2)) {
434 pr_info("IOMMU enabled\n");
435 } else if (!strncmp(str, "off", 3)) {
437 no_platform_optin = 1;
438 pr_info("IOMMU disabled\n");
439 } else if (!strncmp(str, "igfx_off", 8)) {
441 pr_info("Disable GFX device mapping\n");
442 } else if (!strncmp(str, "forcedac", 8)) {
443 pr_info("Forcing DAC for PCI devices\n");
445 } else if (!strncmp(str, "strict", 6)) {
446 pr_info("Disable batched IOTLB flush\n");
447 intel_iommu_strict = 1;
448 } else if (!strncmp(str, "sp_off", 6)) {
449 pr_info("Disable supported super page\n");
450 intel_iommu_superpage = 0;
451 } else if (!strncmp(str, "sm_on", 5)) {
452 pr_info("Intel-IOMMU: scalable mode supported\n");
454 } else if (!strncmp(str, "tboot_noforce", 13)) {
456 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 intel_iommu_tboot_noforce = 1;
460 str += strcspn(str, ",");
466 __setup("intel_iommu=", intel_iommu_setup);
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 struct dmar_domain **domains;
476 domains = iommu->domains[idx];
480 return domains[did & 0xff];
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 struct dmar_domain *domain)
486 struct dmar_domain **domains;
489 if (!iommu->domains[idx]) {
490 size_t size = 256 * sizeof(struct dmar_domain *);
491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494 domains = iommu->domains[idx];
495 if (WARN_ON(!domains))
498 domains[did & 0xff] = domain;
501 void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_si(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 static inline int domain_pfn_supported(struct dmar_domain *domain,
545 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
547 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
555 sagaw = cap_sagaw(iommu->cap);
556 for (agaw = width_to_agaw(max_gaw);
558 if (test_bit(agaw, &sagaw))
566 * Calculate max SAGAW for each iommu.
568 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
570 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
574 * calculate agaw for each iommu.
575 * "SAGAW" may be different across iommus, use a default agaw, and
576 * get a supported less agaw for iommus that don't support the default agaw.
578 int iommu_calculate_agaw(struct intel_iommu *iommu)
580 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 /* This functionin only returns single iommu in a domain */
584 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
588 /* si_domain and vm domain should not get here. */
589 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592 for_each_domain_iommu(iommu_id, domain)
595 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598 return g_iommus[iommu_id];
601 static void domain_update_iommu_coherency(struct dmar_domain *domain)
603 struct dmar_drhd_unit *drhd;
604 struct intel_iommu *iommu;
608 domain->iommu_coherency = 1;
610 for_each_domain_iommu(i, domain) {
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
620 /* No hardware attached; use lowest common denominator */
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
638 for_each_active_iommu(iommu, drhd) {
640 if (!ecap_sc_support(iommu->ecap)) {
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
657 if (!intel_iommu_superpage) {
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
665 mask &= cap_super_page_val(iommu->cap);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686 struct root_entry *root = &iommu->root_entry[bus];
687 struct context_entry *context;
691 if (sm_supported(iommu)) {
699 context = phys_to_virt(*entry & VTD_PAGE_MASK);
701 unsigned long phy_addr;
705 context = alloc_pgtable_page(iommu->node);
709 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
710 phy_addr = virt_to_phys((void *)context);
711 *entry = phy_addr | 1;
712 __iommu_flush_cache(iommu, entry, sizeof(*entry));
714 return &context[devfn];
717 static int iommu_dummy(struct device *dev)
719 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
722 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
724 struct dmar_drhd_unit *drhd = NULL;
725 struct intel_iommu *iommu;
727 struct pci_dev *ptmp, *pdev = NULL;
731 if (iommu_dummy(dev))
734 if (dev_is_pci(dev)) {
735 struct pci_dev *pf_pdev;
737 pdev = to_pci_dev(dev);
740 /* VMD child devices currently cannot be handled individually */
741 if (is_vmd(pdev->bus))
745 /* VFs aren't listed in scope tables; we need to look up
746 * the PF instead to find the IOMMU. */
747 pf_pdev = pci_physfn(pdev);
749 segment = pci_domain_nr(pdev->bus);
750 } else if (has_acpi_companion(dev))
751 dev = &ACPI_COMPANION(dev)->dev;
754 for_each_active_iommu(iommu, drhd) {
755 if (pdev && segment != drhd->segment)
758 for_each_active_dev_scope(drhd->devices,
759 drhd->devices_cnt, i, tmp) {
761 /* For a VF use its original BDF# not that of the PF
762 * which we used for the IOMMU lookup. Strictly speaking
763 * we could do this for all PCI devices; we only need to
764 * get the BDF# from the scope table for ACPI matches. */
765 if (pdev && pdev->is_virtfn)
768 *bus = drhd->devices[i].bus;
769 *devfn = drhd->devices[i].devfn;
773 if (!pdev || !dev_is_pci(tmp))
776 ptmp = to_pci_dev(tmp);
777 if (ptmp->subordinate &&
778 ptmp->subordinate->number <= pdev->bus->number &&
779 ptmp->subordinate->busn_res.end >= pdev->bus->number)
783 if (pdev && drhd->include_all) {
785 *bus = pdev->bus->number;
786 *devfn = pdev->devfn;
797 static void domain_flush_cache(struct dmar_domain *domain,
798 void *addr, int size)
800 if (!domain->iommu_coherency)
801 clflush_cache_range(addr, size);
804 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
806 struct context_entry *context;
810 spin_lock_irqsave(&iommu->lock, flags);
811 context = iommu_context_addr(iommu, bus, devfn, 0);
813 ret = context_present(context);
814 spin_unlock_irqrestore(&iommu->lock, flags);
818 static void free_context_table(struct intel_iommu *iommu)
822 struct context_entry *context;
824 spin_lock_irqsave(&iommu->lock, flags);
825 if (!iommu->root_entry) {
828 for (i = 0; i < ROOT_ENTRY_NR; i++) {
829 context = iommu_context_addr(iommu, i, 0, 0);
831 free_pgtable_page(context);
833 if (!sm_supported(iommu))
836 context = iommu_context_addr(iommu, i, 0x80, 0);
838 free_pgtable_page(context);
841 free_pgtable_page(iommu->root_entry);
842 iommu->root_entry = NULL;
844 spin_unlock_irqrestore(&iommu->lock, flags);
847 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
848 unsigned long pfn, int *target_level)
850 struct dma_pte *parent, *pte;
851 int level = agaw_to_level(domain->agaw);
854 BUG_ON(!domain->pgd);
856 if (!domain_pfn_supported(domain, pfn))
857 /* Address beyond IOMMU's addressing capabilities. */
860 parent = domain->pgd;
865 offset = pfn_level_offset(pfn, level);
866 pte = &parent[offset];
867 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
869 if (level == *target_level)
872 if (!dma_pte_present(pte)) {
875 tmp_page = alloc_pgtable_page(domain->nid);
880 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
881 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
882 if (cmpxchg64(&pte->val, 0ULL, pteval))
883 /* Someone else set it while we were thinking; use theirs. */
884 free_pgtable_page(tmp_page);
886 domain_flush_cache(domain, pte, sizeof(*pte));
891 parent = phys_to_virt(dma_pte_addr(pte));
896 *target_level = level;
902 /* return address's pte at specific level */
903 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
905 int level, int *large_page)
907 struct dma_pte *parent, *pte;
908 int total = agaw_to_level(domain->agaw);
911 parent = domain->pgd;
912 while (level <= total) {
913 offset = pfn_level_offset(pfn, total);
914 pte = &parent[offset];
918 if (!dma_pte_present(pte)) {
923 if (dma_pte_superpage(pte)) {
928 parent = phys_to_virt(dma_pte_addr(pte));
934 /* clear last level pte, a tlb flush should be followed */
935 static void dma_pte_clear_range(struct dmar_domain *domain,
936 unsigned long start_pfn,
937 unsigned long last_pfn)
939 unsigned int large_page;
940 struct dma_pte *first_pte, *pte;
942 BUG_ON(!domain_pfn_supported(domain, start_pfn));
943 BUG_ON(!domain_pfn_supported(domain, last_pfn));
944 BUG_ON(start_pfn > last_pfn);
946 /* we don't need lock here; nobody else touches the iova range */
949 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
951 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
956 start_pfn += lvl_to_nr_pages(large_page);
958 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
960 domain_flush_cache(domain, first_pte,
961 (void *)pte - (void *)first_pte);
963 } while (start_pfn && start_pfn <= last_pfn);
966 static void dma_pte_free_level(struct dmar_domain *domain, int level,
967 int retain_level, struct dma_pte *pte,
968 unsigned long pfn, unsigned long start_pfn,
969 unsigned long last_pfn)
971 pfn = max(start_pfn, pfn);
972 pte = &pte[pfn_level_offset(pfn, level)];
975 unsigned long level_pfn;
976 struct dma_pte *level_pte;
978 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
981 level_pfn = pfn & level_mask(level);
982 level_pte = phys_to_virt(dma_pte_addr(pte));
985 dma_pte_free_level(domain, level - 1, retain_level,
986 level_pte, level_pfn, start_pfn,
991 * Free the page table if we're below the level we want to
992 * retain and the range covers the entire table.
994 if (level < retain_level && !(start_pfn > level_pfn ||
995 last_pfn < level_pfn + level_size(level) - 1)) {
997 domain_flush_cache(domain, pte, sizeof(*pte));
998 free_pgtable_page(level_pte);
1001 pfn += level_size(level);
1002 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1006 * clear last level (leaf) ptes and free page table pages below the
1007 * level we wish to keep intact.
1009 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1010 unsigned long start_pfn,
1011 unsigned long last_pfn,
1014 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1015 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1016 BUG_ON(start_pfn > last_pfn);
1018 dma_pte_clear_range(domain, start_pfn, last_pfn);
1020 /* We don't need lock here; nobody else touches the iova range */
1021 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1022 domain->pgd, 0, start_pfn, last_pfn);
1025 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1026 free_pgtable_page(domain->pgd);
1031 /* When a page at a given level is being unlinked from its parent, we don't
1032 need to *modify* it at all. All we need to do is make a list of all the
1033 pages which can be freed just as soon as we've flushed the IOTLB and we
1034 know the hardware page-walk will no longer touch them.
1035 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1037 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1038 int level, struct dma_pte *pte,
1039 struct page *freelist)
1043 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1044 pg->freelist = freelist;
1050 pte = page_address(pg);
1052 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1053 freelist = dma_pte_list_pagetables(domain, level - 1,
1056 } while (!first_pte_in_page(pte));
1061 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1062 struct dma_pte *pte, unsigned long pfn,
1063 unsigned long start_pfn,
1064 unsigned long last_pfn,
1065 struct page *freelist)
1067 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1069 pfn = max(start_pfn, pfn);
1070 pte = &pte[pfn_level_offset(pfn, level)];
1073 unsigned long level_pfn;
1075 if (!dma_pte_present(pte))
1078 level_pfn = pfn & level_mask(level);
1080 /* If range covers entire pagetable, free it */
1081 if (start_pfn <= level_pfn &&
1082 last_pfn >= level_pfn + level_size(level) - 1) {
1083 /* These suborbinate page tables are going away entirely. Don't
1084 bother to clear them; we're just going to *free* them. */
1085 if (level > 1 && !dma_pte_superpage(pte))
1086 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1092 } else if (level > 1) {
1093 /* Recurse down into a level that isn't *entirely* obsolete */
1094 freelist = dma_pte_clear_level(domain, level - 1,
1095 phys_to_virt(dma_pte_addr(pte)),
1096 level_pfn, start_pfn, last_pfn,
1100 pfn += level_size(level);
1101 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1104 domain_flush_cache(domain, first_pte,
1105 (void *)++last_pte - (void *)first_pte);
1110 /* We can't just free the pages because the IOMMU may still be walking
1111 the page tables, and may have cached the intermediate levels. The
1112 pages can only be freed after the IOTLB flush has been done. */
1113 static struct page *domain_unmap(struct dmar_domain *domain,
1114 unsigned long start_pfn,
1115 unsigned long last_pfn)
1117 struct page *freelist;
1119 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1120 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1121 BUG_ON(start_pfn > last_pfn);
1123 /* we don't need lock here; nobody else touches the iova range */
1124 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1125 domain->pgd, 0, start_pfn, last_pfn, NULL);
1128 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1129 struct page *pgd_page = virt_to_page(domain->pgd);
1130 pgd_page->freelist = freelist;
1131 freelist = pgd_page;
1139 static void dma_free_pagelist(struct page *freelist)
1143 while ((pg = freelist)) {
1144 freelist = pg->freelist;
1145 free_pgtable_page(page_address(pg));
1149 static void iova_entry_free(unsigned long data)
1151 struct page *freelist = (struct page *)data;
1153 dma_free_pagelist(freelist);
1156 /* iommu handling */
1157 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1159 struct root_entry *root;
1160 unsigned long flags;
1162 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1164 pr_err("Allocating root entry for %s failed\n",
1169 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1171 spin_lock_irqsave(&iommu->lock, flags);
1172 iommu->root_entry = root;
1173 spin_unlock_irqrestore(&iommu->lock, flags);
1178 static void iommu_set_root_entry(struct intel_iommu *iommu)
1184 addr = virt_to_phys(iommu->root_entry);
1185 if (sm_supported(iommu))
1186 addr |= DMA_RTADDR_SMT;
1188 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1189 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1191 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1193 /* Make sure hardware complete it */
1194 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195 readl, (sts & DMA_GSTS_RTPS), sts);
1197 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1200 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1205 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1208 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1211 /* Make sure hardware complete it */
1212 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213 readl, (!(val & DMA_GSTS_WBFS)), val);
1215 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1218 /* return value determine if we need a write buffer flush */
1219 static void __iommu_flush_context(struct intel_iommu *iommu,
1220 u16 did, u16 source_id, u8 function_mask,
1227 case DMA_CCMD_GLOBAL_INVL:
1228 val = DMA_CCMD_GLOBAL_INVL;
1230 case DMA_CCMD_DOMAIN_INVL:
1231 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1233 case DMA_CCMD_DEVICE_INVL:
1234 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1235 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1240 val |= DMA_CCMD_ICC;
1242 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1243 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1245 /* Make sure hardware complete it */
1246 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1247 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1249 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252 /* return value determine if we need a write buffer flush */
1253 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1254 u64 addr, unsigned int size_order, u64 type)
1256 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1257 u64 val = 0, val_iva = 0;
1261 case DMA_TLB_GLOBAL_FLUSH:
1262 /* global flush doesn't need set IVA_REG */
1263 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1265 case DMA_TLB_DSI_FLUSH:
1266 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1268 case DMA_TLB_PSI_FLUSH:
1269 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1270 /* IH bit is passed in as part of address */
1271 val_iva = size_order | addr;
1276 /* Note: set drain read/write */
1279 * This is probably to be super secure.. Looks like we can
1280 * ignore it without any impact.
1282 if (cap_read_drain(iommu->cap))
1283 val |= DMA_TLB_READ_DRAIN;
1285 if (cap_write_drain(iommu->cap))
1286 val |= DMA_TLB_WRITE_DRAIN;
1288 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1289 /* Note: Only uses first TLB reg currently */
1291 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1292 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1294 /* Make sure hardware complete it */
1295 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1296 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1298 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1300 /* check IOTLB invalidation granularity */
1301 if (DMA_TLB_IAIG(val) == 0)
1302 pr_err("Flush IOTLB failed\n");
1303 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1304 pr_debug("TLB flush request %Lx, actual %Lx\n",
1305 (unsigned long long)DMA_TLB_IIRG(type),
1306 (unsigned long long)DMA_TLB_IAIG(val));
1309 static struct device_domain_info *
1310 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1313 struct device_domain_info *info;
1315 assert_spin_locked(&device_domain_lock);
1320 list_for_each_entry(info, &domain->devices, link)
1321 if (info->iommu == iommu && info->bus == bus &&
1322 info->devfn == devfn) {
1323 if (info->ats_supported && info->dev)
1331 static void domain_update_iotlb(struct dmar_domain *domain)
1333 struct device_domain_info *info;
1334 bool has_iotlb_device = false;
1336 assert_spin_locked(&device_domain_lock);
1338 list_for_each_entry(info, &domain->devices, link) {
1339 struct pci_dev *pdev;
1341 if (!info->dev || !dev_is_pci(info->dev))
1344 pdev = to_pci_dev(info->dev);
1345 if (pdev->ats_enabled) {
1346 has_iotlb_device = true;
1351 domain->has_iotlb_device = has_iotlb_device;
1354 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1356 struct pci_dev *pdev;
1358 assert_spin_locked(&device_domain_lock);
1360 if (!info || !dev_is_pci(info->dev))
1363 pdev = to_pci_dev(info->dev);
1364 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1365 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1366 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1367 * reserved, which should be set to 0.
1369 if (!ecap_dit(info->iommu->ecap))
1372 struct pci_dev *pf_pdev;
1374 /* pdev will be returned if device is not a vf */
1375 pf_pdev = pci_physfn(pdev);
1376 info->pfsid = pci_dev_id(pf_pdev);
1379 #ifdef CONFIG_INTEL_IOMMU_SVM
1380 /* The PCIe spec, in its wisdom, declares that the behaviour of
1381 the device if you enable PASID support after ATS support is
1382 undefined. So always enable PASID support on devices which
1383 have it, even if we can't yet know if we're ever going to
1385 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1386 info->pasid_enabled = 1;
1388 if (info->pri_supported &&
1389 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1390 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1391 info->pri_enabled = 1;
1393 if (!pdev->untrusted && info->ats_supported &&
1394 pci_ats_page_aligned(pdev) &&
1395 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1396 info->ats_enabled = 1;
1397 domain_update_iotlb(info->domain);
1398 info->ats_qdep = pci_ats_queue_depth(pdev);
1402 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1404 struct pci_dev *pdev;
1406 assert_spin_locked(&device_domain_lock);
1408 if (!dev_is_pci(info->dev))
1411 pdev = to_pci_dev(info->dev);
1413 if (info->ats_enabled) {
1414 pci_disable_ats(pdev);
1415 info->ats_enabled = 0;
1416 domain_update_iotlb(info->domain);
1418 #ifdef CONFIG_INTEL_IOMMU_SVM
1419 if (info->pri_enabled) {
1420 pci_disable_pri(pdev);
1421 info->pri_enabled = 0;
1423 if (info->pasid_enabled) {
1424 pci_disable_pasid(pdev);
1425 info->pasid_enabled = 0;
1430 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1431 u64 addr, unsigned mask)
1434 unsigned long flags;
1435 struct device_domain_info *info;
1437 if (!domain->has_iotlb_device)
1440 spin_lock_irqsave(&device_domain_lock, flags);
1441 list_for_each_entry(info, &domain->devices, link) {
1442 if (!info->ats_enabled)
1445 sid = info->bus << 8 | info->devfn;
1446 qdep = info->ats_qdep;
1447 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1450 spin_unlock_irqrestore(&device_domain_lock, flags);
1453 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1454 struct dmar_domain *domain,
1455 unsigned long pfn, unsigned int pages,
1458 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1459 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1460 u16 did = domain->iommu_did[iommu->seq_id];
1467 * Fallback to domain selective flush if no PSI support or the size is
1469 * PSI requires page size to be 2 ^ x, and the base address is naturally
1470 * aligned to the size
1472 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1473 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1476 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1480 * In caching mode, changes of pages from non-present to present require
1481 * flush. However, device IOTLB doesn't need to be flushed in this case.
1483 if (!cap_caching_mode(iommu->cap) || !map)
1484 iommu_flush_dev_iotlb(domain, addr, mask);
1487 /* Notification for newly created mappings */
1488 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1489 struct dmar_domain *domain,
1490 unsigned long pfn, unsigned int pages)
1492 /* It's a non-present to present mapping. Only flush if caching mode */
1493 if (cap_caching_mode(iommu->cap))
1494 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1496 iommu_flush_write_buffer(iommu);
1499 static void iommu_flush_iova(struct iova_domain *iovad)
1501 struct dmar_domain *domain;
1504 domain = container_of(iovad, struct dmar_domain, iovad);
1506 for_each_domain_iommu(idx, domain) {
1507 struct intel_iommu *iommu = g_iommus[idx];
1508 u16 did = domain->iommu_did[iommu->seq_id];
1510 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1512 if (!cap_caching_mode(iommu->cap))
1513 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1514 0, MAX_AGAW_PFN_WIDTH);
1518 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1521 unsigned long flags;
1523 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1526 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1527 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1528 pmen &= ~DMA_PMEN_EPM;
1529 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1531 /* wait for the protected region status bit to clear */
1532 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1533 readl, !(pmen & DMA_PMEN_PRS), pmen);
1535 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1538 static void iommu_enable_translation(struct intel_iommu *iommu)
1541 unsigned long flags;
1543 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1544 iommu->gcmd |= DMA_GCMD_TE;
1545 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1547 /* Make sure hardware complete it */
1548 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1549 readl, (sts & DMA_GSTS_TES), sts);
1551 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1554 static void iommu_disable_translation(struct intel_iommu *iommu)
1559 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1560 iommu->gcmd &= ~DMA_GCMD_TE;
1561 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1563 /* Make sure hardware complete it */
1564 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1565 readl, (!(sts & DMA_GSTS_TES)), sts);
1567 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1571 static int iommu_init_domains(struct intel_iommu *iommu)
1573 u32 ndomains, nlongs;
1576 ndomains = cap_ndoms(iommu->cap);
1577 pr_debug("%s: Number of Domains supported <%d>\n",
1578 iommu->name, ndomains);
1579 nlongs = BITS_TO_LONGS(ndomains);
1581 spin_lock_init(&iommu->lock);
1583 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1584 if (!iommu->domain_ids) {
1585 pr_err("%s: Allocating domain id array failed\n",
1590 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1591 iommu->domains = kzalloc(size, GFP_KERNEL);
1593 if (iommu->domains) {
1594 size = 256 * sizeof(struct dmar_domain *);
1595 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1598 if (!iommu->domains || !iommu->domains[0]) {
1599 pr_err("%s: Allocating domain array failed\n",
1601 kfree(iommu->domain_ids);
1602 kfree(iommu->domains);
1603 iommu->domain_ids = NULL;
1604 iommu->domains = NULL;
1611 * If Caching mode is set, then invalid translations are tagged
1612 * with domain-id 0, hence we need to pre-allocate it. We also
1613 * use domain-id 0 as a marker for non-allocated domain-id, so
1614 * make sure it is not used for a real domain.
1616 set_bit(0, iommu->domain_ids);
1619 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1620 * entry for first-level or pass-through translation modes should
1621 * be programmed with a domain id different from those used for
1622 * second-level or nested translation. We reserve a domain id for
1625 if (sm_supported(iommu))
1626 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1631 static void disable_dmar_iommu(struct intel_iommu *iommu)
1633 struct device_domain_info *info, *tmp;
1634 unsigned long flags;
1636 if (!iommu->domains || !iommu->domain_ids)
1639 spin_lock_irqsave(&device_domain_lock, flags);
1640 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1641 struct dmar_domain *domain;
1643 if (info->iommu != iommu)
1646 if (!info->dev || !info->domain)
1649 domain = info->domain;
1651 __dmar_remove_one_dev_info(info);
1653 spin_unlock_irqrestore(&device_domain_lock, flags);
1655 if (iommu->gcmd & DMA_GCMD_TE)
1656 iommu_disable_translation(iommu);
1659 static void free_dmar_iommu(struct intel_iommu *iommu)
1661 if ((iommu->domains) && (iommu->domain_ids)) {
1662 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1665 for (i = 0; i < elems; i++)
1666 kfree(iommu->domains[i]);
1667 kfree(iommu->domains);
1668 kfree(iommu->domain_ids);
1669 iommu->domains = NULL;
1670 iommu->domain_ids = NULL;
1673 g_iommus[iommu->seq_id] = NULL;
1675 /* free context mapping */
1676 free_context_table(iommu);
1678 #ifdef CONFIG_INTEL_IOMMU_SVM
1679 if (pasid_supported(iommu)) {
1680 if (ecap_prs(iommu->ecap))
1681 intel_svm_finish_prq(iommu);
1686 static struct dmar_domain *alloc_domain(int flags)
1688 struct dmar_domain *domain;
1690 domain = alloc_domain_mem();
1694 memset(domain, 0, sizeof(*domain));
1695 domain->nid = NUMA_NO_NODE;
1696 domain->flags = flags;
1697 domain->has_iotlb_device = false;
1698 INIT_LIST_HEAD(&domain->devices);
1703 /* Must be called with iommu->lock */
1704 static int domain_attach_iommu(struct dmar_domain *domain,
1705 struct intel_iommu *iommu)
1707 unsigned long ndomains;
1710 assert_spin_locked(&device_domain_lock);
1711 assert_spin_locked(&iommu->lock);
1713 domain->iommu_refcnt[iommu->seq_id] += 1;
1714 domain->iommu_count += 1;
1715 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1716 ndomains = cap_ndoms(iommu->cap);
1717 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1719 if (num >= ndomains) {
1720 pr_err("%s: No free domain ids\n", iommu->name);
1721 domain->iommu_refcnt[iommu->seq_id] -= 1;
1722 domain->iommu_count -= 1;
1726 set_bit(num, iommu->domain_ids);
1727 set_iommu_domain(iommu, num, domain);
1729 domain->iommu_did[iommu->seq_id] = num;
1730 domain->nid = iommu->node;
1732 domain_update_iommu_cap(domain);
1738 static int domain_detach_iommu(struct dmar_domain *domain,
1739 struct intel_iommu *iommu)
1743 assert_spin_locked(&device_domain_lock);
1744 assert_spin_locked(&iommu->lock);
1746 domain->iommu_refcnt[iommu->seq_id] -= 1;
1747 count = --domain->iommu_count;
1748 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1749 num = domain->iommu_did[iommu->seq_id];
1750 clear_bit(num, iommu->domain_ids);
1751 set_iommu_domain(iommu, num, NULL);
1753 domain_update_iommu_cap(domain);
1754 domain->iommu_did[iommu->seq_id] = 0;
1760 static struct iova_domain reserved_iova_list;
1761 static struct lock_class_key reserved_rbtree_key;
1763 static int dmar_init_reserved_ranges(void)
1765 struct pci_dev *pdev = NULL;
1769 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1771 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1772 &reserved_rbtree_key);
1774 /* IOAPIC ranges shouldn't be accessed by DMA */
1775 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1776 IOVA_PFN(IOAPIC_RANGE_END));
1778 pr_err("Reserve IOAPIC range failed\n");
1782 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1783 for_each_pci_dev(pdev) {
1786 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1787 r = &pdev->resource[i];
1788 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1790 iova = reserve_iova(&reserved_iova_list,
1794 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1802 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1804 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1807 static inline int guestwidth_to_adjustwidth(int gaw)
1810 int r = (gaw - 12) % 9;
1821 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1824 int adjust_width, agaw;
1825 unsigned long sagaw;
1828 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1830 err = init_iova_flush_queue(&domain->iovad,
1831 iommu_flush_iova, iova_entry_free);
1835 domain_reserve_special_ranges(domain);
1837 /* calculate AGAW */
1838 if (guest_width > cap_mgaw(iommu->cap))
1839 guest_width = cap_mgaw(iommu->cap);
1840 domain->gaw = guest_width;
1841 adjust_width = guestwidth_to_adjustwidth(guest_width);
1842 agaw = width_to_agaw(adjust_width);
1843 sagaw = cap_sagaw(iommu->cap);
1844 if (!test_bit(agaw, &sagaw)) {
1845 /* hardware doesn't support it, choose a bigger one */
1846 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1847 agaw = find_next_bit(&sagaw, 5, agaw);
1851 domain->agaw = agaw;
1853 if (ecap_coherent(iommu->ecap))
1854 domain->iommu_coherency = 1;
1856 domain->iommu_coherency = 0;
1858 if (ecap_sc_support(iommu->ecap))
1859 domain->iommu_snooping = 1;
1861 domain->iommu_snooping = 0;
1863 if (intel_iommu_superpage)
1864 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1866 domain->iommu_superpage = 0;
1868 domain->nid = iommu->node;
1870 /* always allocate the top pgd */
1871 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1874 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1878 static void domain_exit(struct dmar_domain *domain)
1880 struct page *freelist;
1882 /* Remove associated devices and clear attached or cached domains */
1883 domain_remove_dev_info(domain);
1886 put_iova_domain(&domain->iovad);
1888 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1890 dma_free_pagelist(freelist);
1892 free_domain_mem(domain);
1896 * Get the PASID directory size for scalable mode context entry.
1897 * Value of X in the PDTS field of a scalable mode context entry
1898 * indicates PASID directory with 2^(X + 7) entries.
1900 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1904 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1905 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1913 * Set the RID_PASID field of a scalable mode context entry. The
1914 * IOMMU hardware will use the PASID value set in this field for
1915 * DMA translations of DMA requests without PASID.
1918 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1920 context->hi |= pasid & ((1 << 20) - 1);
1921 context->hi |= (1 << 20);
1925 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1928 static inline void context_set_sm_dte(struct context_entry *context)
1930 context->lo |= (1 << 2);
1934 * Set the PRE(Page Request Enable) field of a scalable mode context
1937 static inline void context_set_sm_pre(struct context_entry *context)
1939 context->lo |= (1 << 4);
1942 /* Convert value to context PASID directory size field coding. */
1943 #define context_pdts(pds) (((pds) & 0x7) << 9)
1945 static int domain_context_mapping_one(struct dmar_domain *domain,
1946 struct intel_iommu *iommu,
1947 struct pasid_table *table,
1950 u16 did = domain->iommu_did[iommu->seq_id];
1951 int translation = CONTEXT_TT_MULTI_LEVEL;
1952 struct device_domain_info *info = NULL;
1953 struct context_entry *context;
1954 unsigned long flags;
1959 if (hw_pass_through && domain_type_is_si(domain))
1960 translation = CONTEXT_TT_PASS_THROUGH;
1962 pr_debug("Set context mapping for %02x:%02x.%d\n",
1963 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1965 BUG_ON(!domain->pgd);
1967 spin_lock_irqsave(&device_domain_lock, flags);
1968 spin_lock(&iommu->lock);
1971 context = iommu_context_addr(iommu, bus, devfn, 1);
1976 if (context_present(context))
1980 * For kdump cases, old valid entries may be cached due to the
1981 * in-flight DMA and copied pgtable, but there is no unmapping
1982 * behaviour for them, thus we need an explicit cache flush for
1983 * the newly-mapped device. For kdump, at this point, the device
1984 * is supposed to finish reset at its driver probe stage, so no
1985 * in-flight DMA will exist, and we don't need to worry anymore
1988 if (context_copied(context)) {
1989 u16 did_old = context_domain_id(context);
1991 if (did_old < cap_ndoms(iommu->cap)) {
1992 iommu->flush.flush_context(iommu, did_old,
1993 (((u16)bus) << 8) | devfn,
1994 DMA_CCMD_MASK_NOBIT,
1995 DMA_CCMD_DEVICE_INVL);
1996 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2001 context_clear_entry(context);
2003 if (sm_supported(iommu)) {
2008 /* Setup the PASID DIR pointer: */
2009 pds = context_get_sm_pds(table);
2010 context->lo = (u64)virt_to_phys(table->table) |
2013 /* Setup the RID_PASID field: */
2014 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2017 * Setup the Device-TLB enable bit and Page request
2020 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2021 if (info && info->ats_supported)
2022 context_set_sm_dte(context);
2023 if (info && info->pri_supported)
2024 context_set_sm_pre(context);
2026 struct dma_pte *pgd = domain->pgd;
2029 context_set_domain_id(context, did);
2031 if (translation != CONTEXT_TT_PASS_THROUGH) {
2033 * Skip top levels of page tables for iommu which has
2034 * less agaw than default. Unnecessary for PT mode.
2036 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2038 pgd = phys_to_virt(dma_pte_addr(pgd));
2039 if (!dma_pte_present(pgd))
2043 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2044 if (info && info->ats_supported)
2045 translation = CONTEXT_TT_DEV_IOTLB;
2047 translation = CONTEXT_TT_MULTI_LEVEL;
2049 context_set_address_root(context, virt_to_phys(pgd));
2050 context_set_address_width(context, agaw);
2053 * In pass through mode, AW must be programmed to
2054 * indicate the largest AGAW value supported by
2055 * hardware. And ASR is ignored by hardware.
2057 context_set_address_width(context, iommu->msagaw);
2060 context_set_translation_type(context, translation);
2063 context_set_fault_enable(context);
2064 context_set_present(context);
2065 domain_flush_cache(domain, context, sizeof(*context));
2068 * It's a non-present to present mapping. If hardware doesn't cache
2069 * non-present entry we only need to flush the write-buffer. If the
2070 * _does_ cache non-present entries, then it does so in the special
2071 * domain #0, which we have to flush:
2073 if (cap_caching_mode(iommu->cap)) {
2074 iommu->flush.flush_context(iommu, 0,
2075 (((u16)bus) << 8) | devfn,
2076 DMA_CCMD_MASK_NOBIT,
2077 DMA_CCMD_DEVICE_INVL);
2078 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2080 iommu_flush_write_buffer(iommu);
2082 iommu_enable_dev_iotlb(info);
2087 spin_unlock(&iommu->lock);
2088 spin_unlock_irqrestore(&device_domain_lock, flags);
2093 struct domain_context_mapping_data {
2094 struct dmar_domain *domain;
2095 struct intel_iommu *iommu;
2096 struct pasid_table *table;
2099 static int domain_context_mapping_cb(struct pci_dev *pdev,
2100 u16 alias, void *opaque)
2102 struct domain_context_mapping_data *data = opaque;
2104 return domain_context_mapping_one(data->domain, data->iommu,
2105 data->table, PCI_BUS_NUM(alias),
2110 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2112 struct domain_context_mapping_data data;
2113 struct pasid_table *table;
2114 struct intel_iommu *iommu;
2117 iommu = device_to_iommu(dev, &bus, &devfn);
2121 table = intel_pasid_get_table(dev);
2123 if (!dev_is_pci(dev))
2124 return domain_context_mapping_one(domain, iommu, table,
2127 data.domain = domain;
2131 return pci_for_each_dma_alias(to_pci_dev(dev),
2132 &domain_context_mapping_cb, &data);
2135 static int domain_context_mapped_cb(struct pci_dev *pdev,
2136 u16 alias, void *opaque)
2138 struct intel_iommu *iommu = opaque;
2140 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2143 static int domain_context_mapped(struct device *dev)
2145 struct intel_iommu *iommu;
2148 iommu = device_to_iommu(dev, &bus, &devfn);
2152 if (!dev_is_pci(dev))
2153 return device_context_mapped(iommu, bus, devfn);
2155 return !pci_for_each_dma_alias(to_pci_dev(dev),
2156 domain_context_mapped_cb, iommu);
2159 /* Returns a number of VTD pages, but aligned to MM page size */
2160 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2163 host_addr &= ~PAGE_MASK;
2164 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2167 /* Return largest possible superpage level for a given mapping */
2168 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2169 unsigned long iov_pfn,
2170 unsigned long phy_pfn,
2171 unsigned long pages)
2173 int support, level = 1;
2174 unsigned long pfnmerge;
2176 support = domain->iommu_superpage;
2178 /* To use a large page, the virtual *and* physical addresses
2179 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2180 of them will mean we have to use smaller pages. So just
2181 merge them and check both at once. */
2182 pfnmerge = iov_pfn | phy_pfn;
2184 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2185 pages >>= VTD_STRIDE_SHIFT;
2188 pfnmerge >>= VTD_STRIDE_SHIFT;
2195 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2196 struct scatterlist *sg, unsigned long phys_pfn,
2197 unsigned long nr_pages, int prot)
2199 struct dma_pte *first_pte = NULL, *pte = NULL;
2200 phys_addr_t uninitialized_var(pteval);
2201 unsigned long sg_res = 0;
2202 unsigned int largepage_lvl = 0;
2203 unsigned long lvl_pages = 0;
2205 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2207 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2210 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2214 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2217 while (nr_pages > 0) {
2221 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2223 sg_res = aligned_nrpages(sg->offset, sg->length);
2224 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2225 sg->dma_length = sg->length;
2226 pteval = (sg_phys(sg) - pgoff) | prot;
2227 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2231 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2233 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2236 /* It is large page*/
2237 if (largepage_lvl > 1) {
2238 unsigned long nr_superpages, end_pfn;
2240 pteval |= DMA_PTE_LARGE_PAGE;
2241 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2243 nr_superpages = sg_res / lvl_pages;
2244 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2247 * Ensure that old small page tables are
2248 * removed to make room for superpage(s).
2249 * We're adding new large pages, so make sure
2250 * we don't remove their parent tables.
2252 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2255 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2259 /* We don't need lock here, nobody else
2260 * touches the iova range
2262 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2264 static int dumps = 5;
2265 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2266 iov_pfn, tmp, (unsigned long long)pteval);
2269 debug_dma_dump_mappings(NULL);
2274 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2276 BUG_ON(nr_pages < lvl_pages);
2277 BUG_ON(sg_res < lvl_pages);
2279 nr_pages -= lvl_pages;
2280 iov_pfn += lvl_pages;
2281 phys_pfn += lvl_pages;
2282 pteval += lvl_pages * VTD_PAGE_SIZE;
2283 sg_res -= lvl_pages;
2285 /* If the next PTE would be the first in a new page, then we
2286 need to flush the cache on the entries we've just written.
2287 And then we'll need to recalculate 'pte', so clear it and
2288 let it get set again in the if (!pte) block above.
2290 If we're done (!nr_pages) we need to flush the cache too.
2292 Also if we've been setting superpages, we may need to
2293 recalculate 'pte' and switch back to smaller pages for the
2294 end of the mapping, if the trailing size is not enough to
2295 use another superpage (i.e. sg_res < lvl_pages). */
2297 if (!nr_pages || first_pte_in_page(pte) ||
2298 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2299 domain_flush_cache(domain, first_pte,
2300 (void *)pte - (void *)first_pte);
2304 if (!sg_res && nr_pages)
2310 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2311 struct scatterlist *sg, unsigned long phys_pfn,
2312 unsigned long nr_pages, int prot)
2315 struct intel_iommu *iommu;
2317 /* Do the real mapping first */
2318 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2322 for_each_domain_iommu(iommu_id, domain) {
2323 iommu = g_iommus[iommu_id];
2324 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2330 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2331 struct scatterlist *sg, unsigned long nr_pages,
2334 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2337 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2338 unsigned long phys_pfn, unsigned long nr_pages,
2341 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2344 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2346 unsigned long flags;
2347 struct context_entry *context;
2353 spin_lock_irqsave(&iommu->lock, flags);
2354 context = iommu_context_addr(iommu, bus, devfn, 0);
2356 spin_unlock_irqrestore(&iommu->lock, flags);
2359 did_old = context_domain_id(context);
2360 context_clear_entry(context);
2361 __iommu_flush_cache(iommu, context, sizeof(*context));
2362 spin_unlock_irqrestore(&iommu->lock, flags);
2363 iommu->flush.flush_context(iommu,
2365 (((u16)bus) << 8) | devfn,
2366 DMA_CCMD_MASK_NOBIT,
2367 DMA_CCMD_DEVICE_INVL);
2368 iommu->flush.flush_iotlb(iommu,
2375 static inline void unlink_domain_info(struct device_domain_info *info)
2377 assert_spin_locked(&device_domain_lock);
2378 list_del(&info->link);
2379 list_del(&info->global);
2381 info->dev->archdata.iommu = NULL;
2384 static void domain_remove_dev_info(struct dmar_domain *domain)
2386 struct device_domain_info *info, *tmp;
2387 unsigned long flags;
2389 spin_lock_irqsave(&device_domain_lock, flags);
2390 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2391 __dmar_remove_one_dev_info(info);
2392 spin_unlock_irqrestore(&device_domain_lock, flags);
2397 * Note: we use struct device->archdata.iommu stores the info
2399 static struct dmar_domain *find_domain(struct device *dev)
2401 struct device_domain_info *info;
2403 /* No lock here, assumes no domain exit in normal case */
2404 info = dev->archdata.iommu;
2406 return info->domain;
2410 static inline struct device_domain_info *
2411 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2413 struct device_domain_info *info;
2415 list_for_each_entry(info, &device_domain_list, global)
2416 if (info->iommu->segment == segment && info->bus == bus &&
2417 info->devfn == devfn)
2423 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2426 struct dmar_domain *domain)
2428 struct dmar_domain *found = NULL;
2429 struct device_domain_info *info;
2430 unsigned long flags;
2433 info = alloc_devinfo_mem();
2438 info->devfn = devfn;
2439 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2440 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2443 info->domain = domain;
2444 info->iommu = iommu;
2445 info->pasid_table = NULL;
2446 info->auxd_enabled = 0;
2447 INIT_LIST_HEAD(&info->auxiliary_domains);
2449 if (dev && dev_is_pci(dev)) {
2450 struct pci_dev *pdev = to_pci_dev(info->dev);
2452 if (!pdev->untrusted &&
2453 !pci_ats_disabled() &&
2454 ecap_dev_iotlb_support(iommu->ecap) &&
2455 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2456 dmar_find_matched_atsr_unit(pdev))
2457 info->ats_supported = 1;
2459 if (sm_supported(iommu)) {
2460 if (pasid_supported(iommu)) {
2461 int features = pci_pasid_features(pdev);
2463 info->pasid_supported = features | 1;
2466 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2467 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2468 info->pri_supported = 1;
2472 spin_lock_irqsave(&device_domain_lock, flags);
2474 found = find_domain(dev);
2477 struct device_domain_info *info2;
2478 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2480 found = info2->domain;
2486 spin_unlock_irqrestore(&device_domain_lock, flags);
2487 free_devinfo_mem(info);
2488 /* Caller must free the original domain */
2492 spin_lock(&iommu->lock);
2493 ret = domain_attach_iommu(domain, iommu);
2494 spin_unlock(&iommu->lock);
2497 spin_unlock_irqrestore(&device_domain_lock, flags);
2498 free_devinfo_mem(info);
2502 list_add(&info->link, &domain->devices);
2503 list_add(&info->global, &device_domain_list);
2505 dev->archdata.iommu = info;
2506 spin_unlock_irqrestore(&device_domain_lock, flags);
2508 /* PASID table is mandatory for a PCI device in scalable mode. */
2509 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2510 ret = intel_pasid_alloc_table(dev);
2512 dev_err(dev, "PASID table allocation failed\n");
2513 dmar_remove_one_dev_info(dev);
2517 /* Setup the PASID entry for requests without PASID: */
2518 spin_lock(&iommu->lock);
2519 if (hw_pass_through && domain_type_is_si(domain))
2520 ret = intel_pasid_setup_pass_through(iommu, domain,
2521 dev, PASID_RID2PASID);
2523 ret = intel_pasid_setup_second_level(iommu, domain,
2524 dev, PASID_RID2PASID);
2525 spin_unlock(&iommu->lock);
2527 dev_err(dev, "Setup RID2PASID failed\n");
2528 dmar_remove_one_dev_info(dev);
2533 if (dev && domain_context_mapping(domain, dev)) {
2534 dev_err(dev, "Domain context map failed\n");
2535 dmar_remove_one_dev_info(dev);
2542 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2544 *(u16 *)opaque = alias;
2548 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2550 struct device_domain_info *info;
2551 struct dmar_domain *domain = NULL;
2552 struct intel_iommu *iommu;
2554 unsigned long flags;
2557 iommu = device_to_iommu(dev, &bus, &devfn);
2561 if (dev_is_pci(dev)) {
2562 struct pci_dev *pdev = to_pci_dev(dev);
2564 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2566 spin_lock_irqsave(&device_domain_lock, flags);
2567 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2568 PCI_BUS_NUM(dma_alias),
2571 iommu = info->iommu;
2572 domain = info->domain;
2574 spin_unlock_irqrestore(&device_domain_lock, flags);
2576 /* DMA alias already has a domain, use it */
2581 /* Allocate and initialize new domain for the device */
2582 domain = alloc_domain(0);
2585 if (domain_init(domain, iommu, gaw)) {
2586 domain_exit(domain);
2595 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2596 struct dmar_domain *domain)
2598 struct intel_iommu *iommu;
2599 struct dmar_domain *tmp;
2600 u16 req_id, dma_alias;
2603 iommu = device_to_iommu(dev, &bus, &devfn);
2607 req_id = ((u16)bus << 8) | devfn;
2609 if (dev_is_pci(dev)) {
2610 struct pci_dev *pdev = to_pci_dev(dev);
2612 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2614 /* register PCI DMA alias device */
2615 if (req_id != dma_alias) {
2616 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2617 dma_alias & 0xff, NULL, domain);
2619 if (!tmp || tmp != domain)
2624 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2625 if (!tmp || tmp != domain)
2631 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2633 struct dmar_domain *domain, *tmp;
2635 domain = find_domain(dev);
2639 domain = find_or_alloc_domain(dev, gaw);
2643 tmp = set_domain_for_dev(dev, domain);
2644 if (!tmp || domain != tmp) {
2645 domain_exit(domain);
2654 static int iommu_domain_identity_map(struct dmar_domain *domain,
2655 unsigned long long start,
2656 unsigned long long end)
2658 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2659 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2661 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2662 dma_to_mm_pfn(last_vpfn))) {
2663 pr_err("Reserving iova failed\n");
2667 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2669 * RMRR range might have overlap with physical memory range,
2672 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2674 return __domain_mapping(domain, first_vpfn, NULL,
2675 first_vpfn, last_vpfn - first_vpfn + 1,
2676 DMA_PTE_READ|DMA_PTE_WRITE);
2679 static int domain_prepare_identity_map(struct device *dev,
2680 struct dmar_domain *domain,
2681 unsigned long long start,
2682 unsigned long long end)
2684 /* For _hardware_ passthrough, don't bother. But for software
2685 passthrough, we do it anyway -- it may indicate a memory
2686 range which is reserved in E820, so which didn't get set
2687 up to start with in si_domain */
2688 if (domain == si_domain && hw_pass_through) {
2689 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2694 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2697 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2698 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2699 dmi_get_system_info(DMI_BIOS_VENDOR),
2700 dmi_get_system_info(DMI_BIOS_VERSION),
2701 dmi_get_system_info(DMI_PRODUCT_VERSION));
2705 if (end >> agaw_to_width(domain->agaw)) {
2706 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2707 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2708 agaw_to_width(domain->agaw),
2709 dmi_get_system_info(DMI_BIOS_VENDOR),
2710 dmi_get_system_info(DMI_BIOS_VERSION),
2711 dmi_get_system_info(DMI_PRODUCT_VERSION));
2715 return iommu_domain_identity_map(domain, start, end);
2718 static int iommu_prepare_identity_map(struct device *dev,
2719 unsigned long long start,
2720 unsigned long long end)
2722 struct dmar_domain *domain;
2725 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2729 ret = domain_prepare_identity_map(dev, domain, start, end);
2731 domain_exit(domain);
2736 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2739 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2741 return iommu_prepare_identity_map(dev, rmrr->base_address,
2745 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2746 static inline void iommu_prepare_isa(void)
2748 struct pci_dev *pdev;
2751 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2755 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2756 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2759 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2764 static inline void iommu_prepare_isa(void)
2768 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2770 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2772 static int __init si_domain_init(int hw)
2774 struct dmar_rmrr_unit *rmrr;
2778 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2782 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2783 domain_exit(si_domain);
2790 for_each_online_node(nid) {
2791 unsigned long start_pfn, end_pfn;
2794 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2795 ret = iommu_domain_identity_map(si_domain,
2796 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2803 * Normally we use DMA domains for devices which have RMRRs. But we
2804 * loose this requirement for graphic and usb devices. Identity map
2805 * the RMRRs for graphic and USB devices so that they could use the
2808 for_each_rmrr_units(rmrr) {
2809 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2811 unsigned long long start = rmrr->base_address;
2812 unsigned long long end = rmrr->end_address;
2814 if (device_is_rmrr_locked(dev))
2817 if (WARN_ON(end < start ||
2818 end >> agaw_to_width(si_domain->agaw)))
2821 ret = iommu_domain_identity_map(si_domain, start, end);
2830 static int identity_mapping(struct device *dev)
2832 struct device_domain_info *info;
2834 info = dev->archdata.iommu;
2835 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2836 return (info->domain == si_domain);
2841 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2843 struct dmar_domain *ndomain;
2844 struct intel_iommu *iommu;
2847 iommu = device_to_iommu(dev, &bus, &devfn);
2851 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2852 if (ndomain != domain)
2858 static bool device_has_rmrr(struct device *dev)
2860 struct dmar_rmrr_unit *rmrr;
2865 for_each_rmrr_units(rmrr) {
2867 * Return TRUE if this RMRR contains the device that
2870 for_each_active_dev_scope(rmrr->devices,
2871 rmrr->devices_cnt, i, tmp)
2882 * There are a couple cases where we need to restrict the functionality of
2883 * devices associated with RMRRs. The first is when evaluating a device for
2884 * identity mapping because problems exist when devices are moved in and out
2885 * of domains and their respective RMRR information is lost. This means that
2886 * a device with associated RMRRs will never be in a "passthrough" domain.
2887 * The second is use of the device through the IOMMU API. This interface
2888 * expects to have full control of the IOVA space for the device. We cannot
2889 * satisfy both the requirement that RMRR access is maintained and have an
2890 * unencumbered IOVA space. We also have no ability to quiesce the device's
2891 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2892 * We therefore prevent devices associated with an RMRR from participating in
2893 * the IOMMU API, which eliminates them from device assignment.
2895 * In both cases we assume that PCI USB devices with RMRRs have them largely
2896 * for historical reasons and that the RMRR space is not actively used post
2897 * boot. This exclusion may change if vendors begin to abuse it.
2899 * The same exception is made for graphics devices, with the requirement that
2900 * any use of the RMRR regions will be torn down before assigning the device
2903 static bool device_is_rmrr_locked(struct device *dev)
2905 if (!device_has_rmrr(dev))
2908 if (dev_is_pci(dev)) {
2909 struct pci_dev *pdev = to_pci_dev(dev);
2911 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2919 * Return the required default domain type for a specific device.
2921 * @dev: the device in query
2922 * @startup: true if this is during early boot
2925 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2926 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2927 * - 0: both identity and dynamic domains work for this device
2929 static int device_def_domain_type(struct device *dev, int startup)
2931 if (dev_is_pci(dev)) {
2932 struct pci_dev *pdev = to_pci_dev(dev);
2934 if (device_is_rmrr_locked(dev))
2935 return IOMMU_DOMAIN_DMA;
2938 * Prevent any device marked as untrusted from getting
2939 * placed into the statically identity mapping domain.
2941 if (pdev->untrusted)
2942 return IOMMU_DOMAIN_DMA;
2944 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2945 return IOMMU_DOMAIN_IDENTITY;
2947 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2948 return IOMMU_DOMAIN_IDENTITY;
2951 * We want to start off with all devices in the 1:1 domain, and
2952 * take them out later if we find they can't access all of memory.
2954 * However, we can't do this for PCI devices behind bridges,
2955 * because all PCI devices behind the same bridge will end up
2956 * with the same source-id on their transactions.
2958 * Practically speaking, we can't change things around for these
2959 * devices at run-time, because we can't be sure there'll be no
2960 * DMA transactions in flight for any of their siblings.
2962 * So PCI devices (unless they're on the root bus) as well as
2963 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2964 * the 1:1 domain, just in _case_ one of their siblings turns out
2965 * not to be able to map all of memory.
2967 if (!pci_is_pcie(pdev)) {
2968 if (!pci_is_root_bus(pdev->bus))
2969 return IOMMU_DOMAIN_DMA;
2970 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2971 return IOMMU_DOMAIN_DMA;
2972 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2973 return IOMMU_DOMAIN_DMA;
2975 if (device_has_rmrr(dev))
2976 return IOMMU_DOMAIN_DMA;
2980 * At boot time, we don't yet know if devices will be 64-bit capable.
2981 * Assume that they will — if they turn out not to be, then we can
2982 * take them out of the 1:1 domain later.
2986 * If the device's dma_mask is less than the system's memory
2987 * size then this is not a candidate for identity mapping.
2989 u64 dma_mask = *dev->dma_mask;
2991 if (dev->coherent_dma_mask &&
2992 dev->coherent_dma_mask < dma_mask)
2993 dma_mask = dev->coherent_dma_mask;
2995 return dma_mask >= dma_get_required_mask(dev);
2998 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2999 IOMMU_DOMAIN_IDENTITY : 0;
3002 static inline int iommu_should_identity_map(struct device *dev, int startup)
3004 return device_def_domain_type(dev, startup) == IOMMU_DOMAIN_IDENTITY;
3007 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3011 if (!iommu_should_identity_map(dev, 1))
3014 ret = domain_add_dev_info(si_domain, dev);
3016 dev_info(dev, "%s identity mapping\n",
3017 hw ? "Hardware" : "Software");
3018 else if (ret == -ENODEV)
3019 /* device not associated with an iommu */
3026 static int __init iommu_prepare_static_identity_mapping(int hw)
3028 struct pci_dev *pdev = NULL;
3029 struct dmar_drhd_unit *drhd;
3030 struct intel_iommu *iommu;
3035 for_each_pci_dev(pdev) {
3036 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3041 for_each_active_iommu(iommu, drhd)
3042 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3043 struct acpi_device_physical_node *pn;
3044 struct acpi_device *adev;
3046 if (dev->bus != &acpi_bus_type)
3049 adev= to_acpi_device(dev);
3050 mutex_lock(&adev->physical_node_lock);
3051 list_for_each_entry(pn, &adev->physical_node_list, node) {
3052 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3056 mutex_unlock(&adev->physical_node_lock);
3064 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3067 * Start from the sane iommu hardware state.
3068 * If the queued invalidation is already initialized by us
3069 * (for example, while enabling interrupt-remapping) then
3070 * we got the things already rolling from a sane state.
3074 * Clear any previous faults.
3076 dmar_fault(-1, iommu);
3078 * Disable queued invalidation if supported and already enabled
3079 * before OS handover.
3081 dmar_disable_qi(iommu);
3084 if (dmar_enable_qi(iommu)) {
3086 * Queued Invalidate not enabled, use Register Based Invalidate
3088 iommu->flush.flush_context = __iommu_flush_context;
3089 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3090 pr_info("%s: Using Register based invalidation\n",
3093 iommu->flush.flush_context = qi_flush_context;
3094 iommu->flush.flush_iotlb = qi_flush_iotlb;
3095 pr_info("%s: Using Queued invalidation\n", iommu->name);
3099 static int copy_context_table(struct intel_iommu *iommu,
3100 struct root_entry *old_re,
3101 struct context_entry **tbl,
3104 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3105 struct context_entry *new_ce = NULL, ce;
3106 struct context_entry *old_ce = NULL;
3107 struct root_entry re;
3108 phys_addr_t old_ce_phys;
3110 tbl_idx = ext ? bus * 2 : bus;
3111 memcpy(&re, old_re, sizeof(re));
3113 for (devfn = 0; devfn < 256; devfn++) {
3114 /* First calculate the correct index */
3115 idx = (ext ? devfn * 2 : devfn) % 256;
3118 /* First save what we may have and clean up */
3120 tbl[tbl_idx] = new_ce;
3121 __iommu_flush_cache(iommu, new_ce,
3131 old_ce_phys = root_entry_lctp(&re);
3133 old_ce_phys = root_entry_uctp(&re);
3136 if (ext && devfn == 0) {
3137 /* No LCTP, try UCTP */
3146 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3151 new_ce = alloc_pgtable_page(iommu->node);
3158 /* Now copy the context entry */
3159 memcpy(&ce, old_ce + idx, sizeof(ce));
3161 if (!__context_present(&ce))
3164 did = context_domain_id(&ce);
3165 if (did >= 0 && did < cap_ndoms(iommu->cap))
3166 set_bit(did, iommu->domain_ids);
3169 * We need a marker for copied context entries. This
3170 * marker needs to work for the old format as well as
3171 * for extended context entries.
3173 * Bit 67 of the context entry is used. In the old
3174 * format this bit is available to software, in the
3175 * extended format it is the PGE bit, but PGE is ignored
3176 * by HW if PASIDs are disabled (and thus still
3179 * So disable PASIDs first and then mark the entry
3180 * copied. This means that we don't copy PASID
3181 * translations from the old kernel, but this is fine as
3182 * faults there are not fatal.
3184 context_clear_pasid_enable(&ce);
3185 context_set_copied(&ce);
3190 tbl[tbl_idx + pos] = new_ce;
3192 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3201 static int copy_translation_tables(struct intel_iommu *iommu)
3203 struct context_entry **ctxt_tbls;
3204 struct root_entry *old_rt;
3205 phys_addr_t old_rt_phys;
3206 int ctxt_table_entries;
3207 unsigned long flags;
3212 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3213 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3214 new_ext = !!ecap_ecs(iommu->ecap);
3217 * The RTT bit can only be changed when translation is disabled,
3218 * but disabling translation means to open a window for data
3219 * corruption. So bail out and don't copy anything if we would
3220 * have to change the bit.
3225 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3229 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3233 /* This is too big for the stack - allocate it from slab */
3234 ctxt_table_entries = ext ? 512 : 256;
3236 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3240 for (bus = 0; bus < 256; bus++) {
3241 ret = copy_context_table(iommu, &old_rt[bus],
3242 ctxt_tbls, bus, ext);
3244 pr_err("%s: Failed to copy context table for bus %d\n",
3250 spin_lock_irqsave(&iommu->lock, flags);
3252 /* Context tables are copied, now write them to the root_entry table */
3253 for (bus = 0; bus < 256; bus++) {
3254 int idx = ext ? bus * 2 : bus;
3257 if (ctxt_tbls[idx]) {
3258 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3259 iommu->root_entry[bus].lo = val;
3262 if (!ext || !ctxt_tbls[idx + 1])
3265 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3266 iommu->root_entry[bus].hi = val;
3269 spin_unlock_irqrestore(&iommu->lock, flags);
3273 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3283 static int __init init_dmars(void)
3285 struct dmar_drhd_unit *drhd;
3286 struct dmar_rmrr_unit *rmrr;
3287 bool copied_tables = false;
3289 struct intel_iommu *iommu;
3295 * initialize and program root entry to not present
3298 for_each_drhd_unit(drhd) {
3300 * lock not needed as this is only incremented in the single
3301 * threaded kernel __init code path all other access are read
3304 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3308 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3311 /* Preallocate enough resources for IOMMU hot-addition */
3312 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3313 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3315 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3318 pr_err("Allocating global iommu array failed\n");
3323 for_each_active_iommu(iommu, drhd) {
3325 * Find the max pasid size of all IOMMU's in the system.
3326 * We need to ensure the system pasid table is no bigger
3327 * than the smallest supported.
3329 if (pasid_supported(iommu)) {
3330 u32 temp = 2 << ecap_pss(iommu->ecap);
3332 intel_pasid_max_id = min_t(u32, temp,
3333 intel_pasid_max_id);
3336 g_iommus[iommu->seq_id] = iommu;
3338 intel_iommu_init_qi(iommu);
3340 ret = iommu_init_domains(iommu);
3344 init_translation_status(iommu);
3346 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3347 iommu_disable_translation(iommu);
3348 clear_translation_pre_enabled(iommu);
3349 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3355 * we could share the same root & context tables
3356 * among all IOMMU's. Need to Split it later.
3358 ret = iommu_alloc_root_entry(iommu);
3362 if (translation_pre_enabled(iommu)) {
3363 pr_info("Translation already enabled - trying to copy translation structures\n");
3365 ret = copy_translation_tables(iommu);
3368 * We found the IOMMU with translation
3369 * enabled - but failed to copy over the
3370 * old root-entry table. Try to proceed
3371 * by disabling translation now and
3372 * allocating a clean root-entry table.
3373 * This might cause DMAR faults, but
3374 * probably the dump will still succeed.
3376 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3378 iommu_disable_translation(iommu);
3379 clear_translation_pre_enabled(iommu);
3381 pr_info("Copied translation tables from previous kernel for %s\n",
3383 copied_tables = true;
3387 if (!ecap_pass_through(iommu->ecap))
3388 hw_pass_through = 0;
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390 if (pasid_supported(iommu))
3391 intel_svm_init(iommu);
3396 * Now that qi is enabled on all iommus, set the root entry and flush
3397 * caches. This is required on some Intel X58 chipsets, otherwise the
3398 * flush_context function will loop forever and the boot hangs.
3400 for_each_active_iommu(iommu, drhd) {
3401 iommu_flush_write_buffer(iommu);
3402 iommu_set_root_entry(iommu);
3403 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3404 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3407 if (iommu_pass_through)
3408 iommu_identity_mapping |= IDENTMAP_ALL;
3410 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3415 iommu_identity_mapping |= IDENTMAP_GFX;
3417 check_tylersburg_isoch();
3419 ret = si_domain_init(hw_pass_through);
3425 * If we copied translations from a previous kernel in the kdump
3426 * case, we can not assign the devices to domains now, as that
3427 * would eliminate the old mappings. So skip this part and defer
3428 * the assignment to device driver initialization time.
3434 * If pass through is not set or not enabled, setup context entries for
3435 * identity mappings for rmrr, gfx, and isa and may fall back to static
3436 * identity mapping if iommu_identity_mapping is set.
3438 if (iommu_identity_mapping) {
3439 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3441 pr_crit("Failed to setup IOMMU pass-through\n");
3447 * for each dev attached to rmrr
3449 * locate drhd for dev, alloc domain for dev
3450 * allocate free domain
3451 * allocate page table entries for rmrr
3452 * if context not allocated for bus
3453 * allocate and init context
3454 * set present in root table for this bus
3455 * init context with domain, translation etc
3459 pr_info("Setting RMRR:\n");
3460 for_each_rmrr_units(rmrr) {
3461 /* some BIOS lists non-exist devices in DMAR table. */
3462 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3464 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3466 pr_err("Mapping reserved region failed\n");
3470 iommu_prepare_isa();
3477 * global invalidate context cache
3478 * global invalidate iotlb
3479 * enable translation
3481 for_each_iommu(iommu, drhd) {
3482 if (drhd->ignored) {
3484 * we always have to disable PMRs or DMA may fail on
3488 iommu_disable_protect_mem_regions(iommu);
3492 iommu_flush_write_buffer(iommu);
3494 #ifdef CONFIG_INTEL_IOMMU_SVM
3495 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3497 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3498 * could cause possible lock race condition.
3500 up_write(&dmar_global_lock);
3501 ret = intel_svm_enable_prq(iommu);
3502 down_write(&dmar_global_lock);
3507 ret = dmar_set_interrupt(iommu);
3515 for_each_active_iommu(iommu, drhd) {
3516 disable_dmar_iommu(iommu);
3517 free_dmar_iommu(iommu);
3526 /* This takes a number of _MM_ pages, not VTD pages */
3527 static unsigned long intel_alloc_iova(struct device *dev,
3528 struct dmar_domain *domain,
3529 unsigned long nrpages, uint64_t dma_mask)
3531 unsigned long iova_pfn;
3533 /* Restrict dma_mask to the width that the iommu can handle */
3534 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3535 /* Ensure we reserve the whole size-aligned region */
3536 nrpages = __roundup_pow_of_two(nrpages);
3538 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3540 * First try to allocate an io virtual address in
3541 * DMA_BIT_MASK(32) and if that fails then try allocating
3544 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3545 IOVA_PFN(DMA_BIT_MASK(32)), false);
3549 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3550 IOVA_PFN(dma_mask), true);
3551 if (unlikely(!iova_pfn)) {
3552 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3559 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3561 struct dmar_domain *domain, *tmp;
3562 struct dmar_rmrr_unit *rmrr;
3563 struct device *i_dev;
3566 domain = find_domain(dev);
3570 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3574 /* We have a new domain - setup possible RMRRs for the device */
3576 for_each_rmrr_units(rmrr) {
3577 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3582 ret = domain_prepare_identity_map(dev, domain,
3586 dev_err(dev, "Mapping reserved region failed\n");
3591 tmp = set_domain_for_dev(dev, domain);
3592 if (!tmp || domain != tmp) {
3593 domain_exit(domain);
3600 dev_err(dev, "Allocating domain failed\n");
3606 /* Check if the dev needs to go through non-identity map and unmap process.*/
3607 static bool iommu_need_mapping(struct device *dev)
3611 if (iommu_dummy(dev))
3614 found = identity_mapping(dev);
3616 if (iommu_should_identity_map(dev, 0))
3620 * 32 bit DMA is removed from si_domain and fall back to
3621 * non-identity mapping.
3623 dmar_remove_one_dev_info(dev);
3624 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3627 * In case of a detached 64 bit DMA device from vm, the device
3628 * is put into si_domain for identity mapping.
3630 if (iommu_should_identity_map(dev, 0) &&
3631 !domain_add_dev_info(si_domain, dev)) {
3632 dev_info(dev, "64bit DMA uses identity mapping\n");
3640 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3641 size_t size, int dir, u64 dma_mask)
3643 struct dmar_domain *domain;
3644 phys_addr_t start_paddr;
3645 unsigned long iova_pfn;
3648 struct intel_iommu *iommu;
3649 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3651 BUG_ON(dir == DMA_NONE);
3653 domain = get_valid_domain_for_dev(dev);
3655 return DMA_MAPPING_ERROR;
3657 iommu = domain_get_iommu(domain);
3658 size = aligned_nrpages(paddr, size);
3660 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3665 * Check if DMAR supports zero-length reads on write only
3668 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3669 !cap_zlr(iommu->cap))
3670 prot |= DMA_PTE_READ;
3671 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3672 prot |= DMA_PTE_WRITE;
3674 * paddr - (paddr + size) might be partial page, we should map the whole
3675 * page. Note: if two part of one page are separately mapped, we
3676 * might have two guest_addr mapping to the same host paddr, but this
3677 * is not a big problem
3679 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3680 mm_to_dma_pfn(paddr_pfn), size, prot);
3684 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3685 start_paddr += paddr & ~PAGE_MASK;
3690 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3691 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3692 size, (unsigned long long)paddr, dir);
3693 return DMA_MAPPING_ERROR;
3696 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3697 unsigned long offset, size_t size,
3698 enum dma_data_direction dir,
3699 unsigned long attrs)
3701 if (iommu_need_mapping(dev))
3702 return __intel_map_single(dev, page_to_phys(page) + offset,
3703 size, dir, *dev->dma_mask);
3704 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3707 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3708 size_t size, enum dma_data_direction dir,
3709 unsigned long attrs)
3711 if (iommu_need_mapping(dev))
3712 return __intel_map_single(dev, phys_addr, size, dir,
3714 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3717 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3719 struct dmar_domain *domain;
3720 unsigned long start_pfn, last_pfn;
3721 unsigned long nrpages;
3722 unsigned long iova_pfn;
3723 struct intel_iommu *iommu;
3724 struct page *freelist;
3725 struct pci_dev *pdev = NULL;
3727 domain = find_domain(dev);
3730 iommu = domain_get_iommu(domain);
3732 iova_pfn = IOVA_PFN(dev_addr);
3734 nrpages = aligned_nrpages(dev_addr, size);
3735 start_pfn = mm_to_dma_pfn(iova_pfn);
3736 last_pfn = start_pfn + nrpages - 1;
3738 if (dev_is_pci(dev))
3739 pdev = to_pci_dev(dev);
3741 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3743 freelist = domain_unmap(domain, start_pfn, last_pfn);
3745 if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3746 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3747 nrpages, !freelist, 0);
3749 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3750 dma_free_pagelist(freelist);
3752 queue_iova(&domain->iovad, iova_pfn, nrpages,
3753 (unsigned long)freelist);
3755 * queue up the release of the unmap to save the 1/6th of the
3756 * cpu used up by the iotlb flush operation...
3761 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3762 size_t size, enum dma_data_direction dir,
3763 unsigned long attrs)
3765 if (iommu_need_mapping(dev))
3766 intel_unmap(dev, dev_addr, size);
3768 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3771 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3772 size_t size, enum dma_data_direction dir, unsigned long attrs)
3774 if (iommu_need_mapping(dev))
3775 intel_unmap(dev, dev_addr, size);
3778 static void *intel_alloc_coherent(struct device *dev, size_t size,
3779 dma_addr_t *dma_handle, gfp_t flags,
3780 unsigned long attrs)
3782 struct page *page = NULL;
3785 if (!iommu_need_mapping(dev))
3786 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3788 size = PAGE_ALIGN(size);
3789 order = get_order(size);
3791 if (gfpflags_allow_blocking(flags)) {
3792 unsigned int count = size >> PAGE_SHIFT;
3794 page = dma_alloc_from_contiguous(dev, count, order,
3795 flags & __GFP_NOWARN);
3799 page = alloc_pages(flags, order);
3802 memset(page_address(page), 0, size);
3804 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3806 dev->coherent_dma_mask);
3807 if (*dma_handle != DMA_MAPPING_ERROR)
3808 return page_address(page);
3809 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3810 __free_pages(page, order);
3815 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3816 dma_addr_t dma_handle, unsigned long attrs)
3819 struct page *page = virt_to_page(vaddr);
3821 if (!iommu_need_mapping(dev))
3822 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3824 size = PAGE_ALIGN(size);
3825 order = get_order(size);
3827 intel_unmap(dev, dma_handle, size);
3828 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3829 __free_pages(page, order);
3832 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3833 int nelems, enum dma_data_direction dir,
3834 unsigned long attrs)
3836 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3837 unsigned long nrpages = 0;
3838 struct scatterlist *sg;
3841 if (!iommu_need_mapping(dev))
3842 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3844 for_each_sg(sglist, sg, nelems, i) {
3845 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3848 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3851 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3852 enum dma_data_direction dir, unsigned long attrs)
3855 struct dmar_domain *domain;
3858 unsigned long iova_pfn;
3860 struct scatterlist *sg;
3861 unsigned long start_vpfn;
3862 struct intel_iommu *iommu;
3864 BUG_ON(dir == DMA_NONE);
3865 if (!iommu_need_mapping(dev))
3866 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3868 domain = get_valid_domain_for_dev(dev);
3872 iommu = domain_get_iommu(domain);
3874 for_each_sg(sglist, sg, nelems, i)
3875 size += aligned_nrpages(sg->offset, sg->length);
3877 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3880 sglist->dma_length = 0;
3885 * Check if DMAR supports zero-length reads on write only
3888 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3889 !cap_zlr(iommu->cap))
3890 prot |= DMA_PTE_READ;
3891 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3892 prot |= DMA_PTE_WRITE;
3894 start_vpfn = mm_to_dma_pfn(iova_pfn);
3896 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3897 if (unlikely(ret)) {
3898 dma_pte_free_pagetable(domain, start_vpfn,
3899 start_vpfn + size - 1,
3900 agaw_to_level(domain->agaw) + 1);
3901 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3908 static const struct dma_map_ops intel_dma_ops = {
3909 .alloc = intel_alloc_coherent,
3910 .free = intel_free_coherent,
3911 .map_sg = intel_map_sg,
3912 .unmap_sg = intel_unmap_sg,
3913 .map_page = intel_map_page,
3914 .unmap_page = intel_unmap_page,
3915 .map_resource = intel_map_resource,
3916 .unmap_resource = intel_unmap_resource,
3917 .dma_supported = dma_direct_supported,
3920 static inline int iommu_domain_cache_init(void)
3924 iommu_domain_cache = kmem_cache_create("iommu_domain",
3925 sizeof(struct dmar_domain),
3930 if (!iommu_domain_cache) {
3931 pr_err("Couldn't create iommu_domain cache\n");
3938 static inline int iommu_devinfo_cache_init(void)
3942 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3943 sizeof(struct device_domain_info),
3947 if (!iommu_devinfo_cache) {
3948 pr_err("Couldn't create devinfo cache\n");
3955 static int __init iommu_init_mempool(void)
3958 ret = iova_cache_get();
3962 ret = iommu_domain_cache_init();
3966 ret = iommu_devinfo_cache_init();
3970 kmem_cache_destroy(iommu_domain_cache);
3977 static void __init iommu_exit_mempool(void)
3979 kmem_cache_destroy(iommu_devinfo_cache);
3980 kmem_cache_destroy(iommu_domain_cache);
3984 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3986 struct dmar_drhd_unit *drhd;
3990 /* We know that this device on this chipset has its own IOMMU.
3991 * If we find it under a different IOMMU, then the BIOS is lying
3992 * to us. Hope that the IOMMU for this device is actually
3993 * disabled, and it needs no translation...
3995 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3997 /* "can't" happen */
3998 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4001 vtbar &= 0xffff0000;
4003 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4004 drhd = dmar_find_matched_drhd_unit(pdev);
4005 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4006 TAINT_FIRMWARE_WORKAROUND,
4007 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4008 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4010 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4012 static void __init init_no_remapping_devices(void)
4014 struct dmar_drhd_unit *drhd;
4018 for_each_drhd_unit(drhd) {
4019 if (!drhd->include_all) {
4020 for_each_active_dev_scope(drhd->devices,
4021 drhd->devices_cnt, i, dev)
4023 /* ignore DMAR unit if no devices exist */
4024 if (i == drhd->devices_cnt)
4029 for_each_active_drhd_unit(drhd) {
4030 if (drhd->include_all)
4033 for_each_active_dev_scope(drhd->devices,
4034 drhd->devices_cnt, i, dev)
4035 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4037 if (i < drhd->devices_cnt)
4040 /* This IOMMU has *only* gfx devices. Either bypass it or
4041 set the gfx_mapped flag, as appropriate */
4042 if (!dmar_map_gfx) {
4044 for_each_active_dev_scope(drhd->devices,
4045 drhd->devices_cnt, i, dev)
4046 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4051 #ifdef CONFIG_SUSPEND
4052 static int init_iommu_hw(void)
4054 struct dmar_drhd_unit *drhd;
4055 struct intel_iommu *iommu = NULL;
4057 for_each_active_iommu(iommu, drhd)
4059 dmar_reenable_qi(iommu);
4061 for_each_iommu(iommu, drhd) {
4062 if (drhd->ignored) {
4064 * we always have to disable PMRs or DMA may fail on
4068 iommu_disable_protect_mem_regions(iommu);
4072 iommu_flush_write_buffer(iommu);
4074 iommu_set_root_entry(iommu);
4076 iommu->flush.flush_context(iommu, 0, 0, 0,
4077 DMA_CCMD_GLOBAL_INVL);
4078 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4079 iommu_enable_translation(iommu);
4080 iommu_disable_protect_mem_regions(iommu);
4086 static void iommu_flush_all(void)
4088 struct dmar_drhd_unit *drhd;
4089 struct intel_iommu *iommu;
4091 for_each_active_iommu(iommu, drhd) {
4092 iommu->flush.flush_context(iommu, 0, 0, 0,
4093 DMA_CCMD_GLOBAL_INVL);
4094 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4095 DMA_TLB_GLOBAL_FLUSH);
4099 static int iommu_suspend(void)
4101 struct dmar_drhd_unit *drhd;
4102 struct intel_iommu *iommu = NULL;
4105 for_each_active_iommu(iommu, drhd) {
4106 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4108 if (!iommu->iommu_state)
4114 for_each_active_iommu(iommu, drhd) {
4115 iommu_disable_translation(iommu);
4117 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4119 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4120 readl(iommu->reg + DMAR_FECTL_REG);
4121 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4122 readl(iommu->reg + DMAR_FEDATA_REG);
4123 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4124 readl(iommu->reg + DMAR_FEADDR_REG);
4125 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4126 readl(iommu->reg + DMAR_FEUADDR_REG);
4128 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4133 for_each_active_iommu(iommu, drhd)
4134 kfree(iommu->iommu_state);
4139 static void iommu_resume(void)
4141 struct dmar_drhd_unit *drhd;
4142 struct intel_iommu *iommu = NULL;
4145 if (init_iommu_hw()) {
4147 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4149 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4153 for_each_active_iommu(iommu, drhd) {
4155 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4157 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4158 iommu->reg + DMAR_FECTL_REG);
4159 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4160 iommu->reg + DMAR_FEDATA_REG);
4161 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4162 iommu->reg + DMAR_FEADDR_REG);
4163 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4164 iommu->reg + DMAR_FEUADDR_REG);
4166 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4169 for_each_active_iommu(iommu, drhd)
4170 kfree(iommu->iommu_state);
4173 static struct syscore_ops iommu_syscore_ops = {
4174 .resume = iommu_resume,
4175 .suspend = iommu_suspend,
4178 static void __init init_iommu_pm_ops(void)
4180 register_syscore_ops(&iommu_syscore_ops);
4184 static inline void init_iommu_pm_ops(void) {}
4185 #endif /* CONFIG_PM */
4188 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4190 struct acpi_dmar_reserved_memory *rmrr;
4191 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4192 struct dmar_rmrr_unit *rmrru;
4195 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4199 rmrru->hdr = header;
4200 rmrr = (struct acpi_dmar_reserved_memory *)header;
4201 rmrru->base_address = rmrr->base_address;
4202 rmrru->end_address = rmrr->end_address;
4204 length = rmrr->end_address - rmrr->base_address + 1;
4205 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4210 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4211 ((void *)rmrr) + rmrr->header.length,
4212 &rmrru->devices_cnt);
4213 if (rmrru->devices_cnt && rmrru->devices == NULL)
4216 list_add(&rmrru->list, &dmar_rmrr_units);
4227 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4229 struct dmar_atsr_unit *atsru;
4230 struct acpi_dmar_atsr *tmp;
4232 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4233 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4234 if (atsr->segment != tmp->segment)
4236 if (atsr->header.length != tmp->header.length)
4238 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4245 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4247 struct acpi_dmar_atsr *atsr;
4248 struct dmar_atsr_unit *atsru;
4250 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4253 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4254 atsru = dmar_find_atsr(atsr);
4258 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4263 * If memory is allocated from slab by ACPI _DSM method, we need to
4264 * copy the memory content because the memory buffer will be freed
4267 atsru->hdr = (void *)(atsru + 1);
4268 memcpy(atsru->hdr, hdr, hdr->length);
4269 atsru->include_all = atsr->flags & 0x1;
4270 if (!atsru->include_all) {
4271 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4272 (void *)atsr + atsr->header.length,
4273 &atsru->devices_cnt);
4274 if (atsru->devices_cnt && atsru->devices == NULL) {
4280 list_add_rcu(&atsru->list, &dmar_atsr_units);
4285 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4287 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4291 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4293 struct acpi_dmar_atsr *atsr;
4294 struct dmar_atsr_unit *atsru;
4296 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4297 atsru = dmar_find_atsr(atsr);
4299 list_del_rcu(&atsru->list);
4301 intel_iommu_free_atsr(atsru);
4307 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4311 struct acpi_dmar_atsr *atsr;
4312 struct dmar_atsr_unit *atsru;
4314 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4315 atsru = dmar_find_atsr(atsr);
4319 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4320 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4328 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4331 struct intel_iommu *iommu = dmaru->iommu;
4333 if (g_iommus[iommu->seq_id])
4336 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4337 pr_warn("%s: Doesn't support hardware pass through.\n",
4341 if (!ecap_sc_support(iommu->ecap) &&
4342 domain_update_iommu_snooping(iommu)) {
4343 pr_warn("%s: Doesn't support snooping.\n",
4347 sp = domain_update_iommu_superpage(iommu) - 1;
4348 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4349 pr_warn("%s: Doesn't support large page.\n",
4355 * Disable translation if already enabled prior to OS handover.
4357 if (iommu->gcmd & DMA_GCMD_TE)
4358 iommu_disable_translation(iommu);
4360 g_iommus[iommu->seq_id] = iommu;
4361 ret = iommu_init_domains(iommu);
4363 ret = iommu_alloc_root_entry(iommu);
4367 #ifdef CONFIG_INTEL_IOMMU_SVM
4368 if (pasid_supported(iommu))
4369 intel_svm_init(iommu);
4372 if (dmaru->ignored) {
4374 * we always have to disable PMRs or DMA may fail on this device
4377 iommu_disable_protect_mem_regions(iommu);
4381 intel_iommu_init_qi(iommu);
4382 iommu_flush_write_buffer(iommu);
4384 #ifdef CONFIG_INTEL_IOMMU_SVM
4385 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4386 ret = intel_svm_enable_prq(iommu);
4391 ret = dmar_set_interrupt(iommu);
4395 iommu_set_root_entry(iommu);
4396 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4397 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4398 iommu_enable_translation(iommu);
4400 iommu_disable_protect_mem_regions(iommu);
4404 disable_dmar_iommu(iommu);
4406 free_dmar_iommu(iommu);
4410 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4413 struct intel_iommu *iommu = dmaru->iommu;
4415 if (!intel_iommu_enabled)
4421 ret = intel_iommu_add(dmaru);
4423 disable_dmar_iommu(iommu);
4424 free_dmar_iommu(iommu);
4430 static void intel_iommu_free_dmars(void)
4432 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4433 struct dmar_atsr_unit *atsru, *atsr_n;
4435 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4436 list_del(&rmrru->list);
4437 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4442 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4443 list_del(&atsru->list);
4444 intel_iommu_free_atsr(atsru);
4448 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4451 struct pci_bus *bus;
4452 struct pci_dev *bridge = NULL;
4454 struct acpi_dmar_atsr *atsr;
4455 struct dmar_atsr_unit *atsru;
4457 dev = pci_physfn(dev);
4458 for (bus = dev->bus; bus; bus = bus->parent) {
4460 /* If it's an integrated device, allow ATS */
4463 /* Connected via non-PCIe: no ATS */
4464 if (!pci_is_pcie(bridge) ||
4465 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4467 /* If we found the root port, look it up in the ATSR */
4468 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4473 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4474 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4475 if (atsr->segment != pci_domain_nr(dev->bus))
4478 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4479 if (tmp == &bridge->dev)
4482 if (atsru->include_all)
4492 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4495 struct dmar_rmrr_unit *rmrru;
4496 struct dmar_atsr_unit *atsru;
4497 struct acpi_dmar_atsr *atsr;
4498 struct acpi_dmar_reserved_memory *rmrr;
4500 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4503 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4504 rmrr = container_of(rmrru->hdr,
4505 struct acpi_dmar_reserved_memory, header);
4506 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4507 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4508 ((void *)rmrr) + rmrr->header.length,
4509 rmrr->segment, rmrru->devices,
4510 rmrru->devices_cnt);
4513 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4514 dmar_remove_dev_scope(info, rmrr->segment,
4515 rmrru->devices, rmrru->devices_cnt);
4519 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4520 if (atsru->include_all)
4523 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4524 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4525 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4526 (void *)atsr + atsr->header.length,
4527 atsr->segment, atsru->devices,
4528 atsru->devices_cnt);
4533 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4534 if (dmar_remove_dev_scope(info, atsr->segment,
4535 atsru->devices, atsru->devices_cnt))
4544 * Here we only respond to action of unbound device from driver.
4546 * Added device is not attached to its DMAR domain here yet. That will happen
4547 * when mapping the device to iova.
4549 static int device_notifier(struct notifier_block *nb,
4550 unsigned long action, void *data)
4552 struct device *dev = data;
4553 struct dmar_domain *domain;
4555 if (iommu_dummy(dev))
4558 if (action == BUS_NOTIFY_REMOVED_DEVICE) {
4559 domain = find_domain(dev);
4563 dmar_remove_one_dev_info(dev);
4564 } else if (action == BUS_NOTIFY_ADD_DEVICE) {
4565 if (iommu_should_identity_map(dev, 1))
4566 domain_add_dev_info(si_domain, dev);
4572 static struct notifier_block device_nb = {
4573 .notifier_call = device_notifier,
4576 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4577 unsigned long val, void *v)
4579 struct memory_notify *mhp = v;
4580 unsigned long long start, end;
4581 unsigned long start_vpfn, last_vpfn;
4584 case MEM_GOING_ONLINE:
4585 start = mhp->start_pfn << PAGE_SHIFT;
4586 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4587 if (iommu_domain_identity_map(si_domain, start, end)) {
4588 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4595 case MEM_CANCEL_ONLINE:
4596 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4597 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4598 while (start_vpfn <= last_vpfn) {
4600 struct dmar_drhd_unit *drhd;
4601 struct intel_iommu *iommu;
4602 struct page *freelist;
4604 iova = find_iova(&si_domain->iovad, start_vpfn);
4606 pr_debug("Failed get IOVA for PFN %lx\n",
4611 iova = split_and_remove_iova(&si_domain->iovad, iova,
4612 start_vpfn, last_vpfn);
4614 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4615 start_vpfn, last_vpfn);
4619 freelist = domain_unmap(si_domain, iova->pfn_lo,
4623 for_each_active_iommu(iommu, drhd)
4624 iommu_flush_iotlb_psi(iommu, si_domain,
4625 iova->pfn_lo, iova_size(iova),
4628 dma_free_pagelist(freelist);
4630 start_vpfn = iova->pfn_hi + 1;
4631 free_iova_mem(iova);
4639 static struct notifier_block intel_iommu_memory_nb = {
4640 .notifier_call = intel_iommu_memory_notifier,
4644 static void free_all_cpu_cached_iovas(unsigned int cpu)
4648 for (i = 0; i < g_num_of_iommus; i++) {
4649 struct intel_iommu *iommu = g_iommus[i];
4650 struct dmar_domain *domain;
4656 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4657 domain = get_iommu_domain(iommu, (u16)did);
4661 free_cpu_cached_iovas(cpu, &domain->iovad);
4666 static int intel_iommu_cpu_dead(unsigned int cpu)
4668 free_all_cpu_cached_iovas(cpu);
4672 static void intel_disable_iommus(void)
4674 struct intel_iommu *iommu = NULL;
4675 struct dmar_drhd_unit *drhd;
4677 for_each_iommu(iommu, drhd)
4678 iommu_disable_translation(iommu);
4681 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4683 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4685 return container_of(iommu_dev, struct intel_iommu, iommu);
4688 static ssize_t intel_iommu_show_version(struct device *dev,
4689 struct device_attribute *attr,
4692 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4693 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4694 return sprintf(buf, "%d:%d\n",
4695 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4697 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4699 static ssize_t intel_iommu_show_address(struct device *dev,
4700 struct device_attribute *attr,
4703 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4704 return sprintf(buf, "%llx\n", iommu->reg_phys);
4706 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4708 static ssize_t intel_iommu_show_cap(struct device *dev,
4709 struct device_attribute *attr,
4712 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4713 return sprintf(buf, "%llx\n", iommu->cap);
4715 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4717 static ssize_t intel_iommu_show_ecap(struct device *dev,
4718 struct device_attribute *attr,
4721 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4722 return sprintf(buf, "%llx\n", iommu->ecap);
4724 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4726 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4727 struct device_attribute *attr,
4730 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4731 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4733 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4735 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4736 struct device_attribute *attr,
4739 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4740 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4741 cap_ndoms(iommu->cap)));
4743 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4745 static struct attribute *intel_iommu_attrs[] = {
4746 &dev_attr_version.attr,
4747 &dev_attr_address.attr,
4749 &dev_attr_ecap.attr,
4750 &dev_attr_domains_supported.attr,
4751 &dev_attr_domains_used.attr,
4755 static struct attribute_group intel_iommu_group = {
4756 .name = "intel-iommu",
4757 .attrs = intel_iommu_attrs,
4760 const struct attribute_group *intel_iommu_groups[] = {
4765 static int __init platform_optin_force_iommu(void)
4767 struct pci_dev *pdev = NULL;
4768 bool has_untrusted_dev = false;
4770 if (!dmar_platform_optin() || no_platform_optin)
4773 for_each_pci_dev(pdev) {
4774 if (pdev->untrusted) {
4775 has_untrusted_dev = true;
4780 if (!has_untrusted_dev)
4783 if (no_iommu || dmar_disabled)
4784 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4787 * If Intel-IOMMU is disabled by default, we will apply identity
4788 * map for all devices except those marked as being untrusted.
4791 iommu_identity_mapping |= IDENTMAP_ALL;
4794 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4802 int __init intel_iommu_init(void)
4805 struct dmar_drhd_unit *drhd;
4806 struct intel_iommu *iommu;
4809 * Intel IOMMU is required for a TXT/tboot launch or platform
4810 * opt in, so enforce that.
4812 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4814 if (iommu_init_mempool()) {
4816 panic("tboot: Failed to initialize iommu memory\n");
4820 down_write(&dmar_global_lock);
4821 if (dmar_table_init()) {
4823 panic("tboot: Failed to initialize DMAR table\n");
4827 if (dmar_dev_scope_init() < 0) {
4829 panic("tboot: Failed to initialize DMAR device scope\n");
4833 up_write(&dmar_global_lock);
4836 * The bus notifier takes the dmar_global_lock, so lockdep will
4837 * complain later when we register it under the lock.
4839 dmar_register_bus_notifier();
4841 down_write(&dmar_global_lock);
4843 if (no_iommu || dmar_disabled) {
4845 * We exit the function here to ensure IOMMU's remapping and
4846 * mempool aren't setup, which means that the IOMMU's PMRs
4847 * won't be disabled via the call to init_dmars(). So disable
4848 * it explicitly here. The PMRs were setup by tboot prior to
4849 * calling SENTER, but the kernel is expected to reset/tear
4852 if (intel_iommu_tboot_noforce) {
4853 for_each_iommu(iommu, drhd)
4854 iommu_disable_protect_mem_regions(iommu);
4858 * Make sure the IOMMUs are switched off, even when we
4859 * boot into a kexec kernel and the previous kernel left
4862 intel_disable_iommus();
4866 if (list_empty(&dmar_rmrr_units))
4867 pr_info("No RMRR found\n");
4869 if (list_empty(&dmar_atsr_units))
4870 pr_info("No ATSR found\n");
4872 if (dmar_init_reserved_ranges()) {
4874 panic("tboot: Failed to reserve iommu ranges\n");
4875 goto out_free_reserved_range;
4879 intel_iommu_gfx_mapped = 1;
4881 init_no_remapping_devices();
4886 panic("tboot: Failed to initialize DMARs\n");
4887 pr_err("Initialization failed\n");
4888 goto out_free_reserved_range;
4890 up_write(&dmar_global_lock);
4892 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4895 dma_ops = &intel_dma_ops;
4897 init_iommu_pm_ops();
4899 for_each_active_iommu(iommu, drhd) {
4900 iommu_device_sysfs_add(&iommu->iommu, NULL,
4903 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4904 iommu_device_register(&iommu->iommu);
4907 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4908 bus_register_notifier(&pci_bus_type, &device_nb);
4909 if (si_domain && !hw_pass_through)
4910 register_memory_notifier(&intel_iommu_memory_nb);
4911 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4912 intel_iommu_cpu_dead);
4914 /* Finally, we enable the DMA remapping hardware. */
4915 for_each_iommu(iommu, drhd) {
4916 if (!translation_pre_enabled(iommu))
4917 iommu_enable_translation(iommu);
4919 iommu_disable_protect_mem_regions(iommu);
4921 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4923 intel_iommu_enabled = 1;
4924 intel_iommu_debugfs_init();
4928 out_free_reserved_range:
4929 put_iova_domain(&reserved_iova_list);
4931 intel_iommu_free_dmars();
4932 up_write(&dmar_global_lock);
4933 iommu_exit_mempool();
4937 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4939 struct intel_iommu *iommu = opaque;
4941 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4946 * NB - intel-iommu lacks any sort of reference counting for the users of
4947 * dependent devices. If multiple endpoints have intersecting dependent
4948 * devices, unbinding the driver from any one of them will possibly leave
4949 * the others unable to operate.
4951 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4953 if (!iommu || !dev || !dev_is_pci(dev))
4956 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4959 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4961 struct intel_iommu *iommu;
4962 unsigned long flags;
4964 assert_spin_locked(&device_domain_lock);
4969 iommu = info->iommu;
4972 if (dev_is_pci(info->dev) && sm_supported(iommu))
4973 intel_pasid_tear_down_entry(iommu, info->dev,
4976 iommu_disable_dev_iotlb(info);
4977 domain_context_clear(iommu, info->dev);
4978 intel_pasid_free_table(info->dev);
4981 unlink_domain_info(info);
4983 spin_lock_irqsave(&iommu->lock, flags);
4984 domain_detach_iommu(info->domain, iommu);
4985 spin_unlock_irqrestore(&iommu->lock, flags);
4987 free_devinfo_mem(info);
4990 static void dmar_remove_one_dev_info(struct device *dev)
4992 struct device_domain_info *info;
4993 unsigned long flags;
4995 spin_lock_irqsave(&device_domain_lock, flags);
4996 info = dev->archdata.iommu;
4997 __dmar_remove_one_dev_info(info);
4998 spin_unlock_irqrestore(&device_domain_lock, flags);
5001 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5005 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5006 domain_reserve_special_ranges(domain);
5008 /* calculate AGAW */
5009 domain->gaw = guest_width;
5010 adjust_width = guestwidth_to_adjustwidth(guest_width);
5011 domain->agaw = width_to_agaw(adjust_width);
5013 domain->iommu_coherency = 0;
5014 domain->iommu_snooping = 0;
5015 domain->iommu_superpage = 0;
5016 domain->max_addr = 0;
5018 /* always allocate the top pgd */
5019 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5022 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5026 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5028 struct dmar_domain *dmar_domain;
5029 struct iommu_domain *domain;
5032 case IOMMU_DOMAIN_DMA:
5034 case IOMMU_DOMAIN_UNMANAGED:
5035 dmar_domain = alloc_domain(0);
5037 pr_err("Can't allocate dmar_domain\n");
5040 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5041 pr_err("Domain initialization failed\n");
5042 domain_exit(dmar_domain);
5046 if (type == IOMMU_DOMAIN_DMA &&
5047 init_iova_flush_queue(&dmar_domain->iovad,
5048 iommu_flush_iova, iova_entry_free)) {
5049 pr_warn("iova flush queue initialization failed\n");
5050 intel_iommu_strict = 1;
5053 domain_update_iommu_cap(dmar_domain);
5055 domain = &dmar_domain->domain;
5056 domain->geometry.aperture_start = 0;
5057 domain->geometry.aperture_end =
5058 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5059 domain->geometry.force_aperture = true;
5062 case IOMMU_DOMAIN_IDENTITY:
5063 return &si_domain->domain;
5071 static void intel_iommu_domain_free(struct iommu_domain *domain)
5073 if (domain != &si_domain->domain)
5074 domain_exit(to_dmar_domain(domain));
5078 * Check whether a @domain could be attached to the @dev through the
5079 * aux-domain attach/detach APIs.
5082 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5084 struct device_domain_info *info = dev->archdata.iommu;
5086 return info && info->auxd_enabled &&
5087 domain->type == IOMMU_DOMAIN_UNMANAGED;
5090 static void auxiliary_link_device(struct dmar_domain *domain,
5093 struct device_domain_info *info = dev->archdata.iommu;
5095 assert_spin_locked(&device_domain_lock);
5099 domain->auxd_refcnt++;
5100 list_add(&domain->auxd, &info->auxiliary_domains);
5103 static void auxiliary_unlink_device(struct dmar_domain *domain,
5106 struct device_domain_info *info = dev->archdata.iommu;
5108 assert_spin_locked(&device_domain_lock);
5112 list_del(&domain->auxd);
5113 domain->auxd_refcnt--;
5115 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5116 intel_pasid_free_id(domain->default_pasid);
5119 static int aux_domain_add_dev(struct dmar_domain *domain,
5124 unsigned long flags;
5125 struct intel_iommu *iommu;
5127 iommu = device_to_iommu(dev, &bus, &devfn);
5131 if (domain->default_pasid <= 0) {
5134 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5135 pci_max_pasids(to_pci_dev(dev)),
5138 pr_err("Can't allocate default pasid\n");
5141 domain->default_pasid = pasid;
5144 spin_lock_irqsave(&device_domain_lock, flags);
5146 * iommu->lock must be held to attach domain to iommu and setup the
5147 * pasid entry for second level translation.
5149 spin_lock(&iommu->lock);
5150 ret = domain_attach_iommu(domain, iommu);
5154 /* Setup the PASID entry for mediated devices: */
5155 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5156 domain->default_pasid);
5159 spin_unlock(&iommu->lock);
5161 auxiliary_link_device(domain, dev);
5163 spin_unlock_irqrestore(&device_domain_lock, flags);
5168 domain_detach_iommu(domain, iommu);
5170 spin_unlock(&iommu->lock);
5171 spin_unlock_irqrestore(&device_domain_lock, flags);
5172 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5173 intel_pasid_free_id(domain->default_pasid);
5178 static void aux_domain_remove_dev(struct dmar_domain *domain,
5181 struct device_domain_info *info;
5182 struct intel_iommu *iommu;
5183 unsigned long flags;
5185 if (!is_aux_domain(dev, &domain->domain))
5188 spin_lock_irqsave(&device_domain_lock, flags);
5189 info = dev->archdata.iommu;
5190 iommu = info->iommu;
5192 auxiliary_unlink_device(domain, dev);
5194 spin_lock(&iommu->lock);
5195 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5196 domain_detach_iommu(domain, iommu);
5197 spin_unlock(&iommu->lock);
5199 spin_unlock_irqrestore(&device_domain_lock, flags);
5202 static int prepare_domain_attach_device(struct iommu_domain *domain,
5205 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5206 struct intel_iommu *iommu;
5210 iommu = device_to_iommu(dev, &bus, &devfn);
5214 /* check if this iommu agaw is sufficient for max mapped address */
5215 addr_width = agaw_to_width(iommu->agaw);
5216 if (addr_width > cap_mgaw(iommu->cap))
5217 addr_width = cap_mgaw(iommu->cap);
5219 if (dmar_domain->max_addr > (1LL << addr_width)) {
5220 dev_err(dev, "%s: iommu width (%d) is not "
5221 "sufficient for the mapped address (%llx)\n",
5222 __func__, addr_width, dmar_domain->max_addr);
5225 dmar_domain->gaw = addr_width;
5228 * Knock out extra levels of page tables if necessary
5230 while (iommu->agaw < dmar_domain->agaw) {
5231 struct dma_pte *pte;
5233 pte = dmar_domain->pgd;
5234 if (dma_pte_present(pte)) {
5235 dmar_domain->pgd = (struct dma_pte *)
5236 phys_to_virt(dma_pte_addr(pte));
5237 free_pgtable_page(pte);
5239 dmar_domain->agaw--;
5245 static int intel_iommu_attach_device(struct iommu_domain *domain,
5250 if (device_is_rmrr_locked(dev)) {
5251 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5255 if (is_aux_domain(dev, domain))
5258 /* normally dev is not mapped */
5259 if (unlikely(domain_context_mapped(dev))) {
5260 struct dmar_domain *old_domain;
5262 old_domain = find_domain(dev);
5264 dmar_remove_one_dev_info(dev);
5267 ret = prepare_domain_attach_device(domain, dev);
5271 return domain_add_dev_info(to_dmar_domain(domain), dev);
5274 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5279 if (!is_aux_domain(dev, domain))
5282 ret = prepare_domain_attach_device(domain, dev);
5286 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5289 static void intel_iommu_detach_device(struct iommu_domain *domain,
5292 dmar_remove_one_dev_info(dev);
5295 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5298 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5301 static int intel_iommu_map(struct iommu_domain *domain,
5302 unsigned long iova, phys_addr_t hpa,
5303 size_t size, int iommu_prot)
5305 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5310 if (iommu_prot & IOMMU_READ)
5311 prot |= DMA_PTE_READ;
5312 if (iommu_prot & IOMMU_WRITE)
5313 prot |= DMA_PTE_WRITE;
5314 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5315 prot |= DMA_PTE_SNP;
5317 max_addr = iova + size;
5318 if (dmar_domain->max_addr < max_addr) {
5321 /* check if minimum agaw is sufficient for mapped address */
5322 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5323 if (end < max_addr) {
5324 pr_err("%s: iommu width (%d) is not "
5325 "sufficient for the mapped address (%llx)\n",
5326 __func__, dmar_domain->gaw, max_addr);
5329 dmar_domain->max_addr = max_addr;
5331 /* Round up size to next multiple of PAGE_SIZE, if it and
5332 the low bits of hpa would take us onto the next page */
5333 size = aligned_nrpages(hpa, size);
5334 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5335 hpa >> VTD_PAGE_SHIFT, size, prot);
5339 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5340 unsigned long iova, size_t size)
5342 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5343 struct page *freelist = NULL;
5344 unsigned long start_pfn, last_pfn;
5345 unsigned int npages;
5346 int iommu_id, level = 0;
5348 /* Cope with horrid API which requires us to unmap more than the
5349 size argument if it happens to be a large-page mapping. */
5350 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5352 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5353 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5355 start_pfn = iova >> VTD_PAGE_SHIFT;
5356 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5358 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5360 npages = last_pfn - start_pfn + 1;
5362 for_each_domain_iommu(iommu_id, dmar_domain)
5363 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5364 start_pfn, npages, !freelist, 0);
5366 dma_free_pagelist(freelist);
5368 if (dmar_domain->max_addr == iova + size)
5369 dmar_domain->max_addr = iova;
5374 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5377 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5378 struct dma_pte *pte;
5382 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5384 phys = dma_pte_addr(pte);
5389 static inline bool scalable_mode_support(void)
5391 struct dmar_drhd_unit *drhd;
5392 struct intel_iommu *iommu;
5396 for_each_active_iommu(iommu, drhd) {
5397 if (!sm_supported(iommu)) {
5407 static inline bool iommu_pasid_support(void)
5409 struct dmar_drhd_unit *drhd;
5410 struct intel_iommu *iommu;
5414 for_each_active_iommu(iommu, drhd) {
5415 if (!pasid_supported(iommu)) {
5425 static bool intel_iommu_capable(enum iommu_cap cap)
5427 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5428 return domain_update_iommu_snooping(NULL) == 1;
5429 if (cap == IOMMU_CAP_INTR_REMAP)
5430 return irq_remapping_enabled == 1;
5435 static int intel_iommu_add_device(struct device *dev)
5437 struct intel_iommu *iommu;
5438 struct iommu_group *group;
5441 iommu = device_to_iommu(dev, &bus, &devfn);
5445 iommu_device_link(&iommu->iommu, dev);
5447 group = iommu_group_get_for_dev(dev);
5450 return PTR_ERR(group);
5452 iommu_group_put(group);
5456 static void intel_iommu_remove_device(struct device *dev)
5458 struct intel_iommu *iommu;
5461 iommu = device_to_iommu(dev, &bus, &devfn);
5465 iommu_group_remove_device(dev);
5467 iommu_device_unlink(&iommu->iommu, dev);
5470 static void intel_iommu_get_resv_regions(struct device *device,
5471 struct list_head *head)
5473 struct iommu_resv_region *reg;
5474 struct dmar_rmrr_unit *rmrr;
5475 struct device *i_dev;
5479 for_each_rmrr_units(rmrr) {
5480 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5482 if (i_dev != device)
5485 list_add_tail(&rmrr->resv->list, head);
5490 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5491 if (dev_is_pci(device)) {
5492 struct pci_dev *pdev = to_pci_dev(device);
5494 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5495 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5498 list_add_tail(®->list, head);
5501 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5503 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5504 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5508 list_add_tail(®->list, head);
5511 static void intel_iommu_put_resv_regions(struct device *dev,
5512 struct list_head *head)
5514 struct iommu_resv_region *entry, *next;
5516 list_for_each_entry_safe(entry, next, head, list) {
5517 if (entry->type == IOMMU_RESV_MSI)
5522 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5524 struct device_domain_info *info;
5525 struct context_entry *context;
5526 struct dmar_domain *domain;
5527 unsigned long flags;
5531 domain = get_valid_domain_for_dev(dev);
5535 spin_lock_irqsave(&device_domain_lock, flags);
5536 spin_lock(&iommu->lock);
5539 info = dev->archdata.iommu;
5540 if (!info || !info->pasid_supported)
5543 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5544 if (WARN_ON(!context))
5547 ctx_lo = context[0].lo;
5549 if (!(ctx_lo & CONTEXT_PASIDE)) {
5550 ctx_lo |= CONTEXT_PASIDE;
5551 context[0].lo = ctx_lo;
5553 iommu->flush.flush_context(iommu,
5554 domain->iommu_did[iommu->seq_id],
5555 PCI_DEVID(info->bus, info->devfn),
5556 DMA_CCMD_MASK_NOBIT,
5557 DMA_CCMD_DEVICE_INVL);
5560 /* Enable PASID support in the device, if it wasn't already */
5561 if (!info->pasid_enabled)
5562 iommu_enable_dev_iotlb(info);
5567 spin_unlock(&iommu->lock);
5568 spin_unlock_irqrestore(&device_domain_lock, flags);
5573 static void intel_iommu_apply_resv_region(struct device *dev,
5574 struct iommu_domain *domain,
5575 struct iommu_resv_region *region)
5577 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5578 unsigned long start, end;
5580 start = IOVA_PFN(region->start);
5581 end = IOVA_PFN(region->start + region->length - 1);
5583 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5586 #ifdef CONFIG_INTEL_IOMMU_SVM
5587 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5589 struct intel_iommu *iommu;
5592 if (iommu_dummy(dev)) {
5594 "No IOMMU translation for device; cannot enable SVM\n");
5598 iommu = device_to_iommu(dev, &bus, &devfn);
5600 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5606 #endif /* CONFIG_INTEL_IOMMU_SVM */
5608 static int intel_iommu_enable_auxd(struct device *dev)
5610 struct device_domain_info *info;
5611 struct intel_iommu *iommu;
5612 unsigned long flags;
5616 iommu = device_to_iommu(dev, &bus, &devfn);
5617 if (!iommu || dmar_disabled)
5620 if (!sm_supported(iommu) || !pasid_supported(iommu))
5623 ret = intel_iommu_enable_pasid(iommu, dev);
5627 spin_lock_irqsave(&device_domain_lock, flags);
5628 info = dev->archdata.iommu;
5629 info->auxd_enabled = 1;
5630 spin_unlock_irqrestore(&device_domain_lock, flags);
5635 static int intel_iommu_disable_auxd(struct device *dev)
5637 struct device_domain_info *info;
5638 unsigned long flags;
5640 spin_lock_irqsave(&device_domain_lock, flags);
5641 info = dev->archdata.iommu;
5642 if (!WARN_ON(!info))
5643 info->auxd_enabled = 0;
5644 spin_unlock_irqrestore(&device_domain_lock, flags);
5650 * A PCI express designated vendor specific extended capability is defined
5651 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5652 * for system software and tools to detect endpoint devices supporting the
5653 * Intel scalable IO virtualization without host driver dependency.
5655 * Returns the address of the matching extended capability structure within
5656 * the device's PCI configuration space or 0 if the device does not support
5659 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5664 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5666 pci_read_config_word(pdev, pos + 4, &vendor);
5667 pci_read_config_word(pdev, pos + 8, &id);
5668 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5671 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5678 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5680 if (feat == IOMMU_DEV_FEAT_AUX) {
5683 if (!dev_is_pci(dev) || dmar_disabled ||
5684 !scalable_mode_support() || !iommu_pasid_support())
5687 ret = pci_pasid_features(to_pci_dev(dev));
5691 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5698 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5700 if (feat == IOMMU_DEV_FEAT_AUX)
5701 return intel_iommu_enable_auxd(dev);
5707 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5709 if (feat == IOMMU_DEV_FEAT_AUX)
5710 return intel_iommu_disable_auxd(dev);
5716 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5718 struct device_domain_info *info = dev->archdata.iommu;
5720 if (feat == IOMMU_DEV_FEAT_AUX)
5721 return scalable_mode_support() && info && info->auxd_enabled;
5727 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5729 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5731 return dmar_domain->default_pasid > 0 ?
5732 dmar_domain->default_pasid : -EINVAL;
5735 const struct iommu_ops intel_iommu_ops = {
5736 .capable = intel_iommu_capable,
5737 .domain_alloc = intel_iommu_domain_alloc,
5738 .domain_free = intel_iommu_domain_free,
5739 .attach_dev = intel_iommu_attach_device,
5740 .detach_dev = intel_iommu_detach_device,
5741 .aux_attach_dev = intel_iommu_aux_attach_device,
5742 .aux_detach_dev = intel_iommu_aux_detach_device,
5743 .aux_get_pasid = intel_iommu_aux_get_pasid,
5744 .map = intel_iommu_map,
5745 .unmap = intel_iommu_unmap,
5746 .iova_to_phys = intel_iommu_iova_to_phys,
5747 .add_device = intel_iommu_add_device,
5748 .remove_device = intel_iommu_remove_device,
5749 .get_resv_regions = intel_iommu_get_resv_regions,
5750 .put_resv_regions = intel_iommu_put_resv_regions,
5751 .apply_resv_region = intel_iommu_apply_resv_region,
5752 .device_group = pci_device_group,
5753 .dev_has_feat = intel_iommu_dev_has_feat,
5754 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5755 .dev_enable_feat = intel_iommu_dev_enable_feat,
5756 .dev_disable_feat = intel_iommu_dev_disable_feat,
5757 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5760 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5762 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5763 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5767 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5775 static void quirk_iommu_rwbf(struct pci_dev *dev)
5778 * Mobile 4 Series Chipset neglects to set RWBF capability,
5779 * but needs it. Same seems to hold for the desktop versions.
5781 pci_info(dev, "Forcing write-buffer flush capability\n");
5785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5794 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5795 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5796 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5797 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5798 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5799 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5800 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5801 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5803 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5807 if (pci_read_config_word(dev, GGC, &ggc))
5810 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5811 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5813 } else if (dmar_map_gfx) {
5814 /* we have to ensure the gfx device is idle before we flush */
5815 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5816 intel_iommu_strict = 1;
5819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5824 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5825 ISOCH DMAR unit for the Azalia sound device, but not give it any
5826 TLB entries, which causes it to deadlock. Check for that. We do
5827 this in a function called from init_dmars(), instead of in a PCI
5828 quirk, because we don't want to print the obnoxious "BIOS broken"
5829 message if VT-d is actually disabled.
5831 static void __init check_tylersburg_isoch(void)
5833 struct pci_dev *pdev;
5834 uint32_t vtisochctrl;
5836 /* If there's no Azalia in the system anyway, forget it. */
5837 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5842 /* System Management Registers. Might be hidden, in which case
5843 we can't do the sanity check. But that's OK, because the
5844 known-broken BIOSes _don't_ actually hide it, so far. */
5845 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5849 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5856 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5857 if (vtisochctrl & 1)
5860 /* Drop all bits other than the number of TLB entries */
5861 vtisochctrl &= 0x1c;
5863 /* If we have the recommended number of TLB entries (16), fine. */
5864 if (vtisochctrl == 0x10)
5867 /* Zero TLB entries? You get to ride the short bus to school. */
5869 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5870 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5871 dmi_get_system_info(DMI_BIOS_VENDOR),
5872 dmi_get_system_info(DMI_BIOS_VERSION),
5873 dmi_get_system_info(DMI_PRODUCT_VERSION));
5874 iommu_identity_mapping |= IDENTMAP_AZALIA;
5878 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",