2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
22 #define dev_fmt(fmt) pr_fmt(fmt)
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
59 #define ROOT_SIZE VTD_PAGE_SIZE
60 #define CONTEXT_SIZE VTD_PAGE_SIZE
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
67 #define IOAPIC_RANGE_START (0xfee00000)
68 #define IOAPIC_RANGE_END (0xfeefffff)
69 #define IOVA_START_ADDR (0x1000)
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
76 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
82 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN (1)
88 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
90 /* page table handling */
91 #define LEVEL_STRIDE (9)
92 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
95 * This bitmap is used to advertise the page sizes our hardware support
96 * to the IOMMU core, which will then use this information to split
97 * physically contiguous memory regions it is mapping into page sizes
100 * Traditionally the IOMMU core just handed us the mappings directly,
101 * after making sure the size is an order of a 4KiB page and that the
102 * mapping has natural alignment.
104 * To retain this behavior, we currently advertise that we support
105 * all page sizes that are an order of 4KiB.
107 * If at some point we'd like to utilize the IOMMU core's new behavior,
108 * we could change this to advertise the real page sizes we support.
110 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
112 static inline int agaw_to_level(int agaw)
117 static inline int agaw_to_width(int agaw)
119 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
122 static inline int width_to_agaw(int width)
124 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
127 static inline unsigned int level_to_offset_bits(int level)
129 return (level - 1) * LEVEL_STRIDE;
132 static inline int pfn_level_offset(unsigned long pfn, int level)
134 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
137 static inline unsigned long level_mask(int level)
139 return -1UL << level_to_offset_bits(level);
142 static inline unsigned long level_size(int level)
144 return 1UL << level_to_offset_bits(level);
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
149 return (pfn + level_size(level) - 1) & level_mask(level);
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
154 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158 are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
161 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
166 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
170 return mm_to_dma_pfn(page_to_pfn(pg));
172 static inline unsigned long virt_to_dma_pfn(void *p)
174 return page_to_dma_pfn(virt_to_page(p));
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
184 * set to 1 to panic kernel if can't successfully enable VT-d
185 * (used when kernel is launched w/ TXT)
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
194 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
202 return re->lo & VTD_PAGE_MASK;
206 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
214 return re->hi & VTD_PAGE_MASK;
217 static inline void context_clear_pasid_enable(struct context_entry *context)
219 context->lo &= ~(1ULL << 11);
222 static inline bool context_pasid_enabled(struct context_entry *context)
224 return !!(context->lo & (1ULL << 11));
227 static inline void context_set_copied(struct context_entry *context)
229 context->hi |= (1ull << 3);
232 static inline bool context_copied(struct context_entry *context)
234 return !!(context->hi & (1ULL << 3));
237 static inline bool __context_present(struct context_entry *context)
239 return (context->lo & 1);
242 bool context_present(struct context_entry *context)
244 return context_pasid_enabled(context) ?
245 __context_present(context) :
246 __context_present(context) && !context_copied(context);
249 static inline void context_set_present(struct context_entry *context)
254 static inline void context_set_fault_enable(struct context_entry *context)
256 context->lo &= (((u64)-1) << 2) | 1;
259 static inline void context_set_translation_type(struct context_entry *context,
262 context->lo &= (((u64)-1) << 4) | 3;
263 context->lo |= (value & 3) << 2;
266 static inline void context_set_address_root(struct context_entry *context,
269 context->lo &= ~VTD_PAGE_MASK;
270 context->lo |= value & VTD_PAGE_MASK;
273 static inline void context_set_address_width(struct context_entry *context,
276 context->hi |= value & 7;
279 static inline void context_set_domain_id(struct context_entry *context,
282 context->hi |= (value & ((1 << 16) - 1)) << 8;
285 static inline int context_domain_id(struct context_entry *c)
287 return((c->hi >> 8) & 0xffff);
290 static inline void context_clear_entry(struct context_entry *context)
297 * This domain is a statically identity mapping domain.
298 * 1. This domain creats a static 1:1 mapping to all usable memory.
299 * 2. It maps to each iommu if successful.
300 * 3. Each iommu mapps to this domain if successful.
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
306 * Domain represents a virtual machine, more than one devices
307 * across iommus may be owned in one domain, e.g. kvm guest.
309 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
311 /* si_domain contains mulitple devices */
312 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
314 #define for_each_domain_iommu(idx, domain) \
315 for (idx = 0; idx < g_num_of_iommus; idx++) \
316 if (domain->iommu_refcnt[idx])
318 struct dmar_rmrr_unit {
319 struct list_head list; /* list of rmrr units */
320 struct acpi_dmar_header *hdr; /* ACPI header */
321 u64 base_address; /* reserved base address*/
322 u64 end_address; /* reserved end address */
323 struct dmar_dev_scope *devices; /* target devices */
324 int devices_cnt; /* target device count */
325 struct iommu_resv_region *resv; /* reserved region handle */
328 struct dmar_atsr_unit {
329 struct list_head list; /* list of ATSR units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 struct dmar_dev_scope *devices; /* target devices */
332 int devices_cnt; /* target device count */
333 u8 include_all:1; /* include all ports */
336 static LIST_HEAD(dmar_atsr_units);
337 static LIST_HEAD(dmar_rmrr_units);
339 #define for_each_rmrr_units(rmrr) \
340 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
342 /* bitmap for indexing intel_iommus */
343 static int g_num_of_iommus;
345 static void domain_exit(struct dmar_domain *domain);
346 static void domain_remove_dev_info(struct dmar_domain *domain);
347 static void dmar_remove_one_dev_info(struct device *dev);
348 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
349 static void domain_context_clear(struct intel_iommu *iommu,
351 static int domain_detach_iommu(struct dmar_domain *domain,
352 struct intel_iommu *iommu);
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
360 int intel_iommu_enabled = 0;
361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363 static int dmar_map_gfx = 1;
364 static int dmar_forcedac;
365 static int intel_iommu_strict;
366 static int intel_iommu_superpage = 1;
367 static int intel_iommu_sm;
368 static int iommu_identity_mapping;
370 #define IDENTMAP_ALL 1
371 #define IDENTMAP_GFX 2
372 #define IDENTMAP_AZALIA 4
374 #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
375 #define pasid_supported(iommu) (sm_supported(iommu) && \
376 ecap_pasid((iommu)->ecap))
378 int intel_iommu_gfx_mapped;
379 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
381 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
382 static DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
386 * Iterate over elements in device_domain_list and call the specified
387 * callback @fn against each element.
389 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
390 void *data), void *data)
394 struct device_domain_info *info;
396 spin_lock_irqsave(&device_domain_lock, flags);
397 list_for_each_entry(info, &device_domain_list, global) {
398 ret = fn(info, data);
400 spin_unlock_irqrestore(&device_domain_lock, flags);
404 spin_unlock_irqrestore(&device_domain_lock, flags);
409 const struct iommu_ops intel_iommu_ops;
411 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
416 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
421 static void init_translation_status(struct intel_iommu *iommu)
425 gsts = readl(iommu->reg + DMAR_GSTS_REG);
426 if (gsts & DMA_GSTS_TES)
427 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
430 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
431 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 return container_of(dom, struct dmar_domain, domain);
436 static int __init intel_iommu_setup(char *str)
441 if (!strncmp(str, "on", 2)) {
443 pr_info("IOMMU enabled\n");
444 } else if (!strncmp(str, "off", 3)) {
446 no_platform_optin = 1;
447 pr_info("IOMMU disabled\n");
448 } else if (!strncmp(str, "igfx_off", 8)) {
450 pr_info("Disable GFX device mapping\n");
451 } else if (!strncmp(str, "forcedac", 8)) {
452 pr_info("Forcing DAC for PCI devices\n");
454 } else if (!strncmp(str, "strict", 6)) {
455 pr_info("Disable batched IOTLB flush\n");
456 intel_iommu_strict = 1;
457 } else if (!strncmp(str, "sp_off", 6)) {
458 pr_info("Disable supported super page\n");
459 intel_iommu_superpage = 0;
460 } else if (!strncmp(str, "sm_on", 5)) {
461 pr_info("Intel-IOMMU: scalable mode supported\n");
463 } else if (!strncmp(str, "tboot_noforce", 13)) {
465 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
466 intel_iommu_tboot_noforce = 1;
469 str += strcspn(str, ",");
475 __setup("intel_iommu=", intel_iommu_setup);
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 struct dmar_domain **domains;
485 domains = iommu->domains[idx];
489 return domains[did & 0xff];
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 struct dmar_domain *domain)
495 struct dmar_domain **domains;
498 if (!iommu->domains[idx]) {
499 size_t size = 256 * sizeof(struct dmar_domain *);
500 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
503 domains = iommu->domains[idx];
504 if (WARN_ON(!domains))
507 domains[did & 0xff] = domain;
510 void *alloc_pgtable_page(int node)
515 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 vaddr = page_address(page);
521 void free_pgtable_page(void *vaddr)
523 free_page((unsigned long)vaddr);
526 static inline void *alloc_domain_mem(void)
528 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
531 static void free_domain_mem(void *vaddr)
533 kmem_cache_free(iommu_domain_cache, vaddr);
536 static inline void * alloc_devinfo_mem(void)
538 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
541 static inline void free_devinfo_mem(void *vaddr)
543 kmem_cache_free(iommu_devinfo_cache, vaddr);
546 static inline int domain_type_is_vm(struct dmar_domain *domain)
548 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
551 static inline int domain_type_is_si(struct dmar_domain *domain)
553 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
556 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
558 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
559 DOMAIN_FLAG_STATIC_IDENTITY);
562 static inline int domain_pfn_supported(struct dmar_domain *domain,
565 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
567 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
570 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
575 sagaw = cap_sagaw(iommu->cap);
576 for (agaw = width_to_agaw(max_gaw);
578 if (test_bit(agaw, &sagaw))
586 * Calculate max SAGAW for each iommu.
588 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
590 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
594 * calculate agaw for each iommu.
595 * "SAGAW" may be different across iommus, use a default agaw, and
596 * get a supported less agaw for iommus that don't support the default agaw.
598 int iommu_calculate_agaw(struct intel_iommu *iommu)
600 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
603 /* This functionin only returns single iommu in a domain */
604 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
608 /* si_domain and vm domain should not get here. */
609 BUG_ON(domain_type_is_vm_or_si(domain));
610 for_each_domain_iommu(iommu_id, domain)
613 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
616 return g_iommus[iommu_id];
619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 struct dmar_drhd_unit *drhd;
622 struct intel_iommu *iommu;
626 domain->iommu_coherency = 1;
628 for_each_domain_iommu(i, domain) {
630 if (!ecap_coherent(g_iommus[i]->ecap)) {
631 domain->iommu_coherency = 0;
638 /* No hardware attached; use lowest common denominator */
640 for_each_active_iommu(iommu, drhd) {
641 if (!ecap_coherent(iommu->ecap)) {
642 domain->iommu_coherency = 0;
649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 struct dmar_drhd_unit *drhd;
652 struct intel_iommu *iommu;
656 for_each_active_iommu(iommu, drhd) {
658 if (!ecap_sc_support(iommu->ecap)) {
669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
671 struct dmar_drhd_unit *drhd;
672 struct intel_iommu *iommu;
675 if (!intel_iommu_superpage) {
679 /* set iommu_superpage to the smallest common denominator */
681 for_each_active_iommu(iommu, drhd) {
683 mask &= cap_super_page_val(iommu->cap);
693 /* Some capabilities may be different across iommus */
694 static void domain_update_iommu_cap(struct dmar_domain *domain)
696 domain_update_iommu_coherency(domain);
697 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
698 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
704 struct root_entry *root = &iommu->root_entry[bus];
705 struct context_entry *context;
709 if (sm_supported(iommu)) {
717 context = phys_to_virt(*entry & VTD_PAGE_MASK);
719 unsigned long phy_addr;
723 context = alloc_pgtable_page(iommu->node);
727 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
728 phy_addr = virt_to_phys((void *)context);
729 *entry = phy_addr | 1;
730 __iommu_flush_cache(iommu, entry, sizeof(*entry));
732 return &context[devfn];
735 static int iommu_dummy(struct device *dev)
737 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
740 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
742 struct dmar_drhd_unit *drhd = NULL;
743 struct intel_iommu *iommu;
745 struct pci_dev *ptmp, *pdev = NULL;
749 if (iommu_dummy(dev))
752 if (dev_is_pci(dev)) {
753 struct pci_dev *pf_pdev;
755 pdev = to_pci_dev(dev);
758 /* VMD child devices currently cannot be handled individually */
759 if (is_vmd(pdev->bus))
763 /* VFs aren't listed in scope tables; we need to look up
764 * the PF instead to find the IOMMU. */
765 pf_pdev = pci_physfn(pdev);
767 segment = pci_domain_nr(pdev->bus);
768 } else if (has_acpi_companion(dev))
769 dev = &ACPI_COMPANION(dev)->dev;
772 for_each_active_iommu(iommu, drhd) {
773 if (pdev && segment != drhd->segment)
776 for_each_active_dev_scope(drhd->devices,
777 drhd->devices_cnt, i, tmp) {
779 /* For a VF use its original BDF# not that of the PF
780 * which we used for the IOMMU lookup. Strictly speaking
781 * we could do this for all PCI devices; we only need to
782 * get the BDF# from the scope table for ACPI matches. */
783 if (pdev && pdev->is_virtfn)
786 *bus = drhd->devices[i].bus;
787 *devfn = drhd->devices[i].devfn;
791 if (!pdev || !dev_is_pci(tmp))
794 ptmp = to_pci_dev(tmp);
795 if (ptmp->subordinate &&
796 ptmp->subordinate->number <= pdev->bus->number &&
797 ptmp->subordinate->busn_res.end >= pdev->bus->number)
801 if (pdev && drhd->include_all) {
803 *bus = pdev->bus->number;
804 *devfn = pdev->devfn;
815 static void domain_flush_cache(struct dmar_domain *domain,
816 void *addr, int size)
818 if (!domain->iommu_coherency)
819 clflush_cache_range(addr, size);
822 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
824 struct context_entry *context;
828 spin_lock_irqsave(&iommu->lock, flags);
829 context = iommu_context_addr(iommu, bus, devfn, 0);
831 ret = context_present(context);
832 spin_unlock_irqrestore(&iommu->lock, flags);
836 static void free_context_table(struct intel_iommu *iommu)
840 struct context_entry *context;
842 spin_lock_irqsave(&iommu->lock, flags);
843 if (!iommu->root_entry) {
846 for (i = 0; i < ROOT_ENTRY_NR; i++) {
847 context = iommu_context_addr(iommu, i, 0, 0);
849 free_pgtable_page(context);
851 if (!sm_supported(iommu))
854 context = iommu_context_addr(iommu, i, 0x80, 0);
856 free_pgtable_page(context);
859 free_pgtable_page(iommu->root_entry);
860 iommu->root_entry = NULL;
862 spin_unlock_irqrestore(&iommu->lock, flags);
865 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
866 unsigned long pfn, int *target_level)
868 struct dma_pte *parent, *pte;
869 int level = agaw_to_level(domain->agaw);
872 BUG_ON(!domain->pgd);
874 if (!domain_pfn_supported(domain, pfn))
875 /* Address beyond IOMMU's addressing capabilities. */
878 parent = domain->pgd;
883 offset = pfn_level_offset(pfn, level);
884 pte = &parent[offset];
885 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
887 if (level == *target_level)
890 if (!dma_pte_present(pte)) {
893 tmp_page = alloc_pgtable_page(domain->nid);
898 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
899 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
900 if (cmpxchg64(&pte->val, 0ULL, pteval))
901 /* Someone else set it while we were thinking; use theirs. */
902 free_pgtable_page(tmp_page);
904 domain_flush_cache(domain, pte, sizeof(*pte));
909 parent = phys_to_virt(dma_pte_addr(pte));
914 *target_level = level;
920 /* return address's pte at specific level */
921 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
923 int level, int *large_page)
925 struct dma_pte *parent, *pte;
926 int total = agaw_to_level(domain->agaw);
929 parent = domain->pgd;
930 while (level <= total) {
931 offset = pfn_level_offset(pfn, total);
932 pte = &parent[offset];
936 if (!dma_pte_present(pte)) {
941 if (dma_pte_superpage(pte)) {
946 parent = phys_to_virt(dma_pte_addr(pte));
952 /* clear last level pte, a tlb flush should be followed */
953 static void dma_pte_clear_range(struct dmar_domain *domain,
954 unsigned long start_pfn,
955 unsigned long last_pfn)
957 unsigned int large_page;
958 struct dma_pte *first_pte, *pte;
960 BUG_ON(!domain_pfn_supported(domain, start_pfn));
961 BUG_ON(!domain_pfn_supported(domain, last_pfn));
962 BUG_ON(start_pfn > last_pfn);
964 /* we don't need lock here; nobody else touches the iova range */
967 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
969 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
974 start_pfn += lvl_to_nr_pages(large_page);
976 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
978 domain_flush_cache(domain, first_pte,
979 (void *)pte - (void *)first_pte);
981 } while (start_pfn && start_pfn <= last_pfn);
984 static void dma_pte_free_level(struct dmar_domain *domain, int level,
985 int retain_level, struct dma_pte *pte,
986 unsigned long pfn, unsigned long start_pfn,
987 unsigned long last_pfn)
989 pfn = max(start_pfn, pfn);
990 pte = &pte[pfn_level_offset(pfn, level)];
993 unsigned long level_pfn;
994 struct dma_pte *level_pte;
996 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
999 level_pfn = pfn & level_mask(level);
1000 level_pte = phys_to_virt(dma_pte_addr(pte));
1003 dma_pte_free_level(domain, level - 1, retain_level,
1004 level_pte, level_pfn, start_pfn,
1009 * Free the page table if we're below the level we want to
1010 * retain and the range covers the entire table.
1012 if (level < retain_level && !(start_pfn > level_pfn ||
1013 last_pfn < level_pfn + level_size(level) - 1)) {
1015 domain_flush_cache(domain, pte, sizeof(*pte));
1016 free_pgtable_page(level_pte);
1019 pfn += level_size(level);
1020 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1024 * clear last level (leaf) ptes and free page table pages below the
1025 * level we wish to keep intact.
1027 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1028 unsigned long start_pfn,
1029 unsigned long last_pfn,
1032 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1033 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1034 BUG_ON(start_pfn > last_pfn);
1036 dma_pte_clear_range(domain, start_pfn, last_pfn);
1038 /* We don't need lock here; nobody else touches the iova range */
1039 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1040 domain->pgd, 0, start_pfn, last_pfn);
1043 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1044 free_pgtable_page(domain->pgd);
1049 /* When a page at a given level is being unlinked from its parent, we don't
1050 need to *modify* it at all. All we need to do is make a list of all the
1051 pages which can be freed just as soon as we've flushed the IOTLB and we
1052 know the hardware page-walk will no longer touch them.
1053 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1055 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1056 int level, struct dma_pte *pte,
1057 struct page *freelist)
1061 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1062 pg->freelist = freelist;
1068 pte = page_address(pg);
1070 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1071 freelist = dma_pte_list_pagetables(domain, level - 1,
1074 } while (!first_pte_in_page(pte));
1079 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1080 struct dma_pte *pte, unsigned long pfn,
1081 unsigned long start_pfn,
1082 unsigned long last_pfn,
1083 struct page *freelist)
1085 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1087 pfn = max(start_pfn, pfn);
1088 pte = &pte[pfn_level_offset(pfn, level)];
1091 unsigned long level_pfn;
1093 if (!dma_pte_present(pte))
1096 level_pfn = pfn & level_mask(level);
1098 /* If range covers entire pagetable, free it */
1099 if (start_pfn <= level_pfn &&
1100 last_pfn >= level_pfn + level_size(level) - 1) {
1101 /* These suborbinate page tables are going away entirely. Don't
1102 bother to clear them; we're just going to *free* them. */
1103 if (level > 1 && !dma_pte_superpage(pte))
1104 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1110 } else if (level > 1) {
1111 /* Recurse down into a level that isn't *entirely* obsolete */
1112 freelist = dma_pte_clear_level(domain, level - 1,
1113 phys_to_virt(dma_pte_addr(pte)),
1114 level_pfn, start_pfn, last_pfn,
1118 pfn += level_size(level);
1119 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1122 domain_flush_cache(domain, first_pte,
1123 (void *)++last_pte - (void *)first_pte);
1128 /* We can't just free the pages because the IOMMU may still be walking
1129 the page tables, and may have cached the intermediate levels. The
1130 pages can only be freed after the IOTLB flush has been done. */
1131 static struct page *domain_unmap(struct dmar_domain *domain,
1132 unsigned long start_pfn,
1133 unsigned long last_pfn)
1135 struct page *freelist;
1137 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1138 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1139 BUG_ON(start_pfn > last_pfn);
1141 /* we don't need lock here; nobody else touches the iova range */
1142 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1143 domain->pgd, 0, start_pfn, last_pfn, NULL);
1146 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1147 struct page *pgd_page = virt_to_page(domain->pgd);
1148 pgd_page->freelist = freelist;
1149 freelist = pgd_page;
1157 static void dma_free_pagelist(struct page *freelist)
1161 while ((pg = freelist)) {
1162 freelist = pg->freelist;
1163 free_pgtable_page(page_address(pg));
1167 static void iova_entry_free(unsigned long data)
1169 struct page *freelist = (struct page *)data;
1171 dma_free_pagelist(freelist);
1174 /* iommu handling */
1175 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1177 struct root_entry *root;
1178 unsigned long flags;
1180 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1182 pr_err("Allocating root entry for %s failed\n",
1187 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1189 spin_lock_irqsave(&iommu->lock, flags);
1190 iommu->root_entry = root;
1191 spin_unlock_irqrestore(&iommu->lock, flags);
1196 static void iommu_set_root_entry(struct intel_iommu *iommu)
1202 addr = virt_to_phys(iommu->root_entry);
1203 if (sm_supported(iommu))
1204 addr |= DMA_RTADDR_SMT;
1206 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1209 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1211 /* Make sure hardware complete it */
1212 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213 readl, (sts & DMA_GSTS_RTPS), sts);
1215 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1218 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1226 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1229 /* Make sure hardware complete it */
1230 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231 readl, (!(val & DMA_GSTS_WBFS)), val);
1233 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1236 /* return value determine if we need a write buffer flush */
1237 static void __iommu_flush_context(struct intel_iommu *iommu,
1238 u16 did, u16 source_id, u8 function_mask,
1245 case DMA_CCMD_GLOBAL_INVL:
1246 val = DMA_CCMD_GLOBAL_INVL;
1248 case DMA_CCMD_DOMAIN_INVL:
1249 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1251 case DMA_CCMD_DEVICE_INVL:
1252 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1253 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258 val |= DMA_CCMD_ICC;
1260 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1263 /* Make sure hardware complete it */
1264 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1265 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1267 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1270 /* return value determine if we need a write buffer flush */
1271 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1272 u64 addr, unsigned int size_order, u64 type)
1274 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1275 u64 val = 0, val_iva = 0;
1279 case DMA_TLB_GLOBAL_FLUSH:
1280 /* global flush doesn't need set IVA_REG */
1281 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1283 case DMA_TLB_DSI_FLUSH:
1284 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1286 case DMA_TLB_PSI_FLUSH:
1287 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288 /* IH bit is passed in as part of address */
1289 val_iva = size_order | addr;
1294 /* Note: set drain read/write */
1297 * This is probably to be super secure.. Looks like we can
1298 * ignore it without any impact.
1300 if (cap_read_drain(iommu->cap))
1301 val |= DMA_TLB_READ_DRAIN;
1303 if (cap_write_drain(iommu->cap))
1304 val |= DMA_TLB_WRITE_DRAIN;
1306 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307 /* Note: Only uses first TLB reg currently */
1309 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1310 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1312 /* Make sure hardware complete it */
1313 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1314 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1316 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1318 /* check IOTLB invalidation granularity */
1319 if (DMA_TLB_IAIG(val) == 0)
1320 pr_err("Flush IOTLB failed\n");
1321 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1322 pr_debug("TLB flush request %Lx, actual %Lx\n",
1323 (unsigned long long)DMA_TLB_IIRG(type),
1324 (unsigned long long)DMA_TLB_IAIG(val));
1327 static struct device_domain_info *
1328 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1331 struct device_domain_info *info;
1333 assert_spin_locked(&device_domain_lock);
1338 list_for_each_entry(info, &domain->devices, link)
1339 if (info->iommu == iommu && info->bus == bus &&
1340 info->devfn == devfn) {
1341 if (info->ats_supported && info->dev)
1349 static void domain_update_iotlb(struct dmar_domain *domain)
1351 struct device_domain_info *info;
1352 bool has_iotlb_device = false;
1354 assert_spin_locked(&device_domain_lock);
1356 list_for_each_entry(info, &domain->devices, link) {
1357 struct pci_dev *pdev;
1359 if (!info->dev || !dev_is_pci(info->dev))
1362 pdev = to_pci_dev(info->dev);
1363 if (pdev->ats_enabled) {
1364 has_iotlb_device = true;
1369 domain->has_iotlb_device = has_iotlb_device;
1372 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1374 struct pci_dev *pdev;
1376 assert_spin_locked(&device_domain_lock);
1378 if (!info || !dev_is_pci(info->dev))
1381 pdev = to_pci_dev(info->dev);
1382 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1383 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1384 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1385 * reserved, which should be set to 0.
1387 if (!ecap_dit(info->iommu->ecap))
1390 struct pci_dev *pf_pdev;
1392 /* pdev will be returned if device is not a vf */
1393 pf_pdev = pci_physfn(pdev);
1394 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1397 #ifdef CONFIG_INTEL_IOMMU_SVM
1398 /* The PCIe spec, in its wisdom, declares that the behaviour of
1399 the device if you enable PASID support after ATS support is
1400 undefined. So always enable PASID support on devices which
1401 have it, even if we can't yet know if we're ever going to
1403 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1404 info->pasid_enabled = 1;
1406 if (info->pri_supported &&
1407 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1408 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1409 info->pri_enabled = 1;
1411 if (!pdev->untrusted && info->ats_supported &&
1412 pci_ats_page_aligned(pdev) &&
1413 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1414 info->ats_enabled = 1;
1415 domain_update_iotlb(info->domain);
1416 info->ats_qdep = pci_ats_queue_depth(pdev);
1420 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1422 struct pci_dev *pdev;
1424 assert_spin_locked(&device_domain_lock);
1426 if (!dev_is_pci(info->dev))
1429 pdev = to_pci_dev(info->dev);
1431 if (info->ats_enabled) {
1432 pci_disable_ats(pdev);
1433 info->ats_enabled = 0;
1434 domain_update_iotlb(info->domain);
1436 #ifdef CONFIG_INTEL_IOMMU_SVM
1437 if (info->pri_enabled) {
1438 pci_disable_pri(pdev);
1439 info->pri_enabled = 0;
1441 if (info->pasid_enabled) {
1442 pci_disable_pasid(pdev);
1443 info->pasid_enabled = 0;
1448 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1449 u64 addr, unsigned mask)
1452 unsigned long flags;
1453 struct device_domain_info *info;
1455 if (!domain->has_iotlb_device)
1458 spin_lock_irqsave(&device_domain_lock, flags);
1459 list_for_each_entry(info, &domain->devices, link) {
1460 if (!info->ats_enabled)
1463 sid = info->bus << 8 | info->devfn;
1464 qdep = info->ats_qdep;
1465 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1468 spin_unlock_irqrestore(&device_domain_lock, flags);
1471 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1472 struct dmar_domain *domain,
1473 unsigned long pfn, unsigned int pages,
1476 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1477 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1478 u16 did = domain->iommu_did[iommu->seq_id];
1485 * Fallback to domain selective flush if no PSI support or the size is
1487 * PSI requires page size to be 2 ^ x, and the base address is naturally
1488 * aligned to the size
1490 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1491 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1494 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1498 * In caching mode, changes of pages from non-present to present require
1499 * flush. However, device IOTLB doesn't need to be flushed in this case.
1501 if (!cap_caching_mode(iommu->cap) || !map)
1502 iommu_flush_dev_iotlb(domain, addr, mask);
1505 /* Notification for newly created mappings */
1506 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1507 struct dmar_domain *domain,
1508 unsigned long pfn, unsigned int pages)
1510 /* It's a non-present to present mapping. Only flush if caching mode */
1511 if (cap_caching_mode(iommu->cap))
1512 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1514 iommu_flush_write_buffer(iommu);
1517 static void iommu_flush_iova(struct iova_domain *iovad)
1519 struct dmar_domain *domain;
1522 domain = container_of(iovad, struct dmar_domain, iovad);
1524 for_each_domain_iommu(idx, domain) {
1525 struct intel_iommu *iommu = g_iommus[idx];
1526 u16 did = domain->iommu_did[iommu->seq_id];
1528 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1530 if (!cap_caching_mode(iommu->cap))
1531 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1532 0, MAX_AGAW_PFN_WIDTH);
1536 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1539 unsigned long flags;
1541 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1542 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1543 pmen &= ~DMA_PMEN_EPM;
1544 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1546 /* wait for the protected region status bit to clear */
1547 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1548 readl, !(pmen & DMA_PMEN_PRS), pmen);
1550 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1553 static void iommu_enable_translation(struct intel_iommu *iommu)
1556 unsigned long flags;
1558 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1559 iommu->gcmd |= DMA_GCMD_TE;
1560 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1562 /* Make sure hardware complete it */
1563 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1564 readl, (sts & DMA_GSTS_TES), sts);
1566 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1569 static void iommu_disable_translation(struct intel_iommu *iommu)
1574 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1575 iommu->gcmd &= ~DMA_GCMD_TE;
1576 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1578 /* Make sure hardware complete it */
1579 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1580 readl, (!(sts & DMA_GSTS_TES)), sts);
1582 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1586 static int iommu_init_domains(struct intel_iommu *iommu)
1588 u32 ndomains, nlongs;
1591 ndomains = cap_ndoms(iommu->cap);
1592 pr_debug("%s: Number of Domains supported <%d>\n",
1593 iommu->name, ndomains);
1594 nlongs = BITS_TO_LONGS(ndomains);
1596 spin_lock_init(&iommu->lock);
1598 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1599 if (!iommu->domain_ids) {
1600 pr_err("%s: Allocating domain id array failed\n",
1605 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1606 iommu->domains = kzalloc(size, GFP_KERNEL);
1608 if (iommu->domains) {
1609 size = 256 * sizeof(struct dmar_domain *);
1610 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1613 if (!iommu->domains || !iommu->domains[0]) {
1614 pr_err("%s: Allocating domain array failed\n",
1616 kfree(iommu->domain_ids);
1617 kfree(iommu->domains);
1618 iommu->domain_ids = NULL;
1619 iommu->domains = NULL;
1626 * If Caching mode is set, then invalid translations are tagged
1627 * with domain-id 0, hence we need to pre-allocate it. We also
1628 * use domain-id 0 as a marker for non-allocated domain-id, so
1629 * make sure it is not used for a real domain.
1631 set_bit(0, iommu->domain_ids);
1634 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1635 * entry for first-level or pass-through translation modes should
1636 * be programmed with a domain id different from those used for
1637 * second-level or nested translation. We reserve a domain id for
1640 if (sm_supported(iommu))
1641 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646 static void disable_dmar_iommu(struct intel_iommu *iommu)
1648 struct device_domain_info *info, *tmp;
1649 unsigned long flags;
1651 if (!iommu->domains || !iommu->domain_ids)
1655 spin_lock_irqsave(&device_domain_lock, flags);
1656 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1657 struct dmar_domain *domain;
1659 if (info->iommu != iommu)
1662 if (!info->dev || !info->domain)
1665 domain = info->domain;
1667 __dmar_remove_one_dev_info(info);
1669 if (!domain_type_is_vm_or_si(domain)) {
1671 * The domain_exit() function can't be called under
1672 * device_domain_lock, as it takes this lock itself.
1673 * So release the lock here and re-run the loop
1676 spin_unlock_irqrestore(&device_domain_lock, flags);
1677 domain_exit(domain);
1681 spin_unlock_irqrestore(&device_domain_lock, flags);
1683 if (iommu->gcmd & DMA_GCMD_TE)
1684 iommu_disable_translation(iommu);
1687 static void free_dmar_iommu(struct intel_iommu *iommu)
1689 if ((iommu->domains) && (iommu->domain_ids)) {
1690 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1693 for (i = 0; i < elems; i++)
1694 kfree(iommu->domains[i]);
1695 kfree(iommu->domains);
1696 kfree(iommu->domain_ids);
1697 iommu->domains = NULL;
1698 iommu->domain_ids = NULL;
1701 g_iommus[iommu->seq_id] = NULL;
1703 /* free context mapping */
1704 free_context_table(iommu);
1706 #ifdef CONFIG_INTEL_IOMMU_SVM
1707 if (pasid_supported(iommu)) {
1708 if (ecap_prs(iommu->ecap))
1709 intel_svm_finish_prq(iommu);
1714 static struct dmar_domain *alloc_domain(int flags)
1716 struct dmar_domain *domain;
1718 domain = alloc_domain_mem();
1722 memset(domain, 0, sizeof(*domain));
1723 domain->nid = NUMA_NO_NODE;
1724 domain->flags = flags;
1725 domain->has_iotlb_device = false;
1726 INIT_LIST_HEAD(&domain->devices);
1731 /* Must be called with iommu->lock */
1732 static int domain_attach_iommu(struct dmar_domain *domain,
1733 struct intel_iommu *iommu)
1735 unsigned long ndomains;
1738 assert_spin_locked(&device_domain_lock);
1739 assert_spin_locked(&iommu->lock);
1741 domain->iommu_refcnt[iommu->seq_id] += 1;
1742 domain->iommu_count += 1;
1743 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1744 ndomains = cap_ndoms(iommu->cap);
1745 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1747 if (num >= ndomains) {
1748 pr_err("%s: No free domain ids\n", iommu->name);
1749 domain->iommu_refcnt[iommu->seq_id] -= 1;
1750 domain->iommu_count -= 1;
1754 set_bit(num, iommu->domain_ids);
1755 set_iommu_domain(iommu, num, domain);
1757 domain->iommu_did[iommu->seq_id] = num;
1758 domain->nid = iommu->node;
1760 domain_update_iommu_cap(domain);
1766 static int domain_detach_iommu(struct dmar_domain *domain,
1767 struct intel_iommu *iommu)
1771 assert_spin_locked(&device_domain_lock);
1772 assert_spin_locked(&iommu->lock);
1774 domain->iommu_refcnt[iommu->seq_id] -= 1;
1775 count = --domain->iommu_count;
1776 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1777 num = domain->iommu_did[iommu->seq_id];
1778 clear_bit(num, iommu->domain_ids);
1779 set_iommu_domain(iommu, num, NULL);
1781 domain_update_iommu_cap(domain);
1782 domain->iommu_did[iommu->seq_id] = 0;
1788 static struct iova_domain reserved_iova_list;
1789 static struct lock_class_key reserved_rbtree_key;
1791 static int dmar_init_reserved_ranges(void)
1793 struct pci_dev *pdev = NULL;
1797 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1799 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1800 &reserved_rbtree_key);
1802 /* IOAPIC ranges shouldn't be accessed by DMA */
1803 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1804 IOVA_PFN(IOAPIC_RANGE_END));
1806 pr_err("Reserve IOAPIC range failed\n");
1810 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1811 for_each_pci_dev(pdev) {
1814 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1815 r = &pdev->resource[i];
1816 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1818 iova = reserve_iova(&reserved_iova_list,
1822 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1830 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1832 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1835 static inline int guestwidth_to_adjustwidth(int gaw)
1838 int r = (gaw - 12) % 9;
1849 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1852 int adjust_width, agaw;
1853 unsigned long sagaw;
1856 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1858 err = init_iova_flush_queue(&domain->iovad,
1859 iommu_flush_iova, iova_entry_free);
1863 domain_reserve_special_ranges(domain);
1865 /* calculate AGAW */
1866 if (guest_width > cap_mgaw(iommu->cap))
1867 guest_width = cap_mgaw(iommu->cap);
1868 domain->gaw = guest_width;
1869 adjust_width = guestwidth_to_adjustwidth(guest_width);
1870 agaw = width_to_agaw(adjust_width);
1871 sagaw = cap_sagaw(iommu->cap);
1872 if (!test_bit(agaw, &sagaw)) {
1873 /* hardware doesn't support it, choose a bigger one */
1874 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1875 agaw = find_next_bit(&sagaw, 5, agaw);
1879 domain->agaw = agaw;
1881 if (ecap_coherent(iommu->ecap))
1882 domain->iommu_coherency = 1;
1884 domain->iommu_coherency = 0;
1886 if (ecap_sc_support(iommu->ecap))
1887 domain->iommu_snooping = 1;
1889 domain->iommu_snooping = 0;
1891 if (intel_iommu_superpage)
1892 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1894 domain->iommu_superpage = 0;
1896 domain->nid = iommu->node;
1898 /* always allocate the top pgd */
1899 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1902 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1906 static void domain_exit(struct dmar_domain *domain)
1908 struct page *freelist;
1910 /* Remove associated devices and clear attached or cached domains */
1912 domain_remove_dev_info(domain);
1916 put_iova_domain(&domain->iovad);
1918 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1920 dma_free_pagelist(freelist);
1922 free_domain_mem(domain);
1926 * Get the PASID directory size for scalable mode context entry.
1927 * Value of X in the PDTS field of a scalable mode context entry
1928 * indicates PASID directory with 2^(X + 7) entries.
1930 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1934 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1935 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1943 * Set the RID_PASID field of a scalable mode context entry. The
1944 * IOMMU hardware will use the PASID value set in this field for
1945 * DMA translations of DMA requests without PASID.
1948 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1950 context->hi |= pasid & ((1 << 20) - 1);
1951 context->hi |= (1 << 20);
1955 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1958 static inline void context_set_sm_dte(struct context_entry *context)
1960 context->lo |= (1 << 2);
1964 * Set the PRE(Page Request Enable) field of a scalable mode context
1967 static inline void context_set_sm_pre(struct context_entry *context)
1969 context->lo |= (1 << 4);
1972 /* Convert value to context PASID directory size field coding. */
1973 #define context_pdts(pds) (((pds) & 0x7) << 9)
1975 static int domain_context_mapping_one(struct dmar_domain *domain,
1976 struct intel_iommu *iommu,
1977 struct pasid_table *table,
1980 u16 did = domain->iommu_did[iommu->seq_id];
1981 int translation = CONTEXT_TT_MULTI_LEVEL;
1982 struct device_domain_info *info = NULL;
1983 struct context_entry *context;
1984 unsigned long flags;
1989 if (hw_pass_through && domain_type_is_si(domain))
1990 translation = CONTEXT_TT_PASS_THROUGH;
1992 pr_debug("Set context mapping for %02x:%02x.%d\n",
1993 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1995 BUG_ON(!domain->pgd);
1997 spin_lock_irqsave(&device_domain_lock, flags);
1998 spin_lock(&iommu->lock);
2001 context = iommu_context_addr(iommu, bus, devfn, 1);
2006 if (context_present(context))
2010 * For kdump cases, old valid entries may be cached due to the
2011 * in-flight DMA and copied pgtable, but there is no unmapping
2012 * behaviour for them, thus we need an explicit cache flush for
2013 * the newly-mapped device. For kdump, at this point, the device
2014 * is supposed to finish reset at its driver probe stage, so no
2015 * in-flight DMA will exist, and we don't need to worry anymore
2018 if (context_copied(context)) {
2019 u16 did_old = context_domain_id(context);
2021 if (did_old < cap_ndoms(iommu->cap)) {
2022 iommu->flush.flush_context(iommu, did_old,
2023 (((u16)bus) << 8) | devfn,
2024 DMA_CCMD_MASK_NOBIT,
2025 DMA_CCMD_DEVICE_INVL);
2026 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2031 context_clear_entry(context);
2033 if (sm_supported(iommu)) {
2038 /* Setup the PASID DIR pointer: */
2039 pds = context_get_sm_pds(table);
2040 context->lo = (u64)virt_to_phys(table->table) |
2043 /* Setup the RID_PASID field: */
2044 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2047 * Setup the Device-TLB enable bit and Page request
2050 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2051 if (info && info->ats_supported)
2052 context_set_sm_dte(context);
2053 if (info && info->pri_supported)
2054 context_set_sm_pre(context);
2056 struct dma_pte *pgd = domain->pgd;
2059 context_set_domain_id(context, did);
2061 if (translation != CONTEXT_TT_PASS_THROUGH) {
2063 * Skip top levels of page tables for iommu which has
2064 * less agaw than default. Unnecessary for PT mode.
2066 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2068 pgd = phys_to_virt(dma_pte_addr(pgd));
2069 if (!dma_pte_present(pgd))
2073 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2074 if (info && info->ats_supported)
2075 translation = CONTEXT_TT_DEV_IOTLB;
2077 translation = CONTEXT_TT_MULTI_LEVEL;
2079 context_set_address_root(context, virt_to_phys(pgd));
2080 context_set_address_width(context, agaw);
2083 * In pass through mode, AW must be programmed to
2084 * indicate the largest AGAW value supported by
2085 * hardware. And ASR is ignored by hardware.
2087 context_set_address_width(context, iommu->msagaw);
2090 context_set_translation_type(context, translation);
2093 context_set_fault_enable(context);
2094 context_set_present(context);
2095 domain_flush_cache(domain, context, sizeof(*context));
2098 * It's a non-present to present mapping. If hardware doesn't cache
2099 * non-present entry we only need to flush the write-buffer. If the
2100 * _does_ cache non-present entries, then it does so in the special
2101 * domain #0, which we have to flush:
2103 if (cap_caching_mode(iommu->cap)) {
2104 iommu->flush.flush_context(iommu, 0,
2105 (((u16)bus) << 8) | devfn,
2106 DMA_CCMD_MASK_NOBIT,
2107 DMA_CCMD_DEVICE_INVL);
2108 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2110 iommu_flush_write_buffer(iommu);
2112 iommu_enable_dev_iotlb(info);
2117 spin_unlock(&iommu->lock);
2118 spin_unlock_irqrestore(&device_domain_lock, flags);
2123 struct domain_context_mapping_data {
2124 struct dmar_domain *domain;
2125 struct intel_iommu *iommu;
2126 struct pasid_table *table;
2129 static int domain_context_mapping_cb(struct pci_dev *pdev,
2130 u16 alias, void *opaque)
2132 struct domain_context_mapping_data *data = opaque;
2134 return domain_context_mapping_one(data->domain, data->iommu,
2135 data->table, PCI_BUS_NUM(alias),
2140 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2142 struct domain_context_mapping_data data;
2143 struct pasid_table *table;
2144 struct intel_iommu *iommu;
2147 iommu = device_to_iommu(dev, &bus, &devfn);
2151 table = intel_pasid_get_table(dev);
2153 if (!dev_is_pci(dev))
2154 return domain_context_mapping_one(domain, iommu, table,
2157 data.domain = domain;
2161 return pci_for_each_dma_alias(to_pci_dev(dev),
2162 &domain_context_mapping_cb, &data);
2165 static int domain_context_mapped_cb(struct pci_dev *pdev,
2166 u16 alias, void *opaque)
2168 struct intel_iommu *iommu = opaque;
2170 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2173 static int domain_context_mapped(struct device *dev)
2175 struct intel_iommu *iommu;
2178 iommu = device_to_iommu(dev, &bus, &devfn);
2182 if (!dev_is_pci(dev))
2183 return device_context_mapped(iommu, bus, devfn);
2185 return !pci_for_each_dma_alias(to_pci_dev(dev),
2186 domain_context_mapped_cb, iommu);
2189 /* Returns a number of VTD pages, but aligned to MM page size */
2190 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2193 host_addr &= ~PAGE_MASK;
2194 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2197 /* Return largest possible superpage level for a given mapping */
2198 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2199 unsigned long iov_pfn,
2200 unsigned long phy_pfn,
2201 unsigned long pages)
2203 int support, level = 1;
2204 unsigned long pfnmerge;
2206 support = domain->iommu_superpage;
2208 /* To use a large page, the virtual *and* physical addresses
2209 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2210 of them will mean we have to use smaller pages. So just
2211 merge them and check both at once. */
2212 pfnmerge = iov_pfn | phy_pfn;
2214 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2215 pages >>= VTD_STRIDE_SHIFT;
2218 pfnmerge >>= VTD_STRIDE_SHIFT;
2225 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2226 struct scatterlist *sg, unsigned long phys_pfn,
2227 unsigned long nr_pages, int prot)
2229 struct dma_pte *first_pte = NULL, *pte = NULL;
2230 phys_addr_t uninitialized_var(pteval);
2231 unsigned long sg_res = 0;
2232 unsigned int largepage_lvl = 0;
2233 unsigned long lvl_pages = 0;
2235 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2237 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2240 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2247 while (nr_pages > 0) {
2251 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2253 sg_res = aligned_nrpages(sg->offset, sg->length);
2254 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2255 sg->dma_length = sg->length;
2256 pteval = (sg_phys(sg) - pgoff) | prot;
2257 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2263 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2266 /* It is large page*/
2267 if (largepage_lvl > 1) {
2268 unsigned long nr_superpages, end_pfn;
2270 pteval |= DMA_PTE_LARGE_PAGE;
2271 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2273 nr_superpages = sg_res / lvl_pages;
2274 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2277 * Ensure that old small page tables are
2278 * removed to make room for superpage(s).
2279 * We're adding new large pages, so make sure
2280 * we don't remove their parent tables.
2282 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2285 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289 /* We don't need lock here, nobody else
2290 * touches the iova range
2292 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2294 static int dumps = 5;
2295 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2296 iov_pfn, tmp, (unsigned long long)pteval);
2299 debug_dma_dump_mappings(NULL);
2304 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2306 BUG_ON(nr_pages < lvl_pages);
2307 BUG_ON(sg_res < lvl_pages);
2309 nr_pages -= lvl_pages;
2310 iov_pfn += lvl_pages;
2311 phys_pfn += lvl_pages;
2312 pteval += lvl_pages * VTD_PAGE_SIZE;
2313 sg_res -= lvl_pages;
2315 /* If the next PTE would be the first in a new page, then we
2316 need to flush the cache on the entries we've just written.
2317 And then we'll need to recalculate 'pte', so clear it and
2318 let it get set again in the if (!pte) block above.
2320 If we're done (!nr_pages) we need to flush the cache too.
2322 Also if we've been setting superpages, we may need to
2323 recalculate 'pte' and switch back to smaller pages for the
2324 end of the mapping, if the trailing size is not enough to
2325 use another superpage (i.e. sg_res < lvl_pages). */
2327 if (!nr_pages || first_pte_in_page(pte) ||
2328 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2329 domain_flush_cache(domain, first_pte,
2330 (void *)pte - (void *)first_pte);
2334 if (!sg_res && nr_pages)
2340 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2341 struct scatterlist *sg, unsigned long phys_pfn,
2342 unsigned long nr_pages, int prot)
2345 struct intel_iommu *iommu;
2347 /* Do the real mapping first */
2348 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2352 /* Notify about the new mapping */
2353 if (domain_type_is_vm(domain)) {
2354 /* VM typed domains can have more than one IOMMUs */
2356 for_each_domain_iommu(iommu_id, domain) {
2357 iommu = g_iommus[iommu_id];
2358 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2361 /* General domains only have one IOMMU */
2362 iommu = domain_get_iommu(domain);
2363 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2369 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2370 struct scatterlist *sg, unsigned long nr_pages,
2373 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2376 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2377 unsigned long phys_pfn, unsigned long nr_pages,
2380 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2383 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2385 unsigned long flags;
2386 struct context_entry *context;
2392 spin_lock_irqsave(&iommu->lock, flags);
2393 context = iommu_context_addr(iommu, bus, devfn, 0);
2395 spin_unlock_irqrestore(&iommu->lock, flags);
2398 did_old = context_domain_id(context);
2399 context_clear_entry(context);
2400 __iommu_flush_cache(iommu, context, sizeof(*context));
2401 spin_unlock_irqrestore(&iommu->lock, flags);
2402 iommu->flush.flush_context(iommu,
2404 (((u16)bus) << 8) | devfn,
2405 DMA_CCMD_MASK_NOBIT,
2406 DMA_CCMD_DEVICE_INVL);
2407 iommu->flush.flush_iotlb(iommu,
2414 static inline void unlink_domain_info(struct device_domain_info *info)
2416 assert_spin_locked(&device_domain_lock);
2417 list_del(&info->link);
2418 list_del(&info->global);
2420 info->dev->archdata.iommu = NULL;
2423 static void domain_remove_dev_info(struct dmar_domain *domain)
2425 struct device_domain_info *info, *tmp;
2426 unsigned long flags;
2428 spin_lock_irqsave(&device_domain_lock, flags);
2429 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2430 __dmar_remove_one_dev_info(info);
2431 spin_unlock_irqrestore(&device_domain_lock, flags);
2436 * Note: we use struct device->archdata.iommu stores the info
2438 static struct dmar_domain *find_domain(struct device *dev)
2440 struct device_domain_info *info;
2442 /* No lock here, assumes no domain exit in normal case */
2443 info = dev->archdata.iommu;
2445 return info->domain;
2449 static inline struct device_domain_info *
2450 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2452 struct device_domain_info *info;
2454 list_for_each_entry(info, &device_domain_list, global)
2455 if (info->iommu->segment == segment && info->bus == bus &&
2456 info->devfn == devfn)
2462 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2465 struct dmar_domain *domain)
2467 struct dmar_domain *found = NULL;
2468 struct device_domain_info *info;
2469 unsigned long flags;
2472 info = alloc_devinfo_mem();
2477 info->devfn = devfn;
2478 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2479 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2482 info->domain = domain;
2483 info->iommu = iommu;
2484 info->pasid_table = NULL;
2486 if (dev && dev_is_pci(dev)) {
2487 struct pci_dev *pdev = to_pci_dev(info->dev);
2489 if (!pdev->untrusted &&
2490 !pci_ats_disabled() &&
2491 ecap_dev_iotlb_support(iommu->ecap) &&
2492 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2493 dmar_find_matched_atsr_unit(pdev))
2494 info->ats_supported = 1;
2496 if (sm_supported(iommu)) {
2497 if (pasid_supported(iommu)) {
2498 int features = pci_pasid_features(pdev);
2500 info->pasid_supported = features | 1;
2503 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2504 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2505 info->pri_supported = 1;
2509 spin_lock_irqsave(&device_domain_lock, flags);
2511 found = find_domain(dev);
2514 struct device_domain_info *info2;
2515 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2517 found = info2->domain;
2523 spin_unlock_irqrestore(&device_domain_lock, flags);
2524 free_devinfo_mem(info);
2525 /* Caller must free the original domain */
2529 spin_lock(&iommu->lock);
2530 ret = domain_attach_iommu(domain, iommu);
2531 spin_unlock(&iommu->lock);
2534 spin_unlock_irqrestore(&device_domain_lock, flags);
2535 free_devinfo_mem(info);
2539 list_add(&info->link, &domain->devices);
2540 list_add(&info->global, &device_domain_list);
2542 dev->archdata.iommu = info;
2543 spin_unlock_irqrestore(&device_domain_lock, flags);
2545 /* PASID table is mandatory for a PCI device in scalable mode. */
2546 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2547 ret = intel_pasid_alloc_table(dev);
2549 dev_err(dev, "PASID table allocation failed\n");
2550 dmar_remove_one_dev_info(dev);
2554 /* Setup the PASID entry for requests without PASID: */
2555 spin_lock(&iommu->lock);
2556 if (hw_pass_through && domain_type_is_si(domain))
2557 ret = intel_pasid_setup_pass_through(iommu, domain,
2558 dev, PASID_RID2PASID);
2560 ret = intel_pasid_setup_second_level(iommu, domain,
2561 dev, PASID_RID2PASID);
2562 spin_unlock(&iommu->lock);
2564 dev_err(dev, "Setup RID2PASID failed\n");
2565 dmar_remove_one_dev_info(dev);
2570 if (dev && domain_context_mapping(domain, dev)) {
2571 dev_err(dev, "Domain context map failed\n");
2572 dmar_remove_one_dev_info(dev);
2579 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2581 *(u16 *)opaque = alias;
2585 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2587 struct device_domain_info *info;
2588 struct dmar_domain *domain = NULL;
2589 struct intel_iommu *iommu;
2591 unsigned long flags;
2594 iommu = device_to_iommu(dev, &bus, &devfn);
2598 if (dev_is_pci(dev)) {
2599 struct pci_dev *pdev = to_pci_dev(dev);
2601 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2603 spin_lock_irqsave(&device_domain_lock, flags);
2604 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2605 PCI_BUS_NUM(dma_alias),
2608 iommu = info->iommu;
2609 domain = info->domain;
2611 spin_unlock_irqrestore(&device_domain_lock, flags);
2613 /* DMA alias already has a domain, use it */
2618 /* Allocate and initialize new domain for the device */
2619 domain = alloc_domain(0);
2622 if (domain_init(domain, iommu, gaw)) {
2623 domain_exit(domain);
2632 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2633 struct dmar_domain *domain)
2635 struct intel_iommu *iommu;
2636 struct dmar_domain *tmp;
2637 u16 req_id, dma_alias;
2640 iommu = device_to_iommu(dev, &bus, &devfn);
2644 req_id = ((u16)bus << 8) | devfn;
2646 if (dev_is_pci(dev)) {
2647 struct pci_dev *pdev = to_pci_dev(dev);
2649 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2651 /* register PCI DMA alias device */
2652 if (req_id != dma_alias) {
2653 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2654 dma_alias & 0xff, NULL, domain);
2656 if (!tmp || tmp != domain)
2661 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662 if (!tmp || tmp != domain)
2668 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2670 struct dmar_domain *domain, *tmp;
2672 domain = find_domain(dev);
2676 domain = find_or_alloc_domain(dev, gaw);
2680 tmp = set_domain_for_dev(dev, domain);
2681 if (!tmp || domain != tmp) {
2682 domain_exit(domain);
2691 static int iommu_domain_identity_map(struct dmar_domain *domain,
2692 unsigned long long start,
2693 unsigned long long end)
2695 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2696 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2698 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2699 dma_to_mm_pfn(last_vpfn))) {
2700 pr_err("Reserving iova failed\n");
2704 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2706 * RMRR range might have overlap with physical memory range,
2709 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2711 return __domain_mapping(domain, first_vpfn, NULL,
2712 first_vpfn, last_vpfn - first_vpfn + 1,
2713 DMA_PTE_READ|DMA_PTE_WRITE);
2716 static int domain_prepare_identity_map(struct device *dev,
2717 struct dmar_domain *domain,
2718 unsigned long long start,
2719 unsigned long long end)
2721 /* For _hardware_ passthrough, don't bother. But for software
2722 passthrough, we do it anyway -- it may indicate a memory
2723 range which is reserved in E820, so which didn't get set
2724 up to start with in si_domain */
2725 if (domain == si_domain && hw_pass_through) {
2726 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2731 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2734 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2735 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2736 dmi_get_system_info(DMI_BIOS_VENDOR),
2737 dmi_get_system_info(DMI_BIOS_VERSION),
2738 dmi_get_system_info(DMI_PRODUCT_VERSION));
2742 if (end >> agaw_to_width(domain->agaw)) {
2743 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2744 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2745 agaw_to_width(domain->agaw),
2746 dmi_get_system_info(DMI_BIOS_VENDOR),
2747 dmi_get_system_info(DMI_BIOS_VERSION),
2748 dmi_get_system_info(DMI_PRODUCT_VERSION));
2752 return iommu_domain_identity_map(domain, start, end);
2755 static int iommu_prepare_identity_map(struct device *dev,
2756 unsigned long long start,
2757 unsigned long long end)
2759 struct dmar_domain *domain;
2762 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2766 ret = domain_prepare_identity_map(dev, domain, start, end);
2768 domain_exit(domain);
2773 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2776 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2778 return iommu_prepare_identity_map(dev, rmrr->base_address,
2782 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2783 static inline void iommu_prepare_isa(void)
2785 struct pci_dev *pdev;
2788 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2792 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2793 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2796 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2801 static inline void iommu_prepare_isa(void)
2805 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2807 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2809 static int __init si_domain_init(int hw)
2813 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2817 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2818 domain_exit(si_domain);
2822 pr_debug("Identity mapping domain allocated\n");
2827 for_each_online_node(nid) {
2828 unsigned long start_pfn, end_pfn;
2831 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2832 ret = iommu_domain_identity_map(si_domain,
2833 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2842 static int identity_mapping(struct device *dev)
2844 struct device_domain_info *info;
2846 if (likely(!iommu_identity_mapping))
2849 info = dev->archdata.iommu;
2850 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2851 return (info->domain == si_domain);
2856 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2858 struct dmar_domain *ndomain;
2859 struct intel_iommu *iommu;
2862 iommu = device_to_iommu(dev, &bus, &devfn);
2866 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2867 if (ndomain != domain)
2873 static bool device_has_rmrr(struct device *dev)
2875 struct dmar_rmrr_unit *rmrr;
2880 for_each_rmrr_units(rmrr) {
2882 * Return TRUE if this RMRR contains the device that
2885 for_each_active_dev_scope(rmrr->devices,
2886 rmrr->devices_cnt, i, tmp)
2897 * There are a couple cases where we need to restrict the functionality of
2898 * devices associated with RMRRs. The first is when evaluating a device for
2899 * identity mapping because problems exist when devices are moved in and out
2900 * of domains and their respective RMRR information is lost. This means that
2901 * a device with associated RMRRs will never be in a "passthrough" domain.
2902 * The second is use of the device through the IOMMU API. This interface
2903 * expects to have full control of the IOVA space for the device. We cannot
2904 * satisfy both the requirement that RMRR access is maintained and have an
2905 * unencumbered IOVA space. We also have no ability to quiesce the device's
2906 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2907 * We therefore prevent devices associated with an RMRR from participating in
2908 * the IOMMU API, which eliminates them from device assignment.
2910 * In both cases we assume that PCI USB devices with RMRRs have them largely
2911 * for historical reasons and that the RMRR space is not actively used post
2912 * boot. This exclusion may change if vendors begin to abuse it.
2914 * The same exception is made for graphics devices, with the requirement that
2915 * any use of the RMRR regions will be torn down before assigning the device
2918 static bool device_is_rmrr_locked(struct device *dev)
2920 if (!device_has_rmrr(dev))
2923 if (dev_is_pci(dev)) {
2924 struct pci_dev *pdev = to_pci_dev(dev);
2926 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2933 static int iommu_should_identity_map(struct device *dev, int startup)
2935 if (dev_is_pci(dev)) {
2936 struct pci_dev *pdev = to_pci_dev(dev);
2938 if (device_is_rmrr_locked(dev))
2942 * Prevent any device marked as untrusted from getting
2943 * placed into the statically identity mapping domain.
2945 if (pdev->untrusted)
2948 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2951 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2954 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2958 * We want to start off with all devices in the 1:1 domain, and
2959 * take them out later if we find they can't access all of memory.
2961 * However, we can't do this for PCI devices behind bridges,
2962 * because all PCI devices behind the same bridge will end up
2963 * with the same source-id on their transactions.
2965 * Practically speaking, we can't change things around for these
2966 * devices at run-time, because we can't be sure there'll be no
2967 * DMA transactions in flight for any of their siblings.
2969 * So PCI devices (unless they're on the root bus) as well as
2970 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2971 * the 1:1 domain, just in _case_ one of their siblings turns out
2972 * not to be able to map all of memory.
2974 if (!pci_is_pcie(pdev)) {
2975 if (!pci_is_root_bus(pdev->bus))
2977 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2979 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2982 if (device_has_rmrr(dev))
2987 * At boot time, we don't yet know if devices will be 64-bit capable.
2988 * Assume that they will — if they turn out not to be, then we can
2989 * take them out of the 1:1 domain later.
2993 * If the device's dma_mask is less than the system's memory
2994 * size then this is not a candidate for identity mapping.
2996 u64 dma_mask = *dev->dma_mask;
2998 if (dev->coherent_dma_mask &&
2999 dev->coherent_dma_mask < dma_mask)
3000 dma_mask = dev->coherent_dma_mask;
3002 return dma_mask >= dma_get_required_mask(dev);
3008 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3012 if (!iommu_should_identity_map(dev, 1))
3015 ret = domain_add_dev_info(si_domain, dev);
3017 dev_info(dev, "%s identity mapping\n",
3018 hw ? "Hardware" : "Software");
3019 else if (ret == -ENODEV)
3020 /* device not associated with an iommu */
3027 static int __init iommu_prepare_static_identity_mapping(int hw)
3029 struct pci_dev *pdev = NULL;
3030 struct dmar_drhd_unit *drhd;
3031 struct intel_iommu *iommu;
3036 for_each_pci_dev(pdev) {
3037 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3042 for_each_active_iommu(iommu, drhd)
3043 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3044 struct acpi_device_physical_node *pn;
3045 struct acpi_device *adev;
3047 if (dev->bus != &acpi_bus_type)
3050 adev= to_acpi_device(dev);
3051 mutex_lock(&adev->physical_node_lock);
3052 list_for_each_entry(pn, &adev->physical_node_list, node) {
3053 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3057 mutex_unlock(&adev->physical_node_lock);
3065 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3068 * Start from the sane iommu hardware state.
3069 * If the queued invalidation is already initialized by us
3070 * (for example, while enabling interrupt-remapping) then
3071 * we got the things already rolling from a sane state.
3075 * Clear any previous faults.
3077 dmar_fault(-1, iommu);
3079 * Disable queued invalidation if supported and already enabled
3080 * before OS handover.
3082 dmar_disable_qi(iommu);
3085 if (dmar_enable_qi(iommu)) {
3087 * Queued Invalidate not enabled, use Register Based Invalidate
3089 iommu->flush.flush_context = __iommu_flush_context;
3090 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3091 pr_info("%s: Using Register based invalidation\n",
3094 iommu->flush.flush_context = qi_flush_context;
3095 iommu->flush.flush_iotlb = qi_flush_iotlb;
3096 pr_info("%s: Using Queued invalidation\n", iommu->name);
3100 static int copy_context_table(struct intel_iommu *iommu,
3101 struct root_entry *old_re,
3102 struct context_entry **tbl,
3105 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3106 struct context_entry *new_ce = NULL, ce;
3107 struct context_entry *old_ce = NULL;
3108 struct root_entry re;
3109 phys_addr_t old_ce_phys;
3111 tbl_idx = ext ? bus * 2 : bus;
3112 memcpy(&re, old_re, sizeof(re));
3114 for (devfn = 0; devfn < 256; devfn++) {
3115 /* First calculate the correct index */
3116 idx = (ext ? devfn * 2 : devfn) % 256;
3119 /* First save what we may have and clean up */
3121 tbl[tbl_idx] = new_ce;
3122 __iommu_flush_cache(iommu, new_ce,
3132 old_ce_phys = root_entry_lctp(&re);
3134 old_ce_phys = root_entry_uctp(&re);
3137 if (ext && devfn == 0) {
3138 /* No LCTP, try UCTP */
3147 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3152 new_ce = alloc_pgtable_page(iommu->node);
3159 /* Now copy the context entry */
3160 memcpy(&ce, old_ce + idx, sizeof(ce));
3162 if (!__context_present(&ce))
3165 did = context_domain_id(&ce);
3166 if (did >= 0 && did < cap_ndoms(iommu->cap))
3167 set_bit(did, iommu->domain_ids);
3170 * We need a marker for copied context entries. This
3171 * marker needs to work for the old format as well as
3172 * for extended context entries.
3174 * Bit 67 of the context entry is used. In the old
3175 * format this bit is available to software, in the
3176 * extended format it is the PGE bit, but PGE is ignored
3177 * by HW if PASIDs are disabled (and thus still
3180 * So disable PASIDs first and then mark the entry
3181 * copied. This means that we don't copy PASID
3182 * translations from the old kernel, but this is fine as
3183 * faults there are not fatal.
3185 context_clear_pasid_enable(&ce);
3186 context_set_copied(&ce);
3191 tbl[tbl_idx + pos] = new_ce;
3193 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3202 static int copy_translation_tables(struct intel_iommu *iommu)
3204 struct context_entry **ctxt_tbls;
3205 struct root_entry *old_rt;
3206 phys_addr_t old_rt_phys;
3207 int ctxt_table_entries;
3208 unsigned long flags;
3213 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3214 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3215 new_ext = !!ecap_ecs(iommu->ecap);
3218 * The RTT bit can only be changed when translation is disabled,
3219 * but disabling translation means to open a window for data
3220 * corruption. So bail out and don't copy anything if we would
3221 * have to change the bit.
3226 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3230 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3234 /* This is too big for the stack - allocate it from slab */
3235 ctxt_table_entries = ext ? 512 : 256;
3237 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3241 for (bus = 0; bus < 256; bus++) {
3242 ret = copy_context_table(iommu, &old_rt[bus],
3243 ctxt_tbls, bus, ext);
3245 pr_err("%s: Failed to copy context table for bus %d\n",
3251 spin_lock_irqsave(&iommu->lock, flags);
3253 /* Context tables are copied, now write them to the root_entry table */
3254 for (bus = 0; bus < 256; bus++) {
3255 int idx = ext ? bus * 2 : bus;
3258 if (ctxt_tbls[idx]) {
3259 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3260 iommu->root_entry[bus].lo = val;
3263 if (!ext || !ctxt_tbls[idx + 1])
3266 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3267 iommu->root_entry[bus].hi = val;
3270 spin_unlock_irqrestore(&iommu->lock, flags);
3274 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3284 static int __init init_dmars(void)
3286 struct dmar_drhd_unit *drhd;
3287 struct dmar_rmrr_unit *rmrr;
3288 bool copied_tables = false;
3290 struct intel_iommu *iommu;
3296 * initialize and program root entry to not present
3299 for_each_drhd_unit(drhd) {
3301 * lock not needed as this is only incremented in the single
3302 * threaded kernel __init code path all other access are read
3305 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3309 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3312 /* Preallocate enough resources for IOMMU hot-addition */
3313 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3314 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3316 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3319 pr_err("Allocating global iommu array failed\n");
3324 for_each_active_iommu(iommu, drhd) {
3326 * Find the max pasid size of all IOMMU's in the system.
3327 * We need to ensure the system pasid table is no bigger
3328 * than the smallest supported.
3330 if (pasid_supported(iommu)) {
3331 u32 temp = 2 << ecap_pss(iommu->ecap);
3333 intel_pasid_max_id = min_t(u32, temp,
3334 intel_pasid_max_id);
3337 g_iommus[iommu->seq_id] = iommu;
3339 intel_iommu_init_qi(iommu);
3341 ret = iommu_init_domains(iommu);
3345 init_translation_status(iommu);
3347 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3348 iommu_disable_translation(iommu);
3349 clear_translation_pre_enabled(iommu);
3350 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3356 * we could share the same root & context tables
3357 * among all IOMMU's. Need to Split it later.
3359 ret = iommu_alloc_root_entry(iommu);
3363 if (translation_pre_enabled(iommu)) {
3364 pr_info("Translation already enabled - trying to copy translation structures\n");
3366 ret = copy_translation_tables(iommu);
3369 * We found the IOMMU with translation
3370 * enabled - but failed to copy over the
3371 * old root-entry table. Try to proceed
3372 * by disabling translation now and
3373 * allocating a clean root-entry table.
3374 * This might cause DMAR faults, but
3375 * probably the dump will still succeed.
3377 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3379 iommu_disable_translation(iommu);
3380 clear_translation_pre_enabled(iommu);
3382 pr_info("Copied translation tables from previous kernel for %s\n",
3384 copied_tables = true;
3388 if (!ecap_pass_through(iommu->ecap))
3389 hw_pass_through = 0;
3390 #ifdef CONFIG_INTEL_IOMMU_SVM
3391 if (pasid_supported(iommu))
3392 intel_svm_init(iommu);
3397 * Now that qi is enabled on all iommus, set the root entry and flush
3398 * caches. This is required on some Intel X58 chipsets, otherwise the
3399 * flush_context function will loop forever and the boot hangs.
3401 for_each_active_iommu(iommu, drhd) {
3402 iommu_flush_write_buffer(iommu);
3403 iommu_set_root_entry(iommu);
3404 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3405 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3408 if (iommu_pass_through)
3409 iommu_identity_mapping |= IDENTMAP_ALL;
3411 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3412 iommu_identity_mapping |= IDENTMAP_GFX;
3415 check_tylersburg_isoch();
3417 if (iommu_identity_mapping) {
3418 ret = si_domain_init(hw_pass_through);
3425 * If we copied translations from a previous kernel in the kdump
3426 * case, we can not assign the devices to domains now, as that
3427 * would eliminate the old mappings. So skip this part and defer
3428 * the assignment to device driver initialization time.
3434 * If pass through is not set or not enabled, setup context entries for
3435 * identity mappings for rmrr, gfx, and isa and may fall back to static
3436 * identity mapping if iommu_identity_mapping is set.
3438 if (iommu_identity_mapping) {
3439 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3441 pr_crit("Failed to setup IOMMU pass-through\n");
3447 * for each dev attached to rmrr
3449 * locate drhd for dev, alloc domain for dev
3450 * allocate free domain
3451 * allocate page table entries for rmrr
3452 * if context not allocated for bus
3453 * allocate and init context
3454 * set present in root table for this bus
3455 * init context with domain, translation etc
3459 pr_info("Setting RMRR:\n");
3460 for_each_rmrr_units(rmrr) {
3461 /* some BIOS lists non-exist devices in DMAR table. */
3462 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3464 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3466 pr_err("Mapping reserved region failed\n");
3470 iommu_prepare_isa();
3477 * global invalidate context cache
3478 * global invalidate iotlb
3479 * enable translation
3481 for_each_iommu(iommu, drhd) {
3482 if (drhd->ignored) {
3484 * we always have to disable PMRs or DMA may fail on
3488 iommu_disable_protect_mem_regions(iommu);
3492 iommu_flush_write_buffer(iommu);
3494 #ifdef CONFIG_INTEL_IOMMU_SVM
3495 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3496 ret = intel_svm_enable_prq(iommu);
3501 ret = dmar_set_interrupt(iommu);
3505 if (!translation_pre_enabled(iommu))
3506 iommu_enable_translation(iommu);
3508 iommu_disable_protect_mem_regions(iommu);
3514 for_each_active_iommu(iommu, drhd) {
3515 disable_dmar_iommu(iommu);
3516 free_dmar_iommu(iommu);
3525 /* This takes a number of _MM_ pages, not VTD pages */
3526 static unsigned long intel_alloc_iova(struct device *dev,
3527 struct dmar_domain *domain,
3528 unsigned long nrpages, uint64_t dma_mask)
3530 unsigned long iova_pfn;
3532 /* Restrict dma_mask to the width that the iommu can handle */
3533 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3534 /* Ensure we reserve the whole size-aligned region */
3535 nrpages = __roundup_pow_of_two(nrpages);
3537 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3539 * First try to allocate an io virtual address in
3540 * DMA_BIT_MASK(32) and if that fails then try allocating
3543 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3544 IOVA_PFN(DMA_BIT_MASK(32)), false);
3548 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3549 IOVA_PFN(dma_mask), true);
3550 if (unlikely(!iova_pfn)) {
3551 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3558 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3560 struct dmar_domain *domain, *tmp;
3561 struct dmar_rmrr_unit *rmrr;
3562 struct device *i_dev;
3565 domain = find_domain(dev);
3569 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3573 /* We have a new domain - setup possible RMRRs for the device */
3575 for_each_rmrr_units(rmrr) {
3576 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3581 ret = domain_prepare_identity_map(dev, domain,
3585 dev_err(dev, "Mapping reserved region failed\n");
3590 tmp = set_domain_for_dev(dev, domain);
3591 if (!tmp || domain != tmp) {
3592 domain_exit(domain);
3599 dev_err(dev, "Allocating domain failed\n");
3605 /* Check if the dev needs to go through non-identity map and unmap process.*/
3606 static int iommu_no_mapping(struct device *dev)
3610 if (iommu_dummy(dev))
3613 if (!iommu_identity_mapping)
3616 found = identity_mapping(dev);
3618 if (iommu_should_identity_map(dev, 0))
3622 * 32 bit DMA is removed from si_domain and fall back
3623 * to non-identity mapping.
3625 dmar_remove_one_dev_info(dev);
3626 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3631 * In case of a detached 64 bit DMA device from vm, the device
3632 * is put into si_domain for identity mapping.
3634 if (iommu_should_identity_map(dev, 0)) {
3636 ret = domain_add_dev_info(si_domain, dev);
3638 dev_info(dev, "64bit DMA uses identity mapping\n");
3647 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3648 size_t size, int dir, u64 dma_mask)
3650 struct dmar_domain *domain;
3651 phys_addr_t start_paddr;
3652 unsigned long iova_pfn;
3655 struct intel_iommu *iommu;
3656 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3658 BUG_ON(dir == DMA_NONE);
3660 if (iommu_no_mapping(dev))
3663 domain = get_valid_domain_for_dev(dev);
3665 return DMA_MAPPING_ERROR;
3667 iommu = domain_get_iommu(domain);
3668 size = aligned_nrpages(paddr, size);
3670 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3675 * Check if DMAR supports zero-length reads on write only
3678 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3679 !cap_zlr(iommu->cap))
3680 prot |= DMA_PTE_READ;
3681 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3682 prot |= DMA_PTE_WRITE;
3684 * paddr - (paddr + size) might be partial page, we should map the whole
3685 * page. Note: if two part of one page are separately mapped, we
3686 * might have two guest_addr mapping to the same host paddr, but this
3687 * is not a big problem
3689 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3690 mm_to_dma_pfn(paddr_pfn), size, prot);
3694 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3695 start_paddr += paddr & ~PAGE_MASK;
3700 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3701 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3702 size, (unsigned long long)paddr, dir);
3703 return DMA_MAPPING_ERROR;
3706 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3707 unsigned long offset, size_t size,
3708 enum dma_data_direction dir,
3709 unsigned long attrs)
3711 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3712 dir, *dev->dma_mask);
3715 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3716 size_t size, enum dma_data_direction dir,
3717 unsigned long attrs)
3719 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3722 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3724 struct dmar_domain *domain;
3725 unsigned long start_pfn, last_pfn;
3726 unsigned long nrpages;
3727 unsigned long iova_pfn;
3728 struct intel_iommu *iommu;
3729 struct page *freelist;
3731 if (iommu_no_mapping(dev))
3734 domain = find_domain(dev);
3737 iommu = domain_get_iommu(domain);
3739 iova_pfn = IOVA_PFN(dev_addr);
3741 nrpages = aligned_nrpages(dev_addr, size);
3742 start_pfn = mm_to_dma_pfn(iova_pfn);
3743 last_pfn = start_pfn + nrpages - 1;
3745 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3747 freelist = domain_unmap(domain, start_pfn, last_pfn);
3749 if (intel_iommu_strict) {
3750 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3751 nrpages, !freelist, 0);
3753 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3754 dma_free_pagelist(freelist);
3756 queue_iova(&domain->iovad, iova_pfn, nrpages,
3757 (unsigned long)freelist);
3759 * queue up the release of the unmap to save the 1/6th of the
3760 * cpu used up by the iotlb flush operation...
3765 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3766 size_t size, enum dma_data_direction dir,
3767 unsigned long attrs)
3769 intel_unmap(dev, dev_addr, size);
3772 static void *intel_alloc_coherent(struct device *dev, size_t size,
3773 dma_addr_t *dma_handle, gfp_t flags,
3774 unsigned long attrs)
3776 struct page *page = NULL;
3779 size = PAGE_ALIGN(size);
3780 order = get_order(size);
3782 if (!iommu_no_mapping(dev))
3783 flags &= ~(GFP_DMA | GFP_DMA32);
3784 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3785 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3791 if (gfpflags_allow_blocking(flags)) {
3792 unsigned int count = size >> PAGE_SHIFT;
3794 page = dma_alloc_from_contiguous(dev, count, order,
3795 flags & __GFP_NOWARN);
3796 if (page && iommu_no_mapping(dev) &&
3797 page_to_phys(page) + size > dev->coherent_dma_mask) {
3798 dma_release_from_contiguous(dev, page, count);
3804 page = alloc_pages(flags, order);
3807 memset(page_address(page), 0, size);
3809 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3811 dev->coherent_dma_mask);
3812 if (*dma_handle != DMA_MAPPING_ERROR)
3813 return page_address(page);
3814 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3815 __free_pages(page, order);
3820 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3821 dma_addr_t dma_handle, unsigned long attrs)
3824 struct page *page = virt_to_page(vaddr);
3826 size = PAGE_ALIGN(size);
3827 order = get_order(size);
3829 intel_unmap(dev, dma_handle, size);
3830 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3831 __free_pages(page, order);
3834 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3835 int nelems, enum dma_data_direction dir,
3836 unsigned long attrs)
3838 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3839 unsigned long nrpages = 0;
3840 struct scatterlist *sg;
3843 for_each_sg(sglist, sg, nelems, i) {
3844 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3847 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3850 static int intel_nontranslate_map_sg(struct device *hddev,
3851 struct scatterlist *sglist, int nelems, int dir)
3854 struct scatterlist *sg;
3856 for_each_sg(sglist, sg, nelems, i) {
3857 BUG_ON(!sg_page(sg));
3858 sg->dma_address = sg_phys(sg);
3859 sg->dma_length = sg->length;
3864 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3865 enum dma_data_direction dir, unsigned long attrs)
3868 struct dmar_domain *domain;
3871 unsigned long iova_pfn;
3873 struct scatterlist *sg;
3874 unsigned long start_vpfn;
3875 struct intel_iommu *iommu;
3877 BUG_ON(dir == DMA_NONE);
3878 if (iommu_no_mapping(dev))
3879 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3881 domain = get_valid_domain_for_dev(dev);
3885 iommu = domain_get_iommu(domain);
3887 for_each_sg(sglist, sg, nelems, i)
3888 size += aligned_nrpages(sg->offset, sg->length);
3890 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3893 sglist->dma_length = 0;
3898 * Check if DMAR supports zero-length reads on write only
3901 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3902 !cap_zlr(iommu->cap))
3903 prot |= DMA_PTE_READ;
3904 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3905 prot |= DMA_PTE_WRITE;
3907 start_vpfn = mm_to_dma_pfn(iova_pfn);
3909 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3910 if (unlikely(ret)) {
3911 dma_pte_free_pagetable(domain, start_vpfn,
3912 start_vpfn + size - 1,
3913 agaw_to_level(domain->agaw) + 1);
3914 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3921 static const struct dma_map_ops intel_dma_ops = {
3922 .alloc = intel_alloc_coherent,
3923 .free = intel_free_coherent,
3924 .map_sg = intel_map_sg,
3925 .unmap_sg = intel_unmap_sg,
3926 .map_page = intel_map_page,
3927 .unmap_page = intel_unmap_page,
3928 .map_resource = intel_map_resource,
3929 .unmap_resource = intel_unmap_page,
3930 .dma_supported = dma_direct_supported,
3933 static inline int iommu_domain_cache_init(void)
3937 iommu_domain_cache = kmem_cache_create("iommu_domain",
3938 sizeof(struct dmar_domain),
3943 if (!iommu_domain_cache) {
3944 pr_err("Couldn't create iommu_domain cache\n");
3951 static inline int iommu_devinfo_cache_init(void)
3955 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3956 sizeof(struct device_domain_info),
3960 if (!iommu_devinfo_cache) {
3961 pr_err("Couldn't create devinfo cache\n");
3968 static int __init iommu_init_mempool(void)
3971 ret = iova_cache_get();
3975 ret = iommu_domain_cache_init();
3979 ret = iommu_devinfo_cache_init();
3983 kmem_cache_destroy(iommu_domain_cache);
3990 static void __init iommu_exit_mempool(void)
3992 kmem_cache_destroy(iommu_devinfo_cache);
3993 kmem_cache_destroy(iommu_domain_cache);
3997 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3999 struct dmar_drhd_unit *drhd;
4003 /* We know that this device on this chipset has its own IOMMU.
4004 * If we find it under a different IOMMU, then the BIOS is lying
4005 * to us. Hope that the IOMMU for this device is actually
4006 * disabled, and it needs no translation...
4008 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4010 /* "can't" happen */
4011 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4014 vtbar &= 0xffff0000;
4016 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4017 drhd = dmar_find_matched_drhd_unit(pdev);
4018 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4019 TAINT_FIRMWARE_WORKAROUND,
4020 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4021 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4023 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4025 static void __init init_no_remapping_devices(void)
4027 struct dmar_drhd_unit *drhd;
4031 for_each_drhd_unit(drhd) {
4032 if (!drhd->include_all) {
4033 for_each_active_dev_scope(drhd->devices,
4034 drhd->devices_cnt, i, dev)
4036 /* ignore DMAR unit if no devices exist */
4037 if (i == drhd->devices_cnt)
4042 for_each_active_drhd_unit(drhd) {
4043 if (drhd->include_all)
4046 for_each_active_dev_scope(drhd->devices,
4047 drhd->devices_cnt, i, dev)
4048 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4050 if (i < drhd->devices_cnt)
4053 /* This IOMMU has *only* gfx devices. Either bypass it or
4054 set the gfx_mapped flag, as appropriate */
4056 intel_iommu_gfx_mapped = 1;
4059 for_each_active_dev_scope(drhd->devices,
4060 drhd->devices_cnt, i, dev)
4061 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4066 #ifdef CONFIG_SUSPEND
4067 static int init_iommu_hw(void)
4069 struct dmar_drhd_unit *drhd;
4070 struct intel_iommu *iommu = NULL;
4072 for_each_active_iommu(iommu, drhd)
4074 dmar_reenable_qi(iommu);
4076 for_each_iommu(iommu, drhd) {
4077 if (drhd->ignored) {
4079 * we always have to disable PMRs or DMA may fail on
4083 iommu_disable_protect_mem_regions(iommu);
4087 iommu_flush_write_buffer(iommu);
4089 iommu_set_root_entry(iommu);
4091 iommu->flush.flush_context(iommu, 0, 0, 0,
4092 DMA_CCMD_GLOBAL_INVL);
4093 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4094 iommu_enable_translation(iommu);
4095 iommu_disable_protect_mem_regions(iommu);
4101 static void iommu_flush_all(void)
4103 struct dmar_drhd_unit *drhd;
4104 struct intel_iommu *iommu;
4106 for_each_active_iommu(iommu, drhd) {
4107 iommu->flush.flush_context(iommu, 0, 0, 0,
4108 DMA_CCMD_GLOBAL_INVL);
4109 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4110 DMA_TLB_GLOBAL_FLUSH);
4114 static int iommu_suspend(void)
4116 struct dmar_drhd_unit *drhd;
4117 struct intel_iommu *iommu = NULL;
4120 for_each_active_iommu(iommu, drhd) {
4121 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4123 if (!iommu->iommu_state)
4129 for_each_active_iommu(iommu, drhd) {
4130 iommu_disable_translation(iommu);
4132 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4134 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4135 readl(iommu->reg + DMAR_FECTL_REG);
4136 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4137 readl(iommu->reg + DMAR_FEDATA_REG);
4138 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4139 readl(iommu->reg + DMAR_FEADDR_REG);
4140 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4141 readl(iommu->reg + DMAR_FEUADDR_REG);
4143 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4148 for_each_active_iommu(iommu, drhd)
4149 kfree(iommu->iommu_state);
4154 static void iommu_resume(void)
4156 struct dmar_drhd_unit *drhd;
4157 struct intel_iommu *iommu = NULL;
4160 if (init_iommu_hw()) {
4162 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4164 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4168 for_each_active_iommu(iommu, drhd) {
4170 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4172 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4173 iommu->reg + DMAR_FECTL_REG);
4174 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4175 iommu->reg + DMAR_FEDATA_REG);
4176 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4177 iommu->reg + DMAR_FEADDR_REG);
4178 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4179 iommu->reg + DMAR_FEUADDR_REG);
4181 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4184 for_each_active_iommu(iommu, drhd)
4185 kfree(iommu->iommu_state);
4188 static struct syscore_ops iommu_syscore_ops = {
4189 .resume = iommu_resume,
4190 .suspend = iommu_suspend,
4193 static void __init init_iommu_pm_ops(void)
4195 register_syscore_ops(&iommu_syscore_ops);
4199 static inline void init_iommu_pm_ops(void) {}
4200 #endif /* CONFIG_PM */
4203 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4205 struct acpi_dmar_reserved_memory *rmrr;
4206 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4207 struct dmar_rmrr_unit *rmrru;
4210 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4214 rmrru->hdr = header;
4215 rmrr = (struct acpi_dmar_reserved_memory *)header;
4216 rmrru->base_address = rmrr->base_address;
4217 rmrru->end_address = rmrr->end_address;
4219 length = rmrr->end_address - rmrr->base_address + 1;
4220 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4225 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4226 ((void *)rmrr) + rmrr->header.length,
4227 &rmrru->devices_cnt);
4228 if (rmrru->devices_cnt && rmrru->devices == NULL)
4231 list_add(&rmrru->list, &dmar_rmrr_units);
4242 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4244 struct dmar_atsr_unit *atsru;
4245 struct acpi_dmar_atsr *tmp;
4247 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4248 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4249 if (atsr->segment != tmp->segment)
4251 if (atsr->header.length != tmp->header.length)
4253 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4260 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4262 struct acpi_dmar_atsr *atsr;
4263 struct dmar_atsr_unit *atsru;
4265 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4268 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4269 atsru = dmar_find_atsr(atsr);
4273 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4278 * If memory is allocated from slab by ACPI _DSM method, we need to
4279 * copy the memory content because the memory buffer will be freed
4282 atsru->hdr = (void *)(atsru + 1);
4283 memcpy(atsru->hdr, hdr, hdr->length);
4284 atsru->include_all = atsr->flags & 0x1;
4285 if (!atsru->include_all) {
4286 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4287 (void *)atsr + atsr->header.length,
4288 &atsru->devices_cnt);
4289 if (atsru->devices_cnt && atsru->devices == NULL) {
4295 list_add_rcu(&atsru->list, &dmar_atsr_units);
4300 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4302 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4306 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4308 struct acpi_dmar_atsr *atsr;
4309 struct dmar_atsr_unit *atsru;
4311 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4312 atsru = dmar_find_atsr(atsr);
4314 list_del_rcu(&atsru->list);
4316 intel_iommu_free_atsr(atsru);
4322 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4326 struct acpi_dmar_atsr *atsr;
4327 struct dmar_atsr_unit *atsru;
4329 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330 atsru = dmar_find_atsr(atsr);
4334 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4335 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4343 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4346 struct intel_iommu *iommu = dmaru->iommu;
4348 if (g_iommus[iommu->seq_id])
4351 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4352 pr_warn("%s: Doesn't support hardware pass through.\n",
4356 if (!ecap_sc_support(iommu->ecap) &&
4357 domain_update_iommu_snooping(iommu)) {
4358 pr_warn("%s: Doesn't support snooping.\n",
4362 sp = domain_update_iommu_superpage(iommu) - 1;
4363 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4364 pr_warn("%s: Doesn't support large page.\n",
4370 * Disable translation if already enabled prior to OS handover.
4372 if (iommu->gcmd & DMA_GCMD_TE)
4373 iommu_disable_translation(iommu);
4375 g_iommus[iommu->seq_id] = iommu;
4376 ret = iommu_init_domains(iommu);
4378 ret = iommu_alloc_root_entry(iommu);
4382 #ifdef CONFIG_INTEL_IOMMU_SVM
4383 if (pasid_supported(iommu))
4384 intel_svm_init(iommu);
4387 if (dmaru->ignored) {
4389 * we always have to disable PMRs or DMA may fail on this device
4392 iommu_disable_protect_mem_regions(iommu);
4396 intel_iommu_init_qi(iommu);
4397 iommu_flush_write_buffer(iommu);
4399 #ifdef CONFIG_INTEL_IOMMU_SVM
4400 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4401 ret = intel_svm_enable_prq(iommu);
4406 ret = dmar_set_interrupt(iommu);
4410 iommu_set_root_entry(iommu);
4411 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4412 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4413 iommu_enable_translation(iommu);
4415 iommu_disable_protect_mem_regions(iommu);
4419 disable_dmar_iommu(iommu);
4421 free_dmar_iommu(iommu);
4425 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4428 struct intel_iommu *iommu = dmaru->iommu;
4430 if (!intel_iommu_enabled)
4436 ret = intel_iommu_add(dmaru);
4438 disable_dmar_iommu(iommu);
4439 free_dmar_iommu(iommu);
4445 static void intel_iommu_free_dmars(void)
4447 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4448 struct dmar_atsr_unit *atsru, *atsr_n;
4450 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4451 list_del(&rmrru->list);
4452 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4457 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4458 list_del(&atsru->list);
4459 intel_iommu_free_atsr(atsru);
4463 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4466 struct pci_bus *bus;
4467 struct pci_dev *bridge = NULL;
4469 struct acpi_dmar_atsr *atsr;
4470 struct dmar_atsr_unit *atsru;
4472 dev = pci_physfn(dev);
4473 for (bus = dev->bus; bus; bus = bus->parent) {
4475 /* If it's an integrated device, allow ATS */
4478 /* Connected via non-PCIe: no ATS */
4479 if (!pci_is_pcie(bridge) ||
4480 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4482 /* If we found the root port, look it up in the ATSR */
4483 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4488 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4489 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4490 if (atsr->segment != pci_domain_nr(dev->bus))
4493 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4494 if (tmp == &bridge->dev)
4497 if (atsru->include_all)
4507 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4510 struct dmar_rmrr_unit *rmrru;
4511 struct dmar_atsr_unit *atsru;
4512 struct acpi_dmar_atsr *atsr;
4513 struct acpi_dmar_reserved_memory *rmrr;
4515 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4518 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4519 rmrr = container_of(rmrru->hdr,
4520 struct acpi_dmar_reserved_memory, header);
4521 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4522 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4523 ((void *)rmrr) + rmrr->header.length,
4524 rmrr->segment, rmrru->devices,
4525 rmrru->devices_cnt);
4528 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4529 dmar_remove_dev_scope(info, rmrr->segment,
4530 rmrru->devices, rmrru->devices_cnt);
4534 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4535 if (atsru->include_all)
4538 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4539 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4540 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4541 (void *)atsr + atsr->header.length,
4542 atsr->segment, atsru->devices,
4543 atsru->devices_cnt);
4548 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4549 if (dmar_remove_dev_scope(info, atsr->segment,
4550 atsru->devices, atsru->devices_cnt))
4559 * Here we only respond to action of unbound device from driver.
4561 * Added device is not attached to its DMAR domain here yet. That will happen
4562 * when mapping the device to iova.
4564 static int device_notifier(struct notifier_block *nb,
4565 unsigned long action, void *data)
4567 struct device *dev = data;
4568 struct dmar_domain *domain;
4570 if (iommu_dummy(dev))
4573 if (action == BUS_NOTIFY_REMOVED_DEVICE) {
4574 domain = find_domain(dev);
4578 dmar_remove_one_dev_info(dev);
4579 if (!domain_type_is_vm_or_si(domain) &&
4580 list_empty(&domain->devices))
4581 domain_exit(domain);
4582 } else if (action == BUS_NOTIFY_ADD_DEVICE) {
4583 if (iommu_should_identity_map(dev, 1))
4584 domain_add_dev_info(si_domain, dev);
4590 static struct notifier_block device_nb = {
4591 .notifier_call = device_notifier,
4594 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4595 unsigned long val, void *v)
4597 struct memory_notify *mhp = v;
4598 unsigned long long start, end;
4599 unsigned long start_vpfn, last_vpfn;
4602 case MEM_GOING_ONLINE:
4603 start = mhp->start_pfn << PAGE_SHIFT;
4604 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4605 if (iommu_domain_identity_map(si_domain, start, end)) {
4606 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4613 case MEM_CANCEL_ONLINE:
4614 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4615 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4616 while (start_vpfn <= last_vpfn) {
4618 struct dmar_drhd_unit *drhd;
4619 struct intel_iommu *iommu;
4620 struct page *freelist;
4622 iova = find_iova(&si_domain->iovad, start_vpfn);
4624 pr_debug("Failed get IOVA for PFN %lx\n",
4629 iova = split_and_remove_iova(&si_domain->iovad, iova,
4630 start_vpfn, last_vpfn);
4632 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4633 start_vpfn, last_vpfn);
4637 freelist = domain_unmap(si_domain, iova->pfn_lo,
4641 for_each_active_iommu(iommu, drhd)
4642 iommu_flush_iotlb_psi(iommu, si_domain,
4643 iova->pfn_lo, iova_size(iova),
4646 dma_free_pagelist(freelist);
4648 start_vpfn = iova->pfn_hi + 1;
4649 free_iova_mem(iova);
4657 static struct notifier_block intel_iommu_memory_nb = {
4658 .notifier_call = intel_iommu_memory_notifier,
4662 static void free_all_cpu_cached_iovas(unsigned int cpu)
4666 for (i = 0; i < g_num_of_iommus; i++) {
4667 struct intel_iommu *iommu = g_iommus[i];
4668 struct dmar_domain *domain;
4674 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4675 domain = get_iommu_domain(iommu, (u16)did);
4679 free_cpu_cached_iovas(cpu, &domain->iovad);
4684 static int intel_iommu_cpu_dead(unsigned int cpu)
4686 free_all_cpu_cached_iovas(cpu);
4690 static void intel_disable_iommus(void)
4692 struct intel_iommu *iommu = NULL;
4693 struct dmar_drhd_unit *drhd;
4695 for_each_iommu(iommu, drhd)
4696 iommu_disable_translation(iommu);
4699 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4701 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4703 return container_of(iommu_dev, struct intel_iommu, iommu);
4706 static ssize_t intel_iommu_show_version(struct device *dev,
4707 struct device_attribute *attr,
4710 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4711 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4712 return sprintf(buf, "%d:%d\n",
4713 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4715 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4717 static ssize_t intel_iommu_show_address(struct device *dev,
4718 struct device_attribute *attr,
4721 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4722 return sprintf(buf, "%llx\n", iommu->reg_phys);
4724 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4726 static ssize_t intel_iommu_show_cap(struct device *dev,
4727 struct device_attribute *attr,
4730 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4731 return sprintf(buf, "%llx\n", iommu->cap);
4733 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4735 static ssize_t intel_iommu_show_ecap(struct device *dev,
4736 struct device_attribute *attr,
4739 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4740 return sprintf(buf, "%llx\n", iommu->ecap);
4742 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4744 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4745 struct device_attribute *attr,
4748 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4749 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4751 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4753 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4754 struct device_attribute *attr,
4757 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4758 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4759 cap_ndoms(iommu->cap)));
4761 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4763 static struct attribute *intel_iommu_attrs[] = {
4764 &dev_attr_version.attr,
4765 &dev_attr_address.attr,
4767 &dev_attr_ecap.attr,
4768 &dev_attr_domains_supported.attr,
4769 &dev_attr_domains_used.attr,
4773 static struct attribute_group intel_iommu_group = {
4774 .name = "intel-iommu",
4775 .attrs = intel_iommu_attrs,
4778 const struct attribute_group *intel_iommu_groups[] = {
4783 static int __init platform_optin_force_iommu(void)
4785 struct pci_dev *pdev = NULL;
4786 bool has_untrusted_dev = false;
4788 if (!dmar_platform_optin() || no_platform_optin)
4791 for_each_pci_dev(pdev) {
4792 if (pdev->untrusted) {
4793 has_untrusted_dev = true;
4798 if (!has_untrusted_dev)
4801 if (no_iommu || dmar_disabled)
4802 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4805 * If Intel-IOMMU is disabled by default, we will apply identity
4806 * map for all devices except those marked as being untrusted.
4809 iommu_identity_mapping |= IDENTMAP_ALL;
4812 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4820 int __init intel_iommu_init(void)
4823 struct dmar_drhd_unit *drhd;
4824 struct intel_iommu *iommu;
4827 * Intel IOMMU is required for a TXT/tboot launch or platform
4828 * opt in, so enforce that.
4830 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4832 if (iommu_init_mempool()) {
4834 panic("tboot: Failed to initialize iommu memory\n");
4838 down_write(&dmar_global_lock);
4839 if (dmar_table_init()) {
4841 panic("tboot: Failed to initialize DMAR table\n");
4845 if (dmar_dev_scope_init() < 0) {
4847 panic("tboot: Failed to initialize DMAR device scope\n");
4851 up_write(&dmar_global_lock);
4854 * The bus notifier takes the dmar_global_lock, so lockdep will
4855 * complain later when we register it under the lock.
4857 dmar_register_bus_notifier();
4859 down_write(&dmar_global_lock);
4861 if (no_iommu || dmar_disabled) {
4863 * We exit the function here to ensure IOMMU's remapping and
4864 * mempool aren't setup, which means that the IOMMU's PMRs
4865 * won't be disabled via the call to init_dmars(). So disable
4866 * it explicitly here. The PMRs were setup by tboot prior to
4867 * calling SENTER, but the kernel is expected to reset/tear
4870 if (intel_iommu_tboot_noforce) {
4871 for_each_iommu(iommu, drhd)
4872 iommu_disable_protect_mem_regions(iommu);
4876 * Make sure the IOMMUs are switched off, even when we
4877 * boot into a kexec kernel and the previous kernel left
4880 intel_disable_iommus();
4884 if (list_empty(&dmar_rmrr_units))
4885 pr_info("No RMRR found\n");
4887 if (list_empty(&dmar_atsr_units))
4888 pr_info("No ATSR found\n");
4890 if (dmar_init_reserved_ranges()) {
4892 panic("tboot: Failed to reserve iommu ranges\n");
4893 goto out_free_reserved_range;
4896 init_no_remapping_devices();
4901 panic("tboot: Failed to initialize DMARs\n");
4902 pr_err("Initialization failed\n");
4903 goto out_free_reserved_range;
4905 up_write(&dmar_global_lock);
4906 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4908 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4911 dma_ops = &intel_dma_ops;
4913 init_iommu_pm_ops();
4915 for_each_active_iommu(iommu, drhd) {
4916 iommu_device_sysfs_add(&iommu->iommu, NULL,
4919 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4920 iommu_device_register(&iommu->iommu);
4923 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4924 bus_register_notifier(&pci_bus_type, &device_nb);
4925 if (si_domain && !hw_pass_through)
4926 register_memory_notifier(&intel_iommu_memory_nb);
4927 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4928 intel_iommu_cpu_dead);
4929 intel_iommu_enabled = 1;
4930 intel_iommu_debugfs_init();
4934 out_free_reserved_range:
4935 put_iova_domain(&reserved_iova_list);
4937 intel_iommu_free_dmars();
4938 up_write(&dmar_global_lock);
4939 iommu_exit_mempool();
4943 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4945 struct intel_iommu *iommu = opaque;
4947 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4952 * NB - intel-iommu lacks any sort of reference counting for the users of
4953 * dependent devices. If multiple endpoints have intersecting dependent
4954 * devices, unbinding the driver from any one of them will possibly leave
4955 * the others unable to operate.
4957 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4959 if (!iommu || !dev || !dev_is_pci(dev))
4962 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4965 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4967 struct intel_iommu *iommu;
4968 unsigned long flags;
4970 assert_spin_locked(&device_domain_lock);
4975 iommu = info->iommu;
4978 if (dev_is_pci(info->dev) && sm_supported(iommu))
4979 intel_pasid_tear_down_entry(iommu, info->dev,
4982 iommu_disable_dev_iotlb(info);
4983 domain_context_clear(iommu, info->dev);
4984 intel_pasid_free_table(info->dev);
4987 unlink_domain_info(info);
4989 spin_lock_irqsave(&iommu->lock, flags);
4990 domain_detach_iommu(info->domain, iommu);
4991 spin_unlock_irqrestore(&iommu->lock, flags);
4993 free_devinfo_mem(info);
4996 static void dmar_remove_one_dev_info(struct device *dev)
4998 struct device_domain_info *info;
4999 unsigned long flags;
5001 spin_lock_irqsave(&device_domain_lock, flags);
5002 info = dev->archdata.iommu;
5003 __dmar_remove_one_dev_info(info);
5004 spin_unlock_irqrestore(&device_domain_lock, flags);
5007 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5011 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5012 domain_reserve_special_ranges(domain);
5014 /* calculate AGAW */
5015 domain->gaw = guest_width;
5016 adjust_width = guestwidth_to_adjustwidth(guest_width);
5017 domain->agaw = width_to_agaw(adjust_width);
5019 domain->iommu_coherency = 0;
5020 domain->iommu_snooping = 0;
5021 domain->iommu_superpage = 0;
5022 domain->max_addr = 0;
5024 /* always allocate the top pgd */
5025 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5028 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5032 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5034 struct dmar_domain *dmar_domain;
5035 struct iommu_domain *domain;
5037 if (type != IOMMU_DOMAIN_UNMANAGED)
5040 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
5042 pr_err("Can't allocate dmar_domain\n");
5045 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5046 pr_err("Domain initialization failed\n");
5047 domain_exit(dmar_domain);
5050 domain_update_iommu_cap(dmar_domain);
5052 domain = &dmar_domain->domain;
5053 domain->geometry.aperture_start = 0;
5054 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5055 domain->geometry.force_aperture = true;
5060 static void intel_iommu_domain_free(struct iommu_domain *domain)
5062 domain_exit(to_dmar_domain(domain));
5065 static int intel_iommu_attach_device(struct iommu_domain *domain,
5068 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5069 struct intel_iommu *iommu;
5073 if (device_is_rmrr_locked(dev)) {
5074 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5078 /* normally dev is not mapped */
5079 if (unlikely(domain_context_mapped(dev))) {
5080 struct dmar_domain *old_domain;
5082 old_domain = find_domain(dev);
5085 dmar_remove_one_dev_info(dev);
5088 if (!domain_type_is_vm_or_si(old_domain) &&
5089 list_empty(&old_domain->devices))
5090 domain_exit(old_domain);
5094 iommu = device_to_iommu(dev, &bus, &devfn);
5098 /* check if this iommu agaw is sufficient for max mapped address */
5099 addr_width = agaw_to_width(iommu->agaw);
5100 if (addr_width > cap_mgaw(iommu->cap))
5101 addr_width = cap_mgaw(iommu->cap);
5103 if (dmar_domain->max_addr > (1LL << addr_width)) {
5104 dev_err(dev, "%s: iommu width (%d) is not "
5105 "sufficient for the mapped address (%llx)\n",
5106 __func__, addr_width, dmar_domain->max_addr);
5109 dmar_domain->gaw = addr_width;
5112 * Knock out extra levels of page tables if necessary
5114 while (iommu->agaw < dmar_domain->agaw) {
5115 struct dma_pte *pte;
5117 pte = dmar_domain->pgd;
5118 if (dma_pte_present(pte)) {
5119 dmar_domain->pgd = (struct dma_pte *)
5120 phys_to_virt(dma_pte_addr(pte));
5121 free_pgtable_page(pte);
5123 dmar_domain->agaw--;
5126 return domain_add_dev_info(dmar_domain, dev);
5129 static void intel_iommu_detach_device(struct iommu_domain *domain,
5132 dmar_remove_one_dev_info(dev);
5135 static int intel_iommu_map(struct iommu_domain *domain,
5136 unsigned long iova, phys_addr_t hpa,
5137 size_t size, int iommu_prot)
5139 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5144 if (iommu_prot & IOMMU_READ)
5145 prot |= DMA_PTE_READ;
5146 if (iommu_prot & IOMMU_WRITE)
5147 prot |= DMA_PTE_WRITE;
5148 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5149 prot |= DMA_PTE_SNP;
5151 max_addr = iova + size;
5152 if (dmar_domain->max_addr < max_addr) {
5155 /* check if minimum agaw is sufficient for mapped address */
5156 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5157 if (end < max_addr) {
5158 pr_err("%s: iommu width (%d) is not "
5159 "sufficient for the mapped address (%llx)\n",
5160 __func__, dmar_domain->gaw, max_addr);
5163 dmar_domain->max_addr = max_addr;
5165 /* Round up size to next multiple of PAGE_SIZE, if it and
5166 the low bits of hpa would take us onto the next page */
5167 size = aligned_nrpages(hpa, size);
5168 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5169 hpa >> VTD_PAGE_SHIFT, size, prot);
5173 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5174 unsigned long iova, size_t size)
5176 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5177 struct page *freelist = NULL;
5178 unsigned long start_pfn, last_pfn;
5179 unsigned int npages;
5180 int iommu_id, level = 0;
5182 /* Cope with horrid API which requires us to unmap more than the
5183 size argument if it happens to be a large-page mapping. */
5184 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5186 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5187 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5189 start_pfn = iova >> VTD_PAGE_SHIFT;
5190 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5192 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5194 npages = last_pfn - start_pfn + 1;
5196 for_each_domain_iommu(iommu_id, dmar_domain)
5197 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5198 start_pfn, npages, !freelist, 0);
5200 dma_free_pagelist(freelist);
5202 if (dmar_domain->max_addr == iova + size)
5203 dmar_domain->max_addr = iova;
5208 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5211 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5212 struct dma_pte *pte;
5216 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5218 phys = dma_pte_addr(pte);
5223 static bool intel_iommu_capable(enum iommu_cap cap)
5225 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5226 return domain_update_iommu_snooping(NULL) == 1;
5227 if (cap == IOMMU_CAP_INTR_REMAP)
5228 return irq_remapping_enabled == 1;
5233 static int intel_iommu_add_device(struct device *dev)
5235 struct intel_iommu *iommu;
5236 struct iommu_group *group;
5239 iommu = device_to_iommu(dev, &bus, &devfn);
5243 iommu_device_link(&iommu->iommu, dev);
5245 group = iommu_group_get_for_dev(dev);
5248 return PTR_ERR(group);
5250 iommu_group_put(group);
5254 static void intel_iommu_remove_device(struct device *dev)
5256 struct intel_iommu *iommu;
5259 iommu = device_to_iommu(dev, &bus, &devfn);
5263 iommu_group_remove_device(dev);
5265 iommu_device_unlink(&iommu->iommu, dev);
5268 static void intel_iommu_get_resv_regions(struct device *device,
5269 struct list_head *head)
5271 struct iommu_resv_region *reg;
5272 struct dmar_rmrr_unit *rmrr;
5273 struct device *i_dev;
5277 for_each_rmrr_units(rmrr) {
5278 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5280 if (i_dev != device)
5283 list_add_tail(&rmrr->resv->list, head);
5288 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5289 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5293 list_add_tail(®->list, head);
5296 static void intel_iommu_put_resv_regions(struct device *dev,
5297 struct list_head *head)
5299 struct iommu_resv_region *entry, *next;
5301 list_for_each_entry_safe(entry, next, head, list) {
5302 if (entry->type == IOMMU_RESV_MSI)
5307 #ifdef CONFIG_INTEL_IOMMU_SVM
5308 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5310 struct device_domain_info *info;
5311 struct context_entry *context;
5312 struct dmar_domain *domain;
5313 unsigned long flags;
5317 domain = get_valid_domain_for_dev(sdev->dev);
5321 spin_lock_irqsave(&device_domain_lock, flags);
5322 spin_lock(&iommu->lock);
5325 info = sdev->dev->archdata.iommu;
5326 if (!info || !info->pasid_supported)
5329 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5330 if (WARN_ON(!context))
5333 ctx_lo = context[0].lo;
5335 sdev->did = domain->iommu_did[iommu->seq_id];
5336 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5338 if (!(ctx_lo & CONTEXT_PASIDE)) {
5339 ctx_lo |= CONTEXT_PASIDE;
5340 context[0].lo = ctx_lo;
5342 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5343 DMA_CCMD_MASK_NOBIT,
5344 DMA_CCMD_DEVICE_INVL);
5347 /* Enable PASID support in the device, if it wasn't already */
5348 if (!info->pasid_enabled)
5349 iommu_enable_dev_iotlb(info);
5351 if (info->ats_enabled) {
5352 sdev->dev_iotlb = 1;
5353 sdev->qdep = info->ats_qdep;
5354 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5360 spin_unlock(&iommu->lock);
5361 spin_unlock_irqrestore(&device_domain_lock, flags);
5366 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5368 struct intel_iommu *iommu;
5371 if (iommu_dummy(dev)) {
5373 "No IOMMU translation for device; cannot enable SVM\n");
5377 iommu = device_to_iommu(dev, &bus, &devfn);
5379 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5385 #endif /* CONFIG_INTEL_IOMMU_SVM */
5387 const struct iommu_ops intel_iommu_ops = {
5388 .capable = intel_iommu_capable,
5389 .domain_alloc = intel_iommu_domain_alloc,
5390 .domain_free = intel_iommu_domain_free,
5391 .attach_dev = intel_iommu_attach_device,
5392 .detach_dev = intel_iommu_detach_device,
5393 .map = intel_iommu_map,
5394 .unmap = intel_iommu_unmap,
5395 .iova_to_phys = intel_iommu_iova_to_phys,
5396 .add_device = intel_iommu_add_device,
5397 .remove_device = intel_iommu_remove_device,
5398 .get_resv_regions = intel_iommu_get_resv_regions,
5399 .put_resv_regions = intel_iommu_put_resv_regions,
5400 .device_group = pci_device_group,
5401 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5404 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5406 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5407 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5419 static void quirk_iommu_rwbf(struct pci_dev *dev)
5422 * Mobile 4 Series Chipset neglects to set RWBF capability,
5423 * but needs it. Same seems to hold for the desktop versions.
5425 pci_info(dev, "Forcing write-buffer flush capability\n");
5429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5438 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5439 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5440 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5441 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5442 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5443 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5444 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5445 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5447 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5451 if (pci_read_config_word(dev, GGC, &ggc))
5454 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5455 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5457 } else if (dmar_map_gfx) {
5458 /* we have to ensure the gfx device is idle before we flush */
5459 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5460 intel_iommu_strict = 1;
5463 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5468 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5469 ISOCH DMAR unit for the Azalia sound device, but not give it any
5470 TLB entries, which causes it to deadlock. Check for that. We do
5471 this in a function called from init_dmars(), instead of in a PCI
5472 quirk, because we don't want to print the obnoxious "BIOS broken"
5473 message if VT-d is actually disabled.
5475 static void __init check_tylersburg_isoch(void)
5477 struct pci_dev *pdev;
5478 uint32_t vtisochctrl;
5480 /* If there's no Azalia in the system anyway, forget it. */
5481 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5486 /* System Management Registers. Might be hidden, in which case
5487 we can't do the sanity check. But that's OK, because the
5488 known-broken BIOSes _don't_ actually hide it, so far. */
5489 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5493 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5500 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5501 if (vtisochctrl & 1)
5504 /* Drop all bits other than the number of TLB entries */
5505 vtisochctrl &= 0x1c;
5507 /* If we have the recommended number of TLB entries (16), fine. */
5508 if (vtisochctrl == 0x10)
5511 /* Zero TLB entries? You get to ride the short bus to school. */
5513 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5514 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5515 dmi_get_system_info(DMI_BIOS_VENDOR),
5516 dmi_get_system_info(DMI_BIOS_VERSION),
5517 dmi_get_system_info(DMI_PRODUCT_VERSION));
5518 iommu_identity_mapping |= IDENTMAP_AZALIA;
5522 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",