2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
57 #define ROOT_SIZE VTD_PAGE_SIZE
58 #define CONTEXT_SIZE VTD_PAGE_SIZE
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
65 #define IOAPIC_RANGE_START (0xfee00000)
66 #define IOAPIC_RANGE_END (0xfeefffff)
67 #define IOVA_START_ADDR (0x1000)
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
74 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
80 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN (1)
86 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
88 /* page table handling */
89 #define LEVEL_STRIDE (9)
90 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
93 * This bitmap is used to advertise the page sizes our hardware support
94 * to the IOMMU core, which will then use this information to split
95 * physically contiguous memory regions it is mapping into page sizes
98 * Traditionally the IOMMU core just handed us the mappings directly,
99 * after making sure the size is an order of a 4KiB page and that the
100 * mapping has natural alignment.
102 * To retain this behavior, we currently advertise that we support
103 * all page sizes that are an order of 4KiB.
105 * If at some point we'd like to utilize the IOMMU core's new behavior,
106 * we could change this to advertise the real page sizes we support.
108 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
110 static inline int agaw_to_level(int agaw)
115 static inline int agaw_to_width(int agaw)
117 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 static inline int width_to_agaw(int width)
122 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 static inline unsigned int level_to_offset_bits(int level)
127 return (level - 1) * LEVEL_STRIDE;
130 static inline int pfn_level_offset(unsigned long pfn, int level)
132 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 static inline unsigned long level_mask(int level)
137 return -1UL << level_to_offset_bits(level);
140 static inline unsigned long level_size(int level)
142 return 1UL << level_to_offset_bits(level);
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
147 return (pfn + level_size(level) - 1) & level_mask(level);
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
152 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156 are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
159 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
164 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
168 return mm_to_dma_pfn(page_to_pfn(pg));
170 static inline unsigned long virt_to_dma_pfn(void *p)
172 return page_to_dma_pfn(virt_to_page(p));
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
182 * set to 1 to panic kernel if can't successfully enable VT-d
183 * (used when kernel is launched w/ TXT)
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187 static int no_platform_optin;
189 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
195 static phys_addr_t root_entry_lctp(struct root_entry *re)
200 return re->lo & VTD_PAGE_MASK;
204 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
207 static phys_addr_t root_entry_uctp(struct root_entry *re)
212 return re->hi & VTD_PAGE_MASK;
215 static inline void context_clear_pasid_enable(struct context_entry *context)
217 context->lo &= ~(1ULL << 11);
220 static inline bool context_pasid_enabled(struct context_entry *context)
222 return !!(context->lo & (1ULL << 11));
225 static inline void context_set_copied(struct context_entry *context)
227 context->hi |= (1ull << 3);
230 static inline bool context_copied(struct context_entry *context)
232 return !!(context->hi & (1ULL << 3));
235 static inline bool __context_present(struct context_entry *context)
237 return (context->lo & 1);
240 bool context_present(struct context_entry *context)
242 return context_pasid_enabled(context) ?
243 __context_present(context) :
244 __context_present(context) && !context_copied(context);
247 static inline void context_set_present(struct context_entry *context)
252 static inline void context_set_fault_enable(struct context_entry *context)
254 context->lo &= (((u64)-1) << 2) | 1;
257 static inline void context_set_translation_type(struct context_entry *context,
260 context->lo &= (((u64)-1) << 4) | 3;
261 context->lo |= (value & 3) << 2;
264 static inline void context_set_address_root(struct context_entry *context,
267 context->lo &= ~VTD_PAGE_MASK;
268 context->lo |= value & VTD_PAGE_MASK;
271 static inline void context_set_address_width(struct context_entry *context,
274 context->hi |= value & 7;
277 static inline void context_set_domain_id(struct context_entry *context,
280 context->hi |= (value & ((1 << 16) - 1)) << 8;
283 static inline int context_domain_id(struct context_entry *c)
285 return((c->hi >> 8) & 0xffff);
288 static inline void context_clear_entry(struct context_entry *context)
301 * 12-63: Host physcial address
307 static inline void dma_clear_pte(struct dma_pte *pte)
312 static inline u64 dma_pte_addr(struct dma_pte *pte)
315 return pte->val & VTD_PAGE_MASK;
317 /* Must have a full atomic 64-bit read */
318 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
322 static inline bool dma_pte_present(struct dma_pte *pte)
324 return (pte->val & 3) != 0;
327 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 return (pte->val & DMA_PTE_LARGE_PAGE);
332 static inline int first_pte_in_page(struct dma_pte *pte)
334 return !((unsigned long)pte & ~VTD_PAGE_MASK);
338 * This domain is a statically identity mapping domain.
339 * 1. This domain creats a static 1:1 mapping to all usable memory.
340 * 2. It maps to each iommu if successful.
341 * 3. Each iommu mapps to this domain if successful.
343 static struct dmar_domain *si_domain;
344 static int hw_pass_through = 1;
347 * Domain represents a virtual machine, more than one devices
348 * across iommus may be owned in one domain, e.g. kvm guest.
350 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
352 /* si_domain contains mulitple devices */
353 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
355 #define for_each_domain_iommu(idx, domain) \
356 for (idx = 0; idx < g_num_of_iommus; idx++) \
357 if (domain->iommu_refcnt[idx])
359 struct dmar_rmrr_unit {
360 struct list_head list; /* list of rmrr units */
361 struct acpi_dmar_header *hdr; /* ACPI header */
362 u64 base_address; /* reserved base address*/
363 u64 end_address; /* reserved end address */
364 struct dmar_dev_scope *devices; /* target devices */
365 int devices_cnt; /* target device count */
366 struct iommu_resv_region *resv; /* reserved region handle */
369 struct dmar_atsr_unit {
370 struct list_head list; /* list of ATSR units */
371 struct acpi_dmar_header *hdr; /* ACPI header */
372 struct dmar_dev_scope *devices; /* target devices */
373 int devices_cnt; /* target device count */
374 u8 include_all:1; /* include all ports */
377 static LIST_HEAD(dmar_atsr_units);
378 static LIST_HEAD(dmar_rmrr_units);
380 #define for_each_rmrr_units(rmrr) \
381 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
383 /* bitmap for indexing intel_iommus */
384 static int g_num_of_iommus;
386 static void domain_exit(struct dmar_domain *domain);
387 static void domain_remove_dev_info(struct dmar_domain *domain);
388 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
390 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
391 static void domain_context_clear(struct intel_iommu *iommu,
393 static int domain_detach_iommu(struct dmar_domain *domain,
394 struct intel_iommu *iommu);
396 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
397 int dmar_disabled = 0;
399 int dmar_disabled = 1;
400 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
402 int intel_iommu_enabled = 0;
403 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
405 static int dmar_map_gfx = 1;
406 static int dmar_forcedac;
407 static int intel_iommu_strict;
408 static int intel_iommu_superpage = 1;
409 static int intel_iommu_ecs = 1;
410 static int intel_iommu_pasid28;
411 static int iommu_identity_mapping;
413 #define IDENTMAP_ALL 1
414 #define IDENTMAP_GFX 2
415 #define IDENTMAP_AZALIA 4
417 /* Broadwell and Skylake have broken ECS support — normal so-called "second
418 * level" translation of DMA requests-without-PASID doesn't actually happen
419 * unless you also set the NESTE bit in an extended context-entry. Which of
420 * course means that SVM doesn't work because it's trying to do nested
421 * translation of the physical addresses it finds in the process page tables,
422 * through the IOVA->phys mapping found in the "second level" page tables.
424 * The VT-d specification was retroactively changed to change the definition
425 * of the capability bits and pretend that Broadwell/Skylake never happened...
426 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
427 * for some reason it was the PASID capability bit which was redefined (from
428 * bit 28 on BDW/SKL to bit 40 in future).
430 * So our test for ECS needs to eschew those implementations which set the old
431 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
432 * Unless we are working around the 'pasid28' limitations, that is, by putting
433 * the device into passthrough mode for normal DMA and thus masking the bug.
435 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
436 (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
437 /* PASID support is thus enabled if ECS is enabled and *either* of the old
438 * or new capability bits are set. */
439 #define pasid_enabled(iommu) (ecs_enabled(iommu) && \
440 (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
450 * Iterate over elements in device_domain_list and call the specified
451 * callback @fn against each element. This helper should only be used
452 * in the context where the device_domain_lock has already been holden.
454 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
455 void *data), void *data)
458 struct device_domain_info *info;
460 assert_spin_locked(&device_domain_lock);
461 list_for_each_entry(info, &device_domain_list, global) {
462 ret = fn(info, data);
470 const struct iommu_ops intel_iommu_ops;
472 static bool translation_pre_enabled(struct intel_iommu *iommu)
474 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
477 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
479 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
482 static void init_translation_status(struct intel_iommu *iommu)
486 gsts = readl(iommu->reg + DMAR_GSTS_REG);
487 if (gsts & DMA_GSTS_TES)
488 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
491 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
492 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
494 return container_of(dom, struct dmar_domain, domain);
497 static int __init intel_iommu_setup(char *str)
502 if (!strncmp(str, "on", 2)) {
504 pr_info("IOMMU enabled\n");
505 } else if (!strncmp(str, "off", 3)) {
507 no_platform_optin = 1;
508 pr_info("IOMMU disabled\n");
509 } else if (!strncmp(str, "igfx_off", 8)) {
511 pr_info("Disable GFX device mapping\n");
512 } else if (!strncmp(str, "forcedac", 8)) {
513 pr_info("Forcing DAC for PCI devices\n");
515 } else if (!strncmp(str, "strict", 6)) {
516 pr_info("Disable batched IOTLB flush\n");
517 intel_iommu_strict = 1;
518 } else if (!strncmp(str, "sp_off", 6)) {
519 pr_info("Disable supported super page\n");
520 intel_iommu_superpage = 0;
521 } else if (!strncmp(str, "ecs_off", 7)) {
523 "Intel-IOMMU: disable extended context table support\n");
525 } else if (!strncmp(str, "pasid28", 7)) {
527 "Intel-IOMMU: enable pre-production PASID support\n");
528 intel_iommu_pasid28 = 1;
529 iommu_identity_mapping |= IDENTMAP_GFX;
530 } else if (!strncmp(str, "tboot_noforce", 13)) {
532 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
533 intel_iommu_tboot_noforce = 1;
536 str += strcspn(str, ",");
542 __setup("intel_iommu=", intel_iommu_setup);
544 static struct kmem_cache *iommu_domain_cache;
545 static struct kmem_cache *iommu_devinfo_cache;
547 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
549 struct dmar_domain **domains;
552 domains = iommu->domains[idx];
556 return domains[did & 0xff];
559 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
560 struct dmar_domain *domain)
562 struct dmar_domain **domains;
565 if (!iommu->domains[idx]) {
566 size_t size = 256 * sizeof(struct dmar_domain *);
567 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
570 domains = iommu->domains[idx];
571 if (WARN_ON(!domains))
574 domains[did & 0xff] = domain;
577 void *alloc_pgtable_page(int node)
582 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
584 vaddr = page_address(page);
588 void free_pgtable_page(void *vaddr)
590 free_page((unsigned long)vaddr);
593 static inline void *alloc_domain_mem(void)
595 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
598 static void free_domain_mem(void *vaddr)
600 kmem_cache_free(iommu_domain_cache, vaddr);
603 static inline void * alloc_devinfo_mem(void)
605 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
608 static inline void free_devinfo_mem(void *vaddr)
610 kmem_cache_free(iommu_devinfo_cache, vaddr);
613 static inline int domain_type_is_vm(struct dmar_domain *domain)
615 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
618 static inline int domain_type_is_si(struct dmar_domain *domain)
620 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
623 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
625 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
626 DOMAIN_FLAG_STATIC_IDENTITY);
629 static inline int domain_pfn_supported(struct dmar_domain *domain,
632 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
634 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
637 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
642 sagaw = cap_sagaw(iommu->cap);
643 for (agaw = width_to_agaw(max_gaw);
645 if (test_bit(agaw, &sagaw))
653 * Calculate max SAGAW for each iommu.
655 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
657 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
661 * calculate agaw for each iommu.
662 * "SAGAW" may be different across iommus, use a default agaw, and
663 * get a supported less agaw for iommus that don't support the default agaw.
665 int iommu_calculate_agaw(struct intel_iommu *iommu)
667 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
670 /* This functionin only returns single iommu in a domain */
671 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
675 /* si_domain and vm domain should not get here. */
676 BUG_ON(domain_type_is_vm_or_si(domain));
677 for_each_domain_iommu(iommu_id, domain)
680 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
683 return g_iommus[iommu_id];
686 static void domain_update_iommu_coherency(struct dmar_domain *domain)
688 struct dmar_drhd_unit *drhd;
689 struct intel_iommu *iommu;
693 domain->iommu_coherency = 1;
695 for_each_domain_iommu(i, domain) {
697 if (!ecap_coherent(g_iommus[i]->ecap)) {
698 domain->iommu_coherency = 0;
705 /* No hardware attached; use lowest common denominator */
707 for_each_active_iommu(iommu, drhd) {
708 if (!ecap_coherent(iommu->ecap)) {
709 domain->iommu_coherency = 0;
716 static int domain_update_iommu_snooping(struct intel_iommu *skip)
718 struct dmar_drhd_unit *drhd;
719 struct intel_iommu *iommu;
723 for_each_active_iommu(iommu, drhd) {
725 if (!ecap_sc_support(iommu->ecap)) {
736 static int domain_update_iommu_superpage(struct intel_iommu *skip)
738 struct dmar_drhd_unit *drhd;
739 struct intel_iommu *iommu;
742 if (!intel_iommu_superpage) {
746 /* set iommu_superpage to the smallest common denominator */
748 for_each_active_iommu(iommu, drhd) {
750 mask &= cap_super_page_val(iommu->cap);
760 /* Some capabilities may be different across iommus */
761 static void domain_update_iommu_cap(struct dmar_domain *domain)
763 domain_update_iommu_coherency(domain);
764 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
765 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
768 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
771 struct root_entry *root = &iommu->root_entry[bus];
772 struct context_entry *context;
776 if (ecs_enabled(iommu)) {
784 context = phys_to_virt(*entry & VTD_PAGE_MASK);
786 unsigned long phy_addr;
790 context = alloc_pgtable_page(iommu->node);
794 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 phy_addr = virt_to_phys((void *)context);
796 *entry = phy_addr | 1;
797 __iommu_flush_cache(iommu, entry, sizeof(*entry));
799 return &context[devfn];
802 static int iommu_dummy(struct device *dev)
804 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
807 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
809 struct dmar_drhd_unit *drhd = NULL;
810 struct intel_iommu *iommu;
812 struct pci_dev *ptmp, *pdev = NULL;
816 if (iommu_dummy(dev))
819 if (dev_is_pci(dev)) {
820 struct pci_dev *pf_pdev;
822 pdev = to_pci_dev(dev);
825 /* VMD child devices currently cannot be handled individually */
826 if (is_vmd(pdev->bus))
830 /* VFs aren't listed in scope tables; we need to look up
831 * the PF instead to find the IOMMU. */
832 pf_pdev = pci_physfn(pdev);
834 segment = pci_domain_nr(pdev->bus);
835 } else if (has_acpi_companion(dev))
836 dev = &ACPI_COMPANION(dev)->dev;
839 for_each_active_iommu(iommu, drhd) {
840 if (pdev && segment != drhd->segment)
843 for_each_active_dev_scope(drhd->devices,
844 drhd->devices_cnt, i, tmp) {
846 /* For a VF use its original BDF# not that of the PF
847 * which we used for the IOMMU lookup. Strictly speaking
848 * we could do this for all PCI devices; we only need to
849 * get the BDF# from the scope table for ACPI matches. */
850 if (pdev && pdev->is_virtfn)
853 *bus = drhd->devices[i].bus;
854 *devfn = drhd->devices[i].devfn;
858 if (!pdev || !dev_is_pci(tmp))
861 ptmp = to_pci_dev(tmp);
862 if (ptmp->subordinate &&
863 ptmp->subordinate->number <= pdev->bus->number &&
864 ptmp->subordinate->busn_res.end >= pdev->bus->number)
868 if (pdev && drhd->include_all) {
870 *bus = pdev->bus->number;
871 *devfn = pdev->devfn;
882 static void domain_flush_cache(struct dmar_domain *domain,
883 void *addr, int size)
885 if (!domain->iommu_coherency)
886 clflush_cache_range(addr, size);
889 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
891 struct context_entry *context;
895 spin_lock_irqsave(&iommu->lock, flags);
896 context = iommu_context_addr(iommu, bus, devfn, 0);
898 ret = context_present(context);
899 spin_unlock_irqrestore(&iommu->lock, flags);
903 static void free_context_table(struct intel_iommu *iommu)
907 struct context_entry *context;
909 spin_lock_irqsave(&iommu->lock, flags);
910 if (!iommu->root_entry) {
913 for (i = 0; i < ROOT_ENTRY_NR; i++) {
914 context = iommu_context_addr(iommu, i, 0, 0);
916 free_pgtable_page(context);
918 if (!ecs_enabled(iommu))
921 context = iommu_context_addr(iommu, i, 0x80, 0);
923 free_pgtable_page(context);
926 free_pgtable_page(iommu->root_entry);
927 iommu->root_entry = NULL;
929 spin_unlock_irqrestore(&iommu->lock, flags);
932 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
933 unsigned long pfn, int *target_level)
935 struct dma_pte *parent, *pte = NULL;
936 int level = agaw_to_level(domain->agaw);
939 BUG_ON(!domain->pgd);
941 if (!domain_pfn_supported(domain, pfn))
942 /* Address beyond IOMMU's addressing capabilities. */
945 parent = domain->pgd;
950 offset = pfn_level_offset(pfn, level);
951 pte = &parent[offset];
952 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
954 if (level == *target_level)
957 if (!dma_pte_present(pte)) {
960 tmp_page = alloc_pgtable_page(domain->nid);
965 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
966 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
967 if (cmpxchg64(&pte->val, 0ULL, pteval))
968 /* Someone else set it while we were thinking; use theirs. */
969 free_pgtable_page(tmp_page);
971 domain_flush_cache(domain, pte, sizeof(*pte));
976 parent = phys_to_virt(dma_pte_addr(pte));
981 *target_level = level;
987 /* return address's pte at specific level */
988 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
990 int level, int *large_page)
992 struct dma_pte *parent, *pte = NULL;
993 int total = agaw_to_level(domain->agaw);
996 parent = domain->pgd;
997 while (level <= total) {
998 offset = pfn_level_offset(pfn, total);
999 pte = &parent[offset];
1003 if (!dma_pte_present(pte)) {
1004 *large_page = total;
1008 if (dma_pte_superpage(pte)) {
1009 *large_page = total;
1013 parent = phys_to_virt(dma_pte_addr(pte));
1019 /* clear last level pte, a tlb flush should be followed */
1020 static void dma_pte_clear_range(struct dmar_domain *domain,
1021 unsigned long start_pfn,
1022 unsigned long last_pfn)
1024 unsigned int large_page = 1;
1025 struct dma_pte *first_pte, *pte;
1027 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1028 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1029 BUG_ON(start_pfn > last_pfn);
1031 /* we don't need lock here; nobody else touches the iova range */
1034 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1036 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1041 start_pfn += lvl_to_nr_pages(large_page);
1043 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1045 domain_flush_cache(domain, first_pte,
1046 (void *)pte - (void *)first_pte);
1048 } while (start_pfn && start_pfn <= last_pfn);
1051 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1052 int retain_level, struct dma_pte *pte,
1053 unsigned long pfn, unsigned long start_pfn,
1054 unsigned long last_pfn)
1056 pfn = max(start_pfn, pfn);
1057 pte = &pte[pfn_level_offset(pfn, level)];
1060 unsigned long level_pfn;
1061 struct dma_pte *level_pte;
1063 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1066 level_pfn = pfn & level_mask(level);
1067 level_pte = phys_to_virt(dma_pte_addr(pte));
1070 dma_pte_free_level(domain, level - 1, retain_level,
1071 level_pte, level_pfn, start_pfn,
1076 * Free the page table if we're below the level we want to
1077 * retain and the range covers the entire table.
1079 if (level < retain_level && !(start_pfn > level_pfn ||
1080 last_pfn < level_pfn + level_size(level) - 1)) {
1082 domain_flush_cache(domain, pte, sizeof(*pte));
1083 free_pgtable_page(level_pte);
1086 pfn += level_size(level);
1087 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1091 * clear last level (leaf) ptes and free page table pages below the
1092 * level we wish to keep intact.
1094 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1095 unsigned long start_pfn,
1096 unsigned long last_pfn,
1099 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1100 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1101 BUG_ON(start_pfn > last_pfn);
1103 dma_pte_clear_range(domain, start_pfn, last_pfn);
1105 /* We don't need lock here; nobody else touches the iova range */
1106 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1107 domain->pgd, 0, start_pfn, last_pfn);
1110 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1111 free_pgtable_page(domain->pgd);
1116 /* When a page at a given level is being unlinked from its parent, we don't
1117 need to *modify* it at all. All we need to do is make a list of all the
1118 pages which can be freed just as soon as we've flushed the IOTLB and we
1119 know the hardware page-walk will no longer touch them.
1120 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1122 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1123 int level, struct dma_pte *pte,
1124 struct page *freelist)
1128 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1129 pg->freelist = freelist;
1135 pte = page_address(pg);
1137 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1138 freelist = dma_pte_list_pagetables(domain, level - 1,
1141 } while (!first_pte_in_page(pte));
1146 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1147 struct dma_pte *pte, unsigned long pfn,
1148 unsigned long start_pfn,
1149 unsigned long last_pfn,
1150 struct page *freelist)
1152 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1154 pfn = max(start_pfn, pfn);
1155 pte = &pte[pfn_level_offset(pfn, level)];
1158 unsigned long level_pfn;
1160 if (!dma_pte_present(pte))
1163 level_pfn = pfn & level_mask(level);
1165 /* If range covers entire pagetable, free it */
1166 if (start_pfn <= level_pfn &&
1167 last_pfn >= level_pfn + level_size(level) - 1) {
1168 /* These suborbinate page tables are going away entirely. Don't
1169 bother to clear them; we're just going to *free* them. */
1170 if (level > 1 && !dma_pte_superpage(pte))
1171 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1177 } else if (level > 1) {
1178 /* Recurse down into a level that isn't *entirely* obsolete */
1179 freelist = dma_pte_clear_level(domain, level - 1,
1180 phys_to_virt(dma_pte_addr(pte)),
1181 level_pfn, start_pfn, last_pfn,
1185 pfn += level_size(level);
1186 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1189 domain_flush_cache(domain, first_pte,
1190 (void *)++last_pte - (void *)first_pte);
1195 /* We can't just free the pages because the IOMMU may still be walking
1196 the page tables, and may have cached the intermediate levels. The
1197 pages can only be freed after the IOTLB flush has been done. */
1198 static struct page *domain_unmap(struct dmar_domain *domain,
1199 unsigned long start_pfn,
1200 unsigned long last_pfn)
1202 struct page *freelist = NULL;
1204 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1205 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1206 BUG_ON(start_pfn > last_pfn);
1208 /* we don't need lock here; nobody else touches the iova range */
1209 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1210 domain->pgd, 0, start_pfn, last_pfn, NULL);
1213 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1214 struct page *pgd_page = virt_to_page(domain->pgd);
1215 pgd_page->freelist = freelist;
1216 freelist = pgd_page;
1224 static void dma_free_pagelist(struct page *freelist)
1228 while ((pg = freelist)) {
1229 freelist = pg->freelist;
1230 free_pgtable_page(page_address(pg));
1234 static void iova_entry_free(unsigned long data)
1236 struct page *freelist = (struct page *)data;
1238 dma_free_pagelist(freelist);
1241 /* iommu handling */
1242 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1244 struct root_entry *root;
1245 unsigned long flags;
1247 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1249 pr_err("Allocating root entry for %s failed\n",
1254 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1256 spin_lock_irqsave(&iommu->lock, flags);
1257 iommu->root_entry = root;
1258 spin_unlock_irqrestore(&iommu->lock, flags);
1263 static void iommu_set_root_entry(struct intel_iommu *iommu)
1269 addr = virt_to_phys(iommu->root_entry);
1270 if (ecs_enabled(iommu))
1271 addr |= DMA_RTADDR_RTT;
1273 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1274 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1276 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1278 /* Make sure hardware complete it */
1279 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1280 readl, (sts & DMA_GSTS_RTPS), sts);
1282 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1285 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1290 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1293 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1294 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1296 /* Make sure hardware complete it */
1297 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1298 readl, (!(val & DMA_GSTS_WBFS)), val);
1300 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1303 /* return value determine if we need a write buffer flush */
1304 static void __iommu_flush_context(struct intel_iommu *iommu,
1305 u16 did, u16 source_id, u8 function_mask,
1312 case DMA_CCMD_GLOBAL_INVL:
1313 val = DMA_CCMD_GLOBAL_INVL;
1315 case DMA_CCMD_DOMAIN_INVL:
1316 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1318 case DMA_CCMD_DEVICE_INVL:
1319 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1320 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1325 val |= DMA_CCMD_ICC;
1327 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1332 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1337 /* return value determine if we need a write buffer flush */
1338 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1339 u64 addr, unsigned int size_order, u64 type)
1341 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1342 u64 val = 0, val_iva = 0;
1346 case DMA_TLB_GLOBAL_FLUSH:
1347 /* global flush doesn't need set IVA_REG */
1348 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1350 case DMA_TLB_DSI_FLUSH:
1351 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1353 case DMA_TLB_PSI_FLUSH:
1354 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1355 /* IH bit is passed in as part of address */
1356 val_iva = size_order | addr;
1361 /* Note: set drain read/write */
1364 * This is probably to be super secure.. Looks like we can
1365 * ignore it without any impact.
1367 if (cap_read_drain(iommu->cap))
1368 val |= DMA_TLB_READ_DRAIN;
1370 if (cap_write_drain(iommu->cap))
1371 val |= DMA_TLB_WRITE_DRAIN;
1373 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1374 /* Note: Only uses first TLB reg currently */
1376 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1377 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1379 /* Make sure hardware complete it */
1380 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1381 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1383 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1385 /* check IOTLB invalidation granularity */
1386 if (DMA_TLB_IAIG(val) == 0)
1387 pr_err("Flush IOTLB failed\n");
1388 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1389 pr_debug("TLB flush request %Lx, actual %Lx\n",
1390 (unsigned long long)DMA_TLB_IIRG(type),
1391 (unsigned long long)DMA_TLB_IAIG(val));
1394 static struct device_domain_info *
1395 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1398 struct device_domain_info *info;
1400 assert_spin_locked(&device_domain_lock);
1405 list_for_each_entry(info, &domain->devices, link)
1406 if (info->iommu == iommu && info->bus == bus &&
1407 info->devfn == devfn) {
1408 if (info->ats_supported && info->dev)
1416 static void domain_update_iotlb(struct dmar_domain *domain)
1418 struct device_domain_info *info;
1419 bool has_iotlb_device = false;
1421 assert_spin_locked(&device_domain_lock);
1423 list_for_each_entry(info, &domain->devices, link) {
1424 struct pci_dev *pdev;
1426 if (!info->dev || !dev_is_pci(info->dev))
1429 pdev = to_pci_dev(info->dev);
1430 if (pdev->ats_enabled) {
1431 has_iotlb_device = true;
1436 domain->has_iotlb_device = has_iotlb_device;
1439 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1441 struct pci_dev *pdev;
1443 assert_spin_locked(&device_domain_lock);
1445 if (!info || !dev_is_pci(info->dev))
1448 pdev = to_pci_dev(info->dev);
1449 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1450 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1451 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1452 * reserved, which should be set to 0.
1454 if (!ecap_dit(info->iommu->ecap))
1457 struct pci_dev *pf_pdev;
1459 /* pdev will be returned if device is not a vf */
1460 pf_pdev = pci_physfn(pdev);
1461 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1464 #ifdef CONFIG_INTEL_IOMMU_SVM
1465 /* The PCIe spec, in its wisdom, declares that the behaviour of
1466 the device if you enable PASID support after ATS support is
1467 undefined. So always enable PASID support on devices which
1468 have it, even if we can't yet know if we're ever going to
1470 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1471 info->pasid_enabled = 1;
1473 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1474 info->pri_enabled = 1;
1476 if (!pdev->untrusted && info->ats_supported &&
1477 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1478 info->ats_enabled = 1;
1479 domain_update_iotlb(info->domain);
1480 info->ats_qdep = pci_ats_queue_depth(pdev);
1484 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1486 struct pci_dev *pdev;
1488 assert_spin_locked(&device_domain_lock);
1490 if (!dev_is_pci(info->dev))
1493 pdev = to_pci_dev(info->dev);
1495 if (info->ats_enabled) {
1496 pci_disable_ats(pdev);
1497 info->ats_enabled = 0;
1498 domain_update_iotlb(info->domain);
1500 #ifdef CONFIG_INTEL_IOMMU_SVM
1501 if (info->pri_enabled) {
1502 pci_disable_pri(pdev);
1503 info->pri_enabled = 0;
1505 if (info->pasid_enabled) {
1506 pci_disable_pasid(pdev);
1507 info->pasid_enabled = 0;
1512 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1513 u64 addr, unsigned mask)
1516 unsigned long flags;
1517 struct device_domain_info *info;
1519 if (!domain->has_iotlb_device)
1522 spin_lock_irqsave(&device_domain_lock, flags);
1523 list_for_each_entry(info, &domain->devices, link) {
1524 if (!info->ats_enabled)
1527 sid = info->bus << 8 | info->devfn;
1528 qdep = info->ats_qdep;
1529 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1532 spin_unlock_irqrestore(&device_domain_lock, flags);
1535 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1536 struct dmar_domain *domain,
1537 unsigned long pfn, unsigned int pages,
1540 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1541 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1542 u16 did = domain->iommu_did[iommu->seq_id];
1549 * Fallback to domain selective flush if no PSI support or the size is
1551 * PSI requires page size to be 2 ^ x, and the base address is naturally
1552 * aligned to the size
1554 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1555 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1558 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1562 * In caching mode, changes of pages from non-present to present require
1563 * flush. However, device IOTLB doesn't need to be flushed in this case.
1565 if (!cap_caching_mode(iommu->cap) || !map)
1566 iommu_flush_dev_iotlb(domain, addr, mask);
1569 /* Notification for newly created mappings */
1570 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1571 struct dmar_domain *domain,
1572 unsigned long pfn, unsigned int pages)
1574 /* It's a non-present to present mapping. Only flush if caching mode */
1575 if (cap_caching_mode(iommu->cap))
1576 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1578 iommu_flush_write_buffer(iommu);
1581 static void iommu_flush_iova(struct iova_domain *iovad)
1583 struct dmar_domain *domain;
1586 domain = container_of(iovad, struct dmar_domain, iovad);
1588 for_each_domain_iommu(idx, domain) {
1589 struct intel_iommu *iommu = g_iommus[idx];
1590 u16 did = domain->iommu_did[iommu->seq_id];
1592 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1594 if (!cap_caching_mode(iommu->cap))
1595 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1596 0, MAX_AGAW_PFN_WIDTH);
1600 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1603 unsigned long flags;
1605 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1606 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1607 pmen &= ~DMA_PMEN_EPM;
1608 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1610 /* wait for the protected region status bit to clear */
1611 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1612 readl, !(pmen & DMA_PMEN_PRS), pmen);
1614 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1617 static void iommu_enable_translation(struct intel_iommu *iommu)
1620 unsigned long flags;
1622 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623 iommu->gcmd |= DMA_GCMD_TE;
1624 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1626 /* Make sure hardware complete it */
1627 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1628 readl, (sts & DMA_GSTS_TES), sts);
1630 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1633 static void iommu_disable_translation(struct intel_iommu *iommu)
1638 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1639 iommu->gcmd &= ~DMA_GCMD_TE;
1640 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1642 /* Make sure hardware complete it */
1643 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1644 readl, (!(sts & DMA_GSTS_TES)), sts);
1646 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650 static int iommu_init_domains(struct intel_iommu *iommu)
1652 u32 ndomains, nlongs;
1655 ndomains = cap_ndoms(iommu->cap);
1656 pr_debug("%s: Number of Domains supported <%d>\n",
1657 iommu->name, ndomains);
1658 nlongs = BITS_TO_LONGS(ndomains);
1660 spin_lock_init(&iommu->lock);
1662 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1663 if (!iommu->domain_ids) {
1664 pr_err("%s: Allocating domain id array failed\n",
1669 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1670 iommu->domains = kzalloc(size, GFP_KERNEL);
1672 if (iommu->domains) {
1673 size = 256 * sizeof(struct dmar_domain *);
1674 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1677 if (!iommu->domains || !iommu->domains[0]) {
1678 pr_err("%s: Allocating domain array failed\n",
1680 kfree(iommu->domain_ids);
1681 kfree(iommu->domains);
1682 iommu->domain_ids = NULL;
1683 iommu->domains = NULL;
1690 * If Caching mode is set, then invalid translations are tagged
1691 * with domain-id 0, hence we need to pre-allocate it. We also
1692 * use domain-id 0 as a marker for non-allocated domain-id, so
1693 * make sure it is not used for a real domain.
1695 set_bit(0, iommu->domain_ids);
1700 static void disable_dmar_iommu(struct intel_iommu *iommu)
1702 struct device_domain_info *info, *tmp;
1703 unsigned long flags;
1705 if (!iommu->domains || !iommu->domain_ids)
1709 spin_lock_irqsave(&device_domain_lock, flags);
1710 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1711 struct dmar_domain *domain;
1713 if (info->iommu != iommu)
1716 if (!info->dev || !info->domain)
1719 domain = info->domain;
1721 __dmar_remove_one_dev_info(info);
1723 if (!domain_type_is_vm_or_si(domain)) {
1725 * The domain_exit() function can't be called under
1726 * device_domain_lock, as it takes this lock itself.
1727 * So release the lock here and re-run the loop
1730 spin_unlock_irqrestore(&device_domain_lock, flags);
1731 domain_exit(domain);
1735 spin_unlock_irqrestore(&device_domain_lock, flags);
1737 if (iommu->gcmd & DMA_GCMD_TE)
1738 iommu_disable_translation(iommu);
1741 static void free_dmar_iommu(struct intel_iommu *iommu)
1743 if ((iommu->domains) && (iommu->domain_ids)) {
1744 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1747 for (i = 0; i < elems; i++)
1748 kfree(iommu->domains[i]);
1749 kfree(iommu->domains);
1750 kfree(iommu->domain_ids);
1751 iommu->domains = NULL;
1752 iommu->domain_ids = NULL;
1755 g_iommus[iommu->seq_id] = NULL;
1757 /* free context mapping */
1758 free_context_table(iommu);
1760 #ifdef CONFIG_INTEL_IOMMU_SVM
1761 if (pasid_enabled(iommu)) {
1762 if (ecap_prs(iommu->ecap))
1763 intel_svm_finish_prq(iommu);
1764 intel_svm_exit(iommu);
1769 static struct dmar_domain *alloc_domain(int flags)
1771 struct dmar_domain *domain;
1773 domain = alloc_domain_mem();
1777 memset(domain, 0, sizeof(*domain));
1779 domain->flags = flags;
1780 domain->has_iotlb_device = false;
1781 INIT_LIST_HEAD(&domain->devices);
1786 /* Must be called with iommu->lock */
1787 static int domain_attach_iommu(struct dmar_domain *domain,
1788 struct intel_iommu *iommu)
1790 unsigned long ndomains;
1793 assert_spin_locked(&device_domain_lock);
1794 assert_spin_locked(&iommu->lock);
1796 domain->iommu_refcnt[iommu->seq_id] += 1;
1797 domain->iommu_count += 1;
1798 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1799 ndomains = cap_ndoms(iommu->cap);
1800 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1802 if (num >= ndomains) {
1803 pr_err("%s: No free domain ids\n", iommu->name);
1804 domain->iommu_refcnt[iommu->seq_id] -= 1;
1805 domain->iommu_count -= 1;
1809 set_bit(num, iommu->domain_ids);
1810 set_iommu_domain(iommu, num, domain);
1812 domain->iommu_did[iommu->seq_id] = num;
1813 domain->nid = iommu->node;
1815 domain_update_iommu_cap(domain);
1821 static int domain_detach_iommu(struct dmar_domain *domain,
1822 struct intel_iommu *iommu)
1824 int num, count = INT_MAX;
1826 assert_spin_locked(&device_domain_lock);
1827 assert_spin_locked(&iommu->lock);
1829 domain->iommu_refcnt[iommu->seq_id] -= 1;
1830 count = --domain->iommu_count;
1831 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1832 num = domain->iommu_did[iommu->seq_id];
1833 clear_bit(num, iommu->domain_ids);
1834 set_iommu_domain(iommu, num, NULL);
1836 domain_update_iommu_cap(domain);
1837 domain->iommu_did[iommu->seq_id] = 0;
1843 static struct iova_domain reserved_iova_list;
1844 static struct lock_class_key reserved_rbtree_key;
1846 static int dmar_init_reserved_ranges(void)
1848 struct pci_dev *pdev = NULL;
1852 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1854 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1855 &reserved_rbtree_key);
1857 /* IOAPIC ranges shouldn't be accessed by DMA */
1858 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1859 IOVA_PFN(IOAPIC_RANGE_END));
1861 pr_err("Reserve IOAPIC range failed\n");
1865 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1866 for_each_pci_dev(pdev) {
1869 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1870 r = &pdev->resource[i];
1871 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1873 iova = reserve_iova(&reserved_iova_list,
1877 pr_err("Reserve iova failed\n");
1885 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1887 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1890 static inline int guestwidth_to_adjustwidth(int gaw)
1893 int r = (gaw - 12) % 9;
1904 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1907 int adjust_width, agaw;
1908 unsigned long sagaw;
1911 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1913 err = init_iova_flush_queue(&domain->iovad,
1914 iommu_flush_iova, iova_entry_free);
1918 domain_reserve_special_ranges(domain);
1920 /* calculate AGAW */
1921 if (guest_width > cap_mgaw(iommu->cap))
1922 guest_width = cap_mgaw(iommu->cap);
1923 domain->gaw = guest_width;
1924 adjust_width = guestwidth_to_adjustwidth(guest_width);
1925 agaw = width_to_agaw(adjust_width);
1926 sagaw = cap_sagaw(iommu->cap);
1927 if (!test_bit(agaw, &sagaw)) {
1928 /* hardware doesn't support it, choose a bigger one */
1929 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1930 agaw = find_next_bit(&sagaw, 5, agaw);
1934 domain->agaw = agaw;
1936 if (ecap_coherent(iommu->ecap))
1937 domain->iommu_coherency = 1;
1939 domain->iommu_coherency = 0;
1941 if (ecap_sc_support(iommu->ecap))
1942 domain->iommu_snooping = 1;
1944 domain->iommu_snooping = 0;
1946 if (intel_iommu_superpage)
1947 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1949 domain->iommu_superpage = 0;
1951 domain->nid = iommu->node;
1953 /* always allocate the top pgd */
1954 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1957 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1961 static void domain_exit(struct dmar_domain *domain)
1963 struct page *freelist = NULL;
1965 /* Domain 0 is reserved, so dont process it */
1969 /* Remove associated devices and clear attached or cached domains */
1971 domain_remove_dev_info(domain);
1975 put_iova_domain(&domain->iovad);
1977 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1979 dma_free_pagelist(freelist);
1981 free_domain_mem(domain);
1984 static int domain_context_mapping_one(struct dmar_domain *domain,
1985 struct intel_iommu *iommu,
1988 u16 did = domain->iommu_did[iommu->seq_id];
1989 int translation = CONTEXT_TT_MULTI_LEVEL;
1990 struct device_domain_info *info = NULL;
1991 struct context_entry *context;
1992 unsigned long flags;
1993 struct dma_pte *pgd;
1998 if (hw_pass_through && domain_type_is_si(domain))
1999 translation = CONTEXT_TT_PASS_THROUGH;
2001 pr_debug("Set context mapping for %02x:%02x.%d\n",
2002 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2004 BUG_ON(!domain->pgd);
2006 spin_lock_irqsave(&device_domain_lock, flags);
2007 spin_lock(&iommu->lock);
2010 context = iommu_context_addr(iommu, bus, devfn, 1);
2015 if (context_present(context))
2019 * For kdump cases, old valid entries may be cached due to the
2020 * in-flight DMA and copied pgtable, but there is no unmapping
2021 * behaviour for them, thus we need an explicit cache flush for
2022 * the newly-mapped device. For kdump, at this point, the device
2023 * is supposed to finish reset at its driver probe stage, so no
2024 * in-flight DMA will exist, and we don't need to worry anymore
2027 if (context_copied(context)) {
2028 u16 did_old = context_domain_id(context);
2030 if (did_old < cap_ndoms(iommu->cap)) {
2031 iommu->flush.flush_context(iommu, did_old,
2032 (((u16)bus) << 8) | devfn,
2033 DMA_CCMD_MASK_NOBIT,
2034 DMA_CCMD_DEVICE_INVL);
2035 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2042 context_clear_entry(context);
2043 context_set_domain_id(context, did);
2046 * Skip top levels of page tables for iommu which has less agaw
2047 * than default. Unnecessary for PT mode.
2049 if (translation != CONTEXT_TT_PASS_THROUGH) {
2050 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2052 pgd = phys_to_virt(dma_pte_addr(pgd));
2053 if (!dma_pte_present(pgd))
2057 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2058 if (info && info->ats_supported)
2059 translation = CONTEXT_TT_DEV_IOTLB;
2061 translation = CONTEXT_TT_MULTI_LEVEL;
2063 context_set_address_root(context, virt_to_phys(pgd));
2064 context_set_address_width(context, iommu->agaw);
2067 * In pass through mode, AW must be programmed to
2068 * indicate the largest AGAW value supported by
2069 * hardware. And ASR is ignored by hardware.
2071 context_set_address_width(context, iommu->msagaw);
2074 context_set_translation_type(context, translation);
2075 context_set_fault_enable(context);
2076 context_set_present(context);
2077 domain_flush_cache(domain, context, sizeof(*context));
2080 * It's a non-present to present mapping. If hardware doesn't cache
2081 * non-present entry we only need to flush the write-buffer. If the
2082 * _does_ cache non-present entries, then it does so in the special
2083 * domain #0, which we have to flush:
2085 if (cap_caching_mode(iommu->cap)) {
2086 iommu->flush.flush_context(iommu, 0,
2087 (((u16)bus) << 8) | devfn,
2088 DMA_CCMD_MASK_NOBIT,
2089 DMA_CCMD_DEVICE_INVL);
2090 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2092 iommu_flush_write_buffer(iommu);
2094 iommu_enable_dev_iotlb(info);
2099 spin_unlock(&iommu->lock);
2100 spin_unlock_irqrestore(&device_domain_lock, flags);
2105 struct domain_context_mapping_data {
2106 struct dmar_domain *domain;
2107 struct intel_iommu *iommu;
2110 static int domain_context_mapping_cb(struct pci_dev *pdev,
2111 u16 alias, void *opaque)
2113 struct domain_context_mapping_data *data = opaque;
2115 return domain_context_mapping_one(data->domain, data->iommu,
2116 PCI_BUS_NUM(alias), alias & 0xff);
2120 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2122 struct intel_iommu *iommu;
2124 struct domain_context_mapping_data data;
2126 iommu = device_to_iommu(dev, &bus, &devfn);
2130 if (!dev_is_pci(dev))
2131 return domain_context_mapping_one(domain, iommu, bus, devfn);
2133 data.domain = domain;
2136 return pci_for_each_dma_alias(to_pci_dev(dev),
2137 &domain_context_mapping_cb, &data);
2140 static int domain_context_mapped_cb(struct pci_dev *pdev,
2141 u16 alias, void *opaque)
2143 struct intel_iommu *iommu = opaque;
2145 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2148 static int domain_context_mapped(struct device *dev)
2150 struct intel_iommu *iommu;
2153 iommu = device_to_iommu(dev, &bus, &devfn);
2157 if (!dev_is_pci(dev))
2158 return device_context_mapped(iommu, bus, devfn);
2160 return !pci_for_each_dma_alias(to_pci_dev(dev),
2161 domain_context_mapped_cb, iommu);
2164 /* Returns a number of VTD pages, but aligned to MM page size */
2165 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2168 host_addr &= ~PAGE_MASK;
2169 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2172 /* Return largest possible superpage level for a given mapping */
2173 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2174 unsigned long iov_pfn,
2175 unsigned long phy_pfn,
2176 unsigned long pages)
2178 int support, level = 1;
2179 unsigned long pfnmerge;
2181 support = domain->iommu_superpage;
2183 /* To use a large page, the virtual *and* physical addresses
2184 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2185 of them will mean we have to use smaller pages. So just
2186 merge them and check both at once. */
2187 pfnmerge = iov_pfn | phy_pfn;
2189 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2190 pages >>= VTD_STRIDE_SHIFT;
2193 pfnmerge >>= VTD_STRIDE_SHIFT;
2200 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2201 struct scatterlist *sg, unsigned long phys_pfn,
2202 unsigned long nr_pages, int prot)
2204 struct dma_pte *first_pte = NULL, *pte = NULL;
2205 phys_addr_t uninitialized_var(pteval);
2206 unsigned long sg_res = 0;
2207 unsigned int largepage_lvl = 0;
2208 unsigned long lvl_pages = 0;
2210 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2212 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2215 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2219 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2222 while (nr_pages > 0) {
2226 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2228 sg_res = aligned_nrpages(sg->offset, sg->length);
2229 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2230 sg->dma_length = sg->length;
2231 pteval = (sg_phys(sg) - pgoff) | prot;
2232 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2236 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2238 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2241 /* It is large page*/
2242 if (largepage_lvl > 1) {
2243 unsigned long nr_superpages, end_pfn;
2245 pteval |= DMA_PTE_LARGE_PAGE;
2246 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2248 nr_superpages = sg_res / lvl_pages;
2249 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2252 * Ensure that old small page tables are
2253 * removed to make room for superpage(s).
2254 * We're adding new large pages, so make sure
2255 * we don't remove their parent tables.
2257 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2260 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2264 /* We don't need lock here, nobody else
2265 * touches the iova range
2267 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2269 static int dumps = 5;
2270 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2271 iov_pfn, tmp, (unsigned long long)pteval);
2274 debug_dma_dump_mappings(NULL);
2279 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2281 BUG_ON(nr_pages < lvl_pages);
2282 BUG_ON(sg_res < lvl_pages);
2284 nr_pages -= lvl_pages;
2285 iov_pfn += lvl_pages;
2286 phys_pfn += lvl_pages;
2287 pteval += lvl_pages * VTD_PAGE_SIZE;
2288 sg_res -= lvl_pages;
2290 /* If the next PTE would be the first in a new page, then we
2291 need to flush the cache on the entries we've just written.
2292 And then we'll need to recalculate 'pte', so clear it and
2293 let it get set again in the if (!pte) block above.
2295 If we're done (!nr_pages) we need to flush the cache too.
2297 Also if we've been setting superpages, we may need to
2298 recalculate 'pte' and switch back to smaller pages for the
2299 end of the mapping, if the trailing size is not enough to
2300 use another superpage (i.e. sg_res < lvl_pages). */
2302 if (!nr_pages || first_pte_in_page(pte) ||
2303 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2304 domain_flush_cache(domain, first_pte,
2305 (void *)pte - (void *)first_pte);
2309 if (!sg_res && nr_pages)
2315 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2316 struct scatterlist *sg, unsigned long phys_pfn,
2317 unsigned long nr_pages, int prot)
2320 struct intel_iommu *iommu;
2322 /* Do the real mapping first */
2323 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2327 /* Notify about the new mapping */
2328 if (domain_type_is_vm(domain)) {
2329 /* VM typed domains can have more than one IOMMUs */
2331 for_each_domain_iommu(iommu_id, domain) {
2332 iommu = g_iommus[iommu_id];
2333 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2336 /* General domains only have one IOMMU */
2337 iommu = domain_get_iommu(domain);
2338 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2344 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2345 struct scatterlist *sg, unsigned long nr_pages,
2348 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2351 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2352 unsigned long phys_pfn, unsigned long nr_pages,
2355 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2358 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2360 unsigned long flags;
2361 struct context_entry *context;
2367 spin_lock_irqsave(&iommu->lock, flags);
2368 context = iommu_context_addr(iommu, bus, devfn, 0);
2370 spin_unlock_irqrestore(&iommu->lock, flags);
2373 did_old = context_domain_id(context);
2374 context_clear_entry(context);
2375 __iommu_flush_cache(iommu, context, sizeof(*context));
2376 spin_unlock_irqrestore(&iommu->lock, flags);
2377 iommu->flush.flush_context(iommu,
2379 (((u16)bus) << 8) | devfn,
2380 DMA_CCMD_MASK_NOBIT,
2381 DMA_CCMD_DEVICE_INVL);
2382 iommu->flush.flush_iotlb(iommu,
2389 static inline void unlink_domain_info(struct device_domain_info *info)
2391 assert_spin_locked(&device_domain_lock);
2392 list_del(&info->link);
2393 list_del(&info->global);
2395 info->dev->archdata.iommu = NULL;
2398 static void domain_remove_dev_info(struct dmar_domain *domain)
2400 struct device_domain_info *info, *tmp;
2401 unsigned long flags;
2403 spin_lock_irqsave(&device_domain_lock, flags);
2404 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2405 __dmar_remove_one_dev_info(info);
2406 spin_unlock_irqrestore(&device_domain_lock, flags);
2411 * Note: we use struct device->archdata.iommu stores the info
2413 static struct dmar_domain *find_domain(struct device *dev)
2415 struct device_domain_info *info;
2417 /* No lock here, assumes no domain exit in normal case */
2418 info = dev->archdata.iommu;
2420 return info->domain;
2424 static inline struct device_domain_info *
2425 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2427 struct device_domain_info *info;
2429 list_for_each_entry(info, &device_domain_list, global)
2430 if (info->iommu->segment == segment && info->bus == bus &&
2431 info->devfn == devfn)
2437 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2440 struct dmar_domain *domain)
2442 struct dmar_domain *found = NULL;
2443 struct device_domain_info *info;
2444 unsigned long flags;
2447 info = alloc_devinfo_mem();
2452 info->devfn = devfn;
2453 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2454 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2457 info->domain = domain;
2458 info->iommu = iommu;
2459 info->pasid_table = NULL;
2461 if (dev && dev_is_pci(dev)) {
2462 struct pci_dev *pdev = to_pci_dev(info->dev);
2464 if (!pci_ats_disabled() &&
2465 ecap_dev_iotlb_support(iommu->ecap) &&
2466 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2467 dmar_find_matched_atsr_unit(pdev))
2468 info->ats_supported = 1;
2470 if (ecs_enabled(iommu)) {
2471 if (pasid_enabled(iommu)) {
2472 int features = pci_pasid_features(pdev);
2474 info->pasid_supported = features | 1;
2477 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2478 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2479 info->pri_supported = 1;
2483 spin_lock_irqsave(&device_domain_lock, flags);
2485 found = find_domain(dev);
2488 struct device_domain_info *info2;
2489 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2491 found = info2->domain;
2497 spin_unlock_irqrestore(&device_domain_lock, flags);
2498 free_devinfo_mem(info);
2499 /* Caller must free the original domain */
2503 spin_lock(&iommu->lock);
2504 ret = domain_attach_iommu(domain, iommu);
2505 spin_unlock(&iommu->lock);
2508 spin_unlock_irqrestore(&device_domain_lock, flags);
2509 free_devinfo_mem(info);
2513 list_add(&info->link, &domain->devices);
2514 list_add(&info->global, &device_domain_list);
2516 dev->archdata.iommu = info;
2518 if (dev && dev_is_pci(dev) && info->pasid_supported) {
2519 ret = intel_pasid_alloc_table(dev);
2521 pr_warn("No pasid table for %s, pasid disabled\n",
2523 info->pasid_supported = 0;
2526 spin_unlock_irqrestore(&device_domain_lock, flags);
2528 if (dev && domain_context_mapping(domain, dev)) {
2529 pr_err("Domain context map for %s failed\n", dev_name(dev));
2530 dmar_remove_one_dev_info(domain, dev);
2537 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2539 *(u16 *)opaque = alias;
2543 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2545 struct device_domain_info *info = NULL;
2546 struct dmar_domain *domain = NULL;
2547 struct intel_iommu *iommu;
2549 unsigned long flags;
2552 iommu = device_to_iommu(dev, &bus, &devfn);
2556 if (dev_is_pci(dev)) {
2557 struct pci_dev *pdev = to_pci_dev(dev);
2559 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2561 spin_lock_irqsave(&device_domain_lock, flags);
2562 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2563 PCI_BUS_NUM(dma_alias),
2566 iommu = info->iommu;
2567 domain = info->domain;
2569 spin_unlock_irqrestore(&device_domain_lock, flags);
2571 /* DMA alias already has a domain, use it */
2576 /* Allocate and initialize new domain for the device */
2577 domain = alloc_domain(0);
2580 if (domain_init(domain, iommu, gaw)) {
2581 domain_exit(domain);
2590 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2591 struct dmar_domain *domain)
2593 struct intel_iommu *iommu;
2594 struct dmar_domain *tmp;
2595 u16 req_id, dma_alias;
2598 iommu = device_to_iommu(dev, &bus, &devfn);
2602 req_id = ((u16)bus << 8) | devfn;
2604 if (dev_is_pci(dev)) {
2605 struct pci_dev *pdev = to_pci_dev(dev);
2607 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2609 /* register PCI DMA alias device */
2610 if (req_id != dma_alias) {
2611 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2612 dma_alias & 0xff, NULL, domain);
2614 if (!tmp || tmp != domain)
2619 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2620 if (!tmp || tmp != domain)
2626 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2628 struct dmar_domain *domain, *tmp;
2630 domain = find_domain(dev);
2634 domain = find_or_alloc_domain(dev, gaw);
2638 tmp = set_domain_for_dev(dev, domain);
2639 if (!tmp || domain != tmp) {
2640 domain_exit(domain);
2649 static int iommu_domain_identity_map(struct dmar_domain *domain,
2650 unsigned long long start,
2651 unsigned long long end)
2653 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2654 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2656 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2657 dma_to_mm_pfn(last_vpfn))) {
2658 pr_err("Reserving iova failed\n");
2662 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2664 * RMRR range might have overlap with physical memory range,
2667 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2669 return __domain_mapping(domain, first_vpfn, NULL,
2670 first_vpfn, last_vpfn - first_vpfn + 1,
2671 DMA_PTE_READ|DMA_PTE_WRITE);
2674 static int domain_prepare_identity_map(struct device *dev,
2675 struct dmar_domain *domain,
2676 unsigned long long start,
2677 unsigned long long end)
2679 /* For _hardware_ passthrough, don't bother. But for software
2680 passthrough, we do it anyway -- it may indicate a memory
2681 range which is reserved in E820, so which didn't get set
2682 up to start with in si_domain */
2683 if (domain == si_domain && hw_pass_through) {
2684 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2685 dev_name(dev), start, end);
2689 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2690 dev_name(dev), start, end);
2693 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2694 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2695 dmi_get_system_info(DMI_BIOS_VENDOR),
2696 dmi_get_system_info(DMI_BIOS_VERSION),
2697 dmi_get_system_info(DMI_PRODUCT_VERSION));
2701 if (end >> agaw_to_width(domain->agaw)) {
2702 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2703 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2704 agaw_to_width(domain->agaw),
2705 dmi_get_system_info(DMI_BIOS_VENDOR),
2706 dmi_get_system_info(DMI_BIOS_VERSION),
2707 dmi_get_system_info(DMI_PRODUCT_VERSION));
2711 return iommu_domain_identity_map(domain, start, end);
2714 static int iommu_prepare_identity_map(struct device *dev,
2715 unsigned long long start,
2716 unsigned long long end)
2718 struct dmar_domain *domain;
2721 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2725 ret = domain_prepare_identity_map(dev, domain, start, end);
2727 domain_exit(domain);
2732 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2735 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2737 return iommu_prepare_identity_map(dev, rmrr->base_address,
2741 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2742 static inline void iommu_prepare_isa(void)
2744 struct pci_dev *pdev;
2747 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2751 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2752 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2755 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2760 static inline void iommu_prepare_isa(void)
2764 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2766 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2768 static int __init si_domain_init(int hw)
2772 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2776 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2777 domain_exit(si_domain);
2781 pr_debug("Identity mapping domain allocated\n");
2786 for_each_online_node(nid) {
2787 unsigned long start_pfn, end_pfn;
2790 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2791 ret = iommu_domain_identity_map(si_domain,
2792 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2801 static int identity_mapping(struct device *dev)
2803 struct device_domain_info *info;
2805 if (likely(!iommu_identity_mapping))
2808 info = dev->archdata.iommu;
2809 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2810 return (info->domain == si_domain);
2815 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2817 struct dmar_domain *ndomain;
2818 struct intel_iommu *iommu;
2821 iommu = device_to_iommu(dev, &bus, &devfn);
2825 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2826 if (ndomain != domain)
2832 static bool device_has_rmrr(struct device *dev)
2834 struct dmar_rmrr_unit *rmrr;
2839 for_each_rmrr_units(rmrr) {
2841 * Return TRUE if this RMRR contains the device that
2844 for_each_active_dev_scope(rmrr->devices,
2845 rmrr->devices_cnt, i, tmp)
2856 * There are a couple cases where we need to restrict the functionality of
2857 * devices associated with RMRRs. The first is when evaluating a device for
2858 * identity mapping because problems exist when devices are moved in and out
2859 * of domains and their respective RMRR information is lost. This means that
2860 * a device with associated RMRRs will never be in a "passthrough" domain.
2861 * The second is use of the device through the IOMMU API. This interface
2862 * expects to have full control of the IOVA space for the device. We cannot
2863 * satisfy both the requirement that RMRR access is maintained and have an
2864 * unencumbered IOVA space. We also have no ability to quiesce the device's
2865 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2866 * We therefore prevent devices associated with an RMRR from participating in
2867 * the IOMMU API, which eliminates them from device assignment.
2869 * In both cases we assume that PCI USB devices with RMRRs have them largely
2870 * for historical reasons and that the RMRR space is not actively used post
2871 * boot. This exclusion may change if vendors begin to abuse it.
2873 * The same exception is made for graphics devices, with the requirement that
2874 * any use of the RMRR regions will be torn down before assigning the device
2877 static bool device_is_rmrr_locked(struct device *dev)
2879 if (!device_has_rmrr(dev))
2882 if (dev_is_pci(dev)) {
2883 struct pci_dev *pdev = to_pci_dev(dev);
2885 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2892 static int iommu_should_identity_map(struct device *dev, int startup)
2895 if (dev_is_pci(dev)) {
2896 struct pci_dev *pdev = to_pci_dev(dev);
2898 if (device_is_rmrr_locked(dev))
2902 * Prevent any device marked as untrusted from getting
2903 * placed into the statically identity mapping domain.
2905 if (pdev->untrusted)
2908 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2911 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2914 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2918 * We want to start off with all devices in the 1:1 domain, and
2919 * take them out later if we find they can't access all of memory.
2921 * However, we can't do this for PCI devices behind bridges,
2922 * because all PCI devices behind the same bridge will end up
2923 * with the same source-id on their transactions.
2925 * Practically speaking, we can't change things around for these
2926 * devices at run-time, because we can't be sure there'll be no
2927 * DMA transactions in flight for any of their siblings.
2929 * So PCI devices (unless they're on the root bus) as well as
2930 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2931 * the 1:1 domain, just in _case_ one of their siblings turns out
2932 * not to be able to map all of memory.
2934 if (!pci_is_pcie(pdev)) {
2935 if (!pci_is_root_bus(pdev->bus))
2937 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2939 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2942 if (device_has_rmrr(dev))
2947 * At boot time, we don't yet know if devices will be 64-bit capable.
2948 * Assume that they will — if they turn out not to be, then we can
2949 * take them out of the 1:1 domain later.
2953 * If the device's dma_mask is less than the system's memory
2954 * size then this is not a candidate for identity mapping.
2956 u64 dma_mask = *dev->dma_mask;
2958 if (dev->coherent_dma_mask &&
2959 dev->coherent_dma_mask < dma_mask)
2960 dma_mask = dev->coherent_dma_mask;
2962 return dma_mask >= dma_get_required_mask(dev);
2968 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2972 if (!iommu_should_identity_map(dev, 1))
2975 ret = domain_add_dev_info(si_domain, dev);
2977 pr_info("%s identity mapping for device %s\n",
2978 hw ? "Hardware" : "Software", dev_name(dev));
2979 else if (ret == -ENODEV)
2980 /* device not associated with an iommu */
2987 static int __init iommu_prepare_static_identity_mapping(int hw)
2989 struct pci_dev *pdev = NULL;
2990 struct dmar_drhd_unit *drhd;
2991 struct intel_iommu *iommu;
2996 for_each_pci_dev(pdev) {
2997 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3002 for_each_active_iommu(iommu, drhd)
3003 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3004 struct acpi_device_physical_node *pn;
3005 struct acpi_device *adev;
3007 if (dev->bus != &acpi_bus_type)
3010 adev= to_acpi_device(dev);
3011 mutex_lock(&adev->physical_node_lock);
3012 list_for_each_entry(pn, &adev->physical_node_list, node) {
3013 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3017 mutex_unlock(&adev->physical_node_lock);
3025 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3028 * Start from the sane iommu hardware state.
3029 * If the queued invalidation is already initialized by us
3030 * (for example, while enabling interrupt-remapping) then
3031 * we got the things already rolling from a sane state.
3035 * Clear any previous faults.
3037 dmar_fault(-1, iommu);
3039 * Disable queued invalidation if supported and already enabled
3040 * before OS handover.
3042 dmar_disable_qi(iommu);
3045 if (dmar_enable_qi(iommu)) {
3047 * Queued Invalidate not enabled, use Register Based Invalidate
3049 iommu->flush.flush_context = __iommu_flush_context;
3050 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051 pr_info("%s: Using Register based invalidation\n",
3054 iommu->flush.flush_context = qi_flush_context;
3055 iommu->flush.flush_iotlb = qi_flush_iotlb;
3056 pr_info("%s: Using Queued invalidation\n", iommu->name);
3060 static int copy_context_table(struct intel_iommu *iommu,
3061 struct root_entry *old_re,
3062 struct context_entry **tbl,
3065 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066 struct context_entry *new_ce = NULL, ce;
3067 struct context_entry *old_ce = NULL;
3068 struct root_entry re;
3069 phys_addr_t old_ce_phys;
3071 tbl_idx = ext ? bus * 2 : bus;
3072 memcpy(&re, old_re, sizeof(re));
3074 for (devfn = 0; devfn < 256; devfn++) {
3075 /* First calculate the correct index */
3076 idx = (ext ? devfn * 2 : devfn) % 256;
3079 /* First save what we may have and clean up */
3081 tbl[tbl_idx] = new_ce;
3082 __iommu_flush_cache(iommu, new_ce,
3092 old_ce_phys = root_entry_lctp(&re);
3094 old_ce_phys = root_entry_uctp(&re);
3097 if (ext && devfn == 0) {
3098 /* No LCTP, try UCTP */
3107 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3112 new_ce = alloc_pgtable_page(iommu->node);
3119 /* Now copy the context entry */
3120 memcpy(&ce, old_ce + idx, sizeof(ce));
3122 if (!__context_present(&ce))
3125 did = context_domain_id(&ce);
3126 if (did >= 0 && did < cap_ndoms(iommu->cap))
3127 set_bit(did, iommu->domain_ids);
3130 * We need a marker for copied context entries. This
3131 * marker needs to work for the old format as well as
3132 * for extended context entries.
3134 * Bit 67 of the context entry is used. In the old
3135 * format this bit is available to software, in the
3136 * extended format it is the PGE bit, but PGE is ignored
3137 * by HW if PASIDs are disabled (and thus still
3140 * So disable PASIDs first and then mark the entry
3141 * copied. This means that we don't copy PASID
3142 * translations from the old kernel, but this is fine as
3143 * faults there are not fatal.
3145 context_clear_pasid_enable(&ce);
3146 context_set_copied(&ce);
3151 tbl[tbl_idx + pos] = new_ce;
3153 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3162 static int copy_translation_tables(struct intel_iommu *iommu)
3164 struct context_entry **ctxt_tbls;
3165 struct root_entry *old_rt;
3166 phys_addr_t old_rt_phys;
3167 int ctxt_table_entries;
3168 unsigned long flags;
3173 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175 new_ext = !!ecap_ecs(iommu->ecap);
3178 * The RTT bit can only be changed when translation is disabled,
3179 * but disabling translation means to open a window for data
3180 * corruption. So bail out and don't copy anything if we would
3181 * have to change the bit.
3186 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3190 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3194 /* This is too big for the stack - allocate it from slab */
3195 ctxt_table_entries = ext ? 512 : 256;
3197 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3201 for (bus = 0; bus < 256; bus++) {
3202 ret = copy_context_table(iommu, &old_rt[bus],
3203 ctxt_tbls, bus, ext);
3205 pr_err("%s: Failed to copy context table for bus %d\n",
3211 spin_lock_irqsave(&iommu->lock, flags);
3213 /* Context tables are copied, now write them to the root_entry table */
3214 for (bus = 0; bus < 256; bus++) {
3215 int idx = ext ? bus * 2 : bus;
3218 if (ctxt_tbls[idx]) {
3219 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220 iommu->root_entry[bus].lo = val;
3223 if (!ext || !ctxt_tbls[idx + 1])
3226 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227 iommu->root_entry[bus].hi = val;
3230 spin_unlock_irqrestore(&iommu->lock, flags);
3234 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3244 static int __init init_dmars(void)
3246 struct dmar_drhd_unit *drhd;
3247 struct dmar_rmrr_unit *rmrr;
3248 bool copied_tables = false;
3250 struct intel_iommu *iommu;
3256 * initialize and program root entry to not present
3259 for_each_drhd_unit(drhd) {
3261 * lock not needed as this is only incremented in the single
3262 * threaded kernel __init code path all other access are read
3265 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3269 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3272 /* Preallocate enough resources for IOMMU hot-addition */
3273 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3274 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3276 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3279 pr_err("Allocating global iommu array failed\n");
3284 for_each_active_iommu(iommu, drhd) {
3286 * Find the max pasid size of all IOMMU's in the system.
3287 * We need to ensure the system pasid table is no bigger
3288 * than the smallest supported.
3290 if (pasid_enabled(iommu)) {
3291 u32 temp = 2 << ecap_pss(iommu->ecap);
3293 intel_pasid_max_id = min_t(u32, temp,
3294 intel_pasid_max_id);
3297 g_iommus[iommu->seq_id] = iommu;
3299 intel_iommu_init_qi(iommu);
3301 ret = iommu_init_domains(iommu);
3305 init_translation_status(iommu);
3307 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3308 iommu_disable_translation(iommu);
3309 clear_translation_pre_enabled(iommu);
3310 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3316 * we could share the same root & context tables
3317 * among all IOMMU's. Need to Split it later.
3319 ret = iommu_alloc_root_entry(iommu);
3323 if (translation_pre_enabled(iommu)) {
3324 pr_info("Translation already enabled - trying to copy translation structures\n");
3326 ret = copy_translation_tables(iommu);
3329 * We found the IOMMU with translation
3330 * enabled - but failed to copy over the
3331 * old root-entry table. Try to proceed
3332 * by disabling translation now and
3333 * allocating a clean root-entry table.
3334 * This might cause DMAR faults, but
3335 * probably the dump will still succeed.
3337 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3339 iommu_disable_translation(iommu);
3340 clear_translation_pre_enabled(iommu);
3342 pr_info("Copied translation tables from previous kernel for %s\n",
3344 copied_tables = true;
3348 if (!ecap_pass_through(iommu->ecap))
3349 hw_pass_through = 0;
3350 #ifdef CONFIG_INTEL_IOMMU_SVM
3351 if (pasid_enabled(iommu))
3352 intel_svm_init(iommu);
3357 * Now that qi is enabled on all iommus, set the root entry and flush
3358 * caches. This is required on some Intel X58 chipsets, otherwise the
3359 * flush_context function will loop forever and the boot hangs.
3361 for_each_active_iommu(iommu, drhd) {
3362 iommu_flush_write_buffer(iommu);
3363 iommu_set_root_entry(iommu);
3364 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3365 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3368 if (iommu_pass_through)
3369 iommu_identity_mapping |= IDENTMAP_ALL;
3371 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3372 iommu_identity_mapping |= IDENTMAP_GFX;
3375 check_tylersburg_isoch();
3377 if (iommu_identity_mapping) {
3378 ret = si_domain_init(hw_pass_through);
3385 * If we copied translations from a previous kernel in the kdump
3386 * case, we can not assign the devices to domains now, as that
3387 * would eliminate the old mappings. So skip this part and defer
3388 * the assignment to device driver initialization time.
3394 * If pass through is not set or not enabled, setup context entries for
3395 * identity mappings for rmrr, gfx, and isa and may fall back to static
3396 * identity mapping if iommu_identity_mapping is set.
3398 if (iommu_identity_mapping) {
3399 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3401 pr_crit("Failed to setup IOMMU pass-through\n");
3407 * for each dev attached to rmrr
3409 * locate drhd for dev, alloc domain for dev
3410 * allocate free domain
3411 * allocate page table entries for rmrr
3412 * if context not allocated for bus
3413 * allocate and init context
3414 * set present in root table for this bus
3415 * init context with domain, translation etc
3419 pr_info("Setting RMRR:\n");
3420 for_each_rmrr_units(rmrr) {
3421 /* some BIOS lists non-exist devices in DMAR table. */
3422 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3424 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3426 pr_err("Mapping reserved region failed\n");
3430 iommu_prepare_isa();
3437 * global invalidate context cache
3438 * global invalidate iotlb
3439 * enable translation
3441 for_each_iommu(iommu, drhd) {
3442 if (drhd->ignored) {
3444 * we always have to disable PMRs or DMA may fail on
3448 iommu_disable_protect_mem_regions(iommu);
3452 iommu_flush_write_buffer(iommu);
3454 #ifdef CONFIG_INTEL_IOMMU_SVM
3455 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3456 ret = intel_svm_enable_prq(iommu);
3461 ret = dmar_set_interrupt(iommu);
3465 if (!translation_pre_enabled(iommu))
3466 iommu_enable_translation(iommu);
3468 iommu_disable_protect_mem_regions(iommu);
3474 for_each_active_iommu(iommu, drhd) {
3475 disable_dmar_iommu(iommu);
3476 free_dmar_iommu(iommu);
3485 /* This takes a number of _MM_ pages, not VTD pages */
3486 static unsigned long intel_alloc_iova(struct device *dev,
3487 struct dmar_domain *domain,
3488 unsigned long nrpages, uint64_t dma_mask)
3490 unsigned long iova_pfn = 0;
3492 /* Restrict dma_mask to the width that the iommu can handle */
3493 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3494 /* Ensure we reserve the whole size-aligned region */
3495 nrpages = __roundup_pow_of_two(nrpages);
3497 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3499 * First try to allocate an io virtual address in
3500 * DMA_BIT_MASK(32) and if that fails then try allocating
3503 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3504 IOVA_PFN(DMA_BIT_MASK(32)), false);
3508 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3509 IOVA_PFN(dma_mask), true);
3510 if (unlikely(!iova_pfn)) {
3511 pr_err("Allocating %ld-page iova for %s failed",
3512 nrpages, dev_name(dev));
3519 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3521 struct dmar_domain *domain, *tmp;
3522 struct dmar_rmrr_unit *rmrr;
3523 struct device *i_dev;
3526 domain = find_domain(dev);
3530 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3534 /* We have a new domain - setup possible RMRRs for the device */
3536 for_each_rmrr_units(rmrr) {
3537 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3542 ret = domain_prepare_identity_map(dev, domain,
3546 dev_err(dev, "Mapping reserved region failed\n");
3551 tmp = set_domain_for_dev(dev, domain);
3552 if (!tmp || domain != tmp) {
3553 domain_exit(domain);
3560 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3566 /* Check if the dev needs to go through non-identity map and unmap process.*/
3567 static int iommu_no_mapping(struct device *dev)
3571 if (iommu_dummy(dev))
3574 if (!iommu_identity_mapping)
3577 found = identity_mapping(dev);
3579 if (iommu_should_identity_map(dev, 0))
3583 * 32 bit DMA is removed from si_domain and fall back
3584 * to non-identity mapping.
3586 dmar_remove_one_dev_info(si_domain, dev);
3587 pr_info("32bit %s uses non-identity mapping\n",
3593 * In case of a detached 64 bit DMA device from vm, the device
3594 * is put into si_domain for identity mapping.
3596 if (iommu_should_identity_map(dev, 0)) {
3598 ret = domain_add_dev_info(si_domain, dev);
3600 pr_info("64bit %s uses identity mapping\n",
3610 static dma_addr_t __intel_map_page(struct device *dev, struct page *page,
3611 unsigned long offset, size_t size, int dir,
3614 phys_addr_t paddr = page_to_phys(page) + offset;
3615 struct dmar_domain *domain;
3616 phys_addr_t start_paddr;
3617 unsigned long iova_pfn;
3620 struct intel_iommu *iommu;
3621 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3623 BUG_ON(dir == DMA_NONE);
3625 if (iommu_no_mapping(dev))
3628 domain = get_valid_domain_for_dev(dev);
3630 return DMA_MAPPING_ERROR;
3632 iommu = domain_get_iommu(domain);
3633 size = aligned_nrpages(paddr, size);
3635 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3640 * Check if DMAR supports zero-length reads on write only
3643 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3644 !cap_zlr(iommu->cap))
3645 prot |= DMA_PTE_READ;
3646 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3647 prot |= DMA_PTE_WRITE;
3649 * paddr - (paddr + size) might be partial page, we should map the whole
3650 * page. Note: if two part of one page are separately mapped, we
3651 * might have two guest_addr mapping to the same host paddr, but this
3652 * is not a big problem
3654 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3655 mm_to_dma_pfn(paddr_pfn), size, prot);
3659 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3660 start_paddr += paddr & ~PAGE_MASK;
3665 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3666 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3667 dev_name(dev), size, (unsigned long long)paddr, dir);
3668 return DMA_MAPPING_ERROR;
3671 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3672 unsigned long offset, size_t size,
3673 enum dma_data_direction dir,
3674 unsigned long attrs)
3676 return __intel_map_page(dev, page, offset, size, dir, *dev->dma_mask);
3679 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3681 struct dmar_domain *domain;
3682 unsigned long start_pfn, last_pfn;
3683 unsigned long nrpages;
3684 unsigned long iova_pfn;
3685 struct intel_iommu *iommu;
3686 struct page *freelist;
3688 if (iommu_no_mapping(dev))
3691 domain = find_domain(dev);
3694 iommu = domain_get_iommu(domain);
3696 iova_pfn = IOVA_PFN(dev_addr);
3698 nrpages = aligned_nrpages(dev_addr, size);
3699 start_pfn = mm_to_dma_pfn(iova_pfn);
3700 last_pfn = start_pfn + nrpages - 1;
3702 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3703 dev_name(dev), start_pfn, last_pfn);
3705 freelist = domain_unmap(domain, start_pfn, last_pfn);
3707 if (intel_iommu_strict) {
3708 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3709 nrpages, !freelist, 0);
3711 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3712 dma_free_pagelist(freelist);
3714 queue_iova(&domain->iovad, iova_pfn, nrpages,
3715 (unsigned long)freelist);
3717 * queue up the release of the unmap to save the 1/6th of the
3718 * cpu used up by the iotlb flush operation...
3723 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3724 size_t size, enum dma_data_direction dir,
3725 unsigned long attrs)
3727 intel_unmap(dev, dev_addr, size);
3730 static void *intel_alloc_coherent(struct device *dev, size_t size,
3731 dma_addr_t *dma_handle, gfp_t flags,
3732 unsigned long attrs)
3734 struct page *page = NULL;
3737 size = PAGE_ALIGN(size);
3738 order = get_order(size);
3740 if (!iommu_no_mapping(dev))
3741 flags &= ~(GFP_DMA | GFP_DMA32);
3742 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3743 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3749 if (gfpflags_allow_blocking(flags)) {
3750 unsigned int count = size >> PAGE_SHIFT;
3752 page = dma_alloc_from_contiguous(dev, count, order,
3753 flags & __GFP_NOWARN);
3754 if (page && iommu_no_mapping(dev) &&
3755 page_to_phys(page) + size > dev->coherent_dma_mask) {
3756 dma_release_from_contiguous(dev, page, count);
3762 page = alloc_pages(flags, order);
3765 memset(page_address(page), 0, size);
3767 *dma_handle = __intel_map_page(dev, page, 0, size, DMA_BIDIRECTIONAL,
3768 dev->coherent_dma_mask);
3769 if (*dma_handle != DMA_MAPPING_ERROR)
3770 return page_address(page);
3771 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3772 __free_pages(page, order);
3777 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3778 dma_addr_t dma_handle, unsigned long attrs)
3781 struct page *page = virt_to_page(vaddr);
3783 size = PAGE_ALIGN(size);
3784 order = get_order(size);
3786 intel_unmap(dev, dma_handle, size);
3787 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3788 __free_pages(page, order);
3791 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3792 int nelems, enum dma_data_direction dir,
3793 unsigned long attrs)
3795 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3796 unsigned long nrpages = 0;
3797 struct scatterlist *sg;
3800 for_each_sg(sglist, sg, nelems, i) {
3801 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3804 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3807 static int intel_nontranslate_map_sg(struct device *hddev,
3808 struct scatterlist *sglist, int nelems, int dir)
3811 struct scatterlist *sg;
3813 for_each_sg(sglist, sg, nelems, i) {
3814 BUG_ON(!sg_page(sg));
3815 sg->dma_address = sg_phys(sg);
3816 sg->dma_length = sg->length;
3821 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3822 enum dma_data_direction dir, unsigned long attrs)
3825 struct dmar_domain *domain;
3828 unsigned long iova_pfn;
3830 struct scatterlist *sg;
3831 unsigned long start_vpfn;
3832 struct intel_iommu *iommu;
3834 BUG_ON(dir == DMA_NONE);
3835 if (iommu_no_mapping(dev))
3836 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3838 domain = get_valid_domain_for_dev(dev);
3842 iommu = domain_get_iommu(domain);
3844 for_each_sg(sglist, sg, nelems, i)
3845 size += aligned_nrpages(sg->offset, sg->length);
3847 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3850 sglist->dma_length = 0;
3855 * Check if DMAR supports zero-length reads on write only
3858 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3859 !cap_zlr(iommu->cap))
3860 prot |= DMA_PTE_READ;
3861 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3862 prot |= DMA_PTE_WRITE;
3864 start_vpfn = mm_to_dma_pfn(iova_pfn);
3866 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3867 if (unlikely(ret)) {
3868 dma_pte_free_pagetable(domain, start_vpfn,
3869 start_vpfn + size - 1,
3870 agaw_to_level(domain->agaw) + 1);
3871 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3878 static const struct dma_map_ops intel_dma_ops = {
3879 .alloc = intel_alloc_coherent,
3880 .free = intel_free_coherent,
3881 .map_sg = intel_map_sg,
3882 .unmap_sg = intel_unmap_sg,
3883 .map_page = intel_map_page,
3884 .unmap_page = intel_unmap_page,
3885 .dma_supported = dma_direct_supported,
3888 static inline int iommu_domain_cache_init(void)
3892 iommu_domain_cache = kmem_cache_create("iommu_domain",
3893 sizeof(struct dmar_domain),
3898 if (!iommu_domain_cache) {
3899 pr_err("Couldn't create iommu_domain cache\n");
3906 static inline int iommu_devinfo_cache_init(void)
3910 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3911 sizeof(struct device_domain_info),
3915 if (!iommu_devinfo_cache) {
3916 pr_err("Couldn't create devinfo cache\n");
3923 static int __init iommu_init_mempool(void)
3926 ret = iova_cache_get();
3930 ret = iommu_domain_cache_init();
3934 ret = iommu_devinfo_cache_init();
3938 kmem_cache_destroy(iommu_domain_cache);
3945 static void __init iommu_exit_mempool(void)
3947 kmem_cache_destroy(iommu_devinfo_cache);
3948 kmem_cache_destroy(iommu_domain_cache);
3952 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3954 struct dmar_drhd_unit *drhd;
3958 /* We know that this device on this chipset has its own IOMMU.
3959 * If we find it under a different IOMMU, then the BIOS is lying
3960 * to us. Hope that the IOMMU for this device is actually
3961 * disabled, and it needs no translation...
3963 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3965 /* "can't" happen */
3966 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3969 vtbar &= 0xffff0000;
3971 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3972 drhd = dmar_find_matched_drhd_unit(pdev);
3973 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3974 TAINT_FIRMWARE_WORKAROUND,
3975 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3976 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3978 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3980 static void __init init_no_remapping_devices(void)
3982 struct dmar_drhd_unit *drhd;
3986 for_each_drhd_unit(drhd) {
3987 if (!drhd->include_all) {
3988 for_each_active_dev_scope(drhd->devices,
3989 drhd->devices_cnt, i, dev)
3991 /* ignore DMAR unit if no devices exist */
3992 if (i == drhd->devices_cnt)
3997 for_each_active_drhd_unit(drhd) {
3998 if (drhd->include_all)
4001 for_each_active_dev_scope(drhd->devices,
4002 drhd->devices_cnt, i, dev)
4003 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4005 if (i < drhd->devices_cnt)
4008 /* This IOMMU has *only* gfx devices. Either bypass it or
4009 set the gfx_mapped flag, as appropriate */
4011 intel_iommu_gfx_mapped = 1;
4014 for_each_active_dev_scope(drhd->devices,
4015 drhd->devices_cnt, i, dev)
4016 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4021 #ifdef CONFIG_SUSPEND
4022 static int init_iommu_hw(void)
4024 struct dmar_drhd_unit *drhd;
4025 struct intel_iommu *iommu = NULL;
4027 for_each_active_iommu(iommu, drhd)
4029 dmar_reenable_qi(iommu);
4031 for_each_iommu(iommu, drhd) {
4032 if (drhd->ignored) {
4034 * we always have to disable PMRs or DMA may fail on
4038 iommu_disable_protect_mem_regions(iommu);
4042 iommu_flush_write_buffer(iommu);
4044 iommu_set_root_entry(iommu);
4046 iommu->flush.flush_context(iommu, 0, 0, 0,
4047 DMA_CCMD_GLOBAL_INVL);
4048 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4049 iommu_enable_translation(iommu);
4050 iommu_disable_protect_mem_regions(iommu);
4056 static void iommu_flush_all(void)
4058 struct dmar_drhd_unit *drhd;
4059 struct intel_iommu *iommu;
4061 for_each_active_iommu(iommu, drhd) {
4062 iommu->flush.flush_context(iommu, 0, 0, 0,
4063 DMA_CCMD_GLOBAL_INVL);
4064 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4065 DMA_TLB_GLOBAL_FLUSH);
4069 static int iommu_suspend(void)
4071 struct dmar_drhd_unit *drhd;
4072 struct intel_iommu *iommu = NULL;
4075 for_each_active_iommu(iommu, drhd) {
4076 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4078 if (!iommu->iommu_state)
4084 for_each_active_iommu(iommu, drhd) {
4085 iommu_disable_translation(iommu);
4087 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4089 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4090 readl(iommu->reg + DMAR_FECTL_REG);
4091 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4092 readl(iommu->reg + DMAR_FEDATA_REG);
4093 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4094 readl(iommu->reg + DMAR_FEADDR_REG);
4095 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4096 readl(iommu->reg + DMAR_FEUADDR_REG);
4098 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4103 for_each_active_iommu(iommu, drhd)
4104 kfree(iommu->iommu_state);
4109 static void iommu_resume(void)
4111 struct dmar_drhd_unit *drhd;
4112 struct intel_iommu *iommu = NULL;
4115 if (init_iommu_hw()) {
4117 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4119 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4123 for_each_active_iommu(iommu, drhd) {
4125 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4127 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4128 iommu->reg + DMAR_FECTL_REG);
4129 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4130 iommu->reg + DMAR_FEDATA_REG);
4131 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4132 iommu->reg + DMAR_FEADDR_REG);
4133 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4134 iommu->reg + DMAR_FEUADDR_REG);
4136 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4139 for_each_active_iommu(iommu, drhd)
4140 kfree(iommu->iommu_state);
4143 static struct syscore_ops iommu_syscore_ops = {
4144 .resume = iommu_resume,
4145 .suspend = iommu_suspend,
4148 static void __init init_iommu_pm_ops(void)
4150 register_syscore_ops(&iommu_syscore_ops);
4154 static inline void init_iommu_pm_ops(void) {}
4155 #endif /* CONFIG_PM */
4158 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4160 struct acpi_dmar_reserved_memory *rmrr;
4161 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4162 struct dmar_rmrr_unit *rmrru;
4165 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4169 rmrru->hdr = header;
4170 rmrr = (struct acpi_dmar_reserved_memory *)header;
4171 rmrru->base_address = rmrr->base_address;
4172 rmrru->end_address = rmrr->end_address;
4174 length = rmrr->end_address - rmrr->base_address + 1;
4175 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4180 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4181 ((void *)rmrr) + rmrr->header.length,
4182 &rmrru->devices_cnt);
4183 if (rmrru->devices_cnt && rmrru->devices == NULL)
4186 list_add(&rmrru->list, &dmar_rmrr_units);
4197 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4199 struct dmar_atsr_unit *atsru;
4200 struct acpi_dmar_atsr *tmp;
4202 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4203 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4204 if (atsr->segment != tmp->segment)
4206 if (atsr->header.length != tmp->header.length)
4208 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4215 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4217 struct acpi_dmar_atsr *atsr;
4218 struct dmar_atsr_unit *atsru;
4220 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4223 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4224 atsru = dmar_find_atsr(atsr);
4228 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4233 * If memory is allocated from slab by ACPI _DSM method, we need to
4234 * copy the memory content because the memory buffer will be freed
4237 atsru->hdr = (void *)(atsru + 1);
4238 memcpy(atsru->hdr, hdr, hdr->length);
4239 atsru->include_all = atsr->flags & 0x1;
4240 if (!atsru->include_all) {
4241 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4242 (void *)atsr + atsr->header.length,
4243 &atsru->devices_cnt);
4244 if (atsru->devices_cnt && atsru->devices == NULL) {
4250 list_add_rcu(&atsru->list, &dmar_atsr_units);
4255 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4257 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4261 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4263 struct acpi_dmar_atsr *atsr;
4264 struct dmar_atsr_unit *atsru;
4266 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4267 atsru = dmar_find_atsr(atsr);
4269 list_del_rcu(&atsru->list);
4271 intel_iommu_free_atsr(atsru);
4277 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4281 struct acpi_dmar_atsr *atsr;
4282 struct dmar_atsr_unit *atsru;
4284 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4285 atsru = dmar_find_atsr(atsr);
4289 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4290 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4298 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4301 struct intel_iommu *iommu = dmaru->iommu;
4303 if (g_iommus[iommu->seq_id])
4306 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4307 pr_warn("%s: Doesn't support hardware pass through.\n",
4311 if (!ecap_sc_support(iommu->ecap) &&
4312 domain_update_iommu_snooping(iommu)) {
4313 pr_warn("%s: Doesn't support snooping.\n",
4317 sp = domain_update_iommu_superpage(iommu) - 1;
4318 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4319 pr_warn("%s: Doesn't support large page.\n",
4325 * Disable translation if already enabled prior to OS handover.
4327 if (iommu->gcmd & DMA_GCMD_TE)
4328 iommu_disable_translation(iommu);
4330 g_iommus[iommu->seq_id] = iommu;
4331 ret = iommu_init_domains(iommu);
4333 ret = iommu_alloc_root_entry(iommu);
4337 #ifdef CONFIG_INTEL_IOMMU_SVM
4338 if (pasid_enabled(iommu))
4339 intel_svm_init(iommu);
4342 if (dmaru->ignored) {
4344 * we always have to disable PMRs or DMA may fail on this device
4347 iommu_disable_protect_mem_regions(iommu);
4351 intel_iommu_init_qi(iommu);
4352 iommu_flush_write_buffer(iommu);
4354 #ifdef CONFIG_INTEL_IOMMU_SVM
4355 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4356 ret = intel_svm_enable_prq(iommu);
4361 ret = dmar_set_interrupt(iommu);
4365 iommu_set_root_entry(iommu);
4366 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4367 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4368 iommu_enable_translation(iommu);
4370 iommu_disable_protect_mem_regions(iommu);
4374 disable_dmar_iommu(iommu);
4376 free_dmar_iommu(iommu);
4380 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4383 struct intel_iommu *iommu = dmaru->iommu;
4385 if (!intel_iommu_enabled)
4391 ret = intel_iommu_add(dmaru);
4393 disable_dmar_iommu(iommu);
4394 free_dmar_iommu(iommu);
4400 static void intel_iommu_free_dmars(void)
4402 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4403 struct dmar_atsr_unit *atsru, *atsr_n;
4405 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4406 list_del(&rmrru->list);
4407 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4412 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4413 list_del(&atsru->list);
4414 intel_iommu_free_atsr(atsru);
4418 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4421 struct pci_bus *bus;
4422 struct pci_dev *bridge = NULL;
4424 struct acpi_dmar_atsr *atsr;
4425 struct dmar_atsr_unit *atsru;
4427 dev = pci_physfn(dev);
4428 for (bus = dev->bus; bus; bus = bus->parent) {
4430 /* If it's an integrated device, allow ATS */
4433 /* Connected via non-PCIe: no ATS */
4434 if (!pci_is_pcie(bridge) ||
4435 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4437 /* If we found the root port, look it up in the ATSR */
4438 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4443 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4444 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4445 if (atsr->segment != pci_domain_nr(dev->bus))
4448 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4449 if (tmp == &bridge->dev)
4452 if (atsru->include_all)
4462 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4465 struct dmar_rmrr_unit *rmrru;
4466 struct dmar_atsr_unit *atsru;
4467 struct acpi_dmar_atsr *atsr;
4468 struct acpi_dmar_reserved_memory *rmrr;
4470 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4473 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4474 rmrr = container_of(rmrru->hdr,
4475 struct acpi_dmar_reserved_memory, header);
4476 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4477 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4478 ((void *)rmrr) + rmrr->header.length,
4479 rmrr->segment, rmrru->devices,
4480 rmrru->devices_cnt);
4483 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4484 dmar_remove_dev_scope(info, rmrr->segment,
4485 rmrru->devices, rmrru->devices_cnt);
4489 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4490 if (atsru->include_all)
4493 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4494 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4495 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4496 (void *)atsr + atsr->header.length,
4497 atsr->segment, atsru->devices,
4498 atsru->devices_cnt);
4503 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4504 if (dmar_remove_dev_scope(info, atsr->segment,
4505 atsru->devices, atsru->devices_cnt))
4514 * Here we only respond to action of unbound device from driver.
4516 * Added device is not attached to its DMAR domain here yet. That will happen
4517 * when mapping the device to iova.
4519 static int device_notifier(struct notifier_block *nb,
4520 unsigned long action, void *data)
4522 struct device *dev = data;
4523 struct dmar_domain *domain;
4525 if (iommu_dummy(dev))
4528 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4531 domain = find_domain(dev);
4535 dmar_remove_one_dev_info(domain, dev);
4536 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4537 domain_exit(domain);
4542 static struct notifier_block device_nb = {
4543 .notifier_call = device_notifier,
4546 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4547 unsigned long val, void *v)
4549 struct memory_notify *mhp = v;
4550 unsigned long long start, end;
4551 unsigned long start_vpfn, last_vpfn;
4554 case MEM_GOING_ONLINE:
4555 start = mhp->start_pfn << PAGE_SHIFT;
4556 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4557 if (iommu_domain_identity_map(si_domain, start, end)) {
4558 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4565 case MEM_CANCEL_ONLINE:
4566 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4567 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4568 while (start_vpfn <= last_vpfn) {
4570 struct dmar_drhd_unit *drhd;
4571 struct intel_iommu *iommu;
4572 struct page *freelist;
4574 iova = find_iova(&si_domain->iovad, start_vpfn);
4576 pr_debug("Failed get IOVA for PFN %lx\n",
4581 iova = split_and_remove_iova(&si_domain->iovad, iova,
4582 start_vpfn, last_vpfn);
4584 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4585 start_vpfn, last_vpfn);
4589 freelist = domain_unmap(si_domain, iova->pfn_lo,
4593 for_each_active_iommu(iommu, drhd)
4594 iommu_flush_iotlb_psi(iommu, si_domain,
4595 iova->pfn_lo, iova_size(iova),
4598 dma_free_pagelist(freelist);
4600 start_vpfn = iova->pfn_hi + 1;
4601 free_iova_mem(iova);
4609 static struct notifier_block intel_iommu_memory_nb = {
4610 .notifier_call = intel_iommu_memory_notifier,
4614 static void free_all_cpu_cached_iovas(unsigned int cpu)
4618 for (i = 0; i < g_num_of_iommus; i++) {
4619 struct intel_iommu *iommu = g_iommus[i];
4620 struct dmar_domain *domain;
4626 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4627 domain = get_iommu_domain(iommu, (u16)did);
4631 free_cpu_cached_iovas(cpu, &domain->iovad);
4636 static int intel_iommu_cpu_dead(unsigned int cpu)
4638 free_all_cpu_cached_iovas(cpu);
4642 static void intel_disable_iommus(void)
4644 struct intel_iommu *iommu = NULL;
4645 struct dmar_drhd_unit *drhd;
4647 for_each_iommu(iommu, drhd)
4648 iommu_disable_translation(iommu);
4651 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4653 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4655 return container_of(iommu_dev, struct intel_iommu, iommu);
4658 static ssize_t intel_iommu_show_version(struct device *dev,
4659 struct device_attribute *attr,
4662 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4663 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4664 return sprintf(buf, "%d:%d\n",
4665 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4667 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4669 static ssize_t intel_iommu_show_address(struct device *dev,
4670 struct device_attribute *attr,
4673 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4674 return sprintf(buf, "%llx\n", iommu->reg_phys);
4676 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4678 static ssize_t intel_iommu_show_cap(struct device *dev,
4679 struct device_attribute *attr,
4682 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4683 return sprintf(buf, "%llx\n", iommu->cap);
4685 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4687 static ssize_t intel_iommu_show_ecap(struct device *dev,
4688 struct device_attribute *attr,
4691 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4692 return sprintf(buf, "%llx\n", iommu->ecap);
4694 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4696 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4697 struct device_attribute *attr,
4700 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4701 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4703 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4705 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4706 struct device_attribute *attr,
4709 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4710 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4711 cap_ndoms(iommu->cap)));
4713 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4715 static struct attribute *intel_iommu_attrs[] = {
4716 &dev_attr_version.attr,
4717 &dev_attr_address.attr,
4719 &dev_attr_ecap.attr,
4720 &dev_attr_domains_supported.attr,
4721 &dev_attr_domains_used.attr,
4725 static struct attribute_group intel_iommu_group = {
4726 .name = "intel-iommu",
4727 .attrs = intel_iommu_attrs,
4730 const struct attribute_group *intel_iommu_groups[] = {
4735 static int __init platform_optin_force_iommu(void)
4737 struct pci_dev *pdev = NULL;
4738 bool has_untrusted_dev = false;
4740 if (!dmar_platform_optin() || no_platform_optin)
4743 for_each_pci_dev(pdev) {
4744 if (pdev->untrusted) {
4745 has_untrusted_dev = true;
4750 if (!has_untrusted_dev)
4753 if (no_iommu || dmar_disabled)
4754 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4757 * If Intel-IOMMU is disabled by default, we will apply identity
4758 * map for all devices except those marked as being untrusted.
4761 iommu_identity_mapping |= IDENTMAP_ALL;
4764 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4772 int __init intel_iommu_init(void)
4775 struct dmar_drhd_unit *drhd;
4776 struct intel_iommu *iommu;
4779 * Intel IOMMU is required for a TXT/tboot launch or platform
4780 * opt in, so enforce that.
4782 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4784 if (iommu_init_mempool()) {
4786 panic("tboot: Failed to initialize iommu memory\n");
4790 down_write(&dmar_global_lock);
4791 if (dmar_table_init()) {
4793 panic("tboot: Failed to initialize DMAR table\n");
4797 if (dmar_dev_scope_init() < 0) {
4799 panic("tboot: Failed to initialize DMAR device scope\n");
4803 up_write(&dmar_global_lock);
4806 * The bus notifier takes the dmar_global_lock, so lockdep will
4807 * complain later when we register it under the lock.
4809 dmar_register_bus_notifier();
4811 down_write(&dmar_global_lock);
4813 if (no_iommu || dmar_disabled) {
4815 * We exit the function here to ensure IOMMU's remapping and
4816 * mempool aren't setup, which means that the IOMMU's PMRs
4817 * won't be disabled via the call to init_dmars(). So disable
4818 * it explicitly here. The PMRs were setup by tboot prior to
4819 * calling SENTER, but the kernel is expected to reset/tear
4822 if (intel_iommu_tboot_noforce) {
4823 for_each_iommu(iommu, drhd)
4824 iommu_disable_protect_mem_regions(iommu);
4828 * Make sure the IOMMUs are switched off, even when we
4829 * boot into a kexec kernel and the previous kernel left
4832 intel_disable_iommus();
4836 if (list_empty(&dmar_rmrr_units))
4837 pr_info("No RMRR found\n");
4839 if (list_empty(&dmar_atsr_units))
4840 pr_info("No ATSR found\n");
4842 if (dmar_init_reserved_ranges()) {
4844 panic("tboot: Failed to reserve iommu ranges\n");
4845 goto out_free_reserved_range;
4848 init_no_remapping_devices();
4853 panic("tboot: Failed to initialize DMARs\n");
4854 pr_err("Initialization failed\n");
4855 goto out_free_reserved_range;
4857 up_write(&dmar_global_lock);
4858 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4860 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4863 dma_ops = &intel_dma_ops;
4865 init_iommu_pm_ops();
4867 for_each_active_iommu(iommu, drhd) {
4868 iommu_device_sysfs_add(&iommu->iommu, NULL,
4871 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4872 iommu_device_register(&iommu->iommu);
4875 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4876 bus_register_notifier(&pci_bus_type, &device_nb);
4877 if (si_domain && !hw_pass_through)
4878 register_memory_notifier(&intel_iommu_memory_nb);
4879 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4880 intel_iommu_cpu_dead);
4881 intel_iommu_enabled = 1;
4882 intel_iommu_debugfs_init();
4886 out_free_reserved_range:
4887 put_iova_domain(&reserved_iova_list);
4889 intel_iommu_free_dmars();
4890 up_write(&dmar_global_lock);
4891 iommu_exit_mempool();
4895 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4897 struct intel_iommu *iommu = opaque;
4899 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4904 * NB - intel-iommu lacks any sort of reference counting for the users of
4905 * dependent devices. If multiple endpoints have intersecting dependent
4906 * devices, unbinding the driver from any one of them will possibly leave
4907 * the others unable to operate.
4909 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4911 if (!iommu || !dev || !dev_is_pci(dev))
4914 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4917 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4919 struct intel_iommu *iommu;
4920 unsigned long flags;
4922 assert_spin_locked(&device_domain_lock);
4927 iommu = info->iommu;
4930 iommu_disable_dev_iotlb(info);
4931 domain_context_clear(iommu, info->dev);
4932 intel_pasid_free_table(info->dev);
4935 unlink_domain_info(info);
4937 spin_lock_irqsave(&iommu->lock, flags);
4938 domain_detach_iommu(info->domain, iommu);
4939 spin_unlock_irqrestore(&iommu->lock, flags);
4941 free_devinfo_mem(info);
4944 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4947 struct device_domain_info *info;
4948 unsigned long flags;
4950 spin_lock_irqsave(&device_domain_lock, flags);
4951 info = dev->archdata.iommu;
4952 __dmar_remove_one_dev_info(info);
4953 spin_unlock_irqrestore(&device_domain_lock, flags);
4956 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4960 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4961 domain_reserve_special_ranges(domain);
4963 /* calculate AGAW */
4964 domain->gaw = guest_width;
4965 adjust_width = guestwidth_to_adjustwidth(guest_width);
4966 domain->agaw = width_to_agaw(adjust_width);
4968 domain->iommu_coherency = 0;
4969 domain->iommu_snooping = 0;
4970 domain->iommu_superpage = 0;
4971 domain->max_addr = 0;
4973 /* always allocate the top pgd */
4974 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4977 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4981 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4983 struct dmar_domain *dmar_domain;
4984 struct iommu_domain *domain;
4986 if (type != IOMMU_DOMAIN_UNMANAGED)
4989 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4991 pr_err("Can't allocate dmar_domain\n");
4994 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4995 pr_err("Domain initialization failed\n");
4996 domain_exit(dmar_domain);
4999 domain_update_iommu_cap(dmar_domain);
5001 domain = &dmar_domain->domain;
5002 domain->geometry.aperture_start = 0;
5003 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5004 domain->geometry.force_aperture = true;
5009 static void intel_iommu_domain_free(struct iommu_domain *domain)
5011 domain_exit(to_dmar_domain(domain));
5014 static int intel_iommu_attach_device(struct iommu_domain *domain,
5017 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5018 struct intel_iommu *iommu;
5022 if (device_is_rmrr_locked(dev)) {
5023 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5027 /* normally dev is not mapped */
5028 if (unlikely(domain_context_mapped(dev))) {
5029 struct dmar_domain *old_domain;
5031 old_domain = find_domain(dev);
5034 dmar_remove_one_dev_info(old_domain, dev);
5037 if (!domain_type_is_vm_or_si(old_domain) &&
5038 list_empty(&old_domain->devices))
5039 domain_exit(old_domain);
5043 iommu = device_to_iommu(dev, &bus, &devfn);
5047 /* check if this iommu agaw is sufficient for max mapped address */
5048 addr_width = agaw_to_width(iommu->agaw);
5049 if (addr_width > cap_mgaw(iommu->cap))
5050 addr_width = cap_mgaw(iommu->cap);
5052 if (dmar_domain->max_addr > (1LL << addr_width)) {
5053 pr_err("%s: iommu width (%d) is not "
5054 "sufficient for the mapped address (%llx)\n",
5055 __func__, addr_width, dmar_domain->max_addr);
5058 dmar_domain->gaw = addr_width;
5061 * Knock out extra levels of page tables if necessary
5063 while (iommu->agaw < dmar_domain->agaw) {
5064 struct dma_pte *pte;
5066 pte = dmar_domain->pgd;
5067 if (dma_pte_present(pte)) {
5068 dmar_domain->pgd = (struct dma_pte *)
5069 phys_to_virt(dma_pte_addr(pte));
5070 free_pgtable_page(pte);
5072 dmar_domain->agaw--;
5075 return domain_add_dev_info(dmar_domain, dev);
5078 static void intel_iommu_detach_device(struct iommu_domain *domain,
5081 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5084 static int intel_iommu_map(struct iommu_domain *domain,
5085 unsigned long iova, phys_addr_t hpa,
5086 size_t size, int iommu_prot)
5088 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5093 if (iommu_prot & IOMMU_READ)
5094 prot |= DMA_PTE_READ;
5095 if (iommu_prot & IOMMU_WRITE)
5096 prot |= DMA_PTE_WRITE;
5097 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5098 prot |= DMA_PTE_SNP;
5100 max_addr = iova + size;
5101 if (dmar_domain->max_addr < max_addr) {
5104 /* check if minimum agaw is sufficient for mapped address */
5105 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5106 if (end < max_addr) {
5107 pr_err("%s: iommu width (%d) is not "
5108 "sufficient for the mapped address (%llx)\n",
5109 __func__, dmar_domain->gaw, max_addr);
5112 dmar_domain->max_addr = max_addr;
5114 /* Round up size to next multiple of PAGE_SIZE, if it and
5115 the low bits of hpa would take us onto the next page */
5116 size = aligned_nrpages(hpa, size);
5117 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5118 hpa >> VTD_PAGE_SHIFT, size, prot);
5122 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5123 unsigned long iova, size_t size)
5125 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5126 struct page *freelist = NULL;
5127 unsigned long start_pfn, last_pfn;
5128 unsigned int npages;
5129 int iommu_id, level = 0;
5131 /* Cope with horrid API which requires us to unmap more than the
5132 size argument if it happens to be a large-page mapping. */
5133 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5135 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5136 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5138 start_pfn = iova >> VTD_PAGE_SHIFT;
5139 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5141 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5143 npages = last_pfn - start_pfn + 1;
5145 for_each_domain_iommu(iommu_id, dmar_domain)
5146 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5147 start_pfn, npages, !freelist, 0);
5149 dma_free_pagelist(freelist);
5151 if (dmar_domain->max_addr == iova + size)
5152 dmar_domain->max_addr = iova;
5157 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5160 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5161 struct dma_pte *pte;
5165 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5167 phys = dma_pte_addr(pte);
5172 static bool intel_iommu_capable(enum iommu_cap cap)
5174 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5175 return domain_update_iommu_snooping(NULL) == 1;
5176 if (cap == IOMMU_CAP_INTR_REMAP)
5177 return irq_remapping_enabled == 1;
5182 static int intel_iommu_add_device(struct device *dev)
5184 struct intel_iommu *iommu;
5185 struct iommu_group *group;
5188 iommu = device_to_iommu(dev, &bus, &devfn);
5192 iommu_device_link(&iommu->iommu, dev);
5194 group = iommu_group_get_for_dev(dev);
5197 return PTR_ERR(group);
5199 iommu_group_put(group);
5203 static void intel_iommu_remove_device(struct device *dev)
5205 struct intel_iommu *iommu;
5208 iommu = device_to_iommu(dev, &bus, &devfn);
5212 iommu_group_remove_device(dev);
5214 iommu_device_unlink(&iommu->iommu, dev);
5217 static void intel_iommu_get_resv_regions(struct device *device,
5218 struct list_head *head)
5220 struct iommu_resv_region *reg;
5221 struct dmar_rmrr_unit *rmrr;
5222 struct device *i_dev;
5226 for_each_rmrr_units(rmrr) {
5227 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5229 if (i_dev != device)
5232 list_add_tail(&rmrr->resv->list, head);
5237 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5238 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5242 list_add_tail(®->list, head);
5245 static void intel_iommu_put_resv_regions(struct device *dev,
5246 struct list_head *head)
5248 struct iommu_resv_region *entry, *next;
5250 list_for_each_entry_safe(entry, next, head, list) {
5251 if (entry->type == IOMMU_RESV_RESERVED)
5256 #ifdef CONFIG_INTEL_IOMMU_SVM
5257 #define MAX_NR_PASID_BITS (20)
5258 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5262 max_pasid = intel_pasid_get_dev_max_id(dev);
5263 pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5270 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5272 struct device_domain_info *info;
5273 struct context_entry *context;
5274 struct dmar_domain *domain;
5275 unsigned long flags;
5279 domain = get_valid_domain_for_dev(sdev->dev);
5283 spin_lock_irqsave(&device_domain_lock, flags);
5284 spin_lock(&iommu->lock);
5287 info = sdev->dev->archdata.iommu;
5288 if (!info || !info->pasid_supported)
5291 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5292 if (WARN_ON(!context))
5295 ctx_lo = context[0].lo;
5297 sdev->did = domain->iommu_did[iommu->seq_id];
5298 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5300 if (!(ctx_lo & CONTEXT_PASIDE)) {
5301 if (iommu->pasid_state_table)
5302 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5303 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5304 intel_iommu_get_pts(sdev->dev);
5307 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5308 * extended to permit requests-with-PASID if the PASIDE bit
5309 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5310 * however, the PASIDE bit is ignored and requests-with-PASID
5311 * are unconditionally blocked. Which makes less sense.
5312 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5313 * "guest mode" translation types depending on whether ATS
5314 * is available or not. Annoyingly, we can't use the new
5315 * modes *unless* PASIDE is set. */
5316 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5317 ctx_lo &= ~CONTEXT_TT_MASK;
5318 if (info->ats_supported)
5319 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5321 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5323 ctx_lo |= CONTEXT_PASIDE;
5324 if (iommu->pasid_state_table)
5325 ctx_lo |= CONTEXT_DINVE;
5326 if (info->pri_supported)
5327 ctx_lo |= CONTEXT_PRS;
5328 context[0].lo = ctx_lo;
5330 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5331 DMA_CCMD_MASK_NOBIT,
5332 DMA_CCMD_DEVICE_INVL);
5335 /* Enable PASID support in the device, if it wasn't already */
5336 if (!info->pasid_enabled)
5337 iommu_enable_dev_iotlb(info);
5339 if (info->ats_enabled) {
5340 sdev->dev_iotlb = 1;
5341 sdev->qdep = info->ats_qdep;
5342 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5348 spin_unlock(&iommu->lock);
5349 spin_unlock_irqrestore(&device_domain_lock, flags);
5354 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5356 struct intel_iommu *iommu;
5359 if (iommu_dummy(dev)) {
5361 "No IOMMU translation for device; cannot enable SVM\n");
5365 iommu = device_to_iommu(dev, &bus, &devfn);
5367 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5373 #endif /* CONFIG_INTEL_IOMMU_SVM */
5375 const struct iommu_ops intel_iommu_ops = {
5376 .capable = intel_iommu_capable,
5377 .domain_alloc = intel_iommu_domain_alloc,
5378 .domain_free = intel_iommu_domain_free,
5379 .attach_dev = intel_iommu_attach_device,
5380 .detach_dev = intel_iommu_detach_device,
5381 .map = intel_iommu_map,
5382 .unmap = intel_iommu_unmap,
5383 .iova_to_phys = intel_iommu_iova_to_phys,
5384 .add_device = intel_iommu_add_device,
5385 .remove_device = intel_iommu_remove_device,
5386 .get_resv_regions = intel_iommu_get_resv_regions,
5387 .put_resv_regions = intel_iommu_put_resv_regions,
5388 .device_group = pci_device_group,
5389 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5392 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5394 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5395 pr_info("Disabling IOMMU for graphics on this chipset\n");
5399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5400 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5401 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5402 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5403 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5404 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5405 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5407 static void quirk_iommu_rwbf(struct pci_dev *dev)
5410 * Mobile 4 Series Chipset neglects to set RWBF capability,
5411 * but needs it. Same seems to hold for the desktop versions.
5413 pr_info("Forcing write-buffer flush capability\n");
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5419 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5420 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5421 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5422 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5423 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5426 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5427 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5428 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5429 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5430 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5431 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5432 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5433 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5435 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5439 if (pci_read_config_word(dev, GGC, &ggc))
5442 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5443 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5445 } else if (dmar_map_gfx) {
5446 /* we have to ensure the gfx device is idle before we flush */
5447 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5448 intel_iommu_strict = 1;
5451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5456 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5457 ISOCH DMAR unit for the Azalia sound device, but not give it any
5458 TLB entries, which causes it to deadlock. Check for that. We do
5459 this in a function called from init_dmars(), instead of in a PCI
5460 quirk, because we don't want to print the obnoxious "BIOS broken"
5461 message if VT-d is actually disabled.
5463 static void __init check_tylersburg_isoch(void)
5465 struct pci_dev *pdev;
5466 uint32_t vtisochctrl;
5468 /* If there's no Azalia in the system anyway, forget it. */
5469 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5474 /* System Management Registers. Might be hidden, in which case
5475 we can't do the sanity check. But that's OK, because the
5476 known-broken BIOSes _don't_ actually hide it, so far. */
5477 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5481 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5488 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5489 if (vtisochctrl & 1)
5492 /* Drop all bits other than the number of TLB entries */
5493 vtisochctrl &= 0x1c;
5495 /* If we have the recommended number of TLB entries (16), fine. */
5496 if (vtisochctrl == 0x10)
5499 /* Zero TLB entries? You get to ride the short bus to school. */
5501 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5502 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5503 dmi_get_system_info(DMI_BIOS_VENDOR),
5504 dmi_get_system_info(DMI_BIOS_VERSION),
5505 dmi_get_system_info(DMI_PRODUCT_VERSION));
5506 iommu_identity_mapping |= IDENTMAP_AZALIA;
5510 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",