Merge tag 'asoc-fix-v5.11-rc2' of https://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/dma-iommu.h>
35 #include <linux/intel-iommu.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/tboot.h>
38 #include <linux/dmi.h>
39 #include <linux/pci-ats.h>
40 #include <linux/memblock.h>
41 #include <linux/dma-map-ops.h>
42 #include <linux/dma-direct.h>
43 #include <linux/crash_dump.h>
44 #include <linux/numa.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int iommu_skip_te_disable;
359
360 #define IDENTMAP_GFX            2
361 #define IDENTMAP_AZALIA         4
362
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365
366 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
367 struct device_domain_info *get_domain_info(struct device *dev)
368 {
369         struct device_domain_info *info;
370
371         if (!dev)
372                 return NULL;
373
374         info = dev_iommu_priv_get(dev);
375         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
376                 return NULL;
377
378         return info;
379 }
380
381 DEFINE_SPINLOCK(device_domain_lock);
382 static LIST_HEAD(device_domain_list);
383
384 /*
385  * Iterate over elements in device_domain_list and call the specified
386  * callback @fn against each element.
387  */
388 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
389                                      void *data), void *data)
390 {
391         int ret = 0;
392         unsigned long flags;
393         struct device_domain_info *info;
394
395         spin_lock_irqsave(&device_domain_lock, flags);
396         list_for_each_entry(info, &device_domain_list, global) {
397                 ret = fn(info, data);
398                 if (ret) {
399                         spin_unlock_irqrestore(&device_domain_lock, flags);
400                         return ret;
401                 }
402         }
403         spin_unlock_irqrestore(&device_domain_lock, flags);
404
405         return 0;
406 }
407
408 const struct iommu_ops intel_iommu_ops;
409
410 static bool translation_pre_enabled(struct intel_iommu *iommu)
411 {
412         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
413 }
414
415 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
416 {
417         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
418 }
419
420 static void init_translation_status(struct intel_iommu *iommu)
421 {
422         u32 gsts;
423
424         gsts = readl(iommu->reg + DMAR_GSTS_REG);
425         if (gsts & DMA_GSTS_TES)
426                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
427 }
428
429 static int __init intel_iommu_setup(char *str)
430 {
431         if (!str)
432                 return -EINVAL;
433         while (*str) {
434                 if (!strncmp(str, "on", 2)) {
435                         dmar_disabled = 0;
436                         pr_info("IOMMU enabled\n");
437                 } else if (!strncmp(str, "off", 3)) {
438                         dmar_disabled = 1;
439                         no_platform_optin = 1;
440                         pr_info("IOMMU disabled\n");
441                 } else if (!strncmp(str, "igfx_off", 8)) {
442                         dmar_map_gfx = 0;
443                         pr_info("Disable GFX device mapping\n");
444                 } else if (!strncmp(str, "forcedac", 8)) {
445                         pr_info("Forcing DAC for PCI devices\n");
446                         dmar_forcedac = 1;
447                 } else if (!strncmp(str, "strict", 6)) {
448                         pr_info("Disable batched IOTLB flush\n");
449                         intel_iommu_strict = 1;
450                 } else if (!strncmp(str, "sp_off", 6)) {
451                         pr_info("Disable supported super page\n");
452                         intel_iommu_superpage = 0;
453                 } else if (!strncmp(str, "sm_on", 5)) {
454                         pr_info("Intel-IOMMU: scalable mode supported\n");
455                         intel_iommu_sm = 1;
456                 } else if (!strncmp(str, "tboot_noforce", 13)) {
457                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
458                         intel_iommu_tboot_noforce = 1;
459                 }
460
461                 str += strcspn(str, ",");
462                 while (*str == ',')
463                         str++;
464         }
465         return 0;
466 }
467 __setup("intel_iommu=", intel_iommu_setup);
468
469 static struct kmem_cache *iommu_domain_cache;
470 static struct kmem_cache *iommu_devinfo_cache;
471
472 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 {
474         struct dmar_domain **domains;
475         int idx = did >> 8;
476
477         domains = iommu->domains[idx];
478         if (!domains)
479                 return NULL;
480
481         return domains[did & 0xff];
482 }
483
484 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
485                              struct dmar_domain *domain)
486 {
487         struct dmar_domain **domains;
488         int idx = did >> 8;
489
490         if (!iommu->domains[idx]) {
491                 size_t size = 256 * sizeof(struct dmar_domain *);
492                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
493         }
494
495         domains = iommu->domains[idx];
496         if (WARN_ON(!domains))
497                 return;
498         else
499                 domains[did & 0xff] = domain;
500 }
501
502 void *alloc_pgtable_page(int node)
503 {
504         struct page *page;
505         void *vaddr = NULL;
506
507         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508         if (page)
509                 vaddr = page_address(page);
510         return vaddr;
511 }
512
513 void free_pgtable_page(void *vaddr)
514 {
515         free_page((unsigned long)vaddr);
516 }
517
518 static inline void *alloc_domain_mem(void)
519 {
520         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
521 }
522
523 static void free_domain_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_domain_cache, vaddr);
526 }
527
528 static inline void * alloc_devinfo_mem(void)
529 {
530         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
531 }
532
533 static inline void free_devinfo_mem(void *vaddr)
534 {
535         kmem_cache_free(iommu_devinfo_cache, vaddr);
536 }
537
538 static inline int domain_type_is_si(struct dmar_domain *domain)
539 {
540         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
541 }
542
543 static inline bool domain_use_first_level(struct dmar_domain *domain)
544 {
545         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
546 }
547
548 static inline int domain_pfn_supported(struct dmar_domain *domain,
549                                        unsigned long pfn)
550 {
551         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
552
553         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
554 }
555
556 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 {
558         unsigned long sagaw;
559         int agaw = -1;
560
561         sagaw = cap_sagaw(iommu->cap);
562         for (agaw = width_to_agaw(max_gaw);
563              agaw >= 0; agaw--) {
564                 if (test_bit(agaw, &sagaw))
565                         break;
566         }
567
568         return agaw;
569 }
570
571 /*
572  * Calculate max SAGAW for each iommu.
573  */
574 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
575 {
576         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
577 }
578
579 /*
580  * calculate agaw for each iommu.
581  * "SAGAW" may be different across iommus, use a default agaw, and
582  * get a supported less agaw for iommus that don't support the default agaw.
583  */
584 int iommu_calculate_agaw(struct intel_iommu *iommu)
585 {
586         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
587 }
588
589 /* This functionin only returns single iommu in a domain */
590 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
591 {
592         int iommu_id;
593
594         /* si_domain and vm domain should not get here. */
595         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
596                 return NULL;
597
598         for_each_domain_iommu(iommu_id, domain)
599                 break;
600
601         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
602                 return NULL;
603
604         return g_iommus[iommu_id];
605 }
606
607 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
608 {
609         return sm_supported(iommu) ?
610                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
611 }
612
613 static void domain_update_iommu_coherency(struct dmar_domain *domain)
614 {
615         struct dmar_drhd_unit *drhd;
616         struct intel_iommu *iommu;
617         bool found = false;
618         int i;
619
620         domain->iommu_coherency = 1;
621
622         for_each_domain_iommu(i, domain) {
623                 found = true;
624                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
625                         domain->iommu_coherency = 0;
626                         break;
627                 }
628         }
629         if (found)
630                 return;
631
632         /* No hardware attached; use lowest common denominator */
633         rcu_read_lock();
634         for_each_active_iommu(iommu, drhd) {
635                 if (!iommu_paging_structure_coherency(iommu)) {
636                         domain->iommu_coherency = 0;
637                         break;
638                 }
639         }
640         rcu_read_unlock();
641 }
642
643 static int domain_update_iommu_snooping(struct intel_iommu *skip)
644 {
645         struct dmar_drhd_unit *drhd;
646         struct intel_iommu *iommu;
647         int ret = 1;
648
649         rcu_read_lock();
650         for_each_active_iommu(iommu, drhd) {
651                 if (iommu != skip) {
652                         if (!ecap_sc_support(iommu->ecap)) {
653                                 ret = 0;
654                                 break;
655                         }
656                 }
657         }
658         rcu_read_unlock();
659
660         return ret;
661 }
662
663 static int domain_update_iommu_superpage(struct dmar_domain *domain,
664                                          struct intel_iommu *skip)
665 {
666         struct dmar_drhd_unit *drhd;
667         struct intel_iommu *iommu;
668         int mask = 0x3;
669
670         if (!intel_iommu_superpage) {
671                 return 0;
672         }
673
674         /* set iommu_superpage to the smallest common denominator */
675         rcu_read_lock();
676         for_each_active_iommu(iommu, drhd) {
677                 if (iommu != skip) {
678                         if (domain && domain_use_first_level(domain)) {
679                                 if (!cap_fl1gp_support(iommu->cap))
680                                         mask = 0x1;
681                         } else {
682                                 mask &= cap_super_page_val(iommu->cap);
683                         }
684
685                         if (!mask)
686                                 break;
687                 }
688         }
689         rcu_read_unlock();
690
691         return fls(mask);
692 }
693
694 static int domain_update_device_node(struct dmar_domain *domain)
695 {
696         struct device_domain_info *info;
697         int nid = NUMA_NO_NODE;
698
699         assert_spin_locked(&device_domain_lock);
700
701         if (list_empty(&domain->devices))
702                 return NUMA_NO_NODE;
703
704         list_for_each_entry(info, &domain->devices, link) {
705                 if (!info->dev)
706                         continue;
707
708                 /*
709                  * There could possibly be multiple device numa nodes as devices
710                  * within the same domain may sit behind different IOMMUs. There
711                  * isn't perfect answer in such situation, so we select first
712                  * come first served policy.
713                  */
714                 nid = dev_to_node(info->dev);
715                 if (nid != NUMA_NO_NODE)
716                         break;
717         }
718
719         return nid;
720 }
721
722 /* Some capabilities may be different across iommus */
723 static void domain_update_iommu_cap(struct dmar_domain *domain)
724 {
725         domain_update_iommu_coherency(domain);
726         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
727         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
728
729         /*
730          * If RHSA is missing, we should default to the device numa domain
731          * as fall back.
732          */
733         if (domain->nid == NUMA_NO_NODE)
734                 domain->nid = domain_update_device_node(domain);
735
736         /*
737          * First-level translation restricts the input-address to a
738          * canonical address (i.e., address bits 63:N have the same
739          * value as address bit [N-1], where N is 48-bits with 4-level
740          * paging and 57-bits with 5-level paging). Hence, skip bit
741          * [N-1].
742          */
743         if (domain_use_first_level(domain))
744                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
745         else
746                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
747 }
748
749 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
750                                          u8 devfn, int alloc)
751 {
752         struct root_entry *root = &iommu->root_entry[bus];
753         struct context_entry *context;
754         u64 *entry;
755
756         entry = &root->lo;
757         if (sm_supported(iommu)) {
758                 if (devfn >= 0x80) {
759                         devfn -= 0x80;
760                         entry = &root->hi;
761                 }
762                 devfn *= 2;
763         }
764         if (*entry & 1)
765                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
766         else {
767                 unsigned long phy_addr;
768                 if (!alloc)
769                         return NULL;
770
771                 context = alloc_pgtable_page(iommu->node);
772                 if (!context)
773                         return NULL;
774
775                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
776                 phy_addr = virt_to_phys((void *)context);
777                 *entry = phy_addr | 1;
778                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
779         }
780         return &context[devfn];
781 }
782
783 static bool attach_deferred(struct device *dev)
784 {
785         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
786 }
787
788 /**
789  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
790  *                               sub-hierarchy of a candidate PCI-PCI bridge
791  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
792  * @bridge: the candidate PCI-PCI bridge
793  *
794  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
795  */
796 static bool
797 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
798 {
799         struct pci_dev *pdev, *pbridge;
800
801         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
802                 return false;
803
804         pdev = to_pci_dev(dev);
805         pbridge = to_pci_dev(bridge);
806
807         if (pbridge->subordinate &&
808             pbridge->subordinate->number <= pdev->bus->number &&
809             pbridge->subordinate->busn_res.end >= pdev->bus->number)
810                 return true;
811
812         return false;
813 }
814
815 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
816 {
817         struct dmar_drhd_unit *drhd;
818         u32 vtbar;
819         int rc;
820
821         /* We know that this device on this chipset has its own IOMMU.
822          * If we find it under a different IOMMU, then the BIOS is lying
823          * to us. Hope that the IOMMU for this device is actually
824          * disabled, and it needs no translation...
825          */
826         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
827         if (rc) {
828                 /* "can't" happen */
829                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
830                 return false;
831         }
832         vtbar &= 0xffff0000;
833
834         /* we know that the this iommu should be at offset 0xa000 from vtbar */
835         drhd = dmar_find_matched_drhd_unit(pdev);
836         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
837                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
838                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
839                 return true;
840         }
841
842         return false;
843 }
844
845 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
846 {
847         if (!iommu || iommu->drhd->ignored)
848                 return true;
849
850         if (dev_is_pci(dev)) {
851                 struct pci_dev *pdev = to_pci_dev(dev);
852
853                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
854                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
855                     quirk_ioat_snb_local_iommu(pdev))
856                         return true;
857         }
858
859         return false;
860 }
861
862 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
863 {
864         struct dmar_drhd_unit *drhd = NULL;
865         struct pci_dev *pdev = NULL;
866         struct intel_iommu *iommu;
867         struct device *tmp;
868         u16 segment = 0;
869         int i;
870
871         if (!dev)
872                 return NULL;
873
874         if (dev_is_pci(dev)) {
875                 struct pci_dev *pf_pdev;
876
877                 pdev = pci_real_dma_dev(to_pci_dev(dev));
878
879                 /* VFs aren't listed in scope tables; we need to look up
880                  * the PF instead to find the IOMMU. */
881                 pf_pdev = pci_physfn(pdev);
882                 dev = &pf_pdev->dev;
883                 segment = pci_domain_nr(pdev->bus);
884         } else if (has_acpi_companion(dev))
885                 dev = &ACPI_COMPANION(dev)->dev;
886
887         rcu_read_lock();
888         for_each_iommu(iommu, drhd) {
889                 if (pdev && segment != drhd->segment)
890                         continue;
891
892                 for_each_active_dev_scope(drhd->devices,
893                                           drhd->devices_cnt, i, tmp) {
894                         if (tmp == dev) {
895                                 /* For a VF use its original BDF# not that of the PF
896                                  * which we used for the IOMMU lookup. Strictly speaking
897                                  * we could do this for all PCI devices; we only need to
898                                  * get the BDF# from the scope table for ACPI matches. */
899                                 if (pdev && pdev->is_virtfn)
900                                         goto got_pdev;
901
902                                 if (bus && devfn) {
903                                         *bus = drhd->devices[i].bus;
904                                         *devfn = drhd->devices[i].devfn;
905                                 }
906                                 goto out;
907                         }
908
909                         if (is_downstream_to_pci_bridge(dev, tmp))
910                                 goto got_pdev;
911                 }
912
913                 if (pdev && drhd->include_all) {
914                 got_pdev:
915                         if (bus && devfn) {
916                                 *bus = pdev->bus->number;
917                                 *devfn = pdev->devfn;
918                         }
919                         goto out;
920                 }
921         }
922         iommu = NULL;
923  out:
924         if (iommu_is_dummy(iommu, dev))
925                 iommu = NULL;
926
927         rcu_read_unlock();
928
929         return iommu;
930 }
931
932 static void domain_flush_cache(struct dmar_domain *domain,
933                                void *addr, int size)
934 {
935         if (!domain->iommu_coherency)
936                 clflush_cache_range(addr, size);
937 }
938
939 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
940 {
941         struct context_entry *context;
942         int ret = 0;
943         unsigned long flags;
944
945         spin_lock_irqsave(&iommu->lock, flags);
946         context = iommu_context_addr(iommu, bus, devfn, 0);
947         if (context)
948                 ret = context_present(context);
949         spin_unlock_irqrestore(&iommu->lock, flags);
950         return ret;
951 }
952
953 static void free_context_table(struct intel_iommu *iommu)
954 {
955         int i;
956         unsigned long flags;
957         struct context_entry *context;
958
959         spin_lock_irqsave(&iommu->lock, flags);
960         if (!iommu->root_entry) {
961                 goto out;
962         }
963         for (i = 0; i < ROOT_ENTRY_NR; i++) {
964                 context = iommu_context_addr(iommu, i, 0, 0);
965                 if (context)
966                         free_pgtable_page(context);
967
968                 if (!sm_supported(iommu))
969                         continue;
970
971                 context = iommu_context_addr(iommu, i, 0x80, 0);
972                 if (context)
973                         free_pgtable_page(context);
974
975         }
976         free_pgtable_page(iommu->root_entry);
977         iommu->root_entry = NULL;
978 out:
979         spin_unlock_irqrestore(&iommu->lock, flags);
980 }
981
982 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
983                                       unsigned long pfn, int *target_level)
984 {
985         struct dma_pte *parent, *pte;
986         int level = agaw_to_level(domain->agaw);
987         int offset;
988
989         BUG_ON(!domain->pgd);
990
991         if (!domain_pfn_supported(domain, pfn))
992                 /* Address beyond IOMMU's addressing capabilities. */
993                 return NULL;
994
995         parent = domain->pgd;
996
997         while (1) {
998                 void *tmp_page;
999
1000                 offset = pfn_level_offset(pfn, level);
1001                 pte = &parent[offset];
1002                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1003                         break;
1004                 if (level == *target_level)
1005                         break;
1006
1007                 if (!dma_pte_present(pte)) {
1008                         uint64_t pteval;
1009
1010                         tmp_page = alloc_pgtable_page(domain->nid);
1011
1012                         if (!tmp_page)
1013                                 return NULL;
1014
1015                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1016                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1017                         if (domain_use_first_level(domain))
1018                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1019                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1020                                 /* Someone else set it while we were thinking; use theirs. */
1021                                 free_pgtable_page(tmp_page);
1022                         else
1023                                 domain_flush_cache(domain, pte, sizeof(*pte));
1024                 }
1025                 if (level == 1)
1026                         break;
1027
1028                 parent = phys_to_virt(dma_pte_addr(pte));
1029                 level--;
1030         }
1031
1032         if (!*target_level)
1033                 *target_level = level;
1034
1035         return pte;
1036 }
1037
1038 /* return address's pte at specific level */
1039 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1040                                          unsigned long pfn,
1041                                          int level, int *large_page)
1042 {
1043         struct dma_pte *parent, *pte;
1044         int total = agaw_to_level(domain->agaw);
1045         int offset;
1046
1047         parent = domain->pgd;
1048         while (level <= total) {
1049                 offset = pfn_level_offset(pfn, total);
1050                 pte = &parent[offset];
1051                 if (level == total)
1052                         return pte;
1053
1054                 if (!dma_pte_present(pte)) {
1055                         *large_page = total;
1056                         break;
1057                 }
1058
1059                 if (dma_pte_superpage(pte)) {
1060                         *large_page = total;
1061                         return pte;
1062                 }
1063
1064                 parent = phys_to_virt(dma_pte_addr(pte));
1065                 total--;
1066         }
1067         return NULL;
1068 }
1069
1070 /* clear last level pte, a tlb flush should be followed */
1071 static void dma_pte_clear_range(struct dmar_domain *domain,
1072                                 unsigned long start_pfn,
1073                                 unsigned long last_pfn)
1074 {
1075         unsigned int large_page;
1076         struct dma_pte *first_pte, *pte;
1077
1078         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080         BUG_ON(start_pfn > last_pfn);
1081
1082         /* we don't need lock here; nobody else touches the iova range */
1083         do {
1084                 large_page = 1;
1085                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1086                 if (!pte) {
1087                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1088                         continue;
1089                 }
1090                 do {
1091                         dma_clear_pte(pte);
1092                         start_pfn += lvl_to_nr_pages(large_page);
1093                         pte++;
1094                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1095
1096                 domain_flush_cache(domain, first_pte,
1097                                    (void *)pte - (void *)first_pte);
1098
1099         } while (start_pfn && start_pfn <= last_pfn);
1100 }
1101
1102 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1103                                int retain_level, struct dma_pte *pte,
1104                                unsigned long pfn, unsigned long start_pfn,
1105                                unsigned long last_pfn)
1106 {
1107         pfn = max(start_pfn, pfn);
1108         pte = &pte[pfn_level_offset(pfn, level)];
1109
1110         do {
1111                 unsigned long level_pfn;
1112                 struct dma_pte *level_pte;
1113
1114                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1115                         goto next;
1116
1117                 level_pfn = pfn & level_mask(level);
1118                 level_pte = phys_to_virt(dma_pte_addr(pte));
1119
1120                 if (level > 2) {
1121                         dma_pte_free_level(domain, level - 1, retain_level,
1122                                            level_pte, level_pfn, start_pfn,
1123                                            last_pfn);
1124                 }
1125
1126                 /*
1127                  * Free the page table if we're below the level we want to
1128                  * retain and the range covers the entire table.
1129                  */
1130                 if (level < retain_level && !(start_pfn > level_pfn ||
1131                       last_pfn < level_pfn + level_size(level) - 1)) {
1132                         dma_clear_pte(pte);
1133                         domain_flush_cache(domain, pte, sizeof(*pte));
1134                         free_pgtable_page(level_pte);
1135                 }
1136 next:
1137                 pfn += level_size(level);
1138         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1139 }
1140
1141 /*
1142  * clear last level (leaf) ptes and free page table pages below the
1143  * level we wish to keep intact.
1144  */
1145 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1146                                    unsigned long start_pfn,
1147                                    unsigned long last_pfn,
1148                                    int retain_level)
1149 {
1150         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1151         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1152         BUG_ON(start_pfn > last_pfn);
1153
1154         dma_pte_clear_range(domain, start_pfn, last_pfn);
1155
1156         /* We don't need lock here; nobody else touches the iova range */
1157         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1158                            domain->pgd, 0, start_pfn, last_pfn);
1159
1160         /* free pgd */
1161         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162                 free_pgtable_page(domain->pgd);
1163                 domain->pgd = NULL;
1164         }
1165 }
1166
1167 /* When a page at a given level is being unlinked from its parent, we don't
1168    need to *modify* it at all. All we need to do is make a list of all the
1169    pages which can be freed just as soon as we've flushed the IOTLB and we
1170    know the hardware page-walk will no longer touch them.
1171    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1172    be freed. */
1173 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1174                                             int level, struct dma_pte *pte,
1175                                             struct page *freelist)
1176 {
1177         struct page *pg;
1178
1179         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1180         pg->freelist = freelist;
1181         freelist = pg;
1182
1183         if (level == 1)
1184                 return freelist;
1185
1186         pte = page_address(pg);
1187         do {
1188                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1189                         freelist = dma_pte_list_pagetables(domain, level - 1,
1190                                                            pte, freelist);
1191                 pte++;
1192         } while (!first_pte_in_page(pte));
1193
1194         return freelist;
1195 }
1196
1197 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1198                                         struct dma_pte *pte, unsigned long pfn,
1199                                         unsigned long start_pfn,
1200                                         unsigned long last_pfn,
1201                                         struct page *freelist)
1202 {
1203         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1204
1205         pfn = max(start_pfn, pfn);
1206         pte = &pte[pfn_level_offset(pfn, level)];
1207
1208         do {
1209                 unsigned long level_pfn;
1210
1211                 if (!dma_pte_present(pte))
1212                         goto next;
1213
1214                 level_pfn = pfn & level_mask(level);
1215
1216                 /* If range covers entire pagetable, free it */
1217                 if (start_pfn <= level_pfn &&
1218                     last_pfn >= level_pfn + level_size(level) - 1) {
1219                         /* These suborbinate page tables are going away entirely. Don't
1220                            bother to clear them; we're just going to *free* them. */
1221                         if (level > 1 && !dma_pte_superpage(pte))
1222                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1223
1224                         dma_clear_pte(pte);
1225                         if (!first_pte)
1226                                 first_pte = pte;
1227                         last_pte = pte;
1228                 } else if (level > 1) {
1229                         /* Recurse down into a level that isn't *entirely* obsolete */
1230                         freelist = dma_pte_clear_level(domain, level - 1,
1231                                                        phys_to_virt(dma_pte_addr(pte)),
1232                                                        level_pfn, start_pfn, last_pfn,
1233                                                        freelist);
1234                 }
1235 next:
1236                 pfn += level_size(level);
1237         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1238
1239         if (first_pte)
1240                 domain_flush_cache(domain, first_pte,
1241                                    (void *)++last_pte - (void *)first_pte);
1242
1243         return freelist;
1244 }
1245
1246 /* We can't just free the pages because the IOMMU may still be walking
1247    the page tables, and may have cached the intermediate levels. The
1248    pages can only be freed after the IOTLB flush has been done. */
1249 static struct page *domain_unmap(struct dmar_domain *domain,
1250                                  unsigned long start_pfn,
1251                                  unsigned long last_pfn,
1252                                  struct page *freelist)
1253 {
1254         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1255         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1256         BUG_ON(start_pfn > last_pfn);
1257
1258         /* we don't need lock here; nobody else touches the iova range */
1259         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1260                                        domain->pgd, 0, start_pfn, last_pfn,
1261                                        freelist);
1262
1263         /* free pgd */
1264         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1265                 struct page *pgd_page = virt_to_page(domain->pgd);
1266                 pgd_page->freelist = freelist;
1267                 freelist = pgd_page;
1268
1269                 domain->pgd = NULL;
1270         }
1271
1272         return freelist;
1273 }
1274
1275 static void dma_free_pagelist(struct page *freelist)
1276 {
1277         struct page *pg;
1278
1279         while ((pg = freelist)) {
1280                 freelist = pg->freelist;
1281                 free_pgtable_page(page_address(pg));
1282         }
1283 }
1284
1285 /* iommu handling */
1286 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1287 {
1288         struct root_entry *root;
1289         unsigned long flags;
1290
1291         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1292         if (!root) {
1293                 pr_err("Allocating root entry for %s failed\n",
1294                         iommu->name);
1295                 return -ENOMEM;
1296         }
1297
1298         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1299
1300         spin_lock_irqsave(&iommu->lock, flags);
1301         iommu->root_entry = root;
1302         spin_unlock_irqrestore(&iommu->lock, flags);
1303
1304         return 0;
1305 }
1306
1307 static void iommu_set_root_entry(struct intel_iommu *iommu)
1308 {
1309         u64 addr;
1310         u32 sts;
1311         unsigned long flag;
1312
1313         addr = virt_to_phys(iommu->root_entry);
1314         if (sm_supported(iommu))
1315                 addr |= DMA_RTADDR_SMT;
1316
1317         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1318         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1319
1320         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1321
1322         /* Make sure hardware complete it */
1323         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1324                       readl, (sts & DMA_GSTS_RTPS), sts);
1325
1326         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1327 }
1328
1329 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1330 {
1331         u32 val;
1332         unsigned long flag;
1333
1334         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1335                 return;
1336
1337         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1338         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1339
1340         /* Make sure hardware complete it */
1341         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1342                       readl, (!(val & DMA_GSTS_WBFS)), val);
1343
1344         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1345 }
1346
1347 /* return value determine if we need a write buffer flush */
1348 static void __iommu_flush_context(struct intel_iommu *iommu,
1349                                   u16 did, u16 source_id, u8 function_mask,
1350                                   u64 type)
1351 {
1352         u64 val = 0;
1353         unsigned long flag;
1354
1355         switch (type) {
1356         case DMA_CCMD_GLOBAL_INVL:
1357                 val = DMA_CCMD_GLOBAL_INVL;
1358                 break;
1359         case DMA_CCMD_DOMAIN_INVL:
1360                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1361                 break;
1362         case DMA_CCMD_DEVICE_INVL:
1363                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1364                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1365                 break;
1366         default:
1367                 BUG();
1368         }
1369         val |= DMA_CCMD_ICC;
1370
1371         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1372         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1373
1374         /* Make sure hardware complete it */
1375         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1376                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1377
1378         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1379 }
1380
1381 /* return value determine if we need a write buffer flush */
1382 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1383                                 u64 addr, unsigned int size_order, u64 type)
1384 {
1385         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1386         u64 val = 0, val_iva = 0;
1387         unsigned long flag;
1388
1389         switch (type) {
1390         case DMA_TLB_GLOBAL_FLUSH:
1391                 /* global flush doesn't need set IVA_REG */
1392                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1393                 break;
1394         case DMA_TLB_DSI_FLUSH:
1395                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1396                 break;
1397         case DMA_TLB_PSI_FLUSH:
1398                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1399                 /* IH bit is passed in as part of address */
1400                 val_iva = size_order | addr;
1401                 break;
1402         default:
1403                 BUG();
1404         }
1405         /* Note: set drain read/write */
1406 #if 0
1407         /*
1408          * This is probably to be super secure.. Looks like we can
1409          * ignore it without any impact.
1410          */
1411         if (cap_read_drain(iommu->cap))
1412                 val |= DMA_TLB_READ_DRAIN;
1413 #endif
1414         if (cap_write_drain(iommu->cap))
1415                 val |= DMA_TLB_WRITE_DRAIN;
1416
1417         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1418         /* Note: Only uses first TLB reg currently */
1419         if (val_iva)
1420                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1421         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1422
1423         /* Make sure hardware complete it */
1424         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1425                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1426
1427         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1428
1429         /* check IOTLB invalidation granularity */
1430         if (DMA_TLB_IAIG(val) == 0)
1431                 pr_err("Flush IOTLB failed\n");
1432         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1433                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1434                         (unsigned long long)DMA_TLB_IIRG(type),
1435                         (unsigned long long)DMA_TLB_IAIG(val));
1436 }
1437
1438 static struct device_domain_info *
1439 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1440                          u8 bus, u8 devfn)
1441 {
1442         struct device_domain_info *info;
1443
1444         assert_spin_locked(&device_domain_lock);
1445
1446         if (!iommu->qi)
1447                 return NULL;
1448
1449         list_for_each_entry(info, &domain->devices, link)
1450                 if (info->iommu == iommu && info->bus == bus &&
1451                     info->devfn == devfn) {
1452                         if (info->ats_supported && info->dev)
1453                                 return info;
1454                         break;
1455                 }
1456
1457         return NULL;
1458 }
1459
1460 static void domain_update_iotlb(struct dmar_domain *domain)
1461 {
1462         struct device_domain_info *info;
1463         bool has_iotlb_device = false;
1464
1465         assert_spin_locked(&device_domain_lock);
1466
1467         list_for_each_entry(info, &domain->devices, link) {
1468                 struct pci_dev *pdev;
1469
1470                 if (!info->dev || !dev_is_pci(info->dev))
1471                         continue;
1472
1473                 pdev = to_pci_dev(info->dev);
1474                 if (pdev->ats_enabled) {
1475                         has_iotlb_device = true;
1476                         break;
1477                 }
1478         }
1479
1480         domain->has_iotlb_device = has_iotlb_device;
1481 }
1482
1483 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1484 {
1485         struct pci_dev *pdev;
1486
1487         assert_spin_locked(&device_domain_lock);
1488
1489         if (!info || !dev_is_pci(info->dev))
1490                 return;
1491
1492         pdev = to_pci_dev(info->dev);
1493         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1494          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1495          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1496          * reserved, which should be set to 0.
1497          */
1498         if (!ecap_dit(info->iommu->ecap))
1499                 info->pfsid = 0;
1500         else {
1501                 struct pci_dev *pf_pdev;
1502
1503                 /* pdev will be returned if device is not a vf */
1504                 pf_pdev = pci_physfn(pdev);
1505                 info->pfsid = pci_dev_id(pf_pdev);
1506         }
1507
1508 #ifdef CONFIG_INTEL_IOMMU_SVM
1509         /* The PCIe spec, in its wisdom, declares that the behaviour of
1510            the device if you enable PASID support after ATS support is
1511            undefined. So always enable PASID support on devices which
1512            have it, even if we can't yet know if we're ever going to
1513            use it. */
1514         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1515                 info->pasid_enabled = 1;
1516
1517         if (info->pri_supported &&
1518             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1519             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1520                 info->pri_enabled = 1;
1521 #endif
1522         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1523             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1524                 info->ats_enabled = 1;
1525                 domain_update_iotlb(info->domain);
1526                 info->ats_qdep = pci_ats_queue_depth(pdev);
1527         }
1528 }
1529
1530 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1531 {
1532         struct pci_dev *pdev;
1533
1534         assert_spin_locked(&device_domain_lock);
1535
1536         if (!dev_is_pci(info->dev))
1537                 return;
1538
1539         pdev = to_pci_dev(info->dev);
1540
1541         if (info->ats_enabled) {
1542                 pci_disable_ats(pdev);
1543                 info->ats_enabled = 0;
1544                 domain_update_iotlb(info->domain);
1545         }
1546 #ifdef CONFIG_INTEL_IOMMU_SVM
1547         if (info->pri_enabled) {
1548                 pci_disable_pri(pdev);
1549                 info->pri_enabled = 0;
1550         }
1551         if (info->pasid_enabled) {
1552                 pci_disable_pasid(pdev);
1553                 info->pasid_enabled = 0;
1554         }
1555 #endif
1556 }
1557
1558 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1559                                   u64 addr, unsigned mask)
1560 {
1561         u16 sid, qdep;
1562         unsigned long flags;
1563         struct device_domain_info *info;
1564
1565         if (!domain->has_iotlb_device)
1566                 return;
1567
1568         spin_lock_irqsave(&device_domain_lock, flags);
1569         list_for_each_entry(info, &domain->devices, link) {
1570                 if (!info->ats_enabled)
1571                         continue;
1572
1573                 sid = info->bus << 8 | info->devfn;
1574                 qdep = info->ats_qdep;
1575                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1576                                 qdep, addr, mask);
1577         }
1578         spin_unlock_irqrestore(&device_domain_lock, flags);
1579 }
1580
1581 static void domain_flush_piotlb(struct intel_iommu *iommu,
1582                                 struct dmar_domain *domain,
1583                                 u64 addr, unsigned long npages, bool ih)
1584 {
1585         u16 did = domain->iommu_did[iommu->seq_id];
1586
1587         if (domain->default_pasid)
1588                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1589                                 addr, npages, ih);
1590
1591         if (!list_empty(&domain->devices))
1592                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1593 }
1594
1595 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1596                                   struct dmar_domain *domain,
1597                                   unsigned long pfn, unsigned int pages,
1598                                   int ih, int map)
1599 {
1600         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1601         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1602         u16 did = domain->iommu_did[iommu->seq_id];
1603
1604         BUG_ON(pages == 0);
1605
1606         if (ih)
1607                 ih = 1 << 6;
1608
1609         if (domain_use_first_level(domain)) {
1610                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1611         } else {
1612                 /*
1613                  * Fallback to domain selective flush if no PSI support or
1614                  * the size is too big. PSI requires page size to be 2 ^ x,
1615                  * and the base address is naturally aligned to the size.
1616                  */
1617                 if (!cap_pgsel_inv(iommu->cap) ||
1618                     mask > cap_max_amask_val(iommu->cap))
1619                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1620                                                         DMA_TLB_DSI_FLUSH);
1621                 else
1622                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1623                                                         DMA_TLB_PSI_FLUSH);
1624         }
1625
1626         /*
1627          * In caching mode, changes of pages from non-present to present require
1628          * flush. However, device IOTLB doesn't need to be flushed in this case.
1629          */
1630         if (!cap_caching_mode(iommu->cap) || !map)
1631                 iommu_flush_dev_iotlb(domain, addr, mask);
1632 }
1633
1634 /* Notification for newly created mappings */
1635 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1636                                         struct dmar_domain *domain,
1637                                         unsigned long pfn, unsigned int pages)
1638 {
1639         /*
1640          * It's a non-present to present mapping. Only flush if caching mode
1641          * and second level.
1642          */
1643         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1644                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1645         else
1646                 iommu_flush_write_buffer(iommu);
1647 }
1648
1649 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1650 {
1651         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1652         int idx;
1653
1654         for_each_domain_iommu(idx, dmar_domain) {
1655                 struct intel_iommu *iommu = g_iommus[idx];
1656                 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1657
1658                 if (domain_use_first_level(dmar_domain))
1659                         domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1660                 else
1661                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1662                                                  DMA_TLB_DSI_FLUSH);
1663
1664                 if (!cap_caching_mode(iommu->cap))
1665                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1666                                               0, MAX_AGAW_PFN_WIDTH);
1667         }
1668 }
1669
1670 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1671 {
1672         u32 pmen;
1673         unsigned long flags;
1674
1675         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1676                 return;
1677
1678         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1679         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1680         pmen &= ~DMA_PMEN_EPM;
1681         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1682
1683         /* wait for the protected region status bit to clear */
1684         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1685                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1686
1687         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1688 }
1689
1690 static void iommu_enable_translation(struct intel_iommu *iommu)
1691 {
1692         u32 sts;
1693         unsigned long flags;
1694
1695         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1696         iommu->gcmd |= DMA_GCMD_TE;
1697         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1698
1699         /* Make sure hardware complete it */
1700         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1701                       readl, (sts & DMA_GSTS_TES), sts);
1702
1703         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1704 }
1705
1706 static void iommu_disable_translation(struct intel_iommu *iommu)
1707 {
1708         u32 sts;
1709         unsigned long flag;
1710
1711         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1712             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1713                 return;
1714
1715         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1716         iommu->gcmd &= ~DMA_GCMD_TE;
1717         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1718
1719         /* Make sure hardware complete it */
1720         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1721                       readl, (!(sts & DMA_GSTS_TES)), sts);
1722
1723         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1724 }
1725
1726 static int iommu_init_domains(struct intel_iommu *iommu)
1727 {
1728         u32 ndomains, nlongs;
1729         size_t size;
1730
1731         ndomains = cap_ndoms(iommu->cap);
1732         pr_debug("%s: Number of Domains supported <%d>\n",
1733                  iommu->name, ndomains);
1734         nlongs = BITS_TO_LONGS(ndomains);
1735
1736         spin_lock_init(&iommu->lock);
1737
1738         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1739         if (!iommu->domain_ids) {
1740                 pr_err("%s: Allocating domain id array failed\n",
1741                        iommu->name);
1742                 return -ENOMEM;
1743         }
1744
1745         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1746         iommu->domains = kzalloc(size, GFP_KERNEL);
1747
1748         if (iommu->domains) {
1749                 size = 256 * sizeof(struct dmar_domain *);
1750                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1751         }
1752
1753         if (!iommu->domains || !iommu->domains[0]) {
1754                 pr_err("%s: Allocating domain array failed\n",
1755                        iommu->name);
1756                 kfree(iommu->domain_ids);
1757                 kfree(iommu->domains);
1758                 iommu->domain_ids = NULL;
1759                 iommu->domains    = NULL;
1760                 return -ENOMEM;
1761         }
1762
1763         /*
1764          * If Caching mode is set, then invalid translations are tagged
1765          * with domain-id 0, hence we need to pre-allocate it. We also
1766          * use domain-id 0 as a marker for non-allocated domain-id, so
1767          * make sure it is not used for a real domain.
1768          */
1769         set_bit(0, iommu->domain_ids);
1770
1771         /*
1772          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1773          * entry for first-level or pass-through translation modes should
1774          * be programmed with a domain id different from those used for
1775          * second-level or nested translation. We reserve a domain id for
1776          * this purpose.
1777          */
1778         if (sm_supported(iommu))
1779                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1780
1781         return 0;
1782 }
1783
1784 static void disable_dmar_iommu(struct intel_iommu *iommu)
1785 {
1786         struct device_domain_info *info, *tmp;
1787         unsigned long flags;
1788
1789         if (!iommu->domains || !iommu->domain_ids)
1790                 return;
1791
1792         spin_lock_irqsave(&device_domain_lock, flags);
1793         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1794                 if (info->iommu != iommu)
1795                         continue;
1796
1797                 if (!info->dev || !info->domain)
1798                         continue;
1799
1800                 __dmar_remove_one_dev_info(info);
1801         }
1802         spin_unlock_irqrestore(&device_domain_lock, flags);
1803
1804         if (iommu->gcmd & DMA_GCMD_TE)
1805                 iommu_disable_translation(iommu);
1806 }
1807
1808 static void free_dmar_iommu(struct intel_iommu *iommu)
1809 {
1810         if ((iommu->domains) && (iommu->domain_ids)) {
1811                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1812                 int i;
1813
1814                 for (i = 0; i < elems; i++)
1815                         kfree(iommu->domains[i]);
1816                 kfree(iommu->domains);
1817                 kfree(iommu->domain_ids);
1818                 iommu->domains = NULL;
1819                 iommu->domain_ids = NULL;
1820         }
1821
1822         g_iommus[iommu->seq_id] = NULL;
1823
1824         /* free context mapping */
1825         free_context_table(iommu);
1826
1827 #ifdef CONFIG_INTEL_IOMMU_SVM
1828         if (pasid_supported(iommu)) {
1829                 if (ecap_prs(iommu->ecap))
1830                         intel_svm_finish_prq(iommu);
1831         }
1832         if (vccap_pasid(iommu->vccap))
1833                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1834
1835 #endif
1836 }
1837
1838 /*
1839  * Check and return whether first level is used by default for
1840  * DMA translation.
1841  */
1842 static bool first_level_by_default(void)
1843 {
1844         struct dmar_drhd_unit *drhd;
1845         struct intel_iommu *iommu;
1846         static int first_level_support = -1;
1847
1848         if (likely(first_level_support != -1))
1849                 return first_level_support;
1850
1851         first_level_support = 1;
1852
1853         rcu_read_lock();
1854         for_each_active_iommu(iommu, drhd) {
1855                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1856                         first_level_support = 0;
1857                         break;
1858                 }
1859         }
1860         rcu_read_unlock();
1861
1862         return first_level_support;
1863 }
1864
1865 static struct dmar_domain *alloc_domain(int flags)
1866 {
1867         struct dmar_domain *domain;
1868
1869         domain = alloc_domain_mem();
1870         if (!domain)
1871                 return NULL;
1872
1873         memset(domain, 0, sizeof(*domain));
1874         domain->nid = NUMA_NO_NODE;
1875         domain->flags = flags;
1876         if (first_level_by_default())
1877                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1878         domain->has_iotlb_device = false;
1879         INIT_LIST_HEAD(&domain->devices);
1880
1881         return domain;
1882 }
1883
1884 /* Must be called with iommu->lock */
1885 static int domain_attach_iommu(struct dmar_domain *domain,
1886                                struct intel_iommu *iommu)
1887 {
1888         unsigned long ndomains;
1889         int num;
1890
1891         assert_spin_locked(&device_domain_lock);
1892         assert_spin_locked(&iommu->lock);
1893
1894         domain->iommu_refcnt[iommu->seq_id] += 1;
1895         domain->iommu_count += 1;
1896         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1897                 ndomains = cap_ndoms(iommu->cap);
1898                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1899
1900                 if (num >= ndomains) {
1901                         pr_err("%s: No free domain ids\n", iommu->name);
1902                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1903                         domain->iommu_count -= 1;
1904                         return -ENOSPC;
1905                 }
1906
1907                 set_bit(num, iommu->domain_ids);
1908                 set_iommu_domain(iommu, num, domain);
1909
1910                 domain->iommu_did[iommu->seq_id] = num;
1911                 domain->nid                      = iommu->node;
1912
1913                 domain_update_iommu_cap(domain);
1914         }
1915
1916         return 0;
1917 }
1918
1919 static int domain_detach_iommu(struct dmar_domain *domain,
1920                                struct intel_iommu *iommu)
1921 {
1922         int num, count;
1923
1924         assert_spin_locked(&device_domain_lock);
1925         assert_spin_locked(&iommu->lock);
1926
1927         domain->iommu_refcnt[iommu->seq_id] -= 1;
1928         count = --domain->iommu_count;
1929         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1930                 num = domain->iommu_did[iommu->seq_id];
1931                 clear_bit(num, iommu->domain_ids);
1932                 set_iommu_domain(iommu, num, NULL);
1933
1934                 domain_update_iommu_cap(domain);
1935                 domain->iommu_did[iommu->seq_id] = 0;
1936         }
1937
1938         return count;
1939 }
1940
1941 static inline int guestwidth_to_adjustwidth(int gaw)
1942 {
1943         int agaw;
1944         int r = (gaw - 12) % 9;
1945
1946         if (r == 0)
1947                 agaw = gaw;
1948         else
1949                 agaw = gaw + 9 - r;
1950         if (agaw > 64)
1951                 agaw = 64;
1952         return agaw;
1953 }
1954
1955 static void domain_exit(struct dmar_domain *domain)
1956 {
1957
1958         /* Remove associated devices and clear attached or cached domains */
1959         domain_remove_dev_info(domain);
1960
1961         /* destroy iovas */
1962         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1963                 iommu_put_dma_cookie(&domain->domain);
1964
1965         if (domain->pgd) {
1966                 struct page *freelist;
1967
1968                 freelist = domain_unmap(domain, 0,
1969                                         DOMAIN_MAX_PFN(domain->gaw), NULL);
1970                 dma_free_pagelist(freelist);
1971         }
1972
1973         free_domain_mem(domain);
1974 }
1975
1976 /*
1977  * Get the PASID directory size for scalable mode context entry.
1978  * Value of X in the PDTS field of a scalable mode context entry
1979  * indicates PASID directory with 2^(X + 7) entries.
1980  */
1981 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1982 {
1983         int pds, max_pde;
1984
1985         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1986         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1987         if (pds < 7)
1988                 return 0;
1989
1990         return pds - 7;
1991 }
1992
1993 /*
1994  * Set the RID_PASID field of a scalable mode context entry. The
1995  * IOMMU hardware will use the PASID value set in this field for
1996  * DMA translations of DMA requests without PASID.
1997  */
1998 static inline void
1999 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2000 {
2001         context->hi |= pasid & ((1 << 20) - 1);
2002 }
2003
2004 /*
2005  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2006  * entry.
2007  */
2008 static inline void context_set_sm_dte(struct context_entry *context)
2009 {
2010         context->lo |= (1 << 2);
2011 }
2012
2013 /*
2014  * Set the PRE(Page Request Enable) field of a scalable mode context
2015  * entry.
2016  */
2017 static inline void context_set_sm_pre(struct context_entry *context)
2018 {
2019         context->lo |= (1 << 4);
2020 }
2021
2022 /* Convert value to context PASID directory size field coding. */
2023 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2024
2025 static int domain_context_mapping_one(struct dmar_domain *domain,
2026                                       struct intel_iommu *iommu,
2027                                       struct pasid_table *table,
2028                                       u8 bus, u8 devfn)
2029 {
2030         u16 did = domain->iommu_did[iommu->seq_id];
2031         int translation = CONTEXT_TT_MULTI_LEVEL;
2032         struct device_domain_info *info = NULL;
2033         struct context_entry *context;
2034         unsigned long flags;
2035         int ret;
2036
2037         WARN_ON(did == 0);
2038
2039         if (hw_pass_through && domain_type_is_si(domain))
2040                 translation = CONTEXT_TT_PASS_THROUGH;
2041
2042         pr_debug("Set context mapping for %02x:%02x.%d\n",
2043                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2044
2045         BUG_ON(!domain->pgd);
2046
2047         spin_lock_irqsave(&device_domain_lock, flags);
2048         spin_lock(&iommu->lock);
2049
2050         ret = -ENOMEM;
2051         context = iommu_context_addr(iommu, bus, devfn, 1);
2052         if (!context)
2053                 goto out_unlock;
2054
2055         ret = 0;
2056         if (context_present(context))
2057                 goto out_unlock;
2058
2059         /*
2060          * For kdump cases, old valid entries may be cached due to the
2061          * in-flight DMA and copied pgtable, but there is no unmapping
2062          * behaviour for them, thus we need an explicit cache flush for
2063          * the newly-mapped device. For kdump, at this point, the device
2064          * is supposed to finish reset at its driver probe stage, so no
2065          * in-flight DMA will exist, and we don't need to worry anymore
2066          * hereafter.
2067          */
2068         if (context_copied(context)) {
2069                 u16 did_old = context_domain_id(context);
2070
2071                 if (did_old < cap_ndoms(iommu->cap)) {
2072                         iommu->flush.flush_context(iommu, did_old,
2073                                                    (((u16)bus) << 8) | devfn,
2074                                                    DMA_CCMD_MASK_NOBIT,
2075                                                    DMA_CCMD_DEVICE_INVL);
2076                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2077                                                  DMA_TLB_DSI_FLUSH);
2078                 }
2079         }
2080
2081         context_clear_entry(context);
2082
2083         if (sm_supported(iommu)) {
2084                 unsigned long pds;
2085
2086                 WARN_ON(!table);
2087
2088                 /* Setup the PASID DIR pointer: */
2089                 pds = context_get_sm_pds(table);
2090                 context->lo = (u64)virt_to_phys(table->table) |
2091                                 context_pdts(pds);
2092
2093                 /* Setup the RID_PASID field: */
2094                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2095
2096                 /*
2097                  * Setup the Device-TLB enable bit and Page request
2098                  * Enable bit:
2099                  */
2100                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2101                 if (info && info->ats_supported)
2102                         context_set_sm_dte(context);
2103                 if (info && info->pri_supported)
2104                         context_set_sm_pre(context);
2105         } else {
2106                 struct dma_pte *pgd = domain->pgd;
2107                 int agaw;
2108
2109                 context_set_domain_id(context, did);
2110
2111                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2112                         /*
2113                          * Skip top levels of page tables for iommu which has
2114                          * less agaw than default. Unnecessary for PT mode.
2115                          */
2116                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2117                                 ret = -ENOMEM;
2118                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2119                                 if (!dma_pte_present(pgd))
2120                                         goto out_unlock;
2121                         }
2122
2123                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2124                         if (info && info->ats_supported)
2125                                 translation = CONTEXT_TT_DEV_IOTLB;
2126                         else
2127                                 translation = CONTEXT_TT_MULTI_LEVEL;
2128
2129                         context_set_address_root(context, virt_to_phys(pgd));
2130                         context_set_address_width(context, agaw);
2131                 } else {
2132                         /*
2133                          * In pass through mode, AW must be programmed to
2134                          * indicate the largest AGAW value supported by
2135                          * hardware. And ASR is ignored by hardware.
2136                          */
2137                         context_set_address_width(context, iommu->msagaw);
2138                 }
2139
2140                 context_set_translation_type(context, translation);
2141         }
2142
2143         context_set_fault_enable(context);
2144         context_set_present(context);
2145         if (!ecap_coherent(iommu->ecap))
2146                 clflush_cache_range(context, sizeof(*context));
2147
2148         /*
2149          * It's a non-present to present mapping. If hardware doesn't cache
2150          * non-present entry we only need to flush the write-buffer. If the
2151          * _does_ cache non-present entries, then it does so in the special
2152          * domain #0, which we have to flush:
2153          */
2154         if (cap_caching_mode(iommu->cap)) {
2155                 iommu->flush.flush_context(iommu, 0,
2156                                            (((u16)bus) << 8) | devfn,
2157                                            DMA_CCMD_MASK_NOBIT,
2158                                            DMA_CCMD_DEVICE_INVL);
2159                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2160         } else {
2161                 iommu_flush_write_buffer(iommu);
2162         }
2163         iommu_enable_dev_iotlb(info);
2164
2165         ret = 0;
2166
2167 out_unlock:
2168         spin_unlock(&iommu->lock);
2169         spin_unlock_irqrestore(&device_domain_lock, flags);
2170
2171         return ret;
2172 }
2173
2174 struct domain_context_mapping_data {
2175         struct dmar_domain *domain;
2176         struct intel_iommu *iommu;
2177         struct pasid_table *table;
2178 };
2179
2180 static int domain_context_mapping_cb(struct pci_dev *pdev,
2181                                      u16 alias, void *opaque)
2182 {
2183         struct domain_context_mapping_data *data = opaque;
2184
2185         return domain_context_mapping_one(data->domain, data->iommu,
2186                                           data->table, PCI_BUS_NUM(alias),
2187                                           alias & 0xff);
2188 }
2189
2190 static int
2191 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2192 {
2193         struct domain_context_mapping_data data;
2194         struct pasid_table *table;
2195         struct intel_iommu *iommu;
2196         u8 bus, devfn;
2197
2198         iommu = device_to_iommu(dev, &bus, &devfn);
2199         if (!iommu)
2200                 return -ENODEV;
2201
2202         table = intel_pasid_get_table(dev);
2203
2204         if (!dev_is_pci(dev))
2205                 return domain_context_mapping_one(domain, iommu, table,
2206                                                   bus, devfn);
2207
2208         data.domain = domain;
2209         data.iommu = iommu;
2210         data.table = table;
2211
2212         return pci_for_each_dma_alias(to_pci_dev(dev),
2213                                       &domain_context_mapping_cb, &data);
2214 }
2215
2216 static int domain_context_mapped_cb(struct pci_dev *pdev,
2217                                     u16 alias, void *opaque)
2218 {
2219         struct intel_iommu *iommu = opaque;
2220
2221         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2222 }
2223
2224 static int domain_context_mapped(struct device *dev)
2225 {
2226         struct intel_iommu *iommu;
2227         u8 bus, devfn;
2228
2229         iommu = device_to_iommu(dev, &bus, &devfn);
2230         if (!iommu)
2231                 return -ENODEV;
2232
2233         if (!dev_is_pci(dev))
2234                 return device_context_mapped(iommu, bus, devfn);
2235
2236         return !pci_for_each_dma_alias(to_pci_dev(dev),
2237                                        domain_context_mapped_cb, iommu);
2238 }
2239
2240 /* Returns a number of VTD pages, but aligned to MM page size */
2241 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2242                                             size_t size)
2243 {
2244         host_addr &= ~PAGE_MASK;
2245         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2246 }
2247
2248 /* Return largest possible superpage level for a given mapping */
2249 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2250                                           unsigned long iov_pfn,
2251                                           unsigned long phy_pfn,
2252                                           unsigned long pages)
2253 {
2254         int support, level = 1;
2255         unsigned long pfnmerge;
2256
2257         support = domain->iommu_superpage;
2258
2259         /* To use a large page, the virtual *and* physical addresses
2260            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2261            of them will mean we have to use smaller pages. So just
2262            merge them and check both at once. */
2263         pfnmerge = iov_pfn | phy_pfn;
2264
2265         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2266                 pages >>= VTD_STRIDE_SHIFT;
2267                 if (!pages)
2268                         break;
2269                 pfnmerge >>= VTD_STRIDE_SHIFT;
2270                 level++;
2271                 support--;
2272         }
2273         return level;
2274 }
2275
2276 static int
2277 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2278                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2279 {
2280         struct dma_pte *first_pte = NULL, *pte = NULL;
2281         unsigned int largepage_lvl = 0;
2282         unsigned long lvl_pages = 0;
2283         phys_addr_t pteval;
2284         u64 attr;
2285
2286         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2287
2288         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2289                 return -EINVAL;
2290
2291         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2292         if (domain_use_first_level(domain))
2293                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2294
2295         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2296
2297         while (nr_pages > 0) {
2298                 uint64_t tmp;
2299
2300                 if (!pte) {
2301                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2302                                         phys_pfn, nr_pages);
2303
2304                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2305                         if (!pte)
2306                                 return -ENOMEM;
2307                         /* It is large page*/
2308                         if (largepage_lvl > 1) {
2309                                 unsigned long nr_superpages, end_pfn;
2310
2311                                 pteval |= DMA_PTE_LARGE_PAGE;
2312                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2313
2314                                 nr_superpages = nr_pages / lvl_pages;
2315                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2316
2317                                 /*
2318                                  * Ensure that old small page tables are
2319                                  * removed to make room for superpage(s).
2320                                  * We're adding new large pages, so make sure
2321                                  * we don't remove their parent tables.
2322                                  */
2323                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2324                                                        largepage_lvl + 1);
2325                         } else {
2326                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2327                         }
2328
2329                 }
2330                 /* We don't need lock here, nobody else
2331                  * touches the iova range
2332                  */
2333                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2334                 if (tmp) {
2335                         static int dumps = 5;
2336                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2337                                 iov_pfn, tmp, (unsigned long long)pteval);
2338                         if (dumps) {
2339                                 dumps--;
2340                                 debug_dma_dump_mappings(NULL);
2341                         }
2342                         WARN_ON(1);
2343                 }
2344
2345                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2346
2347                 BUG_ON(nr_pages < lvl_pages);
2348
2349                 nr_pages -= lvl_pages;
2350                 iov_pfn += lvl_pages;
2351                 phys_pfn += lvl_pages;
2352                 pteval += lvl_pages * VTD_PAGE_SIZE;
2353
2354                 /* If the next PTE would be the first in a new page, then we
2355                  * need to flush the cache on the entries we've just written.
2356                  * And then we'll need to recalculate 'pte', so clear it and
2357                  * let it get set again in the if (!pte) block above.
2358                  *
2359                  * If we're done (!nr_pages) we need to flush the cache too.
2360                  *
2361                  * Also if we've been setting superpages, we may need to
2362                  * recalculate 'pte' and switch back to smaller pages for the
2363                  * end of the mapping, if the trailing size is not enough to
2364                  * use another superpage (i.e. nr_pages < lvl_pages).
2365                  */
2366                 pte++;
2367                 if (!nr_pages || first_pte_in_page(pte) ||
2368                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2369                         domain_flush_cache(domain, first_pte,
2370                                            (void *)pte - (void *)first_pte);
2371                         pte = NULL;
2372                 }
2373         }
2374
2375         return 0;
2376 }
2377
2378 static int
2379 domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380                unsigned long phys_pfn, unsigned long nr_pages, int prot)
2381 {
2382         int iommu_id, ret;
2383         struct intel_iommu *iommu;
2384
2385         /* Do the real mapping first */
2386         ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
2387         if (ret)
2388                 return ret;
2389
2390         for_each_domain_iommu(iommu_id, domain) {
2391                 iommu = g_iommus[iommu_id];
2392                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2393         }
2394
2395         return 0;
2396 }
2397
2398 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2399 {
2400         unsigned long flags;
2401         struct context_entry *context;
2402         u16 did_old;
2403
2404         if (!iommu)
2405                 return;
2406
2407         spin_lock_irqsave(&iommu->lock, flags);
2408         context = iommu_context_addr(iommu, bus, devfn, 0);
2409         if (!context) {
2410                 spin_unlock_irqrestore(&iommu->lock, flags);
2411                 return;
2412         }
2413         did_old = context_domain_id(context);
2414         context_clear_entry(context);
2415         __iommu_flush_cache(iommu, context, sizeof(*context));
2416         spin_unlock_irqrestore(&iommu->lock, flags);
2417         iommu->flush.flush_context(iommu,
2418                                    did_old,
2419                                    (((u16)bus) << 8) | devfn,
2420                                    DMA_CCMD_MASK_NOBIT,
2421                                    DMA_CCMD_DEVICE_INVL);
2422         iommu->flush.flush_iotlb(iommu,
2423                                  did_old,
2424                                  0,
2425                                  0,
2426                                  DMA_TLB_DSI_FLUSH);
2427 }
2428
2429 static inline void unlink_domain_info(struct device_domain_info *info)
2430 {
2431         assert_spin_locked(&device_domain_lock);
2432         list_del(&info->link);
2433         list_del(&info->global);
2434         if (info->dev)
2435                 dev_iommu_priv_set(info->dev, NULL);
2436 }
2437
2438 static void domain_remove_dev_info(struct dmar_domain *domain)
2439 {
2440         struct device_domain_info *info, *tmp;
2441         unsigned long flags;
2442
2443         spin_lock_irqsave(&device_domain_lock, flags);
2444         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2445                 __dmar_remove_one_dev_info(info);
2446         spin_unlock_irqrestore(&device_domain_lock, flags);
2447 }
2448
2449 struct dmar_domain *find_domain(struct device *dev)
2450 {
2451         struct device_domain_info *info;
2452
2453         if (unlikely(!dev || !dev->iommu))
2454                 return NULL;
2455
2456         if (unlikely(attach_deferred(dev)))
2457                 return NULL;
2458
2459         /* No lock here, assumes no domain exit in normal case */
2460         info = get_domain_info(dev);
2461         if (likely(info))
2462                 return info->domain;
2463
2464         return NULL;
2465 }
2466
2467 static inline struct device_domain_info *
2468 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2469 {
2470         struct device_domain_info *info;
2471
2472         list_for_each_entry(info, &device_domain_list, global)
2473                 if (info->segment == segment && info->bus == bus &&
2474                     info->devfn == devfn)
2475                         return info;
2476
2477         return NULL;
2478 }
2479
2480 static int domain_setup_first_level(struct intel_iommu *iommu,
2481                                     struct dmar_domain *domain,
2482                                     struct device *dev,
2483                                     u32 pasid)
2484 {
2485         int flags = PASID_FLAG_SUPERVISOR_MODE;
2486         struct dma_pte *pgd = domain->pgd;
2487         int agaw, level;
2488
2489         /*
2490          * Skip top levels of page tables for iommu which has
2491          * less agaw than default. Unnecessary for PT mode.
2492          */
2493         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2494                 pgd = phys_to_virt(dma_pte_addr(pgd));
2495                 if (!dma_pte_present(pgd))
2496                         return -ENOMEM;
2497         }
2498
2499         level = agaw_to_level(agaw);
2500         if (level != 4 && level != 5)
2501                 return -EINVAL;
2502
2503         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2504
2505         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2506                                              domain->iommu_did[iommu->seq_id],
2507                                              flags);
2508 }
2509
2510 static bool dev_is_real_dma_subdevice(struct device *dev)
2511 {
2512         return dev && dev_is_pci(dev) &&
2513                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2514 }
2515
2516 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2517                                                     int bus, int devfn,
2518                                                     struct device *dev,
2519                                                     struct dmar_domain *domain)
2520 {
2521         struct dmar_domain *found = NULL;
2522         struct device_domain_info *info;
2523         unsigned long flags;
2524         int ret;
2525
2526         info = alloc_devinfo_mem();
2527         if (!info)
2528                 return NULL;
2529
2530         if (!dev_is_real_dma_subdevice(dev)) {
2531                 info->bus = bus;
2532                 info->devfn = devfn;
2533                 info->segment = iommu->segment;
2534         } else {
2535                 struct pci_dev *pdev = to_pci_dev(dev);
2536
2537                 info->bus = pdev->bus->number;
2538                 info->devfn = pdev->devfn;
2539                 info->segment = pci_domain_nr(pdev->bus);
2540         }
2541
2542         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2543         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2544         info->ats_qdep = 0;
2545         info->dev = dev;
2546         info->domain = domain;
2547         info->iommu = iommu;
2548         info->pasid_table = NULL;
2549         info->auxd_enabled = 0;
2550         INIT_LIST_HEAD(&info->auxiliary_domains);
2551
2552         if (dev && dev_is_pci(dev)) {
2553                 struct pci_dev *pdev = to_pci_dev(info->dev);
2554
2555                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2556                     pci_ats_supported(pdev) &&
2557                     dmar_find_matched_atsr_unit(pdev))
2558                         info->ats_supported = 1;
2559
2560                 if (sm_supported(iommu)) {
2561                         if (pasid_supported(iommu)) {
2562                                 int features = pci_pasid_features(pdev);
2563                                 if (features >= 0)
2564                                         info->pasid_supported = features | 1;
2565                         }
2566
2567                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2568                             pci_pri_supported(pdev))
2569                                 info->pri_supported = 1;
2570                 }
2571         }
2572
2573         spin_lock_irqsave(&device_domain_lock, flags);
2574         if (dev)
2575                 found = find_domain(dev);
2576
2577         if (!found) {
2578                 struct device_domain_info *info2;
2579                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2580                                                        info->devfn);
2581                 if (info2) {
2582                         found      = info2->domain;
2583                         info2->dev = dev;
2584                 }
2585         }
2586
2587         if (found) {
2588                 spin_unlock_irqrestore(&device_domain_lock, flags);
2589                 free_devinfo_mem(info);
2590                 /* Caller must free the original domain */
2591                 return found;
2592         }
2593
2594         spin_lock(&iommu->lock);
2595         ret = domain_attach_iommu(domain, iommu);
2596         spin_unlock(&iommu->lock);
2597
2598         if (ret) {
2599                 spin_unlock_irqrestore(&device_domain_lock, flags);
2600                 free_devinfo_mem(info);
2601                 return NULL;
2602         }
2603
2604         list_add(&info->link, &domain->devices);
2605         list_add(&info->global, &device_domain_list);
2606         if (dev)
2607                 dev_iommu_priv_set(dev, info);
2608         spin_unlock_irqrestore(&device_domain_lock, flags);
2609
2610         /* PASID table is mandatory for a PCI device in scalable mode. */
2611         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2612                 ret = intel_pasid_alloc_table(dev);
2613                 if (ret) {
2614                         dev_err(dev, "PASID table allocation failed\n");
2615                         dmar_remove_one_dev_info(dev);
2616                         return NULL;
2617                 }
2618
2619                 /* Setup the PASID entry for requests without PASID: */
2620                 spin_lock_irqsave(&iommu->lock, flags);
2621                 if (hw_pass_through && domain_type_is_si(domain))
2622                         ret = intel_pasid_setup_pass_through(iommu, domain,
2623                                         dev, PASID_RID2PASID);
2624                 else if (domain_use_first_level(domain))
2625                         ret = domain_setup_first_level(iommu, domain, dev,
2626                                         PASID_RID2PASID);
2627                 else
2628                         ret = intel_pasid_setup_second_level(iommu, domain,
2629                                         dev, PASID_RID2PASID);
2630                 spin_unlock_irqrestore(&iommu->lock, flags);
2631                 if (ret) {
2632                         dev_err(dev, "Setup RID2PASID failed\n");
2633                         dmar_remove_one_dev_info(dev);
2634                         return NULL;
2635                 }
2636         }
2637
2638         if (dev && domain_context_mapping(domain, dev)) {
2639                 dev_err(dev, "Domain context map failed\n");
2640                 dmar_remove_one_dev_info(dev);
2641                 return NULL;
2642         }
2643
2644         return domain;
2645 }
2646
2647 static int iommu_domain_identity_map(struct dmar_domain *domain,
2648                                      unsigned long first_vpfn,
2649                                      unsigned long last_vpfn)
2650 {
2651         /*
2652          * RMRR range might have overlap with physical memory range,
2653          * clear it first
2654          */
2655         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2656
2657         return __domain_mapping(domain, first_vpfn,
2658                                 first_vpfn, last_vpfn - first_vpfn + 1,
2659                                 DMA_PTE_READ|DMA_PTE_WRITE);
2660 }
2661
2662 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2663
2664 static int __init si_domain_init(int hw)
2665 {
2666         struct dmar_rmrr_unit *rmrr;
2667         struct device *dev;
2668         int i, nid, ret;
2669
2670         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2671         if (!si_domain)
2672                 return -EFAULT;
2673
2674         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2675                 domain_exit(si_domain);
2676                 return -EFAULT;
2677         }
2678
2679         if (hw)
2680                 return 0;
2681
2682         for_each_online_node(nid) {
2683                 unsigned long start_pfn, end_pfn;
2684                 int i;
2685
2686                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2687                         ret = iommu_domain_identity_map(si_domain,
2688                                         mm_to_dma_pfn(start_pfn),
2689                                         mm_to_dma_pfn(end_pfn));
2690                         if (ret)
2691                                 return ret;
2692                 }
2693         }
2694
2695         /*
2696          * Identity map the RMRRs so that devices with RMRRs could also use
2697          * the si_domain.
2698          */
2699         for_each_rmrr_units(rmrr) {
2700                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2701                                           i, dev) {
2702                         unsigned long long start = rmrr->base_address;
2703                         unsigned long long end = rmrr->end_address;
2704
2705                         if (WARN_ON(end < start ||
2706                                     end >> agaw_to_width(si_domain->agaw)))
2707                                 continue;
2708
2709                         ret = iommu_domain_identity_map(si_domain,
2710                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2711                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2712                         if (ret)
2713                                 return ret;
2714                 }
2715         }
2716
2717         return 0;
2718 }
2719
2720 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2721 {
2722         struct dmar_domain *ndomain;
2723         struct intel_iommu *iommu;
2724         u8 bus, devfn;
2725
2726         iommu = device_to_iommu(dev, &bus, &devfn);
2727         if (!iommu)
2728                 return -ENODEV;
2729
2730         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2731         if (ndomain != domain)
2732                 return -EBUSY;
2733
2734         return 0;
2735 }
2736
2737 static bool device_has_rmrr(struct device *dev)
2738 {
2739         struct dmar_rmrr_unit *rmrr;
2740         struct device *tmp;
2741         int i;
2742
2743         rcu_read_lock();
2744         for_each_rmrr_units(rmrr) {
2745                 /*
2746                  * Return TRUE if this RMRR contains the device that
2747                  * is passed in.
2748                  */
2749                 for_each_active_dev_scope(rmrr->devices,
2750                                           rmrr->devices_cnt, i, tmp)
2751                         if (tmp == dev ||
2752                             is_downstream_to_pci_bridge(dev, tmp)) {
2753                                 rcu_read_unlock();
2754                                 return true;
2755                         }
2756         }
2757         rcu_read_unlock();
2758         return false;
2759 }
2760
2761 /**
2762  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2763  * is relaxable (ie. is allowed to be not enforced under some conditions)
2764  * @dev: device handle
2765  *
2766  * We assume that PCI USB devices with RMRRs have them largely
2767  * for historical reasons and that the RMRR space is not actively used post
2768  * boot.  This exclusion may change if vendors begin to abuse it.
2769  *
2770  * The same exception is made for graphics devices, with the requirement that
2771  * any use of the RMRR regions will be torn down before assigning the device
2772  * to a guest.
2773  *
2774  * Return: true if the RMRR is relaxable, false otherwise
2775  */
2776 static bool device_rmrr_is_relaxable(struct device *dev)
2777 {
2778         struct pci_dev *pdev;
2779
2780         if (!dev_is_pci(dev))
2781                 return false;
2782
2783         pdev = to_pci_dev(dev);
2784         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2785                 return true;
2786         else
2787                 return false;
2788 }
2789
2790 /*
2791  * There are a couple cases where we need to restrict the functionality of
2792  * devices associated with RMRRs.  The first is when evaluating a device for
2793  * identity mapping because problems exist when devices are moved in and out
2794  * of domains and their respective RMRR information is lost.  This means that
2795  * a device with associated RMRRs will never be in a "passthrough" domain.
2796  * The second is use of the device through the IOMMU API.  This interface
2797  * expects to have full control of the IOVA space for the device.  We cannot
2798  * satisfy both the requirement that RMRR access is maintained and have an
2799  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2800  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2801  * We therefore prevent devices associated with an RMRR from participating in
2802  * the IOMMU API, which eliminates them from device assignment.
2803  *
2804  * In both cases, devices which have relaxable RMRRs are not concerned by this
2805  * restriction. See device_rmrr_is_relaxable comment.
2806  */
2807 static bool device_is_rmrr_locked(struct device *dev)
2808 {
2809         if (!device_has_rmrr(dev))
2810                 return false;
2811
2812         if (device_rmrr_is_relaxable(dev))
2813                 return false;
2814
2815         return true;
2816 }
2817
2818 /*
2819  * Return the required default domain type for a specific device.
2820  *
2821  * @dev: the device in query
2822  * @startup: true if this is during early boot
2823  *
2824  * Returns:
2825  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2826  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2827  *  - 0: both identity and dynamic domains work for this device
2828  */
2829 static int device_def_domain_type(struct device *dev)
2830 {
2831         if (dev_is_pci(dev)) {
2832                 struct pci_dev *pdev = to_pci_dev(dev);
2833
2834                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2835                         return IOMMU_DOMAIN_IDENTITY;
2836
2837                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2838                         return IOMMU_DOMAIN_IDENTITY;
2839         }
2840
2841         return 0;
2842 }
2843
2844 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2845 {
2846         /*
2847          * Start from the sane iommu hardware state.
2848          * If the queued invalidation is already initialized by us
2849          * (for example, while enabling interrupt-remapping) then
2850          * we got the things already rolling from a sane state.
2851          */
2852         if (!iommu->qi) {
2853                 /*
2854                  * Clear any previous faults.
2855                  */
2856                 dmar_fault(-1, iommu);
2857                 /*
2858                  * Disable queued invalidation if supported and already enabled
2859                  * before OS handover.
2860                  */
2861                 dmar_disable_qi(iommu);
2862         }
2863
2864         if (dmar_enable_qi(iommu)) {
2865                 /*
2866                  * Queued Invalidate not enabled, use Register Based Invalidate
2867                  */
2868                 iommu->flush.flush_context = __iommu_flush_context;
2869                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2870                 pr_info("%s: Using Register based invalidation\n",
2871                         iommu->name);
2872         } else {
2873                 iommu->flush.flush_context = qi_flush_context;
2874                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2875                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2876         }
2877 }
2878
2879 static int copy_context_table(struct intel_iommu *iommu,
2880                               struct root_entry *old_re,
2881                               struct context_entry **tbl,
2882                               int bus, bool ext)
2883 {
2884         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2885         struct context_entry *new_ce = NULL, ce;
2886         struct context_entry *old_ce = NULL;
2887         struct root_entry re;
2888         phys_addr_t old_ce_phys;
2889
2890         tbl_idx = ext ? bus * 2 : bus;
2891         memcpy(&re, old_re, sizeof(re));
2892
2893         for (devfn = 0; devfn < 256; devfn++) {
2894                 /* First calculate the correct index */
2895                 idx = (ext ? devfn * 2 : devfn) % 256;
2896
2897                 if (idx == 0) {
2898                         /* First save what we may have and clean up */
2899                         if (new_ce) {
2900                                 tbl[tbl_idx] = new_ce;
2901                                 __iommu_flush_cache(iommu, new_ce,
2902                                                     VTD_PAGE_SIZE);
2903                                 pos = 1;
2904                         }
2905
2906                         if (old_ce)
2907                                 memunmap(old_ce);
2908
2909                         ret = 0;
2910                         if (devfn < 0x80)
2911                                 old_ce_phys = root_entry_lctp(&re);
2912                         else
2913                                 old_ce_phys = root_entry_uctp(&re);
2914
2915                         if (!old_ce_phys) {
2916                                 if (ext && devfn == 0) {
2917                                         /* No LCTP, try UCTP */
2918                                         devfn = 0x7f;
2919                                         continue;
2920                                 } else {
2921                                         goto out;
2922                                 }
2923                         }
2924
2925                         ret = -ENOMEM;
2926                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2927                                         MEMREMAP_WB);
2928                         if (!old_ce)
2929                                 goto out;
2930
2931                         new_ce = alloc_pgtable_page(iommu->node);
2932                         if (!new_ce)
2933                                 goto out_unmap;
2934
2935                         ret = 0;
2936                 }
2937
2938                 /* Now copy the context entry */
2939                 memcpy(&ce, old_ce + idx, sizeof(ce));
2940
2941                 if (!__context_present(&ce))
2942                         continue;
2943
2944                 did = context_domain_id(&ce);
2945                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2946                         set_bit(did, iommu->domain_ids);
2947
2948                 /*
2949                  * We need a marker for copied context entries. This
2950                  * marker needs to work for the old format as well as
2951                  * for extended context entries.
2952                  *
2953                  * Bit 67 of the context entry is used. In the old
2954                  * format this bit is available to software, in the
2955                  * extended format it is the PGE bit, but PGE is ignored
2956                  * by HW if PASIDs are disabled (and thus still
2957                  * available).
2958                  *
2959                  * So disable PASIDs first and then mark the entry
2960                  * copied. This means that we don't copy PASID
2961                  * translations from the old kernel, but this is fine as
2962                  * faults there are not fatal.
2963                  */
2964                 context_clear_pasid_enable(&ce);
2965                 context_set_copied(&ce);
2966
2967                 new_ce[idx] = ce;
2968         }
2969
2970         tbl[tbl_idx + pos] = new_ce;
2971
2972         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2973
2974 out_unmap:
2975         memunmap(old_ce);
2976
2977 out:
2978         return ret;
2979 }
2980
2981 static int copy_translation_tables(struct intel_iommu *iommu)
2982 {
2983         struct context_entry **ctxt_tbls;
2984         struct root_entry *old_rt;
2985         phys_addr_t old_rt_phys;
2986         int ctxt_table_entries;
2987         unsigned long flags;
2988         u64 rtaddr_reg;
2989         int bus, ret;
2990         bool new_ext, ext;
2991
2992         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2993         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2994         new_ext    = !!ecap_ecs(iommu->ecap);
2995
2996         /*
2997          * The RTT bit can only be changed when translation is disabled,
2998          * but disabling translation means to open a window for data
2999          * corruption. So bail out and don't copy anything if we would
3000          * have to change the bit.
3001          */
3002         if (new_ext != ext)
3003                 return -EINVAL;
3004
3005         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3006         if (!old_rt_phys)
3007                 return -EINVAL;
3008
3009         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3010         if (!old_rt)
3011                 return -ENOMEM;
3012
3013         /* This is too big for the stack - allocate it from slab */
3014         ctxt_table_entries = ext ? 512 : 256;
3015         ret = -ENOMEM;
3016         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3017         if (!ctxt_tbls)
3018                 goto out_unmap;
3019
3020         for (bus = 0; bus < 256; bus++) {
3021                 ret = copy_context_table(iommu, &old_rt[bus],
3022                                          ctxt_tbls, bus, ext);
3023                 if (ret) {
3024                         pr_err("%s: Failed to copy context table for bus %d\n",
3025                                 iommu->name, bus);
3026                         continue;
3027                 }
3028         }
3029
3030         spin_lock_irqsave(&iommu->lock, flags);
3031
3032         /* Context tables are copied, now write them to the root_entry table */
3033         for (bus = 0; bus < 256; bus++) {
3034                 int idx = ext ? bus * 2 : bus;
3035                 u64 val;
3036
3037                 if (ctxt_tbls[idx]) {
3038                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3039                         iommu->root_entry[bus].lo = val;
3040                 }
3041
3042                 if (!ext || !ctxt_tbls[idx + 1])
3043                         continue;
3044
3045                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3046                 iommu->root_entry[bus].hi = val;
3047         }
3048
3049         spin_unlock_irqrestore(&iommu->lock, flags);
3050
3051         kfree(ctxt_tbls);
3052
3053         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3054
3055         ret = 0;
3056
3057 out_unmap:
3058         memunmap(old_rt);
3059
3060         return ret;
3061 }
3062
3063 #ifdef CONFIG_INTEL_IOMMU_SVM
3064 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3065 {
3066         struct intel_iommu *iommu = data;
3067         ioasid_t ioasid;
3068
3069         if (!iommu)
3070                 return INVALID_IOASID;
3071         /*
3072          * VT-d virtual command interface always uses the full 20 bit
3073          * PASID range. Host can partition guest PASID range based on
3074          * policies but it is out of guest's control.
3075          */
3076         if (min < PASID_MIN || max > intel_pasid_max_id)
3077                 return INVALID_IOASID;
3078
3079         if (vcmd_alloc_pasid(iommu, &ioasid))
3080                 return INVALID_IOASID;
3081
3082         return ioasid;
3083 }
3084
3085 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3086 {
3087         struct intel_iommu *iommu = data;
3088
3089         if (!iommu)
3090                 return;
3091         /*
3092          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3093          * We can only free the PASID when all the devices are unbound.
3094          */
3095         if (ioasid_find(NULL, ioasid, NULL)) {
3096                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3097                 return;
3098         }
3099         vcmd_free_pasid(iommu, ioasid);
3100 }
3101
3102 static void register_pasid_allocator(struct intel_iommu *iommu)
3103 {
3104         /*
3105          * If we are running in the host, no need for custom allocator
3106          * in that PASIDs are allocated from the host system-wide.
3107          */
3108         if (!cap_caching_mode(iommu->cap))
3109                 return;
3110
3111         if (!sm_supported(iommu)) {
3112                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3113                 return;
3114         }
3115
3116         /*
3117          * Register a custom PASID allocator if we are running in a guest,
3118          * guest PASID must be obtained via virtual command interface.
3119          * There can be multiple vIOMMUs in each guest but only one allocator
3120          * is active. All vIOMMU allocators will eventually be calling the same
3121          * host allocator.
3122          */
3123         if (!vccap_pasid(iommu->vccap))
3124                 return;
3125
3126         pr_info("Register custom PASID allocator\n");
3127         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3128         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3129         iommu->pasid_allocator.pdata = (void *)iommu;
3130         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3131                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3132                 /*
3133                  * Disable scalable mode on this IOMMU if there
3134                  * is no custom allocator. Mixing SM capable vIOMMU
3135                  * and non-SM vIOMMU are not supported.
3136                  */
3137                 intel_iommu_sm = 0;
3138         }
3139 }
3140 #endif
3141
3142 static int __init init_dmars(void)
3143 {
3144         struct dmar_drhd_unit *drhd;
3145         struct intel_iommu *iommu;
3146         int ret;
3147
3148         /*
3149          * for each drhd
3150          *    allocate root
3151          *    initialize and program root entry to not present
3152          * endfor
3153          */
3154         for_each_drhd_unit(drhd) {
3155                 /*
3156                  * lock not needed as this is only incremented in the single
3157                  * threaded kernel __init code path all other access are read
3158                  * only
3159                  */
3160                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3161                         g_num_of_iommus++;
3162                         continue;
3163                 }
3164                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3165         }
3166
3167         /* Preallocate enough resources for IOMMU hot-addition */
3168         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3169                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3170
3171         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3172                         GFP_KERNEL);
3173         if (!g_iommus) {
3174                 pr_err("Allocating global iommu array failed\n");
3175                 ret = -ENOMEM;
3176                 goto error;
3177         }
3178
3179         for_each_iommu(iommu, drhd) {
3180                 if (drhd->ignored) {
3181                         iommu_disable_translation(iommu);
3182                         continue;
3183                 }
3184
3185                 /*
3186                  * Find the max pasid size of all IOMMU's in the system.
3187                  * We need to ensure the system pasid table is no bigger
3188                  * than the smallest supported.
3189                  */
3190                 if (pasid_supported(iommu)) {
3191                         u32 temp = 2 << ecap_pss(iommu->ecap);
3192
3193                         intel_pasid_max_id = min_t(u32, temp,
3194                                                    intel_pasid_max_id);
3195                 }
3196
3197                 g_iommus[iommu->seq_id] = iommu;
3198
3199                 intel_iommu_init_qi(iommu);
3200
3201                 ret = iommu_init_domains(iommu);
3202                 if (ret)
3203                         goto free_iommu;
3204
3205                 init_translation_status(iommu);
3206
3207                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3208                         iommu_disable_translation(iommu);
3209                         clear_translation_pre_enabled(iommu);
3210                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3211                                 iommu->name);
3212                 }
3213
3214                 /*
3215                  * TBD:
3216                  * we could share the same root & context tables
3217                  * among all IOMMU's. Need to Split it later.
3218                  */
3219                 ret = iommu_alloc_root_entry(iommu);
3220                 if (ret)
3221                         goto free_iommu;
3222
3223                 if (translation_pre_enabled(iommu)) {
3224                         pr_info("Translation already enabled - trying to copy translation structures\n");
3225
3226                         ret = copy_translation_tables(iommu);
3227                         if (ret) {
3228                                 /*
3229                                  * We found the IOMMU with translation
3230                                  * enabled - but failed to copy over the
3231                                  * old root-entry table. Try to proceed
3232                                  * by disabling translation now and
3233                                  * allocating a clean root-entry table.
3234                                  * This might cause DMAR faults, but
3235                                  * probably the dump will still succeed.
3236                                  */
3237                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3238                                        iommu->name);
3239                                 iommu_disable_translation(iommu);
3240                                 clear_translation_pre_enabled(iommu);
3241                         } else {
3242                                 pr_info("Copied translation tables from previous kernel for %s\n",
3243                                         iommu->name);
3244                         }
3245                 }
3246
3247                 if (!ecap_pass_through(iommu->ecap))
3248                         hw_pass_through = 0;
3249                 intel_svm_check(iommu);
3250         }
3251
3252         /*
3253          * Now that qi is enabled on all iommus, set the root entry and flush
3254          * caches. This is required on some Intel X58 chipsets, otherwise the
3255          * flush_context function will loop forever and the boot hangs.
3256          */
3257         for_each_active_iommu(iommu, drhd) {
3258                 iommu_flush_write_buffer(iommu);
3259 #ifdef CONFIG_INTEL_IOMMU_SVM
3260                 register_pasid_allocator(iommu);
3261 #endif
3262                 iommu_set_root_entry(iommu);
3263                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3264                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3265         }
3266
3267 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3268         dmar_map_gfx = 0;
3269 #endif
3270
3271         if (!dmar_map_gfx)
3272                 iommu_identity_mapping |= IDENTMAP_GFX;
3273
3274         check_tylersburg_isoch();
3275
3276         ret = si_domain_init(hw_pass_through);
3277         if (ret)
3278                 goto free_iommu;
3279
3280         /*
3281          * for each drhd
3282          *   enable fault log
3283          *   global invalidate context cache
3284          *   global invalidate iotlb
3285          *   enable translation
3286          */
3287         for_each_iommu(iommu, drhd) {
3288                 if (drhd->ignored) {
3289                         /*
3290                          * we always have to disable PMRs or DMA may fail on
3291                          * this device
3292                          */
3293                         if (force_on)
3294                                 iommu_disable_protect_mem_regions(iommu);
3295                         continue;
3296                 }
3297
3298                 iommu_flush_write_buffer(iommu);
3299
3300 #ifdef CONFIG_INTEL_IOMMU_SVM
3301                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3302                         /*
3303                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3304                          * could cause possible lock race condition.
3305                          */
3306                         up_write(&dmar_global_lock);
3307                         ret = intel_svm_enable_prq(iommu);
3308                         down_write(&dmar_global_lock);
3309                         if (ret)
3310                                 goto free_iommu;
3311                 }
3312 #endif
3313                 ret = dmar_set_interrupt(iommu);
3314                 if (ret)
3315                         goto free_iommu;
3316         }
3317
3318         return 0;
3319
3320 free_iommu:
3321         for_each_active_iommu(iommu, drhd) {
3322                 disable_dmar_iommu(iommu);
3323                 free_dmar_iommu(iommu);
3324         }
3325
3326         kfree(g_iommus);
3327
3328 error:
3329         return ret;
3330 }
3331
3332 static inline int iommu_domain_cache_init(void)
3333 {
3334         int ret = 0;
3335
3336         iommu_domain_cache = kmem_cache_create("iommu_domain",
3337                                          sizeof(struct dmar_domain),
3338                                          0,
3339                                          SLAB_HWCACHE_ALIGN,
3340
3341                                          NULL);
3342         if (!iommu_domain_cache) {
3343                 pr_err("Couldn't create iommu_domain cache\n");
3344                 ret = -ENOMEM;
3345         }
3346
3347         return ret;
3348 }
3349
3350 static inline int iommu_devinfo_cache_init(void)
3351 {
3352         int ret = 0;
3353
3354         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3355                                          sizeof(struct device_domain_info),
3356                                          0,
3357                                          SLAB_HWCACHE_ALIGN,
3358                                          NULL);
3359         if (!iommu_devinfo_cache) {
3360                 pr_err("Couldn't create devinfo cache\n");
3361                 ret = -ENOMEM;
3362         }
3363
3364         return ret;
3365 }
3366
3367 static int __init iommu_init_mempool(void)
3368 {
3369         int ret;
3370         ret = iova_cache_get();
3371         if (ret)
3372                 return ret;
3373
3374         ret = iommu_domain_cache_init();
3375         if (ret)
3376                 goto domain_error;
3377
3378         ret = iommu_devinfo_cache_init();
3379         if (!ret)
3380                 return ret;
3381
3382         kmem_cache_destroy(iommu_domain_cache);
3383 domain_error:
3384         iova_cache_put();
3385
3386         return -ENOMEM;
3387 }
3388
3389 static void __init iommu_exit_mempool(void)
3390 {
3391         kmem_cache_destroy(iommu_devinfo_cache);
3392         kmem_cache_destroy(iommu_domain_cache);
3393         iova_cache_put();
3394 }
3395
3396 static void __init init_no_remapping_devices(void)
3397 {
3398         struct dmar_drhd_unit *drhd;
3399         struct device *dev;
3400         int i;
3401
3402         for_each_drhd_unit(drhd) {
3403                 if (!drhd->include_all) {
3404                         for_each_active_dev_scope(drhd->devices,
3405                                                   drhd->devices_cnt, i, dev)
3406                                 break;
3407                         /* ignore DMAR unit if no devices exist */
3408                         if (i == drhd->devices_cnt)
3409                                 drhd->ignored = 1;
3410                 }
3411         }
3412
3413         for_each_active_drhd_unit(drhd) {
3414                 if (drhd->include_all)
3415                         continue;
3416
3417                 for_each_active_dev_scope(drhd->devices,
3418                                           drhd->devices_cnt, i, dev)
3419                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3420                                 break;
3421                 if (i < drhd->devices_cnt)
3422                         continue;
3423
3424                 /* This IOMMU has *only* gfx devices. Either bypass it or
3425                    set the gfx_mapped flag, as appropriate */
3426                 drhd->gfx_dedicated = 1;
3427                 if (!dmar_map_gfx)
3428                         drhd->ignored = 1;
3429         }
3430 }
3431
3432 #ifdef CONFIG_SUSPEND
3433 static int init_iommu_hw(void)
3434 {
3435         struct dmar_drhd_unit *drhd;
3436         struct intel_iommu *iommu = NULL;
3437
3438         for_each_active_iommu(iommu, drhd)
3439                 if (iommu->qi)
3440                         dmar_reenable_qi(iommu);
3441
3442         for_each_iommu(iommu, drhd) {
3443                 if (drhd->ignored) {
3444                         /*
3445                          * we always have to disable PMRs or DMA may fail on
3446                          * this device
3447                          */
3448                         if (force_on)
3449                                 iommu_disable_protect_mem_regions(iommu);
3450                         continue;
3451                 }
3452
3453                 iommu_flush_write_buffer(iommu);
3454
3455                 iommu_set_root_entry(iommu);
3456
3457                 iommu->flush.flush_context(iommu, 0, 0, 0,
3458                                            DMA_CCMD_GLOBAL_INVL);
3459                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3460                 iommu_enable_translation(iommu);
3461                 iommu_disable_protect_mem_regions(iommu);
3462         }
3463
3464         return 0;
3465 }
3466
3467 static void iommu_flush_all(void)
3468 {
3469         struct dmar_drhd_unit *drhd;
3470         struct intel_iommu *iommu;
3471
3472         for_each_active_iommu(iommu, drhd) {
3473                 iommu->flush.flush_context(iommu, 0, 0, 0,
3474                                            DMA_CCMD_GLOBAL_INVL);
3475                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3476                                          DMA_TLB_GLOBAL_FLUSH);
3477         }
3478 }
3479
3480 static int iommu_suspend(void)
3481 {
3482         struct dmar_drhd_unit *drhd;
3483         struct intel_iommu *iommu = NULL;
3484         unsigned long flag;
3485
3486         for_each_active_iommu(iommu, drhd) {
3487                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3488                                              GFP_KERNEL);
3489                 if (!iommu->iommu_state)
3490                         goto nomem;
3491         }
3492
3493         iommu_flush_all();
3494
3495         for_each_active_iommu(iommu, drhd) {
3496                 iommu_disable_translation(iommu);
3497
3498                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3499
3500                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3501                         readl(iommu->reg + DMAR_FECTL_REG);
3502                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3503                         readl(iommu->reg + DMAR_FEDATA_REG);
3504                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3505                         readl(iommu->reg + DMAR_FEADDR_REG);
3506                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3507                         readl(iommu->reg + DMAR_FEUADDR_REG);
3508
3509                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3510         }
3511         return 0;
3512
3513 nomem:
3514         for_each_active_iommu(iommu, drhd)
3515                 kfree(iommu->iommu_state);
3516
3517         return -ENOMEM;
3518 }
3519
3520 static void iommu_resume(void)
3521 {
3522         struct dmar_drhd_unit *drhd;
3523         struct intel_iommu *iommu = NULL;
3524         unsigned long flag;
3525
3526         if (init_iommu_hw()) {
3527                 if (force_on)
3528                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3529                 else
3530                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3531                 return;
3532         }
3533
3534         for_each_active_iommu(iommu, drhd) {
3535
3536                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3537
3538                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3539                         iommu->reg + DMAR_FECTL_REG);
3540                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3541                         iommu->reg + DMAR_FEDATA_REG);
3542                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3543                         iommu->reg + DMAR_FEADDR_REG);
3544                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3545                         iommu->reg + DMAR_FEUADDR_REG);
3546
3547                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3548         }
3549
3550         for_each_active_iommu(iommu, drhd)
3551                 kfree(iommu->iommu_state);
3552 }
3553
3554 static struct syscore_ops iommu_syscore_ops = {
3555         .resume         = iommu_resume,
3556         .suspend        = iommu_suspend,
3557 };
3558
3559 static void __init init_iommu_pm_ops(void)
3560 {
3561         register_syscore_ops(&iommu_syscore_ops);
3562 }
3563
3564 #else
3565 static inline void init_iommu_pm_ops(void) {}
3566 #endif  /* CONFIG_PM */
3567
3568 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3569 {
3570         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3571             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3572             rmrr->end_address <= rmrr->base_address ||
3573             arch_rmrr_sanity_check(rmrr))
3574                 return -EINVAL;
3575
3576         return 0;
3577 }
3578
3579 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3580 {
3581         struct acpi_dmar_reserved_memory *rmrr;
3582         struct dmar_rmrr_unit *rmrru;
3583
3584         rmrr = (struct acpi_dmar_reserved_memory *)header;
3585         if (rmrr_sanity_check(rmrr)) {
3586                 pr_warn(FW_BUG
3587                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3588                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3589                            rmrr->base_address, rmrr->end_address,
3590                            dmi_get_system_info(DMI_BIOS_VENDOR),
3591                            dmi_get_system_info(DMI_BIOS_VERSION),
3592                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3593                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3594         }
3595
3596         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3597         if (!rmrru)
3598                 goto out;
3599
3600         rmrru->hdr = header;
3601
3602         rmrru->base_address = rmrr->base_address;
3603         rmrru->end_address = rmrr->end_address;
3604
3605         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3606                                 ((void *)rmrr) + rmrr->header.length,
3607                                 &rmrru->devices_cnt);
3608         if (rmrru->devices_cnt && rmrru->devices == NULL)
3609                 goto free_rmrru;
3610
3611         list_add(&rmrru->list, &dmar_rmrr_units);
3612
3613         return 0;
3614 free_rmrru:
3615         kfree(rmrru);
3616 out:
3617         return -ENOMEM;
3618 }
3619
3620 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3621 {
3622         struct dmar_atsr_unit *atsru;
3623         struct acpi_dmar_atsr *tmp;
3624
3625         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3626                                 dmar_rcu_check()) {
3627                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3628                 if (atsr->segment != tmp->segment)
3629                         continue;
3630                 if (atsr->header.length != tmp->header.length)
3631                         continue;
3632                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3633                         return atsru;
3634         }
3635
3636         return NULL;
3637 }
3638
3639 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3640 {
3641         struct acpi_dmar_atsr *atsr;
3642         struct dmar_atsr_unit *atsru;
3643
3644         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3645                 return 0;
3646
3647         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3648         atsru = dmar_find_atsr(atsr);
3649         if (atsru)
3650                 return 0;
3651
3652         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3653         if (!atsru)
3654                 return -ENOMEM;
3655
3656         /*
3657          * If memory is allocated from slab by ACPI _DSM method, we need to
3658          * copy the memory content because the memory buffer will be freed
3659          * on return.
3660          */
3661         atsru->hdr = (void *)(atsru + 1);
3662         memcpy(atsru->hdr, hdr, hdr->length);
3663         atsru->include_all = atsr->flags & 0x1;
3664         if (!atsru->include_all) {
3665                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3666                                 (void *)atsr + atsr->header.length,
3667                                 &atsru->devices_cnt);
3668                 if (atsru->devices_cnt && atsru->devices == NULL) {
3669                         kfree(atsru);
3670                         return -ENOMEM;
3671                 }
3672         }
3673
3674         list_add_rcu(&atsru->list, &dmar_atsr_units);
3675
3676         return 0;
3677 }
3678
3679 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3680 {
3681         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3682         kfree(atsru);
3683 }
3684
3685 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3686 {
3687         struct acpi_dmar_atsr *atsr;
3688         struct dmar_atsr_unit *atsru;
3689
3690         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3691         atsru = dmar_find_atsr(atsr);
3692         if (atsru) {
3693                 list_del_rcu(&atsru->list);
3694                 synchronize_rcu();
3695                 intel_iommu_free_atsr(atsru);
3696         }
3697
3698         return 0;
3699 }
3700
3701 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3702 {
3703         int i;
3704         struct device *dev;
3705         struct acpi_dmar_atsr *atsr;
3706         struct dmar_atsr_unit *atsru;
3707
3708         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3709         atsru = dmar_find_atsr(atsr);
3710         if (!atsru)
3711                 return 0;
3712
3713         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3714                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3715                                           i, dev)
3716                         return -EBUSY;
3717         }
3718
3719         return 0;
3720 }
3721
3722 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3723 {
3724         int sp, ret;
3725         struct intel_iommu *iommu = dmaru->iommu;
3726
3727         if (g_iommus[iommu->seq_id])
3728                 return 0;
3729
3730         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3731                 pr_warn("%s: Doesn't support hardware pass through.\n",
3732                         iommu->name);
3733                 return -ENXIO;
3734         }
3735         if (!ecap_sc_support(iommu->ecap) &&
3736             domain_update_iommu_snooping(iommu)) {
3737                 pr_warn("%s: Doesn't support snooping.\n",
3738                         iommu->name);
3739                 return -ENXIO;
3740         }
3741         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3742         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3743                 pr_warn("%s: Doesn't support large page.\n",
3744                         iommu->name);
3745                 return -ENXIO;
3746         }
3747
3748         /*
3749          * Disable translation if already enabled prior to OS handover.
3750          */
3751         if (iommu->gcmd & DMA_GCMD_TE)
3752                 iommu_disable_translation(iommu);
3753
3754         g_iommus[iommu->seq_id] = iommu;
3755         ret = iommu_init_domains(iommu);
3756         if (ret == 0)
3757                 ret = iommu_alloc_root_entry(iommu);
3758         if (ret)
3759                 goto out;
3760
3761         intel_svm_check(iommu);
3762
3763         if (dmaru->ignored) {
3764                 /*
3765                  * we always have to disable PMRs or DMA may fail on this device
3766                  */
3767                 if (force_on)
3768                         iommu_disable_protect_mem_regions(iommu);
3769                 return 0;
3770         }
3771
3772         intel_iommu_init_qi(iommu);
3773         iommu_flush_write_buffer(iommu);
3774
3775 #ifdef CONFIG_INTEL_IOMMU_SVM
3776         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3777                 ret = intel_svm_enable_prq(iommu);
3778                 if (ret)
3779                         goto disable_iommu;
3780         }
3781 #endif
3782         ret = dmar_set_interrupt(iommu);
3783         if (ret)
3784                 goto disable_iommu;
3785
3786         iommu_set_root_entry(iommu);
3787         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3788         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3789         iommu_enable_translation(iommu);
3790
3791         iommu_disable_protect_mem_regions(iommu);
3792         return 0;
3793
3794 disable_iommu:
3795         disable_dmar_iommu(iommu);
3796 out:
3797         free_dmar_iommu(iommu);
3798         return ret;
3799 }
3800
3801 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3802 {
3803         int ret = 0;
3804         struct intel_iommu *iommu = dmaru->iommu;
3805
3806         if (!intel_iommu_enabled)
3807                 return 0;
3808         if (iommu == NULL)
3809                 return -EINVAL;
3810
3811         if (insert) {
3812                 ret = intel_iommu_add(dmaru);
3813         } else {
3814                 disable_dmar_iommu(iommu);
3815                 free_dmar_iommu(iommu);
3816         }
3817
3818         return ret;
3819 }
3820
3821 static void intel_iommu_free_dmars(void)
3822 {
3823         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3824         struct dmar_atsr_unit *atsru, *atsr_n;
3825
3826         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3827                 list_del(&rmrru->list);
3828                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3829                 kfree(rmrru);
3830         }
3831
3832         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3833                 list_del(&atsru->list);
3834                 intel_iommu_free_atsr(atsru);
3835         }
3836 }
3837
3838 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3839 {
3840         int i, ret = 1;
3841         struct pci_bus *bus;
3842         struct pci_dev *bridge = NULL;
3843         struct device *tmp;
3844         struct acpi_dmar_atsr *atsr;
3845         struct dmar_atsr_unit *atsru;
3846
3847         dev = pci_physfn(dev);
3848         for (bus = dev->bus; bus; bus = bus->parent) {
3849                 bridge = bus->self;
3850                 /* If it's an integrated device, allow ATS */
3851                 if (!bridge)
3852                         return 1;
3853                 /* Connected via non-PCIe: no ATS */
3854                 if (!pci_is_pcie(bridge) ||
3855                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3856                         return 0;
3857                 /* If we found the root port, look it up in the ATSR */
3858                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3859                         break;
3860         }
3861
3862         rcu_read_lock();
3863         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3864                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3865                 if (atsr->segment != pci_domain_nr(dev->bus))
3866                         continue;
3867
3868                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3869                         if (tmp == &bridge->dev)
3870                                 goto out;
3871
3872                 if (atsru->include_all)
3873                         goto out;
3874         }
3875         ret = 0;
3876 out:
3877         rcu_read_unlock();
3878
3879         return ret;
3880 }
3881
3882 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3883 {
3884         int ret;
3885         struct dmar_rmrr_unit *rmrru;
3886         struct dmar_atsr_unit *atsru;
3887         struct acpi_dmar_atsr *atsr;
3888         struct acpi_dmar_reserved_memory *rmrr;
3889
3890         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3891                 return 0;
3892
3893         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3894                 rmrr = container_of(rmrru->hdr,
3895                                     struct acpi_dmar_reserved_memory, header);
3896                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3897                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3898                                 ((void *)rmrr) + rmrr->header.length,
3899                                 rmrr->segment, rmrru->devices,
3900                                 rmrru->devices_cnt);
3901                         if (ret < 0)
3902                                 return ret;
3903                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3904                         dmar_remove_dev_scope(info, rmrr->segment,
3905                                 rmrru->devices, rmrru->devices_cnt);
3906                 }
3907         }
3908
3909         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3910                 if (atsru->include_all)
3911                         continue;
3912
3913                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3914                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3915                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3916                                         (void *)atsr + atsr->header.length,
3917                                         atsr->segment, atsru->devices,
3918                                         atsru->devices_cnt);
3919                         if (ret > 0)
3920                                 break;
3921                         else if (ret < 0)
3922                                 return ret;
3923                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3924                         if (dmar_remove_dev_scope(info, atsr->segment,
3925                                         atsru->devices, atsru->devices_cnt))
3926                                 break;
3927                 }
3928         }
3929
3930         return 0;
3931 }
3932
3933 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3934                                        unsigned long val, void *v)
3935 {
3936         struct memory_notify *mhp = v;
3937         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3938         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3939                         mhp->nr_pages - 1);
3940
3941         switch (val) {
3942         case MEM_GOING_ONLINE:
3943                 if (iommu_domain_identity_map(si_domain,
3944                                               start_vpfn, last_vpfn)) {
3945                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3946                                 start_vpfn, last_vpfn);
3947                         return NOTIFY_BAD;
3948                 }
3949                 break;
3950
3951         case MEM_OFFLINE:
3952         case MEM_CANCEL_ONLINE:
3953                 {
3954                         struct dmar_drhd_unit *drhd;
3955                         struct intel_iommu *iommu;
3956                         struct page *freelist;
3957
3958                         freelist = domain_unmap(si_domain,
3959                                                 start_vpfn, last_vpfn,
3960                                                 NULL);
3961
3962                         rcu_read_lock();
3963                         for_each_active_iommu(iommu, drhd)
3964                                 iommu_flush_iotlb_psi(iommu, si_domain,
3965                                         start_vpfn, mhp->nr_pages,
3966                                         !freelist, 0);
3967                         rcu_read_unlock();
3968                         dma_free_pagelist(freelist);
3969                 }
3970                 break;
3971         }
3972
3973         return NOTIFY_OK;
3974 }
3975
3976 static struct notifier_block intel_iommu_memory_nb = {
3977         .notifier_call = intel_iommu_memory_notifier,
3978         .priority = 0
3979 };
3980
3981 static void free_all_cpu_cached_iovas(unsigned int cpu)
3982 {
3983         int i;
3984
3985         for (i = 0; i < g_num_of_iommus; i++) {
3986                 struct intel_iommu *iommu = g_iommus[i];
3987                 struct dmar_domain *domain;
3988                 int did;
3989
3990                 if (!iommu)
3991                         continue;
3992
3993                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
3994                         domain = get_iommu_domain(iommu, (u16)did);
3995
3996                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
3997                                 continue;
3998
3999                         iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
4000                 }
4001         }
4002 }
4003
4004 static int intel_iommu_cpu_dead(unsigned int cpu)
4005 {
4006         free_all_cpu_cached_iovas(cpu);
4007         return 0;
4008 }
4009
4010 static void intel_disable_iommus(void)
4011 {
4012         struct intel_iommu *iommu = NULL;
4013         struct dmar_drhd_unit *drhd;
4014
4015         for_each_iommu(iommu, drhd)
4016                 iommu_disable_translation(iommu);
4017 }
4018
4019 void intel_iommu_shutdown(void)
4020 {
4021         struct dmar_drhd_unit *drhd;
4022         struct intel_iommu *iommu = NULL;
4023
4024         if (no_iommu || dmar_disabled)
4025                 return;
4026
4027         down_write(&dmar_global_lock);
4028
4029         /* Disable PMRs explicitly here. */
4030         for_each_iommu(iommu, drhd)
4031                 iommu_disable_protect_mem_regions(iommu);
4032
4033         /* Make sure the IOMMUs are switched off */
4034         intel_disable_iommus();
4035
4036         up_write(&dmar_global_lock);
4037 }
4038
4039 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4040 {
4041         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4042
4043         return container_of(iommu_dev, struct intel_iommu, iommu);
4044 }
4045
4046 static ssize_t intel_iommu_show_version(struct device *dev,
4047                                         struct device_attribute *attr,
4048                                         char *buf)
4049 {
4050         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4051         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4052         return sprintf(buf, "%d:%d\n",
4053                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4054 }
4055 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4056
4057 static ssize_t intel_iommu_show_address(struct device *dev,
4058                                         struct device_attribute *attr,
4059                                         char *buf)
4060 {
4061         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4062         return sprintf(buf, "%llx\n", iommu->reg_phys);
4063 }
4064 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4065
4066 static ssize_t intel_iommu_show_cap(struct device *dev,
4067                                     struct device_attribute *attr,
4068                                     char *buf)
4069 {
4070         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4071         return sprintf(buf, "%llx\n", iommu->cap);
4072 }
4073 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4074
4075 static ssize_t intel_iommu_show_ecap(struct device *dev,
4076                                     struct device_attribute *attr,
4077                                     char *buf)
4078 {
4079         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4080         return sprintf(buf, "%llx\n", iommu->ecap);
4081 }
4082 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4083
4084 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4085                                       struct device_attribute *attr,
4086                                       char *buf)
4087 {
4088         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4089         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4090 }
4091 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4092
4093 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4094                                            struct device_attribute *attr,
4095                                            char *buf)
4096 {
4097         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4098         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4099                                                   cap_ndoms(iommu->cap)));
4100 }
4101 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4102
4103 static struct attribute *intel_iommu_attrs[] = {
4104         &dev_attr_version.attr,
4105         &dev_attr_address.attr,
4106         &dev_attr_cap.attr,
4107         &dev_attr_ecap.attr,
4108         &dev_attr_domains_supported.attr,
4109         &dev_attr_domains_used.attr,
4110         NULL,
4111 };
4112
4113 static struct attribute_group intel_iommu_group = {
4114         .name = "intel-iommu",
4115         .attrs = intel_iommu_attrs,
4116 };
4117
4118 const struct attribute_group *intel_iommu_groups[] = {
4119         &intel_iommu_group,
4120         NULL,
4121 };
4122
4123 static inline bool has_external_pci(void)
4124 {
4125         struct pci_dev *pdev = NULL;
4126
4127         for_each_pci_dev(pdev)
4128                 if (pdev->external_facing)
4129                         return true;
4130
4131         return false;
4132 }
4133
4134 static int __init platform_optin_force_iommu(void)
4135 {
4136         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4137                 return 0;
4138
4139         if (no_iommu || dmar_disabled)
4140                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4141
4142         /*
4143          * If Intel-IOMMU is disabled by default, we will apply identity
4144          * map for all devices except those marked as being untrusted.
4145          */
4146         if (dmar_disabled)
4147                 iommu_set_default_passthrough(false);
4148
4149         dmar_disabled = 0;
4150         no_iommu = 0;
4151
4152         return 1;
4153 }
4154
4155 static int __init probe_acpi_namespace_devices(void)
4156 {
4157         struct dmar_drhd_unit *drhd;
4158         /* To avoid a -Wunused-but-set-variable warning. */
4159         struct intel_iommu *iommu __maybe_unused;
4160         struct device *dev;
4161         int i, ret = 0;
4162
4163         for_each_active_iommu(iommu, drhd) {
4164                 for_each_active_dev_scope(drhd->devices,
4165                                           drhd->devices_cnt, i, dev) {
4166                         struct acpi_device_physical_node *pn;
4167                         struct iommu_group *group;
4168                         struct acpi_device *adev;
4169
4170                         if (dev->bus != &acpi_bus_type)
4171                                 continue;
4172
4173                         adev = to_acpi_device(dev);
4174                         mutex_lock(&adev->physical_node_lock);
4175                         list_for_each_entry(pn,
4176                                             &adev->physical_node_list, node) {
4177                                 group = iommu_group_get(pn->dev);
4178                                 if (group) {
4179                                         iommu_group_put(group);
4180                                         continue;
4181                                 }
4182
4183                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4184                                 ret = iommu_probe_device(pn->dev);
4185                                 if (ret)
4186                                         break;
4187                         }
4188                         mutex_unlock(&adev->physical_node_lock);
4189
4190                         if (ret)
4191                                 return ret;
4192                 }
4193         }
4194
4195         return 0;
4196 }
4197
4198 int __init intel_iommu_init(void)
4199 {
4200         int ret = -ENODEV;
4201         struct dmar_drhd_unit *drhd;
4202         struct intel_iommu *iommu;
4203
4204         /*
4205          * Intel IOMMU is required for a TXT/tboot launch or platform
4206          * opt in, so enforce that.
4207          */
4208         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4209                     platform_optin_force_iommu();
4210
4211         if (iommu_init_mempool()) {
4212                 if (force_on)
4213                         panic("tboot: Failed to initialize iommu memory\n");
4214                 return -ENOMEM;
4215         }
4216
4217         down_write(&dmar_global_lock);
4218         if (dmar_table_init()) {
4219                 if (force_on)
4220                         panic("tboot: Failed to initialize DMAR table\n");
4221                 goto out_free_dmar;
4222         }
4223
4224         if (dmar_dev_scope_init() < 0) {
4225                 if (force_on)
4226                         panic("tboot: Failed to initialize DMAR device scope\n");
4227                 goto out_free_dmar;
4228         }
4229
4230         up_write(&dmar_global_lock);
4231
4232         /*
4233          * The bus notifier takes the dmar_global_lock, so lockdep will
4234          * complain later when we register it under the lock.
4235          */
4236         dmar_register_bus_notifier();
4237
4238         down_write(&dmar_global_lock);
4239
4240         if (!no_iommu)
4241                 intel_iommu_debugfs_init();
4242
4243         if (no_iommu || dmar_disabled) {
4244                 /*
4245                  * We exit the function here to ensure IOMMU's remapping and
4246                  * mempool aren't setup, which means that the IOMMU's PMRs
4247                  * won't be disabled via the call to init_dmars(). So disable
4248                  * it explicitly here. The PMRs were setup by tboot prior to
4249                  * calling SENTER, but the kernel is expected to reset/tear
4250                  * down the PMRs.
4251                  */
4252                 if (intel_iommu_tboot_noforce) {
4253                         for_each_iommu(iommu, drhd)
4254                                 iommu_disable_protect_mem_regions(iommu);
4255                 }
4256
4257                 /*
4258                  * Make sure the IOMMUs are switched off, even when we
4259                  * boot into a kexec kernel and the previous kernel left
4260                  * them enabled
4261                  */
4262                 intel_disable_iommus();
4263                 goto out_free_dmar;
4264         }
4265
4266         if (list_empty(&dmar_rmrr_units))
4267                 pr_info("No RMRR found\n");
4268
4269         if (list_empty(&dmar_atsr_units))
4270                 pr_info("No ATSR found\n");
4271
4272         if (dmar_map_gfx)
4273                 intel_iommu_gfx_mapped = 1;
4274
4275         init_no_remapping_devices();
4276
4277         ret = init_dmars();
4278         if (ret) {
4279                 if (force_on)
4280                         panic("tboot: Failed to initialize DMARs\n");
4281                 pr_err("Initialization failed\n");
4282                 goto out_free_dmar;
4283         }
4284         up_write(&dmar_global_lock);
4285
4286         init_iommu_pm_ops();
4287
4288         down_read(&dmar_global_lock);
4289         for_each_active_iommu(iommu, drhd) {
4290                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4291                                        intel_iommu_groups,
4292                                        "%s", iommu->name);
4293                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4294                 iommu_device_register(&iommu->iommu);
4295         }
4296         up_read(&dmar_global_lock);
4297
4298         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4299         if (si_domain && !hw_pass_through)
4300                 register_memory_notifier(&intel_iommu_memory_nb);
4301         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4302                           intel_iommu_cpu_dead);
4303
4304         down_read(&dmar_global_lock);
4305         if (probe_acpi_namespace_devices())
4306                 pr_warn("ACPI name space devices didn't probe correctly\n");
4307
4308         /* Finally, we enable the DMA remapping hardware. */
4309         for_each_iommu(iommu, drhd) {
4310                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4311                         iommu_enable_translation(iommu);
4312
4313                 iommu_disable_protect_mem_regions(iommu);
4314         }
4315         up_read(&dmar_global_lock);
4316
4317         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4318
4319         intel_iommu_enabled = 1;
4320
4321         return 0;
4322
4323 out_free_dmar:
4324         intel_iommu_free_dmars();
4325         up_write(&dmar_global_lock);
4326         iommu_exit_mempool();
4327         return ret;
4328 }
4329
4330 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4331 {
4332         struct intel_iommu *iommu = opaque;
4333
4334         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4335         return 0;
4336 }
4337
4338 /*
4339  * NB - intel-iommu lacks any sort of reference counting for the users of
4340  * dependent devices.  If multiple endpoints have intersecting dependent
4341  * devices, unbinding the driver from any one of them will possibly leave
4342  * the others unable to operate.
4343  */
4344 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4345 {
4346         if (!iommu || !dev || !dev_is_pci(dev))
4347                 return;
4348
4349         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4350 }
4351
4352 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4353 {
4354         struct dmar_domain *domain;
4355         struct intel_iommu *iommu;
4356         unsigned long flags;
4357
4358         assert_spin_locked(&device_domain_lock);
4359
4360         if (WARN_ON(!info))
4361                 return;
4362
4363         iommu = info->iommu;
4364         domain = info->domain;
4365
4366         if (info->dev) {
4367                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4368                         intel_pasid_tear_down_entry(iommu, info->dev,
4369                                         PASID_RID2PASID, false);
4370
4371                 iommu_disable_dev_iotlb(info);
4372                 if (!dev_is_real_dma_subdevice(info->dev))
4373                         domain_context_clear(iommu, info->dev);
4374                 intel_pasid_free_table(info->dev);
4375         }
4376
4377         unlink_domain_info(info);
4378
4379         spin_lock_irqsave(&iommu->lock, flags);
4380         domain_detach_iommu(domain, iommu);
4381         spin_unlock_irqrestore(&iommu->lock, flags);
4382
4383         free_devinfo_mem(info);
4384 }
4385
4386 static void dmar_remove_one_dev_info(struct device *dev)
4387 {
4388         struct device_domain_info *info;
4389         unsigned long flags;
4390
4391         spin_lock_irqsave(&device_domain_lock, flags);
4392         info = get_domain_info(dev);
4393         if (info)
4394                 __dmar_remove_one_dev_info(info);
4395         spin_unlock_irqrestore(&device_domain_lock, flags);
4396 }
4397
4398 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4399 {
4400         int adjust_width;
4401
4402         /* calculate AGAW */
4403         domain->gaw = guest_width;
4404         adjust_width = guestwidth_to_adjustwidth(guest_width);
4405         domain->agaw = width_to_agaw(adjust_width);
4406
4407         domain->iommu_coherency = 0;
4408         domain->iommu_snooping = 0;
4409         domain->iommu_superpage = 0;
4410         domain->max_addr = 0;
4411
4412         /* always allocate the top pgd */
4413         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4414         if (!domain->pgd)
4415                 return -ENOMEM;
4416         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4417         return 0;
4418 }
4419
4420 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4421 {
4422         struct dmar_domain *dmar_domain;
4423         struct iommu_domain *domain;
4424
4425         switch (type) {
4426         case IOMMU_DOMAIN_DMA:
4427         case IOMMU_DOMAIN_UNMANAGED:
4428                 dmar_domain = alloc_domain(0);
4429                 if (!dmar_domain) {
4430                         pr_err("Can't allocate dmar_domain\n");
4431                         return NULL;
4432                 }
4433                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4434                         pr_err("Domain initialization failed\n");
4435                         domain_exit(dmar_domain);
4436                         return NULL;
4437                 }
4438
4439                 if (type == IOMMU_DOMAIN_DMA &&
4440                     iommu_get_dma_cookie(&dmar_domain->domain))
4441                         return NULL;
4442
4443                 domain = &dmar_domain->domain;
4444                 domain->geometry.aperture_start = 0;
4445                 domain->geometry.aperture_end   =
4446                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4447                 domain->geometry.force_aperture = true;
4448
4449                 return domain;
4450         case IOMMU_DOMAIN_IDENTITY:
4451                 return &si_domain->domain;
4452         default:
4453                 return NULL;
4454         }
4455
4456         return NULL;
4457 }
4458
4459 static void intel_iommu_domain_free(struct iommu_domain *domain)
4460 {
4461         if (domain != &si_domain->domain)
4462                 domain_exit(to_dmar_domain(domain));
4463 }
4464
4465 /*
4466  * Check whether a @domain could be attached to the @dev through the
4467  * aux-domain attach/detach APIs.
4468  */
4469 static inline bool
4470 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4471 {
4472         struct device_domain_info *info = get_domain_info(dev);
4473
4474         return info && info->auxd_enabled &&
4475                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4476 }
4477
4478 static void auxiliary_link_device(struct dmar_domain *domain,
4479                                   struct device *dev)
4480 {
4481         struct device_domain_info *info = get_domain_info(dev);
4482
4483         assert_spin_locked(&device_domain_lock);
4484         if (WARN_ON(!info))
4485                 return;
4486
4487         domain->auxd_refcnt++;
4488         list_add(&domain->auxd, &info->auxiliary_domains);
4489 }
4490
4491 static void auxiliary_unlink_device(struct dmar_domain *domain,
4492                                     struct device *dev)
4493 {
4494         struct device_domain_info *info = get_domain_info(dev);
4495
4496         assert_spin_locked(&device_domain_lock);
4497         if (WARN_ON(!info))
4498                 return;
4499
4500         list_del(&domain->auxd);
4501         domain->auxd_refcnt--;
4502
4503         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4504                 ioasid_put(domain->default_pasid);
4505 }
4506
4507 static int aux_domain_add_dev(struct dmar_domain *domain,
4508                               struct device *dev)
4509 {
4510         int ret;
4511         unsigned long flags;
4512         struct intel_iommu *iommu;
4513
4514         iommu = device_to_iommu(dev, NULL, NULL);
4515         if (!iommu)
4516                 return -ENODEV;
4517
4518         if (domain->default_pasid <= 0) {
4519                 u32 pasid;
4520
4521                 /* No private data needed for the default pasid */
4522                 pasid = ioasid_alloc(NULL, PASID_MIN,
4523                                      pci_max_pasids(to_pci_dev(dev)) - 1,
4524                                      NULL);
4525                 if (pasid == INVALID_IOASID) {
4526                         pr_err("Can't allocate default pasid\n");
4527                         return -ENODEV;
4528                 }
4529                 domain->default_pasid = pasid;
4530         }
4531
4532         spin_lock_irqsave(&device_domain_lock, flags);
4533         /*
4534          * iommu->lock must be held to attach domain to iommu and setup the
4535          * pasid entry for second level translation.
4536          */
4537         spin_lock(&iommu->lock);
4538         ret = domain_attach_iommu(domain, iommu);
4539         if (ret)
4540                 goto attach_failed;
4541
4542         /* Setup the PASID entry for mediated devices: */
4543         if (domain_use_first_level(domain))
4544                 ret = domain_setup_first_level(iommu, domain, dev,
4545                                                domain->default_pasid);
4546         else
4547                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4548                                                      domain->default_pasid);
4549         if (ret)
4550                 goto table_failed;
4551         spin_unlock(&iommu->lock);
4552
4553         auxiliary_link_device(domain, dev);
4554
4555         spin_unlock_irqrestore(&device_domain_lock, flags);
4556
4557         return 0;
4558
4559 table_failed:
4560         domain_detach_iommu(domain, iommu);
4561 attach_failed:
4562         spin_unlock(&iommu->lock);
4563         spin_unlock_irqrestore(&device_domain_lock, flags);
4564         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4565                 ioasid_put(domain->default_pasid);
4566
4567         return ret;
4568 }
4569
4570 static void aux_domain_remove_dev(struct dmar_domain *domain,
4571                                   struct device *dev)
4572 {
4573         struct device_domain_info *info;
4574         struct intel_iommu *iommu;
4575         unsigned long flags;
4576
4577         if (!is_aux_domain(dev, &domain->domain))
4578                 return;
4579
4580         spin_lock_irqsave(&device_domain_lock, flags);
4581         info = get_domain_info(dev);
4582         iommu = info->iommu;
4583
4584         auxiliary_unlink_device(domain, dev);
4585
4586         spin_lock(&iommu->lock);
4587         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
4588         domain_detach_iommu(domain, iommu);
4589         spin_unlock(&iommu->lock);
4590
4591         spin_unlock_irqrestore(&device_domain_lock, flags);
4592 }
4593
4594 static int prepare_domain_attach_device(struct iommu_domain *domain,
4595                                         struct device *dev)
4596 {
4597         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4598         struct intel_iommu *iommu;
4599         int addr_width;
4600
4601         iommu = device_to_iommu(dev, NULL, NULL);
4602         if (!iommu)
4603                 return -ENODEV;
4604
4605         /* check if this iommu agaw is sufficient for max mapped address */
4606         addr_width = agaw_to_width(iommu->agaw);
4607         if (addr_width > cap_mgaw(iommu->cap))
4608                 addr_width = cap_mgaw(iommu->cap);
4609
4610         if (dmar_domain->max_addr > (1LL << addr_width)) {
4611                 dev_err(dev, "%s: iommu width (%d) is not "
4612                         "sufficient for the mapped address (%llx)\n",
4613                         __func__, addr_width, dmar_domain->max_addr);
4614                 return -EFAULT;
4615         }
4616         dmar_domain->gaw = addr_width;
4617
4618         /*
4619          * Knock out extra levels of page tables if necessary
4620          */
4621         while (iommu->agaw < dmar_domain->agaw) {
4622                 struct dma_pte *pte;
4623
4624                 pte = dmar_domain->pgd;
4625                 if (dma_pte_present(pte)) {
4626                         dmar_domain->pgd = (struct dma_pte *)
4627                                 phys_to_virt(dma_pte_addr(pte));
4628                         free_pgtable_page(pte);
4629                 }
4630                 dmar_domain->agaw--;
4631         }
4632
4633         return 0;
4634 }
4635
4636 static int intel_iommu_attach_device(struct iommu_domain *domain,
4637                                      struct device *dev)
4638 {
4639         int ret;
4640
4641         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4642             device_is_rmrr_locked(dev)) {
4643                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4644                 return -EPERM;
4645         }
4646
4647         if (is_aux_domain(dev, domain))
4648                 return -EPERM;
4649
4650         /* normally dev is not mapped */
4651         if (unlikely(domain_context_mapped(dev))) {
4652                 struct dmar_domain *old_domain;
4653
4654                 old_domain = find_domain(dev);
4655                 if (old_domain)
4656                         dmar_remove_one_dev_info(dev);
4657         }
4658
4659         ret = prepare_domain_attach_device(domain, dev);
4660         if (ret)
4661                 return ret;
4662
4663         return domain_add_dev_info(to_dmar_domain(domain), dev);
4664 }
4665
4666 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4667                                          struct device *dev)
4668 {
4669         int ret;
4670
4671         if (!is_aux_domain(dev, domain))
4672                 return -EPERM;
4673
4674         ret = prepare_domain_attach_device(domain, dev);
4675         if (ret)
4676                 return ret;
4677
4678         return aux_domain_add_dev(to_dmar_domain(domain), dev);
4679 }
4680
4681 static void intel_iommu_detach_device(struct iommu_domain *domain,
4682                                       struct device *dev)
4683 {
4684         dmar_remove_one_dev_info(dev);
4685 }
4686
4687 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4688                                           struct device *dev)
4689 {
4690         aux_domain_remove_dev(to_dmar_domain(domain), dev);
4691 }
4692
4693 #ifdef CONFIG_INTEL_IOMMU_SVM
4694 /*
4695  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4696  * VT-d granularity. Invalidation is typically included in the unmap operation
4697  * as a result of DMA or VFIO unmap. However, for assigned devices guest
4698  * owns the first level page tables. Invalidations of translation caches in the
4699  * guest are trapped and passed down to the host.
4700  *
4701  * vIOMMU in the guest will only expose first level page tables, therefore
4702  * we do not support IOTLB granularity for request without PASID (second level).
4703  *
4704  * For example, to find the VT-d granularity encoding for IOTLB
4705  * type and page selective granularity within PASID:
4706  * X: indexed by iommu cache type
4707  * Y: indexed by enum iommu_inv_granularity
4708  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4709  */
4710
4711 static const int
4712 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4713         /*
4714          * PASID based IOTLB invalidation: PASID selective (per PASID),
4715          * page selective (address granularity)
4716          */
4717         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4718         /* PASID based dev TLBs */
4719         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4720         /* PASID cache */
4721         {-EINVAL, -EINVAL, -EINVAL}
4722 };
4723
4724 static inline int to_vtd_granularity(int type, int granu)
4725 {
4726         return inv_type_granu_table[type][granu];
4727 }
4728
4729 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4730 {
4731         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4732
4733         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4734          * IOMMU cache invalidate API passes granu_size in bytes, and number of
4735          * granu size in contiguous memory.
4736          */
4737         return order_base_2(nr_pages);
4738 }
4739
4740 static int
4741 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4742                            struct iommu_cache_invalidate_info *inv_info)
4743 {
4744         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4745         struct device_domain_info *info;
4746         struct intel_iommu *iommu;
4747         unsigned long flags;
4748         int cache_type;
4749         u8 bus, devfn;
4750         u16 did, sid;
4751         int ret = 0;
4752         u64 size = 0;
4753
4754         if (!inv_info || !dmar_domain)
4755                 return -EINVAL;
4756
4757         if (!dev || !dev_is_pci(dev))
4758                 return -ENODEV;
4759
4760         iommu = device_to_iommu(dev, &bus, &devfn);
4761         if (!iommu)
4762                 return -ENODEV;
4763
4764         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4765                 return -EINVAL;
4766
4767         spin_lock_irqsave(&device_domain_lock, flags);
4768         spin_lock(&iommu->lock);
4769         info = get_domain_info(dev);
4770         if (!info) {
4771                 ret = -EINVAL;
4772                 goto out_unlock;
4773         }
4774         did = dmar_domain->iommu_did[iommu->seq_id];
4775         sid = PCI_DEVID(bus, devfn);
4776
4777         /* Size is only valid in address selective invalidation */
4778         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4779                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4780                                    inv_info->granu.addr_info.nb_granules);
4781
4782         for_each_set_bit(cache_type,
4783                          (unsigned long *)&inv_info->cache,
4784                          IOMMU_CACHE_INV_TYPE_NR) {
4785                 int granu = 0;
4786                 u64 pasid = 0;
4787                 u64 addr = 0;
4788
4789                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
4790                 if (granu == -EINVAL) {
4791                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4792                                            cache_type, inv_info->granularity);
4793                         break;
4794                 }
4795
4796                 /*
4797                  * PASID is stored in different locations based on the
4798                  * granularity.
4799                  */
4800                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4801                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4802                         pasid = inv_info->granu.pasid_info.pasid;
4803                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4804                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4805                         pasid = inv_info->granu.addr_info.pasid;
4806
4807                 switch (BIT(cache_type)) {
4808                 case IOMMU_CACHE_INV_TYPE_IOTLB:
4809                         /* HW will ignore LSB bits based on address mask */
4810                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4811                             size &&
4812                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4813                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4814                                                    inv_info->granu.addr_info.addr, size);
4815                         }
4816
4817                         /*
4818                          * If granu is PASID-selective, address is ignored.
4819                          * We use npages = -1 to indicate that.
4820                          */
4821                         qi_flush_piotlb(iommu, did, pasid,
4822                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4823                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4824                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4825
4826                         if (!info->ats_enabled)
4827                                 break;
4828                         /*
4829                          * Always flush device IOTLB if ATS is enabled. vIOMMU
4830                          * in the guest may assume IOTLB flush is inclusive,
4831                          * which is more efficient.
4832                          */
4833                         fallthrough;
4834                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4835                         /*
4836                          * PASID based device TLB invalidation does not support
4837                          * IOMMU_INV_GRANU_PASID granularity but only supports
4838                          * IOMMU_INV_GRANU_ADDR.
4839                          * The equivalent of that is we set the size to be the
4840                          * entire range of 64 bit. User only provides PASID info
4841                          * without address info. So we set addr to 0.
4842                          */
4843                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4844                                 size = 64 - VTD_PAGE_SHIFT;
4845                                 addr = 0;
4846                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4847                                 addr = inv_info->granu.addr_info.addr;
4848                         }
4849
4850                         if (info->ats_enabled)
4851                                 qi_flush_dev_iotlb_pasid(iommu, sid,
4852                                                 info->pfsid, pasid,
4853                                                 info->ats_qdep, addr,
4854                                                 size);
4855                         else
4856                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4857                         break;
4858                 default:
4859                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
4860                                             cache_type);
4861                         ret = -EINVAL;
4862                 }
4863         }
4864 out_unlock:
4865         spin_unlock(&iommu->lock);
4866         spin_unlock_irqrestore(&device_domain_lock, flags);
4867
4868         return ret;
4869 }
4870 #endif
4871
4872 static int intel_iommu_map(struct iommu_domain *domain,
4873                            unsigned long iova, phys_addr_t hpa,
4874                            size_t size, int iommu_prot, gfp_t gfp)
4875 {
4876         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4877         u64 max_addr;
4878         int prot = 0;
4879         int ret;
4880
4881         if (iommu_prot & IOMMU_READ)
4882                 prot |= DMA_PTE_READ;
4883         if (iommu_prot & IOMMU_WRITE)
4884                 prot |= DMA_PTE_WRITE;
4885         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4886                 prot |= DMA_PTE_SNP;
4887
4888         max_addr = iova + size;
4889         if (dmar_domain->max_addr < max_addr) {
4890                 u64 end;
4891
4892                 /* check if minimum agaw is sufficient for mapped address */
4893                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4894                 if (end < max_addr) {
4895                         pr_err("%s: iommu width (%d) is not "
4896                                "sufficient for the mapped address (%llx)\n",
4897                                __func__, dmar_domain->gaw, max_addr);
4898                         return -EFAULT;
4899                 }
4900                 dmar_domain->max_addr = max_addr;
4901         }
4902         /* Round up size to next multiple of PAGE_SIZE, if it and
4903            the low bits of hpa would take us onto the next page */
4904         size = aligned_nrpages(hpa, size);
4905         ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4906                              hpa >> VTD_PAGE_SHIFT, size, prot);
4907         return ret;
4908 }
4909
4910 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4911                                 unsigned long iova, size_t size,
4912                                 struct iommu_iotlb_gather *gather)
4913 {
4914         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4915         unsigned long start_pfn, last_pfn;
4916         int level = 0;
4917
4918         /* Cope with horrid API which requires us to unmap more than the
4919            size argument if it happens to be a large-page mapping. */
4920         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4921
4922         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4923                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4924
4925         start_pfn = iova >> VTD_PAGE_SHIFT;
4926         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4927
4928         gather->freelist = domain_unmap(dmar_domain, start_pfn,
4929                                         last_pfn, gather->freelist);
4930
4931         if (dmar_domain->max_addr == iova + size)
4932                 dmar_domain->max_addr = iova;
4933
4934         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4935
4936         return size;
4937 }
4938
4939 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4940                                  struct iommu_iotlb_gather *gather)
4941 {
4942         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4943         unsigned long iova_pfn = IOVA_PFN(gather->start);
4944         size_t size = gather->end - gather->start;
4945         unsigned long start_pfn;
4946         unsigned long nrpages;
4947         int iommu_id;
4948
4949         nrpages = aligned_nrpages(gather->start, size);
4950         start_pfn = mm_to_dma_pfn(iova_pfn);
4951
4952         for_each_domain_iommu(iommu_id, dmar_domain)
4953                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4954                                       start_pfn, nrpages, !gather->freelist, 0);
4955
4956         dma_free_pagelist(gather->freelist);
4957 }
4958
4959 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4960                                             dma_addr_t iova)
4961 {
4962         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4963         struct dma_pte *pte;
4964         int level = 0;
4965         u64 phys = 0;
4966
4967         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4968         if (pte && dma_pte_present(pte))
4969                 phys = dma_pte_addr(pte) +
4970                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4971                                                 VTD_PAGE_SHIFT) - 1));
4972
4973         return phys;
4974 }
4975
4976 static inline bool scalable_mode_support(void)
4977 {
4978         struct dmar_drhd_unit *drhd;
4979         struct intel_iommu *iommu;
4980         bool ret = true;
4981
4982         rcu_read_lock();
4983         for_each_active_iommu(iommu, drhd) {
4984                 if (!sm_supported(iommu)) {
4985                         ret = false;
4986                         break;
4987                 }
4988         }
4989         rcu_read_unlock();
4990
4991         return ret;
4992 }
4993
4994 static inline bool iommu_pasid_support(void)
4995 {
4996         struct dmar_drhd_unit *drhd;
4997         struct intel_iommu *iommu;
4998         bool ret = true;
4999
5000         rcu_read_lock();
5001         for_each_active_iommu(iommu, drhd) {
5002                 if (!pasid_supported(iommu)) {
5003                         ret = false;
5004                         break;
5005                 }
5006         }
5007         rcu_read_unlock();
5008
5009         return ret;
5010 }
5011
5012 static inline bool nested_mode_support(void)
5013 {
5014         struct dmar_drhd_unit *drhd;
5015         struct intel_iommu *iommu;
5016         bool ret = true;
5017
5018         rcu_read_lock();
5019         for_each_active_iommu(iommu, drhd) {
5020                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5021                         ret = false;
5022                         break;
5023                 }
5024         }
5025         rcu_read_unlock();
5026
5027         return ret;
5028 }
5029
5030 static bool intel_iommu_capable(enum iommu_cap cap)
5031 {
5032         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5033                 return domain_update_iommu_snooping(NULL) == 1;
5034         if (cap == IOMMU_CAP_INTR_REMAP)
5035                 return irq_remapping_enabled == 1;
5036
5037         return false;
5038 }
5039
5040 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5041 {
5042         struct intel_iommu *iommu;
5043
5044         iommu = device_to_iommu(dev, NULL, NULL);
5045         if (!iommu)
5046                 return ERR_PTR(-ENODEV);
5047
5048         if (translation_pre_enabled(iommu))
5049                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5050
5051         return &iommu->iommu;
5052 }
5053
5054 static void intel_iommu_release_device(struct device *dev)
5055 {
5056         struct intel_iommu *iommu;
5057
5058         iommu = device_to_iommu(dev, NULL, NULL);
5059         if (!iommu)
5060                 return;
5061
5062         dmar_remove_one_dev_info(dev);
5063
5064         set_dma_ops(dev, NULL);
5065 }
5066
5067 static void intel_iommu_probe_finalize(struct device *dev)
5068 {
5069         dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
5070         struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5071         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5072
5073         if (domain && domain->type == IOMMU_DOMAIN_DMA)
5074                 iommu_setup_dma_ops(dev, base,
5075                                     __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
5076         else
5077                 set_dma_ops(dev, NULL);
5078 }
5079
5080 static void intel_iommu_get_resv_regions(struct device *device,
5081                                          struct list_head *head)
5082 {
5083         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5084         struct iommu_resv_region *reg;
5085         struct dmar_rmrr_unit *rmrr;
5086         struct device *i_dev;
5087         int i;
5088
5089         down_read(&dmar_global_lock);
5090         for_each_rmrr_units(rmrr) {
5091                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5092                                           i, i_dev) {
5093                         struct iommu_resv_region *resv;
5094                         enum iommu_resv_type type;
5095                         size_t length;
5096
5097                         if (i_dev != device &&
5098                             !is_downstream_to_pci_bridge(device, i_dev))
5099                                 continue;
5100
5101                         length = rmrr->end_address - rmrr->base_address + 1;
5102
5103                         type = device_rmrr_is_relaxable(device) ?
5104                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5105
5106                         resv = iommu_alloc_resv_region(rmrr->base_address,
5107                                                        length, prot, type);
5108                         if (!resv)
5109                                 break;
5110
5111                         list_add_tail(&resv->list, head);
5112                 }
5113         }
5114         up_read(&dmar_global_lock);
5115
5116 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5117         if (dev_is_pci(device)) {
5118                 struct pci_dev *pdev = to_pci_dev(device);
5119
5120                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5121                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5122                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5123                         if (reg)
5124                                 list_add_tail(&reg->list, head);
5125                 }
5126         }
5127 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5128
5129         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5130                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5131                                       0, IOMMU_RESV_MSI);
5132         if (!reg)
5133                 return;
5134         list_add_tail(&reg->list, head);
5135 }
5136
5137 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5138 {
5139         struct device_domain_info *info;
5140         struct context_entry *context;
5141         struct dmar_domain *domain;
5142         unsigned long flags;
5143         u64 ctx_lo;
5144         int ret;
5145
5146         domain = find_domain(dev);
5147         if (!domain)
5148                 return -EINVAL;
5149
5150         spin_lock_irqsave(&device_domain_lock, flags);
5151         spin_lock(&iommu->lock);
5152
5153         ret = -EINVAL;
5154         info = get_domain_info(dev);
5155         if (!info || !info->pasid_supported)
5156                 goto out;
5157
5158         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5159         if (WARN_ON(!context))
5160                 goto out;
5161
5162         ctx_lo = context[0].lo;
5163
5164         if (!(ctx_lo & CONTEXT_PASIDE)) {
5165                 ctx_lo |= CONTEXT_PASIDE;
5166                 context[0].lo = ctx_lo;
5167                 wmb();
5168                 iommu->flush.flush_context(iommu,
5169                                            domain->iommu_did[iommu->seq_id],
5170                                            PCI_DEVID(info->bus, info->devfn),
5171                                            DMA_CCMD_MASK_NOBIT,
5172                                            DMA_CCMD_DEVICE_INVL);
5173         }
5174
5175         /* Enable PASID support in the device, if it wasn't already */
5176         if (!info->pasid_enabled)
5177                 iommu_enable_dev_iotlb(info);
5178
5179         ret = 0;
5180
5181  out:
5182         spin_unlock(&iommu->lock);
5183         spin_unlock_irqrestore(&device_domain_lock, flags);
5184
5185         return ret;
5186 }
5187
5188 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5189 {
5190         if (dev_is_pci(dev))
5191                 return pci_device_group(dev);
5192         return generic_device_group(dev);
5193 }
5194
5195 static int intel_iommu_enable_auxd(struct device *dev)
5196 {
5197         struct device_domain_info *info;
5198         struct intel_iommu *iommu;
5199         unsigned long flags;
5200         int ret;
5201
5202         iommu = device_to_iommu(dev, NULL, NULL);
5203         if (!iommu || dmar_disabled)
5204                 return -EINVAL;
5205
5206         if (!sm_supported(iommu) || !pasid_supported(iommu))
5207                 return -EINVAL;
5208
5209         ret = intel_iommu_enable_pasid(iommu, dev);
5210         if (ret)
5211                 return -ENODEV;
5212
5213         spin_lock_irqsave(&device_domain_lock, flags);
5214         info = get_domain_info(dev);
5215         info->auxd_enabled = 1;
5216         spin_unlock_irqrestore(&device_domain_lock, flags);
5217
5218         return 0;
5219 }
5220
5221 static int intel_iommu_disable_auxd(struct device *dev)
5222 {
5223         struct device_domain_info *info;
5224         unsigned long flags;
5225
5226         spin_lock_irqsave(&device_domain_lock, flags);
5227         info = get_domain_info(dev);
5228         if (!WARN_ON(!info))
5229                 info->auxd_enabled = 0;
5230         spin_unlock_irqrestore(&device_domain_lock, flags);
5231
5232         return 0;
5233 }
5234
5235 /*
5236  * A PCI express designated vendor specific extended capability is defined
5237  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5238  * for system software and tools to detect endpoint devices supporting the
5239  * Intel scalable IO virtualization without host driver dependency.
5240  *
5241  * Returns the address of the matching extended capability structure within
5242  * the device's PCI configuration space or 0 if the device does not support
5243  * it.
5244  */
5245 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5246 {
5247         int pos;
5248         u16 vendor, id;
5249
5250         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5251         while (pos) {
5252                 pci_read_config_word(pdev, pos + 4, &vendor);
5253                 pci_read_config_word(pdev, pos + 8, &id);
5254                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5255                         return pos;
5256
5257                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5258         }
5259
5260         return 0;
5261 }
5262
5263 static bool
5264 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5265 {
5266         if (feat == IOMMU_DEV_FEAT_AUX) {
5267                 int ret;
5268
5269                 if (!dev_is_pci(dev) || dmar_disabled ||
5270                     !scalable_mode_support() || !iommu_pasid_support())
5271                         return false;
5272
5273                 ret = pci_pasid_features(to_pci_dev(dev));
5274                 if (ret < 0)
5275                         return false;
5276
5277                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5278         }
5279
5280         if (feat == IOMMU_DEV_FEAT_SVA) {
5281                 struct device_domain_info *info = get_domain_info(dev);
5282
5283                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5284                         info->pasid_supported && info->pri_supported &&
5285                         info->ats_supported;
5286         }
5287
5288         return false;
5289 }
5290
5291 static int
5292 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5293 {
5294         if (feat == IOMMU_DEV_FEAT_AUX)
5295                 return intel_iommu_enable_auxd(dev);
5296
5297         if (feat == IOMMU_DEV_FEAT_SVA) {
5298                 struct device_domain_info *info = get_domain_info(dev);
5299
5300                 if (!info)
5301                         return -EINVAL;
5302
5303                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5304                         return 0;
5305         }
5306
5307         return -ENODEV;
5308 }
5309
5310 static int
5311 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5312 {
5313         if (feat == IOMMU_DEV_FEAT_AUX)
5314                 return intel_iommu_disable_auxd(dev);
5315
5316         return -ENODEV;
5317 }
5318
5319 static bool
5320 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5321 {
5322         struct device_domain_info *info = get_domain_info(dev);
5323
5324         if (feat == IOMMU_DEV_FEAT_AUX)
5325                 return scalable_mode_support() && info && info->auxd_enabled;
5326
5327         return false;
5328 }
5329
5330 static int
5331 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5332 {
5333         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5334
5335         return dmar_domain->default_pasid > 0 ?
5336                         dmar_domain->default_pasid : -EINVAL;
5337 }
5338
5339 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5340                                            struct device *dev)
5341 {
5342         return attach_deferred(dev);
5343 }
5344
5345 static int
5346 intel_iommu_domain_set_attr(struct iommu_domain *domain,
5347                             enum iommu_attr attr, void *data)
5348 {
5349         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5350         unsigned long flags;
5351         int ret = 0;
5352
5353         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
5354                 return -EINVAL;
5355
5356         switch (attr) {
5357         case DOMAIN_ATTR_NESTING:
5358                 spin_lock_irqsave(&device_domain_lock, flags);
5359                 if (nested_mode_support() &&
5360                     list_empty(&dmar_domain->devices)) {
5361                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5362                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5363                 } else {
5364                         ret = -ENODEV;
5365                 }
5366                 spin_unlock_irqrestore(&device_domain_lock, flags);
5367                 break;
5368         default:
5369                 ret = -EINVAL;
5370                 break;
5371         }
5372
5373         return ret;
5374 }
5375
5376 static int
5377 intel_iommu_domain_get_attr(struct iommu_domain *domain,
5378                             enum iommu_attr attr, void *data)
5379 {
5380         switch (domain->type) {
5381         case IOMMU_DOMAIN_UNMANAGED:
5382                 return -ENODEV;
5383         case IOMMU_DOMAIN_DMA:
5384                 switch (attr) {
5385                 case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
5386                         *(int *)data = !intel_iommu_strict;
5387                         return 0;
5388                 default:
5389                         return -ENODEV;
5390                 }
5391                 break;
5392         default:
5393                 return -EINVAL;
5394         }
5395 }
5396
5397 /*
5398  * Check that the device does not live on an external facing PCI port that is
5399  * marked as untrusted. Such devices should not be able to apply quirks and
5400  * thus not be able to bypass the IOMMU restrictions.
5401  */
5402 static bool risky_device(struct pci_dev *pdev)
5403 {
5404         if (pdev->untrusted) {
5405                 pci_info(pdev,
5406                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5407                          pdev->vendor, pdev->device);
5408                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5409                 return true;
5410         }
5411         return false;
5412 }
5413
5414 const struct iommu_ops intel_iommu_ops = {
5415         .capable                = intel_iommu_capable,
5416         .domain_alloc           = intel_iommu_domain_alloc,
5417         .domain_free            = intel_iommu_domain_free,
5418         .domain_get_attr        = intel_iommu_domain_get_attr,
5419         .domain_set_attr        = intel_iommu_domain_set_attr,
5420         .attach_dev             = intel_iommu_attach_device,
5421         .detach_dev             = intel_iommu_detach_device,
5422         .aux_attach_dev         = intel_iommu_aux_attach_device,
5423         .aux_detach_dev         = intel_iommu_aux_detach_device,
5424         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5425         .map                    = intel_iommu_map,
5426         .unmap                  = intel_iommu_unmap,
5427         .flush_iotlb_all        = intel_flush_iotlb_all,
5428         .iotlb_sync             = intel_iommu_tlb_sync,
5429         .iova_to_phys           = intel_iommu_iova_to_phys,
5430         .probe_device           = intel_iommu_probe_device,
5431         .probe_finalize         = intel_iommu_probe_finalize,
5432         .release_device         = intel_iommu_release_device,
5433         .get_resv_regions       = intel_iommu_get_resv_regions,
5434         .put_resv_regions       = generic_iommu_put_resv_regions,
5435         .device_group           = intel_iommu_device_group,
5436         .dev_has_feat           = intel_iommu_dev_has_feat,
5437         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5438         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5439         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5440         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5441         .def_domain_type        = device_def_domain_type,
5442         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5443 #ifdef CONFIG_INTEL_IOMMU_SVM
5444         .cache_invalidate       = intel_iommu_sva_invalidate,
5445         .sva_bind_gpasid        = intel_svm_bind_gpasid,
5446         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
5447         .sva_bind               = intel_svm_bind,
5448         .sva_unbind             = intel_svm_unbind,
5449         .sva_get_pasid          = intel_svm_get_pasid,
5450         .page_response          = intel_svm_page_response,
5451 #endif
5452 };
5453
5454 static void quirk_iommu_igfx(struct pci_dev *dev)
5455 {
5456         if (risky_device(dev))
5457                 return;
5458
5459         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5460         dmar_map_gfx = 0;
5461 }
5462
5463 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5469 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5470 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5471
5472 /* Broadwell igfx malfunctions with dmar */
5473 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5488 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5489 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5491 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5492 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5493 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5497
5498 static void quirk_iommu_rwbf(struct pci_dev *dev)
5499 {
5500         if (risky_device(dev))
5501                 return;
5502
5503         /*
5504          * Mobile 4 Series Chipset neglects to set RWBF capability,
5505          * but needs it. Same seems to hold for the desktop versions.
5506          */
5507         pci_info(dev, "Forcing write-buffer flush capability\n");
5508         rwbf_quirk = 1;
5509 }
5510
5511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5518
5519 #define GGC 0x52
5520 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5521 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5522 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5523 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5524 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5525 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5526 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5527 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5528
5529 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5530 {
5531         unsigned short ggc;
5532
5533         if (risky_device(dev))
5534                 return;
5535
5536         if (pci_read_config_word(dev, GGC, &ggc))
5537                 return;
5538
5539         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5540                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5541                 dmar_map_gfx = 0;
5542         } else if (dmar_map_gfx) {
5543                 /* we have to ensure the gfx device is idle before we flush */
5544                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5545                 intel_iommu_strict = 1;
5546        }
5547 }
5548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5552
5553 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5554 {
5555         unsigned short ver;
5556
5557         if (!IS_GFX_DEVICE(dev))
5558                 return;
5559
5560         ver = (dev->device >> 8) & 0xff;
5561         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5562             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5563             ver != 0x9a)
5564                 return;
5565
5566         if (risky_device(dev))
5567                 return;
5568
5569         pci_info(dev, "Skip IOMMU disabling for graphics\n");
5570         iommu_skip_te_disable = 1;
5571 }
5572 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5573
5574 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5575    ISOCH DMAR unit for the Azalia sound device, but not give it any
5576    TLB entries, which causes it to deadlock. Check for that.  We do
5577    this in a function called from init_dmars(), instead of in a PCI
5578    quirk, because we don't want to print the obnoxious "BIOS broken"
5579    message if VT-d is actually disabled.
5580 */
5581 static void __init check_tylersburg_isoch(void)
5582 {
5583         struct pci_dev *pdev;
5584         uint32_t vtisochctrl;
5585
5586         /* If there's no Azalia in the system anyway, forget it. */
5587         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5588         if (!pdev)
5589                 return;
5590
5591         if (risky_device(pdev)) {
5592                 pci_dev_put(pdev);
5593                 return;
5594         }
5595
5596         pci_dev_put(pdev);
5597
5598         /* System Management Registers. Might be hidden, in which case
5599            we can't do the sanity check. But that's OK, because the
5600            known-broken BIOSes _don't_ actually hide it, so far. */
5601         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5602         if (!pdev)
5603                 return;
5604
5605         if (risky_device(pdev)) {
5606                 pci_dev_put(pdev);
5607                 return;
5608         }
5609
5610         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5611                 pci_dev_put(pdev);
5612                 return;
5613         }
5614
5615         pci_dev_put(pdev);
5616
5617         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5618         if (vtisochctrl & 1)
5619                 return;
5620
5621         /* Drop all bits other than the number of TLB entries */
5622         vtisochctrl &= 0x1c;
5623
5624         /* If we have the recommended number of TLB entries (16), fine. */
5625         if (vtisochctrl == 0x10)
5626                 return;
5627
5628         /* Zero TLB entries? You get to ride the short bus to school. */
5629         if (!vtisochctrl) {
5630                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5631                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5632                      dmi_get_system_info(DMI_BIOS_VENDOR),
5633                      dmi_get_system_info(DMI_BIOS_VERSION),
5634                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5635                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5636                 return;
5637         }
5638
5639         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5640                vtisochctrl);
5641 }