Merge tag 'pci-v5.9-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci
[linux-block.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359
360 #define IDENTMAP_GFX            2
361 #define IDENTMAP_AZALIA         4
362
363 int intel_iommu_gfx_mapped;
364 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
365
366 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev->archdata.iommu;
376         if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO ||
377                      info == DEFER_DEVICE_DOMAIN_INFO))
378                 return NULL;
379
380         return info;
381 }
382
383 DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
385
386 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
387                                 to_pci_dev(d)->untrusted)
388
389 /*
390  * Iterate over elements in device_domain_list and call the specified
391  * callback @fn against each element.
392  */
393 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
394                                      void *data), void *data)
395 {
396         int ret = 0;
397         unsigned long flags;
398         struct device_domain_info *info;
399
400         spin_lock_irqsave(&device_domain_lock, flags);
401         list_for_each_entry(info, &device_domain_list, global) {
402                 ret = fn(info, data);
403                 if (ret) {
404                         spin_unlock_irqrestore(&device_domain_lock, flags);
405                         return ret;
406                 }
407         }
408         spin_unlock_irqrestore(&device_domain_lock, flags);
409
410         return 0;
411 }
412
413 const struct iommu_ops intel_iommu_ops;
414
415 static bool translation_pre_enabled(struct intel_iommu *iommu)
416 {
417         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
418 }
419
420 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
421 {
422         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
423 }
424
425 static void init_translation_status(struct intel_iommu *iommu)
426 {
427         u32 gsts;
428
429         gsts = readl(iommu->reg + DMAR_GSTS_REG);
430         if (gsts & DMA_GSTS_TES)
431                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
432 }
433
434 static int __init intel_iommu_setup(char *str)
435 {
436         if (!str)
437                 return -EINVAL;
438         while (*str) {
439                 if (!strncmp(str, "on", 2)) {
440                         dmar_disabled = 0;
441                         pr_info("IOMMU enabled\n");
442                 } else if (!strncmp(str, "off", 3)) {
443                         dmar_disabled = 1;
444                         no_platform_optin = 1;
445                         pr_info("IOMMU disabled\n");
446                 } else if (!strncmp(str, "igfx_off", 8)) {
447                         dmar_map_gfx = 0;
448                         pr_info("Disable GFX device mapping\n");
449                 } else if (!strncmp(str, "forcedac", 8)) {
450                         pr_info("Forcing DAC for PCI devices\n");
451                         dmar_forcedac = 1;
452                 } else if (!strncmp(str, "strict", 6)) {
453                         pr_info("Disable batched IOTLB flush\n");
454                         intel_iommu_strict = 1;
455                 } else if (!strncmp(str, "sp_off", 6)) {
456                         pr_info("Disable supported super page\n");
457                         intel_iommu_superpage = 0;
458                 } else if (!strncmp(str, "sm_on", 5)) {
459                         pr_info("Intel-IOMMU: scalable mode supported\n");
460                         intel_iommu_sm = 1;
461                 } else if (!strncmp(str, "tboot_noforce", 13)) {
462                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463                         intel_iommu_tboot_noforce = 1;
464                 } else if (!strncmp(str, "nobounce", 8)) {
465                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
466                         intel_no_bounce = 1;
467                 }
468
469                 str += strcspn(str, ",");
470                 while (*str == ',')
471                         str++;
472         }
473         return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 {
482         struct dmar_domain **domains;
483         int idx = did >> 8;
484
485         domains = iommu->domains[idx];
486         if (!domains)
487                 return NULL;
488
489         return domains[did & 0xff];
490 }
491
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493                              struct dmar_domain *domain)
494 {
495         struct dmar_domain **domains;
496         int idx = did >> 8;
497
498         if (!iommu->domains[idx]) {
499                 size_t size = 256 * sizeof(struct dmar_domain *);
500                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
501         }
502
503         domains = iommu->domains[idx];
504         if (WARN_ON(!domains))
505                 return;
506         else
507                 domains[did & 0xff] = domain;
508 }
509
510 void *alloc_pgtable_page(int node)
511 {
512         struct page *page;
513         void *vaddr = NULL;
514
515         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516         if (page)
517                 vaddr = page_address(page);
518         return vaddr;
519 }
520
521 void free_pgtable_page(void *vaddr)
522 {
523         free_page((unsigned long)vaddr);
524 }
525
526 static inline void *alloc_domain_mem(void)
527 {
528         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
529 }
530
531 static void free_domain_mem(void *vaddr)
532 {
533         kmem_cache_free(iommu_domain_cache, vaddr);
534 }
535
536 static inline void * alloc_devinfo_mem(void)
537 {
538         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
539 }
540
541 static inline void free_devinfo_mem(void *vaddr)
542 {
543         kmem_cache_free(iommu_devinfo_cache, vaddr);
544 }
545
546 static inline int domain_type_is_si(struct dmar_domain *domain)
547 {
548         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
549 }
550
551 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 {
553         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
554 }
555
556 static inline int domain_pfn_supported(struct dmar_domain *domain,
557                                        unsigned long pfn)
558 {
559         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560
561         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
562 }
563
564 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
565 {
566         unsigned long sagaw;
567         int agaw = -1;
568
569         sagaw = cap_sagaw(iommu->cap);
570         for (agaw = width_to_agaw(max_gaw);
571              agaw >= 0; agaw--) {
572                 if (test_bit(agaw, &sagaw))
573                         break;
574         }
575
576         return agaw;
577 }
578
579 /*
580  * Calculate max SAGAW for each iommu.
581  */
582 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 {
584         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
585 }
586
587 /*
588  * calculate agaw for each iommu.
589  * "SAGAW" may be different across iommus, use a default agaw, and
590  * get a supported less agaw for iommus that don't support the default agaw.
591  */
592 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 {
594         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
595 }
596
597 /* This functionin only returns single iommu in a domain */
598 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
599 {
600         int iommu_id;
601
602         /* si_domain and vm domain should not get here. */
603         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
604                 return NULL;
605
606         for_each_domain_iommu(iommu_id, domain)
607                 break;
608
609         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
610                 return NULL;
611
612         return g_iommus[iommu_id];
613 }
614
615 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 {
617         return sm_supported(iommu) ?
618                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
619 }
620
621 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 {
623         struct dmar_drhd_unit *drhd;
624         struct intel_iommu *iommu;
625         bool found = false;
626         int i;
627
628         domain->iommu_coherency = 1;
629
630         for_each_domain_iommu(i, domain) {
631                 found = true;
632                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
633                         domain->iommu_coherency = 0;
634                         break;
635                 }
636         }
637         if (found)
638                 return;
639
640         /* No hardware attached; use lowest common denominator */
641         rcu_read_lock();
642         for_each_active_iommu(iommu, drhd) {
643                 if (!iommu_paging_structure_coherency(iommu)) {
644                         domain->iommu_coherency = 0;
645                         break;
646                 }
647         }
648         rcu_read_unlock();
649 }
650
651 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 {
653         struct dmar_drhd_unit *drhd;
654         struct intel_iommu *iommu;
655         int ret = 1;
656
657         rcu_read_lock();
658         for_each_active_iommu(iommu, drhd) {
659                 if (iommu != skip) {
660                         if (!ecap_sc_support(iommu->ecap)) {
661                                 ret = 0;
662                                 break;
663                         }
664                 }
665         }
666         rcu_read_unlock();
667
668         return ret;
669 }
670
671 static int domain_update_iommu_superpage(struct dmar_domain *domain,
672                                          struct intel_iommu *skip)
673 {
674         struct dmar_drhd_unit *drhd;
675         struct intel_iommu *iommu;
676         int mask = 0x3;
677
678         if (!intel_iommu_superpage) {
679                 return 0;
680         }
681
682         /* set iommu_superpage to the smallest common denominator */
683         rcu_read_lock();
684         for_each_active_iommu(iommu, drhd) {
685                 if (iommu != skip) {
686                         if (domain && domain_use_first_level(domain)) {
687                                 if (!cap_fl1gp_support(iommu->cap))
688                                         mask = 0x1;
689                         } else {
690                                 mask &= cap_super_page_val(iommu->cap);
691                         }
692
693                         if (!mask)
694                                 break;
695                 }
696         }
697         rcu_read_unlock();
698
699         return fls(mask);
700 }
701
702 /* Some capabilities may be different across iommus */
703 static void domain_update_iommu_cap(struct dmar_domain *domain)
704 {
705         domain_update_iommu_coherency(domain);
706         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
707         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
708 }
709
710 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
711                                          u8 devfn, int alloc)
712 {
713         struct root_entry *root = &iommu->root_entry[bus];
714         struct context_entry *context;
715         u64 *entry;
716
717         entry = &root->lo;
718         if (sm_supported(iommu)) {
719                 if (devfn >= 0x80) {
720                         devfn -= 0x80;
721                         entry = &root->hi;
722                 }
723                 devfn *= 2;
724         }
725         if (*entry & 1)
726                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
727         else {
728                 unsigned long phy_addr;
729                 if (!alloc)
730                         return NULL;
731
732                 context = alloc_pgtable_page(iommu->node);
733                 if (!context)
734                         return NULL;
735
736                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
737                 phy_addr = virt_to_phys((void *)context);
738                 *entry = phy_addr | 1;
739                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
740         }
741         return &context[devfn];
742 }
743
744 static int iommu_dummy(struct device *dev)
745 {
746         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
747 }
748
749 static bool attach_deferred(struct device *dev)
750 {
751         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
752 }
753
754 /**
755  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
756  *                               sub-hierarchy of a candidate PCI-PCI bridge
757  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
758  * @bridge: the candidate PCI-PCI bridge
759  *
760  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
761  */
762 static bool
763 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
764 {
765         struct pci_dev *pdev, *pbridge;
766
767         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
768                 return false;
769
770         pdev = to_pci_dev(dev);
771         pbridge = to_pci_dev(bridge);
772
773         if (pbridge->subordinate &&
774             pbridge->subordinate->number <= pdev->bus->number &&
775             pbridge->subordinate->busn_res.end >= pdev->bus->number)
776                 return true;
777
778         return false;
779 }
780
781 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
782 {
783         struct dmar_drhd_unit *drhd = NULL;
784         struct intel_iommu *iommu;
785         struct device *tmp;
786         struct pci_dev *pdev = NULL;
787         u16 segment = 0;
788         int i;
789
790         if (iommu_dummy(dev))
791                 return NULL;
792
793         if (dev_is_pci(dev)) {
794                 struct pci_dev *pf_pdev;
795
796                 pdev = pci_real_dma_dev(to_pci_dev(dev));
797
798                 /* VFs aren't listed in scope tables; we need to look up
799                  * the PF instead to find the IOMMU. */
800                 pf_pdev = pci_physfn(pdev);
801                 dev = &pf_pdev->dev;
802                 segment = pci_domain_nr(pdev->bus);
803         } else if (has_acpi_companion(dev))
804                 dev = &ACPI_COMPANION(dev)->dev;
805
806         rcu_read_lock();
807         for_each_active_iommu(iommu, drhd) {
808                 if (pdev && segment != drhd->segment)
809                         continue;
810
811                 for_each_active_dev_scope(drhd->devices,
812                                           drhd->devices_cnt, i, tmp) {
813                         if (tmp == dev) {
814                                 /* For a VF use its original BDF# not that of the PF
815                                  * which we used for the IOMMU lookup. Strictly speaking
816                                  * we could do this for all PCI devices; we only need to
817                                  * get the BDF# from the scope table for ACPI matches. */
818                                 if (pdev && pdev->is_virtfn)
819                                         goto got_pdev;
820
821                                 *bus = drhd->devices[i].bus;
822                                 *devfn = drhd->devices[i].devfn;
823                                 goto out;
824                         }
825
826                         if (is_downstream_to_pci_bridge(dev, tmp))
827                                 goto got_pdev;
828                 }
829
830                 if (pdev && drhd->include_all) {
831                 got_pdev:
832                         *bus = pdev->bus->number;
833                         *devfn = pdev->devfn;
834                         goto out;
835                 }
836         }
837         iommu = NULL;
838  out:
839         rcu_read_unlock();
840
841         return iommu;
842 }
843
844 static void domain_flush_cache(struct dmar_domain *domain,
845                                void *addr, int size)
846 {
847         if (!domain->iommu_coherency)
848                 clflush_cache_range(addr, size);
849 }
850
851 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
852 {
853         struct context_entry *context;
854         int ret = 0;
855         unsigned long flags;
856
857         spin_lock_irqsave(&iommu->lock, flags);
858         context = iommu_context_addr(iommu, bus, devfn, 0);
859         if (context)
860                 ret = context_present(context);
861         spin_unlock_irqrestore(&iommu->lock, flags);
862         return ret;
863 }
864
865 static void free_context_table(struct intel_iommu *iommu)
866 {
867         int i;
868         unsigned long flags;
869         struct context_entry *context;
870
871         spin_lock_irqsave(&iommu->lock, flags);
872         if (!iommu->root_entry) {
873                 goto out;
874         }
875         for (i = 0; i < ROOT_ENTRY_NR; i++) {
876                 context = iommu_context_addr(iommu, i, 0, 0);
877                 if (context)
878                         free_pgtable_page(context);
879
880                 if (!sm_supported(iommu))
881                         continue;
882
883                 context = iommu_context_addr(iommu, i, 0x80, 0);
884                 if (context)
885                         free_pgtable_page(context);
886
887         }
888         free_pgtable_page(iommu->root_entry);
889         iommu->root_entry = NULL;
890 out:
891         spin_unlock_irqrestore(&iommu->lock, flags);
892 }
893
894 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
895                                       unsigned long pfn, int *target_level)
896 {
897         struct dma_pte *parent, *pte;
898         int level = agaw_to_level(domain->agaw);
899         int offset;
900
901         BUG_ON(!domain->pgd);
902
903         if (!domain_pfn_supported(domain, pfn))
904                 /* Address beyond IOMMU's addressing capabilities. */
905                 return NULL;
906
907         parent = domain->pgd;
908
909         while (1) {
910                 void *tmp_page;
911
912                 offset = pfn_level_offset(pfn, level);
913                 pte = &parent[offset];
914                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
915                         break;
916                 if (level == *target_level)
917                         break;
918
919                 if (!dma_pte_present(pte)) {
920                         uint64_t pteval;
921
922                         tmp_page = alloc_pgtable_page(domain->nid);
923
924                         if (!tmp_page)
925                                 return NULL;
926
927                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
928                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
929                         if (domain_use_first_level(domain))
930                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
931                         if (cmpxchg64(&pte->val, 0ULL, pteval))
932                                 /* Someone else set it while we were thinking; use theirs. */
933                                 free_pgtable_page(tmp_page);
934                         else
935                                 domain_flush_cache(domain, pte, sizeof(*pte));
936                 }
937                 if (level == 1)
938                         break;
939
940                 parent = phys_to_virt(dma_pte_addr(pte));
941                 level--;
942         }
943
944         if (!*target_level)
945                 *target_level = level;
946
947         return pte;
948 }
949
950 /* return address's pte at specific level */
951 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
952                                          unsigned long pfn,
953                                          int level, int *large_page)
954 {
955         struct dma_pte *parent, *pte;
956         int total = agaw_to_level(domain->agaw);
957         int offset;
958
959         parent = domain->pgd;
960         while (level <= total) {
961                 offset = pfn_level_offset(pfn, total);
962                 pte = &parent[offset];
963                 if (level == total)
964                         return pte;
965
966                 if (!dma_pte_present(pte)) {
967                         *large_page = total;
968                         break;
969                 }
970
971                 if (dma_pte_superpage(pte)) {
972                         *large_page = total;
973                         return pte;
974                 }
975
976                 parent = phys_to_virt(dma_pte_addr(pte));
977                 total--;
978         }
979         return NULL;
980 }
981
982 /* clear last level pte, a tlb flush should be followed */
983 static void dma_pte_clear_range(struct dmar_domain *domain,
984                                 unsigned long start_pfn,
985                                 unsigned long last_pfn)
986 {
987         unsigned int large_page;
988         struct dma_pte *first_pte, *pte;
989
990         BUG_ON(!domain_pfn_supported(domain, start_pfn));
991         BUG_ON(!domain_pfn_supported(domain, last_pfn));
992         BUG_ON(start_pfn > last_pfn);
993
994         /* we don't need lock here; nobody else touches the iova range */
995         do {
996                 large_page = 1;
997                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
998                 if (!pte) {
999                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1000                         continue;
1001                 }
1002                 do {
1003                         dma_clear_pte(pte);
1004                         start_pfn += lvl_to_nr_pages(large_page);
1005                         pte++;
1006                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1007
1008                 domain_flush_cache(domain, first_pte,
1009                                    (void *)pte - (void *)first_pte);
1010
1011         } while (start_pfn && start_pfn <= last_pfn);
1012 }
1013
1014 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1015                                int retain_level, struct dma_pte *pte,
1016                                unsigned long pfn, unsigned long start_pfn,
1017                                unsigned long last_pfn)
1018 {
1019         pfn = max(start_pfn, pfn);
1020         pte = &pte[pfn_level_offset(pfn, level)];
1021
1022         do {
1023                 unsigned long level_pfn;
1024                 struct dma_pte *level_pte;
1025
1026                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1027                         goto next;
1028
1029                 level_pfn = pfn & level_mask(level);
1030                 level_pte = phys_to_virt(dma_pte_addr(pte));
1031
1032                 if (level > 2) {
1033                         dma_pte_free_level(domain, level - 1, retain_level,
1034                                            level_pte, level_pfn, start_pfn,
1035                                            last_pfn);
1036                 }
1037
1038                 /*
1039                  * Free the page table if we're below the level we want to
1040                  * retain and the range covers the entire table.
1041                  */
1042                 if (level < retain_level && !(start_pfn > level_pfn ||
1043                       last_pfn < level_pfn + level_size(level) - 1)) {
1044                         dma_clear_pte(pte);
1045                         domain_flush_cache(domain, pte, sizeof(*pte));
1046                         free_pgtable_page(level_pte);
1047                 }
1048 next:
1049                 pfn += level_size(level);
1050         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1051 }
1052
1053 /*
1054  * clear last level (leaf) ptes and free page table pages below the
1055  * level we wish to keep intact.
1056  */
1057 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1058                                    unsigned long start_pfn,
1059                                    unsigned long last_pfn,
1060                                    int retain_level)
1061 {
1062         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1063         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1064         BUG_ON(start_pfn > last_pfn);
1065
1066         dma_pte_clear_range(domain, start_pfn, last_pfn);
1067
1068         /* We don't need lock here; nobody else touches the iova range */
1069         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1070                            domain->pgd, 0, start_pfn, last_pfn);
1071
1072         /* free pgd */
1073         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1074                 free_pgtable_page(domain->pgd);
1075                 domain->pgd = NULL;
1076         }
1077 }
1078
1079 /* When a page at a given level is being unlinked from its parent, we don't
1080    need to *modify* it at all. All we need to do is make a list of all the
1081    pages which can be freed just as soon as we've flushed the IOTLB and we
1082    know the hardware page-walk will no longer touch them.
1083    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1084    be freed. */
1085 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1086                                             int level, struct dma_pte *pte,
1087                                             struct page *freelist)
1088 {
1089         struct page *pg;
1090
1091         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1092         pg->freelist = freelist;
1093         freelist = pg;
1094
1095         if (level == 1)
1096                 return freelist;
1097
1098         pte = page_address(pg);
1099         do {
1100                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1101                         freelist = dma_pte_list_pagetables(domain, level - 1,
1102                                                            pte, freelist);
1103                 pte++;
1104         } while (!first_pte_in_page(pte));
1105
1106         return freelist;
1107 }
1108
1109 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1110                                         struct dma_pte *pte, unsigned long pfn,
1111                                         unsigned long start_pfn,
1112                                         unsigned long last_pfn,
1113                                         struct page *freelist)
1114 {
1115         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1116
1117         pfn = max(start_pfn, pfn);
1118         pte = &pte[pfn_level_offset(pfn, level)];
1119
1120         do {
1121                 unsigned long level_pfn;
1122
1123                 if (!dma_pte_present(pte))
1124                         goto next;
1125
1126                 level_pfn = pfn & level_mask(level);
1127
1128                 /* If range covers entire pagetable, free it */
1129                 if (start_pfn <= level_pfn &&
1130                     last_pfn >= level_pfn + level_size(level) - 1) {
1131                         /* These suborbinate page tables are going away entirely. Don't
1132                            bother to clear them; we're just going to *free* them. */
1133                         if (level > 1 && !dma_pte_superpage(pte))
1134                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1135
1136                         dma_clear_pte(pte);
1137                         if (!first_pte)
1138                                 first_pte = pte;
1139                         last_pte = pte;
1140                 } else if (level > 1) {
1141                         /* Recurse down into a level that isn't *entirely* obsolete */
1142                         freelist = dma_pte_clear_level(domain, level - 1,
1143                                                        phys_to_virt(dma_pte_addr(pte)),
1144                                                        level_pfn, start_pfn, last_pfn,
1145                                                        freelist);
1146                 }
1147 next:
1148                 pfn += level_size(level);
1149         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1150
1151         if (first_pte)
1152                 domain_flush_cache(domain, first_pte,
1153                                    (void *)++last_pte - (void *)first_pte);
1154
1155         return freelist;
1156 }
1157
1158 /* We can't just free the pages because the IOMMU may still be walking
1159    the page tables, and may have cached the intermediate levels. The
1160    pages can only be freed after the IOTLB flush has been done. */
1161 static struct page *domain_unmap(struct dmar_domain *domain,
1162                                  unsigned long start_pfn,
1163                                  unsigned long last_pfn)
1164 {
1165         struct page *freelist;
1166
1167         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1168         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1169         BUG_ON(start_pfn > last_pfn);
1170
1171         /* we don't need lock here; nobody else touches the iova range */
1172         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1173                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1174
1175         /* free pgd */
1176         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1177                 struct page *pgd_page = virt_to_page(domain->pgd);
1178                 pgd_page->freelist = freelist;
1179                 freelist = pgd_page;
1180
1181                 domain->pgd = NULL;
1182         }
1183
1184         return freelist;
1185 }
1186
1187 static void dma_free_pagelist(struct page *freelist)
1188 {
1189         struct page *pg;
1190
1191         while ((pg = freelist)) {
1192                 freelist = pg->freelist;
1193                 free_pgtable_page(page_address(pg));
1194         }
1195 }
1196
1197 static void iova_entry_free(unsigned long data)
1198 {
1199         struct page *freelist = (struct page *)data;
1200
1201         dma_free_pagelist(freelist);
1202 }
1203
1204 /* iommu handling */
1205 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1206 {
1207         struct root_entry *root;
1208         unsigned long flags;
1209
1210         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1211         if (!root) {
1212                 pr_err("Allocating root entry for %s failed\n",
1213                         iommu->name);
1214                 return -ENOMEM;
1215         }
1216
1217         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1218
1219         spin_lock_irqsave(&iommu->lock, flags);
1220         iommu->root_entry = root;
1221         spin_unlock_irqrestore(&iommu->lock, flags);
1222
1223         return 0;
1224 }
1225
1226 static void iommu_set_root_entry(struct intel_iommu *iommu)
1227 {
1228         u64 addr;
1229         u32 sts;
1230         unsigned long flag;
1231
1232         addr = virt_to_phys(iommu->root_entry);
1233         if (sm_supported(iommu))
1234                 addr |= DMA_RTADDR_SMT;
1235
1236         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1238
1239         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1240
1241         /* Make sure hardware complete it */
1242         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1243                       readl, (sts & DMA_GSTS_RTPS), sts);
1244
1245         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1246 }
1247
1248 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1249 {
1250         u32 val;
1251         unsigned long flag;
1252
1253         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1254                 return;
1255
1256         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1257         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1258
1259         /* Make sure hardware complete it */
1260         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1261                       readl, (!(val & DMA_GSTS_WBFS)), val);
1262
1263         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1264 }
1265
1266 /* return value determine if we need a write buffer flush */
1267 static void __iommu_flush_context(struct intel_iommu *iommu,
1268                                   u16 did, u16 source_id, u8 function_mask,
1269                                   u64 type)
1270 {
1271         u64 val = 0;
1272         unsigned long flag;
1273
1274         switch (type) {
1275         case DMA_CCMD_GLOBAL_INVL:
1276                 val = DMA_CCMD_GLOBAL_INVL;
1277                 break;
1278         case DMA_CCMD_DOMAIN_INVL:
1279                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1280                 break;
1281         case DMA_CCMD_DEVICE_INVL:
1282                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1283                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1284                 break;
1285         default:
1286                 BUG();
1287         }
1288         val |= DMA_CCMD_ICC;
1289
1290         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1291         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1292
1293         /* Make sure hardware complete it */
1294         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1295                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1296
1297         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1298 }
1299
1300 /* return value determine if we need a write buffer flush */
1301 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1302                                 u64 addr, unsigned int size_order, u64 type)
1303 {
1304         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1305         u64 val = 0, val_iva = 0;
1306         unsigned long flag;
1307
1308         switch (type) {
1309         case DMA_TLB_GLOBAL_FLUSH:
1310                 /* global flush doesn't need set IVA_REG */
1311                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1312                 break;
1313         case DMA_TLB_DSI_FLUSH:
1314                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1315                 break;
1316         case DMA_TLB_PSI_FLUSH:
1317                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1318                 /* IH bit is passed in as part of address */
1319                 val_iva = size_order | addr;
1320                 break;
1321         default:
1322                 BUG();
1323         }
1324         /* Note: set drain read/write */
1325 #if 0
1326         /*
1327          * This is probably to be super secure.. Looks like we can
1328          * ignore it without any impact.
1329          */
1330         if (cap_read_drain(iommu->cap))
1331                 val |= DMA_TLB_READ_DRAIN;
1332 #endif
1333         if (cap_write_drain(iommu->cap))
1334                 val |= DMA_TLB_WRITE_DRAIN;
1335
1336         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1337         /* Note: Only uses first TLB reg currently */
1338         if (val_iva)
1339                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1340         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1341
1342         /* Make sure hardware complete it */
1343         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1344                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1345
1346         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1347
1348         /* check IOTLB invalidation granularity */
1349         if (DMA_TLB_IAIG(val) == 0)
1350                 pr_err("Flush IOTLB failed\n");
1351         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1352                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1353                         (unsigned long long)DMA_TLB_IIRG(type),
1354                         (unsigned long long)DMA_TLB_IAIG(val));
1355 }
1356
1357 static struct device_domain_info *
1358 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1359                          u8 bus, u8 devfn)
1360 {
1361         struct device_domain_info *info;
1362
1363         assert_spin_locked(&device_domain_lock);
1364
1365         if (!iommu->qi)
1366                 return NULL;
1367
1368         list_for_each_entry(info, &domain->devices, link)
1369                 if (info->iommu == iommu && info->bus == bus &&
1370                     info->devfn == devfn) {
1371                         if (info->ats_supported && info->dev)
1372                                 return info;
1373                         break;
1374                 }
1375
1376         return NULL;
1377 }
1378
1379 static void domain_update_iotlb(struct dmar_domain *domain)
1380 {
1381         struct device_domain_info *info;
1382         bool has_iotlb_device = false;
1383
1384         assert_spin_locked(&device_domain_lock);
1385
1386         list_for_each_entry(info, &domain->devices, link) {
1387                 struct pci_dev *pdev;
1388
1389                 if (!info->dev || !dev_is_pci(info->dev))
1390                         continue;
1391
1392                 pdev = to_pci_dev(info->dev);
1393                 if (pdev->ats_enabled) {
1394                         has_iotlb_device = true;
1395                         break;
1396                 }
1397         }
1398
1399         domain->has_iotlb_device = has_iotlb_device;
1400 }
1401
1402 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1403 {
1404         struct pci_dev *pdev;
1405
1406         assert_spin_locked(&device_domain_lock);
1407
1408         if (!info || !dev_is_pci(info->dev))
1409                 return;
1410
1411         pdev = to_pci_dev(info->dev);
1412         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1413          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1414          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1415          * reserved, which should be set to 0.
1416          */
1417         if (!ecap_dit(info->iommu->ecap))
1418                 info->pfsid = 0;
1419         else {
1420                 struct pci_dev *pf_pdev;
1421
1422                 /* pdev will be returned if device is not a vf */
1423                 pf_pdev = pci_physfn(pdev);
1424                 info->pfsid = pci_dev_id(pf_pdev);
1425         }
1426
1427 #ifdef CONFIG_INTEL_IOMMU_SVM
1428         /* The PCIe spec, in its wisdom, declares that the behaviour of
1429            the device if you enable PASID support after ATS support is
1430            undefined. So always enable PASID support on devices which
1431            have it, even if we can't yet know if we're ever going to
1432            use it. */
1433         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1434                 info->pasid_enabled = 1;
1435
1436         if (info->pri_supported &&
1437             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1438             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1439                 info->pri_enabled = 1;
1440 #endif
1441         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1442             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1443                 info->ats_enabled = 1;
1444                 domain_update_iotlb(info->domain);
1445                 info->ats_qdep = pci_ats_queue_depth(pdev);
1446         }
1447 }
1448
1449 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1450 {
1451         struct pci_dev *pdev;
1452
1453         assert_spin_locked(&device_domain_lock);
1454
1455         if (!dev_is_pci(info->dev))
1456                 return;
1457
1458         pdev = to_pci_dev(info->dev);
1459
1460         if (info->ats_enabled) {
1461                 pci_disable_ats(pdev);
1462                 info->ats_enabled = 0;
1463                 domain_update_iotlb(info->domain);
1464         }
1465 #ifdef CONFIG_INTEL_IOMMU_SVM
1466         if (info->pri_enabled) {
1467                 pci_disable_pri(pdev);
1468                 info->pri_enabled = 0;
1469         }
1470         if (info->pasid_enabled) {
1471                 pci_disable_pasid(pdev);
1472                 info->pasid_enabled = 0;
1473         }
1474 #endif
1475 }
1476
1477 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1478                                   u64 addr, unsigned mask)
1479 {
1480         u16 sid, qdep;
1481         unsigned long flags;
1482         struct device_domain_info *info;
1483
1484         if (!domain->has_iotlb_device)
1485                 return;
1486
1487         spin_lock_irqsave(&device_domain_lock, flags);
1488         list_for_each_entry(info, &domain->devices, link) {
1489                 if (!info->ats_enabled)
1490                         continue;
1491
1492                 sid = info->bus << 8 | info->devfn;
1493                 qdep = info->ats_qdep;
1494                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1495                                 qdep, addr, mask);
1496         }
1497         spin_unlock_irqrestore(&device_domain_lock, flags);
1498 }
1499
1500 static void domain_flush_piotlb(struct intel_iommu *iommu,
1501                                 struct dmar_domain *domain,
1502                                 u64 addr, unsigned long npages, bool ih)
1503 {
1504         u16 did = domain->iommu_did[iommu->seq_id];
1505
1506         if (domain->default_pasid)
1507                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1508                                 addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1512 }
1513
1514 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1515                                   struct dmar_domain *domain,
1516                                   unsigned long pfn, unsigned int pages,
1517                                   int ih, int map)
1518 {
1519         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1520         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1521         u16 did = domain->iommu_did[iommu->seq_id];
1522
1523         BUG_ON(pages == 0);
1524
1525         if (ih)
1526                 ih = 1 << 6;
1527
1528         if (domain_use_first_level(domain)) {
1529                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1530         } else {
1531                 /*
1532                  * Fallback to domain selective flush if no PSI support or
1533                  * the size is too big. PSI requires page size to be 2 ^ x,
1534                  * and the base address is naturally aligned to the size.
1535                  */
1536                 if (!cap_pgsel_inv(iommu->cap) ||
1537                     mask > cap_max_amask_val(iommu->cap))
1538                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1539                                                         DMA_TLB_DSI_FLUSH);
1540                 else
1541                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1542                                                         DMA_TLB_PSI_FLUSH);
1543         }
1544
1545         /*
1546          * In caching mode, changes of pages from non-present to present require
1547          * flush. However, device IOTLB doesn't need to be flushed in this case.
1548          */
1549         if (!cap_caching_mode(iommu->cap) || !map)
1550                 iommu_flush_dev_iotlb(domain, addr, mask);
1551 }
1552
1553 /* Notification for newly created mappings */
1554 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1555                                         struct dmar_domain *domain,
1556                                         unsigned long pfn, unsigned int pages)
1557 {
1558         /*
1559          * It's a non-present to present mapping. Only flush if caching mode
1560          * and second level.
1561          */
1562         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1563                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1564         else
1565                 iommu_flush_write_buffer(iommu);
1566 }
1567
1568 static void iommu_flush_iova(struct iova_domain *iovad)
1569 {
1570         struct dmar_domain *domain;
1571         int idx;
1572
1573         domain = container_of(iovad, struct dmar_domain, iovad);
1574
1575         for_each_domain_iommu(idx, domain) {
1576                 struct intel_iommu *iommu = g_iommus[idx];
1577                 u16 did = domain->iommu_did[iommu->seq_id];
1578
1579                 if (domain_use_first_level(domain))
1580                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1581                 else
1582                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1583                                                  DMA_TLB_DSI_FLUSH);
1584
1585                 if (!cap_caching_mode(iommu->cap))
1586                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1587                                               0, MAX_AGAW_PFN_WIDTH);
1588         }
1589 }
1590
1591 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1592 {
1593         u32 pmen;
1594         unsigned long flags;
1595
1596         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1597                 return;
1598
1599         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1600         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1601         pmen &= ~DMA_PMEN_EPM;
1602         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1603
1604         /* wait for the protected region status bit to clear */
1605         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1606                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1607
1608         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1609 }
1610
1611 static void iommu_enable_translation(struct intel_iommu *iommu)
1612 {
1613         u32 sts;
1614         unsigned long flags;
1615
1616         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617         iommu->gcmd |= DMA_GCMD_TE;
1618         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1619
1620         /* Make sure hardware complete it */
1621         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1622                       readl, (sts & DMA_GSTS_TES), sts);
1623
1624         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1625 }
1626
1627 static void iommu_disable_translation(struct intel_iommu *iommu)
1628 {
1629         u32 sts;
1630         unsigned long flag;
1631
1632         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1633         iommu->gcmd &= ~DMA_GCMD_TE;
1634         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1635
1636         /* Make sure hardware complete it */
1637         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1638                       readl, (!(sts & DMA_GSTS_TES)), sts);
1639
1640         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1641 }
1642
1643 static int iommu_init_domains(struct intel_iommu *iommu)
1644 {
1645         u32 ndomains, nlongs;
1646         size_t size;
1647
1648         ndomains = cap_ndoms(iommu->cap);
1649         pr_debug("%s: Number of Domains supported <%d>\n",
1650                  iommu->name, ndomains);
1651         nlongs = BITS_TO_LONGS(ndomains);
1652
1653         spin_lock_init(&iommu->lock);
1654
1655         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1656         if (!iommu->domain_ids) {
1657                 pr_err("%s: Allocating domain id array failed\n",
1658                        iommu->name);
1659                 return -ENOMEM;
1660         }
1661
1662         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1663         iommu->domains = kzalloc(size, GFP_KERNEL);
1664
1665         if (iommu->domains) {
1666                 size = 256 * sizeof(struct dmar_domain *);
1667                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1668         }
1669
1670         if (!iommu->domains || !iommu->domains[0]) {
1671                 pr_err("%s: Allocating domain array failed\n",
1672                        iommu->name);
1673                 kfree(iommu->domain_ids);
1674                 kfree(iommu->domains);
1675                 iommu->domain_ids = NULL;
1676                 iommu->domains    = NULL;
1677                 return -ENOMEM;
1678         }
1679
1680         /*
1681          * If Caching mode is set, then invalid translations are tagged
1682          * with domain-id 0, hence we need to pre-allocate it. We also
1683          * use domain-id 0 as a marker for non-allocated domain-id, so
1684          * make sure it is not used for a real domain.
1685          */
1686         set_bit(0, iommu->domain_ids);
1687
1688         /*
1689          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1690          * entry for first-level or pass-through translation modes should
1691          * be programmed with a domain id different from those used for
1692          * second-level or nested translation. We reserve a domain id for
1693          * this purpose.
1694          */
1695         if (sm_supported(iommu))
1696                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1697
1698         return 0;
1699 }
1700
1701 static void disable_dmar_iommu(struct intel_iommu *iommu)
1702 {
1703         struct device_domain_info *info, *tmp;
1704         unsigned long flags;
1705
1706         if (!iommu->domains || !iommu->domain_ids)
1707                 return;
1708
1709         spin_lock_irqsave(&device_domain_lock, flags);
1710         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1711                 if (info->iommu != iommu)
1712                         continue;
1713
1714                 if (!info->dev || !info->domain)
1715                         continue;
1716
1717                 __dmar_remove_one_dev_info(info);
1718         }
1719         spin_unlock_irqrestore(&device_domain_lock, flags);
1720
1721         if (iommu->gcmd & DMA_GCMD_TE)
1722                 iommu_disable_translation(iommu);
1723 }
1724
1725 static void free_dmar_iommu(struct intel_iommu *iommu)
1726 {
1727         if ((iommu->domains) && (iommu->domain_ids)) {
1728                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1729                 int i;
1730
1731                 for (i = 0; i < elems; i++)
1732                         kfree(iommu->domains[i]);
1733                 kfree(iommu->domains);
1734                 kfree(iommu->domain_ids);
1735                 iommu->domains = NULL;
1736                 iommu->domain_ids = NULL;
1737         }
1738
1739         g_iommus[iommu->seq_id] = NULL;
1740
1741         /* free context mapping */
1742         free_context_table(iommu);
1743
1744 #ifdef CONFIG_INTEL_IOMMU_SVM
1745         if (pasid_supported(iommu)) {
1746                 if (ecap_prs(iommu->ecap))
1747                         intel_svm_finish_prq(iommu);
1748         }
1749         if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1750                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1751
1752 #endif
1753 }
1754
1755 /*
1756  * Check and return whether first level is used by default for
1757  * DMA translation.
1758  */
1759 static bool first_level_by_default(void)
1760 {
1761         struct dmar_drhd_unit *drhd;
1762         struct intel_iommu *iommu;
1763         static int first_level_support = -1;
1764
1765         if (likely(first_level_support != -1))
1766                 return first_level_support;
1767
1768         first_level_support = 1;
1769
1770         rcu_read_lock();
1771         for_each_active_iommu(iommu, drhd) {
1772                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1773                         first_level_support = 0;
1774                         break;
1775                 }
1776         }
1777         rcu_read_unlock();
1778
1779         return first_level_support;
1780 }
1781
1782 static struct dmar_domain *alloc_domain(int flags)
1783 {
1784         struct dmar_domain *domain;
1785
1786         domain = alloc_domain_mem();
1787         if (!domain)
1788                 return NULL;
1789
1790         memset(domain, 0, sizeof(*domain));
1791         domain->nid = NUMA_NO_NODE;
1792         domain->flags = flags;
1793         if (first_level_by_default())
1794                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1795         domain->has_iotlb_device = false;
1796         INIT_LIST_HEAD(&domain->devices);
1797
1798         return domain;
1799 }
1800
1801 /* Must be called with iommu->lock */
1802 static int domain_attach_iommu(struct dmar_domain *domain,
1803                                struct intel_iommu *iommu)
1804 {
1805         unsigned long ndomains;
1806         int num;
1807
1808         assert_spin_locked(&device_domain_lock);
1809         assert_spin_locked(&iommu->lock);
1810
1811         domain->iommu_refcnt[iommu->seq_id] += 1;
1812         domain->iommu_count += 1;
1813         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1814                 ndomains = cap_ndoms(iommu->cap);
1815                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1816
1817                 if (num >= ndomains) {
1818                         pr_err("%s: No free domain ids\n", iommu->name);
1819                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1820                         domain->iommu_count -= 1;
1821                         return -ENOSPC;
1822                 }
1823
1824                 set_bit(num, iommu->domain_ids);
1825                 set_iommu_domain(iommu, num, domain);
1826
1827                 domain->iommu_did[iommu->seq_id] = num;
1828                 domain->nid                      = iommu->node;
1829
1830                 domain_update_iommu_cap(domain);
1831         }
1832
1833         return 0;
1834 }
1835
1836 static int domain_detach_iommu(struct dmar_domain *domain,
1837                                struct intel_iommu *iommu)
1838 {
1839         int num, count;
1840
1841         assert_spin_locked(&device_domain_lock);
1842         assert_spin_locked(&iommu->lock);
1843
1844         domain->iommu_refcnt[iommu->seq_id] -= 1;
1845         count = --domain->iommu_count;
1846         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1847                 num = domain->iommu_did[iommu->seq_id];
1848                 clear_bit(num, iommu->domain_ids);
1849                 set_iommu_domain(iommu, num, NULL);
1850
1851                 domain_update_iommu_cap(domain);
1852                 domain->iommu_did[iommu->seq_id] = 0;
1853         }
1854
1855         return count;
1856 }
1857
1858 static struct iova_domain reserved_iova_list;
1859 static struct lock_class_key reserved_rbtree_key;
1860
1861 static int dmar_init_reserved_ranges(void)
1862 {
1863         struct pci_dev *pdev = NULL;
1864         struct iova *iova;
1865         int i;
1866
1867         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1868
1869         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1870                 &reserved_rbtree_key);
1871
1872         /* IOAPIC ranges shouldn't be accessed by DMA */
1873         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1874                 IOVA_PFN(IOAPIC_RANGE_END));
1875         if (!iova) {
1876                 pr_err("Reserve IOAPIC range failed\n");
1877                 return -ENODEV;
1878         }
1879
1880         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1881         for_each_pci_dev(pdev) {
1882                 struct resource *r;
1883
1884                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1885                         r = &pdev->resource[i];
1886                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1887                                 continue;
1888                         iova = reserve_iova(&reserved_iova_list,
1889                                             IOVA_PFN(r->start),
1890                                             IOVA_PFN(r->end));
1891                         if (!iova) {
1892                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1893                                 return -ENODEV;
1894                         }
1895                 }
1896         }
1897         return 0;
1898 }
1899
1900 static inline int guestwidth_to_adjustwidth(int gaw)
1901 {
1902         int agaw;
1903         int r = (gaw - 12) % 9;
1904
1905         if (r == 0)
1906                 agaw = gaw;
1907         else
1908                 agaw = gaw + 9 - r;
1909         if (agaw > 64)
1910                 agaw = 64;
1911         return agaw;
1912 }
1913
1914 static void domain_exit(struct dmar_domain *domain)
1915 {
1916
1917         /* Remove associated devices and clear attached or cached domains */
1918         domain_remove_dev_info(domain);
1919
1920         /* destroy iovas */
1921         if (domain->domain.type == IOMMU_DOMAIN_DMA)
1922                 put_iova_domain(&domain->iovad);
1923
1924         if (domain->pgd) {
1925                 struct page *freelist;
1926
1927                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1928                 dma_free_pagelist(freelist);
1929         }
1930
1931         free_domain_mem(domain);
1932 }
1933
1934 /*
1935  * Get the PASID directory size for scalable mode context entry.
1936  * Value of X in the PDTS field of a scalable mode context entry
1937  * indicates PASID directory with 2^(X + 7) entries.
1938  */
1939 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1940 {
1941         int pds, max_pde;
1942
1943         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1944         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1945         if (pds < 7)
1946                 return 0;
1947
1948         return pds - 7;
1949 }
1950
1951 /*
1952  * Set the RID_PASID field of a scalable mode context entry. The
1953  * IOMMU hardware will use the PASID value set in this field for
1954  * DMA translations of DMA requests without PASID.
1955  */
1956 static inline void
1957 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1958 {
1959         context->hi |= pasid & ((1 << 20) - 1);
1960 }
1961
1962 /*
1963  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1964  * entry.
1965  */
1966 static inline void context_set_sm_dte(struct context_entry *context)
1967 {
1968         context->lo |= (1 << 2);
1969 }
1970
1971 /*
1972  * Set the PRE(Page Request Enable) field of a scalable mode context
1973  * entry.
1974  */
1975 static inline void context_set_sm_pre(struct context_entry *context)
1976 {
1977         context->lo |= (1 << 4);
1978 }
1979
1980 /* Convert value to context PASID directory size field coding. */
1981 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1982
1983 static int domain_context_mapping_one(struct dmar_domain *domain,
1984                                       struct intel_iommu *iommu,
1985                                       struct pasid_table *table,
1986                                       u8 bus, u8 devfn)
1987 {
1988         u16 did = domain->iommu_did[iommu->seq_id];
1989         int translation = CONTEXT_TT_MULTI_LEVEL;
1990         struct device_domain_info *info = NULL;
1991         struct context_entry *context;
1992         unsigned long flags;
1993         int ret;
1994
1995         WARN_ON(did == 0);
1996
1997         if (hw_pass_through && domain_type_is_si(domain))
1998                 translation = CONTEXT_TT_PASS_THROUGH;
1999
2000         pr_debug("Set context mapping for %02x:%02x.%d\n",
2001                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2002
2003         BUG_ON(!domain->pgd);
2004
2005         spin_lock_irqsave(&device_domain_lock, flags);
2006         spin_lock(&iommu->lock);
2007
2008         ret = -ENOMEM;
2009         context = iommu_context_addr(iommu, bus, devfn, 1);
2010         if (!context)
2011                 goto out_unlock;
2012
2013         ret = 0;
2014         if (context_present(context))
2015                 goto out_unlock;
2016
2017         /*
2018          * For kdump cases, old valid entries may be cached due to the
2019          * in-flight DMA and copied pgtable, but there is no unmapping
2020          * behaviour for them, thus we need an explicit cache flush for
2021          * the newly-mapped device. For kdump, at this point, the device
2022          * is supposed to finish reset at its driver probe stage, so no
2023          * in-flight DMA will exist, and we don't need to worry anymore
2024          * hereafter.
2025          */
2026         if (context_copied(context)) {
2027                 u16 did_old = context_domain_id(context);
2028
2029                 if (did_old < cap_ndoms(iommu->cap)) {
2030                         iommu->flush.flush_context(iommu, did_old,
2031                                                    (((u16)bus) << 8) | devfn,
2032                                                    DMA_CCMD_MASK_NOBIT,
2033                                                    DMA_CCMD_DEVICE_INVL);
2034                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2035                                                  DMA_TLB_DSI_FLUSH);
2036                 }
2037         }
2038
2039         context_clear_entry(context);
2040
2041         if (sm_supported(iommu)) {
2042                 unsigned long pds;
2043
2044                 WARN_ON(!table);
2045
2046                 /* Setup the PASID DIR pointer: */
2047                 pds = context_get_sm_pds(table);
2048                 context->lo = (u64)virt_to_phys(table->table) |
2049                                 context_pdts(pds);
2050
2051                 /* Setup the RID_PASID field: */
2052                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2053
2054                 /*
2055                  * Setup the Device-TLB enable bit and Page request
2056                  * Enable bit:
2057                  */
2058                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059                 if (info && info->ats_supported)
2060                         context_set_sm_dte(context);
2061                 if (info && info->pri_supported)
2062                         context_set_sm_pre(context);
2063         } else {
2064                 struct dma_pte *pgd = domain->pgd;
2065                 int agaw;
2066
2067                 context_set_domain_id(context, did);
2068
2069                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2070                         /*
2071                          * Skip top levels of page tables for iommu which has
2072                          * less agaw than default. Unnecessary for PT mode.
2073                          */
2074                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2075                                 ret = -ENOMEM;
2076                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2077                                 if (!dma_pte_present(pgd))
2078                                         goto out_unlock;
2079                         }
2080
2081                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2082                         if (info && info->ats_supported)
2083                                 translation = CONTEXT_TT_DEV_IOTLB;
2084                         else
2085                                 translation = CONTEXT_TT_MULTI_LEVEL;
2086
2087                         context_set_address_root(context, virt_to_phys(pgd));
2088                         context_set_address_width(context, agaw);
2089                 } else {
2090                         /*
2091                          * In pass through mode, AW must be programmed to
2092                          * indicate the largest AGAW value supported by
2093                          * hardware. And ASR is ignored by hardware.
2094                          */
2095                         context_set_address_width(context, iommu->msagaw);
2096                 }
2097
2098                 context_set_translation_type(context, translation);
2099         }
2100
2101         context_set_fault_enable(context);
2102         context_set_present(context);
2103         if (!ecap_coherent(iommu->ecap))
2104                 clflush_cache_range(context, sizeof(*context));
2105
2106         /*
2107          * It's a non-present to present mapping. If hardware doesn't cache
2108          * non-present entry we only need to flush the write-buffer. If the
2109          * _does_ cache non-present entries, then it does so in the special
2110          * domain #0, which we have to flush:
2111          */
2112         if (cap_caching_mode(iommu->cap)) {
2113                 iommu->flush.flush_context(iommu, 0,
2114                                            (((u16)bus) << 8) | devfn,
2115                                            DMA_CCMD_MASK_NOBIT,
2116                                            DMA_CCMD_DEVICE_INVL);
2117                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2118         } else {
2119                 iommu_flush_write_buffer(iommu);
2120         }
2121         iommu_enable_dev_iotlb(info);
2122
2123         ret = 0;
2124
2125 out_unlock:
2126         spin_unlock(&iommu->lock);
2127         spin_unlock_irqrestore(&device_domain_lock, flags);
2128
2129         return ret;
2130 }
2131
2132 struct domain_context_mapping_data {
2133         struct dmar_domain *domain;
2134         struct intel_iommu *iommu;
2135         struct pasid_table *table;
2136 };
2137
2138 static int domain_context_mapping_cb(struct pci_dev *pdev,
2139                                      u16 alias, void *opaque)
2140 {
2141         struct domain_context_mapping_data *data = opaque;
2142
2143         return domain_context_mapping_one(data->domain, data->iommu,
2144                                           data->table, PCI_BUS_NUM(alias),
2145                                           alias & 0xff);
2146 }
2147
2148 static int
2149 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2150 {
2151         struct domain_context_mapping_data data;
2152         struct pasid_table *table;
2153         struct intel_iommu *iommu;
2154         u8 bus, devfn;
2155
2156         iommu = device_to_iommu(dev, &bus, &devfn);
2157         if (!iommu)
2158                 return -ENODEV;
2159
2160         table = intel_pasid_get_table(dev);
2161
2162         if (!dev_is_pci(dev))
2163                 return domain_context_mapping_one(domain, iommu, table,
2164                                                   bus, devfn);
2165
2166         data.domain = domain;
2167         data.iommu = iommu;
2168         data.table = table;
2169
2170         return pci_for_each_dma_alias(to_pci_dev(dev),
2171                                       &domain_context_mapping_cb, &data);
2172 }
2173
2174 static int domain_context_mapped_cb(struct pci_dev *pdev,
2175                                     u16 alias, void *opaque)
2176 {
2177         struct intel_iommu *iommu = opaque;
2178
2179         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2180 }
2181
2182 static int domain_context_mapped(struct device *dev)
2183 {
2184         struct intel_iommu *iommu;
2185         u8 bus, devfn;
2186
2187         iommu = device_to_iommu(dev, &bus, &devfn);
2188         if (!iommu)
2189                 return -ENODEV;
2190
2191         if (!dev_is_pci(dev))
2192                 return device_context_mapped(iommu, bus, devfn);
2193
2194         return !pci_for_each_dma_alias(to_pci_dev(dev),
2195                                        domain_context_mapped_cb, iommu);
2196 }
2197
2198 /* Returns a number of VTD pages, but aligned to MM page size */
2199 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2200                                             size_t size)
2201 {
2202         host_addr &= ~PAGE_MASK;
2203         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2204 }
2205
2206 /* Return largest possible superpage level for a given mapping */
2207 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2208                                           unsigned long iov_pfn,
2209                                           unsigned long phy_pfn,
2210                                           unsigned long pages)
2211 {
2212         int support, level = 1;
2213         unsigned long pfnmerge;
2214
2215         support = domain->iommu_superpage;
2216
2217         /* To use a large page, the virtual *and* physical addresses
2218            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2219            of them will mean we have to use smaller pages. So just
2220            merge them and check both at once. */
2221         pfnmerge = iov_pfn | phy_pfn;
2222
2223         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2224                 pages >>= VTD_STRIDE_SHIFT;
2225                 if (!pages)
2226                         break;
2227                 pfnmerge >>= VTD_STRIDE_SHIFT;
2228                 level++;
2229                 support--;
2230         }
2231         return level;
2232 }
2233
2234 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2235                             struct scatterlist *sg, unsigned long phys_pfn,
2236                             unsigned long nr_pages, int prot)
2237 {
2238         struct dma_pte *first_pte = NULL, *pte = NULL;
2239         phys_addr_t pteval;
2240         unsigned long sg_res = 0;
2241         unsigned int largepage_lvl = 0;
2242         unsigned long lvl_pages = 0;
2243         u64 attr;
2244
2245         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2246
2247         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2248                 return -EINVAL;
2249
2250         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2251         if (domain_use_first_level(domain))
2252                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2253
2254         if (!sg) {
2255                 sg_res = nr_pages;
2256                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2257         }
2258
2259         while (nr_pages > 0) {
2260                 uint64_t tmp;
2261
2262                 if (!sg_res) {
2263                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2264
2265                         sg_res = aligned_nrpages(sg->offset, sg->length);
2266                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2267                         sg->dma_length = sg->length;
2268                         pteval = (sg_phys(sg) - pgoff) | attr;
2269                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2270                 }
2271
2272                 if (!pte) {
2273                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2274
2275                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2276                         if (!pte)
2277                                 return -ENOMEM;
2278                         /* It is large page*/
2279                         if (largepage_lvl > 1) {
2280                                 unsigned long nr_superpages, end_pfn;
2281
2282                                 pteval |= DMA_PTE_LARGE_PAGE;
2283                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2284
2285                                 nr_superpages = sg_res / lvl_pages;
2286                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2287
2288                                 /*
2289                                  * Ensure that old small page tables are
2290                                  * removed to make room for superpage(s).
2291                                  * We're adding new large pages, so make sure
2292                                  * we don't remove their parent tables.
2293                                  */
2294                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2295                                                        largepage_lvl + 1);
2296                         } else {
2297                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2298                         }
2299
2300                 }
2301                 /* We don't need lock here, nobody else
2302                  * touches the iova range
2303                  */
2304                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2305                 if (tmp) {
2306                         static int dumps = 5;
2307                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2308                                 iov_pfn, tmp, (unsigned long long)pteval);
2309                         if (dumps) {
2310                                 dumps--;
2311                                 debug_dma_dump_mappings(NULL);
2312                         }
2313                         WARN_ON(1);
2314                 }
2315
2316                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2317
2318                 BUG_ON(nr_pages < lvl_pages);
2319                 BUG_ON(sg_res < lvl_pages);
2320
2321                 nr_pages -= lvl_pages;
2322                 iov_pfn += lvl_pages;
2323                 phys_pfn += lvl_pages;
2324                 pteval += lvl_pages * VTD_PAGE_SIZE;
2325                 sg_res -= lvl_pages;
2326
2327                 /* If the next PTE would be the first in a new page, then we
2328                    need to flush the cache on the entries we've just written.
2329                    And then we'll need to recalculate 'pte', so clear it and
2330                    let it get set again in the if (!pte) block above.
2331
2332                    If we're done (!nr_pages) we need to flush the cache too.
2333
2334                    Also if we've been setting superpages, we may need to
2335                    recalculate 'pte' and switch back to smaller pages for the
2336                    end of the mapping, if the trailing size is not enough to
2337                    use another superpage (i.e. sg_res < lvl_pages). */
2338                 pte++;
2339                 if (!nr_pages || first_pte_in_page(pte) ||
2340                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2341                         domain_flush_cache(domain, first_pte,
2342                                            (void *)pte - (void *)first_pte);
2343                         pte = NULL;
2344                 }
2345
2346                 if (!sg_res && nr_pages)
2347                         sg = sg_next(sg);
2348         }
2349         return 0;
2350 }
2351
2352 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2353                           struct scatterlist *sg, unsigned long phys_pfn,
2354                           unsigned long nr_pages, int prot)
2355 {
2356         int iommu_id, ret;
2357         struct intel_iommu *iommu;
2358
2359         /* Do the real mapping first */
2360         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2361         if (ret)
2362                 return ret;
2363
2364         for_each_domain_iommu(iommu_id, domain) {
2365                 iommu = g_iommus[iommu_id];
2366                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2367         }
2368
2369         return 0;
2370 }
2371
2372 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2373                                     struct scatterlist *sg, unsigned long nr_pages,
2374                                     int prot)
2375 {
2376         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2377 }
2378
2379 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380                                      unsigned long phys_pfn, unsigned long nr_pages,
2381                                      int prot)
2382 {
2383         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2384 }
2385
2386 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2387 {
2388         unsigned long flags;
2389         struct context_entry *context;
2390         u16 did_old;
2391
2392         if (!iommu)
2393                 return;
2394
2395         spin_lock_irqsave(&iommu->lock, flags);
2396         context = iommu_context_addr(iommu, bus, devfn, 0);
2397         if (!context) {
2398                 spin_unlock_irqrestore(&iommu->lock, flags);
2399                 return;
2400         }
2401         did_old = context_domain_id(context);
2402         context_clear_entry(context);
2403         __iommu_flush_cache(iommu, context, sizeof(*context));
2404         spin_unlock_irqrestore(&iommu->lock, flags);
2405         iommu->flush.flush_context(iommu,
2406                                    did_old,
2407                                    (((u16)bus) << 8) | devfn,
2408                                    DMA_CCMD_MASK_NOBIT,
2409                                    DMA_CCMD_DEVICE_INVL);
2410         iommu->flush.flush_iotlb(iommu,
2411                                  did_old,
2412                                  0,
2413                                  0,
2414                                  DMA_TLB_DSI_FLUSH);
2415 }
2416
2417 static inline void unlink_domain_info(struct device_domain_info *info)
2418 {
2419         assert_spin_locked(&device_domain_lock);
2420         list_del(&info->link);
2421         list_del(&info->global);
2422         if (info->dev)
2423                 info->dev->archdata.iommu = NULL;
2424 }
2425
2426 static void domain_remove_dev_info(struct dmar_domain *domain)
2427 {
2428         struct device_domain_info *info, *tmp;
2429         unsigned long flags;
2430
2431         spin_lock_irqsave(&device_domain_lock, flags);
2432         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2433                 __dmar_remove_one_dev_info(info);
2434         spin_unlock_irqrestore(&device_domain_lock, flags);
2435 }
2436
2437 struct dmar_domain *find_domain(struct device *dev)
2438 {
2439         struct device_domain_info *info;
2440
2441         if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2442                 return NULL;
2443
2444         /* No lock here, assumes no domain exit in normal case */
2445         info = get_domain_info(dev);
2446         if (likely(info))
2447                 return info->domain;
2448
2449         return NULL;
2450 }
2451
2452 static void do_deferred_attach(struct device *dev)
2453 {
2454         struct iommu_domain *domain;
2455
2456         dev->archdata.iommu = NULL;
2457         domain = iommu_get_domain_for_dev(dev);
2458         if (domain)
2459                 intel_iommu_attach_device(domain, dev);
2460 }
2461
2462 static inline struct device_domain_info *
2463 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2464 {
2465         struct device_domain_info *info;
2466
2467         list_for_each_entry(info, &device_domain_list, global)
2468                 if (info->segment == segment && info->bus == bus &&
2469                     info->devfn == devfn)
2470                         return info;
2471
2472         return NULL;
2473 }
2474
2475 static int domain_setup_first_level(struct intel_iommu *iommu,
2476                                     struct dmar_domain *domain,
2477                                     struct device *dev,
2478                                     int pasid)
2479 {
2480         int flags = PASID_FLAG_SUPERVISOR_MODE;
2481         struct dma_pte *pgd = domain->pgd;
2482         int agaw, level;
2483
2484         /*
2485          * Skip top levels of page tables for iommu which has
2486          * less agaw than default. Unnecessary for PT mode.
2487          */
2488         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2489                 pgd = phys_to_virt(dma_pte_addr(pgd));
2490                 if (!dma_pte_present(pgd))
2491                         return -ENOMEM;
2492         }
2493
2494         level = agaw_to_level(agaw);
2495         if (level != 4 && level != 5)
2496                 return -EINVAL;
2497
2498         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2499
2500         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2501                                              domain->iommu_did[iommu->seq_id],
2502                                              flags);
2503 }
2504
2505 static bool dev_is_real_dma_subdevice(struct device *dev)
2506 {
2507         return dev && dev_is_pci(dev) &&
2508                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2509 }
2510
2511 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2512                                                     int bus, int devfn,
2513                                                     struct device *dev,
2514                                                     struct dmar_domain *domain)
2515 {
2516         struct dmar_domain *found = NULL;
2517         struct device_domain_info *info;
2518         unsigned long flags;
2519         int ret;
2520
2521         info = alloc_devinfo_mem();
2522         if (!info)
2523                 return NULL;
2524
2525         if (!dev_is_real_dma_subdevice(dev)) {
2526                 info->bus = bus;
2527                 info->devfn = devfn;
2528                 info->segment = iommu->segment;
2529         } else {
2530                 struct pci_dev *pdev = to_pci_dev(dev);
2531
2532                 info->bus = pdev->bus->number;
2533                 info->devfn = pdev->devfn;
2534                 info->segment = pci_domain_nr(pdev->bus);
2535         }
2536
2537         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2538         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2539         info->ats_qdep = 0;
2540         info->dev = dev;
2541         info->domain = domain;
2542         info->iommu = iommu;
2543         info->pasid_table = NULL;
2544         info->auxd_enabled = 0;
2545         INIT_LIST_HEAD(&info->auxiliary_domains);
2546
2547         if (dev && dev_is_pci(dev)) {
2548                 struct pci_dev *pdev = to_pci_dev(info->dev);
2549
2550                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2551                     pci_ats_supported(pdev) &&
2552                     dmar_find_matched_atsr_unit(pdev))
2553                         info->ats_supported = 1;
2554
2555                 if (sm_supported(iommu)) {
2556                         if (pasid_supported(iommu)) {
2557                                 int features = pci_pasid_features(pdev);
2558                                 if (features >= 0)
2559                                         info->pasid_supported = features | 1;
2560                         }
2561
2562                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2563                             pci_pri_supported(pdev))
2564                                 info->pri_supported = 1;
2565                 }
2566         }
2567
2568         spin_lock_irqsave(&device_domain_lock, flags);
2569         if (dev)
2570                 found = find_domain(dev);
2571
2572         if (!found) {
2573                 struct device_domain_info *info2;
2574                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2575                                                        info->devfn);
2576                 if (info2) {
2577                         found      = info2->domain;
2578                         info2->dev = dev;
2579                 }
2580         }
2581
2582         if (found) {
2583                 spin_unlock_irqrestore(&device_domain_lock, flags);
2584                 free_devinfo_mem(info);
2585                 /* Caller must free the original domain */
2586                 return found;
2587         }
2588
2589         spin_lock(&iommu->lock);
2590         ret = domain_attach_iommu(domain, iommu);
2591         spin_unlock(&iommu->lock);
2592
2593         if (ret) {
2594                 spin_unlock_irqrestore(&device_domain_lock, flags);
2595                 free_devinfo_mem(info);
2596                 return NULL;
2597         }
2598
2599         list_add(&info->link, &domain->devices);
2600         list_add(&info->global, &device_domain_list);
2601         if (dev)
2602                 dev->archdata.iommu = info;
2603         spin_unlock_irqrestore(&device_domain_lock, flags);
2604
2605         /* PASID table is mandatory for a PCI device in scalable mode. */
2606         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2607                 ret = intel_pasid_alloc_table(dev);
2608                 if (ret) {
2609                         dev_err(dev, "PASID table allocation failed\n");
2610                         dmar_remove_one_dev_info(dev);
2611                         return NULL;
2612                 }
2613
2614                 /* Setup the PASID entry for requests without PASID: */
2615                 spin_lock(&iommu->lock);
2616                 if (hw_pass_through && domain_type_is_si(domain))
2617                         ret = intel_pasid_setup_pass_through(iommu, domain,
2618                                         dev, PASID_RID2PASID);
2619                 else if (domain_use_first_level(domain))
2620                         ret = domain_setup_first_level(iommu, domain, dev,
2621                                         PASID_RID2PASID);
2622                 else
2623                         ret = intel_pasid_setup_second_level(iommu, domain,
2624                                         dev, PASID_RID2PASID);
2625                 spin_unlock(&iommu->lock);
2626                 if (ret) {
2627                         dev_err(dev, "Setup RID2PASID failed\n");
2628                         dmar_remove_one_dev_info(dev);
2629                         return NULL;
2630                 }
2631         }
2632
2633         if (dev && domain_context_mapping(domain, dev)) {
2634                 dev_err(dev, "Domain context map failed\n");
2635                 dmar_remove_one_dev_info(dev);
2636                 return NULL;
2637         }
2638
2639         return domain;
2640 }
2641
2642 static int iommu_domain_identity_map(struct dmar_domain *domain,
2643                                      unsigned long first_vpfn,
2644                                      unsigned long last_vpfn)
2645 {
2646         /*
2647          * RMRR range might have overlap with physical memory range,
2648          * clear it first
2649          */
2650         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2651
2652         return __domain_mapping(domain, first_vpfn, NULL,
2653                                 first_vpfn, last_vpfn - first_vpfn + 1,
2654                                 DMA_PTE_READ|DMA_PTE_WRITE);
2655 }
2656
2657 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2658
2659 static int __init si_domain_init(int hw)
2660 {
2661         struct dmar_rmrr_unit *rmrr;
2662         struct device *dev;
2663         int i, nid, ret;
2664
2665         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2666         if (!si_domain)
2667                 return -EFAULT;
2668
2669         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2670                 domain_exit(si_domain);
2671                 return -EFAULT;
2672         }
2673
2674         if (hw)
2675                 return 0;
2676
2677         for_each_online_node(nid) {
2678                 unsigned long start_pfn, end_pfn;
2679                 int i;
2680
2681                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2682                         ret = iommu_domain_identity_map(si_domain,
2683                                         mm_to_dma_pfn(start_pfn),
2684                                         mm_to_dma_pfn(end_pfn));
2685                         if (ret)
2686                                 return ret;
2687                 }
2688         }
2689
2690         /*
2691          * Identity map the RMRRs so that devices with RMRRs could also use
2692          * the si_domain.
2693          */
2694         for_each_rmrr_units(rmrr) {
2695                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2696                                           i, dev) {
2697                         unsigned long long start = rmrr->base_address;
2698                         unsigned long long end = rmrr->end_address;
2699
2700                         if (WARN_ON(end < start ||
2701                                     end >> agaw_to_width(si_domain->agaw)))
2702                                 continue;
2703
2704                         ret = iommu_domain_identity_map(si_domain,
2705                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2706                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2707                         if (ret)
2708                                 return ret;
2709                 }
2710         }
2711
2712         return 0;
2713 }
2714
2715 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2716 {
2717         struct dmar_domain *ndomain;
2718         struct intel_iommu *iommu;
2719         u8 bus, devfn;
2720
2721         iommu = device_to_iommu(dev, &bus, &devfn);
2722         if (!iommu)
2723                 return -ENODEV;
2724
2725         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2726         if (ndomain != domain)
2727                 return -EBUSY;
2728
2729         return 0;
2730 }
2731
2732 static bool device_has_rmrr(struct device *dev)
2733 {
2734         struct dmar_rmrr_unit *rmrr;
2735         struct device *tmp;
2736         int i;
2737
2738         rcu_read_lock();
2739         for_each_rmrr_units(rmrr) {
2740                 /*
2741                  * Return TRUE if this RMRR contains the device that
2742                  * is passed in.
2743                  */
2744                 for_each_active_dev_scope(rmrr->devices,
2745                                           rmrr->devices_cnt, i, tmp)
2746                         if (tmp == dev ||
2747                             is_downstream_to_pci_bridge(dev, tmp)) {
2748                                 rcu_read_unlock();
2749                                 return true;
2750                         }
2751         }
2752         rcu_read_unlock();
2753         return false;
2754 }
2755
2756 /**
2757  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2758  * is relaxable (ie. is allowed to be not enforced under some conditions)
2759  * @dev: device handle
2760  *
2761  * We assume that PCI USB devices with RMRRs have them largely
2762  * for historical reasons and that the RMRR space is not actively used post
2763  * boot.  This exclusion may change if vendors begin to abuse it.
2764  *
2765  * The same exception is made for graphics devices, with the requirement that
2766  * any use of the RMRR regions will be torn down before assigning the device
2767  * to a guest.
2768  *
2769  * Return: true if the RMRR is relaxable, false otherwise
2770  */
2771 static bool device_rmrr_is_relaxable(struct device *dev)
2772 {
2773         struct pci_dev *pdev;
2774
2775         if (!dev_is_pci(dev))
2776                 return false;
2777
2778         pdev = to_pci_dev(dev);
2779         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2780                 return true;
2781         else
2782                 return false;
2783 }
2784
2785 /*
2786  * There are a couple cases where we need to restrict the functionality of
2787  * devices associated with RMRRs.  The first is when evaluating a device for
2788  * identity mapping because problems exist when devices are moved in and out
2789  * of domains and their respective RMRR information is lost.  This means that
2790  * a device with associated RMRRs will never be in a "passthrough" domain.
2791  * The second is use of the device through the IOMMU API.  This interface
2792  * expects to have full control of the IOVA space for the device.  We cannot
2793  * satisfy both the requirement that RMRR access is maintained and have an
2794  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2795  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2796  * We therefore prevent devices associated with an RMRR from participating in
2797  * the IOMMU API, which eliminates them from device assignment.
2798  *
2799  * In both cases, devices which have relaxable RMRRs are not concerned by this
2800  * restriction. See device_rmrr_is_relaxable comment.
2801  */
2802 static bool device_is_rmrr_locked(struct device *dev)
2803 {
2804         if (!device_has_rmrr(dev))
2805                 return false;
2806
2807         if (device_rmrr_is_relaxable(dev))
2808                 return false;
2809
2810         return true;
2811 }
2812
2813 /*
2814  * Return the required default domain type for a specific device.
2815  *
2816  * @dev: the device in query
2817  * @startup: true if this is during early boot
2818  *
2819  * Returns:
2820  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2821  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2822  *  - 0: both identity and dynamic domains work for this device
2823  */
2824 static int device_def_domain_type(struct device *dev)
2825 {
2826         if (dev_is_pci(dev)) {
2827                 struct pci_dev *pdev = to_pci_dev(dev);
2828
2829                 /*
2830                  * Prevent any device marked as untrusted from getting
2831                  * placed into the statically identity mapping domain.
2832                  */
2833                 if (pdev->untrusted)
2834                         return IOMMU_DOMAIN_DMA;
2835
2836                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2837                         return IOMMU_DOMAIN_IDENTITY;
2838
2839                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2840                         return IOMMU_DOMAIN_IDENTITY;
2841         }
2842
2843         return 0;
2844 }
2845
2846 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2847 {
2848         /*
2849          * Start from the sane iommu hardware state.
2850          * If the queued invalidation is already initialized by us
2851          * (for example, while enabling interrupt-remapping) then
2852          * we got the things already rolling from a sane state.
2853          */
2854         if (!iommu->qi) {
2855                 /*
2856                  * Clear any previous faults.
2857                  */
2858                 dmar_fault(-1, iommu);
2859                 /*
2860                  * Disable queued invalidation if supported and already enabled
2861                  * before OS handover.
2862                  */
2863                 dmar_disable_qi(iommu);
2864         }
2865
2866         if (dmar_enable_qi(iommu)) {
2867                 /*
2868                  * Queued Invalidate not enabled, use Register Based Invalidate
2869                  */
2870                 iommu->flush.flush_context = __iommu_flush_context;
2871                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2872                 pr_info("%s: Using Register based invalidation\n",
2873                         iommu->name);
2874         } else {
2875                 iommu->flush.flush_context = qi_flush_context;
2876                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2877                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2878         }
2879 }
2880
2881 static int copy_context_table(struct intel_iommu *iommu,
2882                               struct root_entry *old_re,
2883                               struct context_entry **tbl,
2884                               int bus, bool ext)
2885 {
2886         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2887         struct context_entry *new_ce = NULL, ce;
2888         struct context_entry *old_ce = NULL;
2889         struct root_entry re;
2890         phys_addr_t old_ce_phys;
2891
2892         tbl_idx = ext ? bus * 2 : bus;
2893         memcpy(&re, old_re, sizeof(re));
2894
2895         for (devfn = 0; devfn < 256; devfn++) {
2896                 /* First calculate the correct index */
2897                 idx = (ext ? devfn * 2 : devfn) % 256;
2898
2899                 if (idx == 0) {
2900                         /* First save what we may have and clean up */
2901                         if (new_ce) {
2902                                 tbl[tbl_idx] = new_ce;
2903                                 __iommu_flush_cache(iommu, new_ce,
2904                                                     VTD_PAGE_SIZE);
2905                                 pos = 1;
2906                         }
2907
2908                         if (old_ce)
2909                                 memunmap(old_ce);
2910
2911                         ret = 0;
2912                         if (devfn < 0x80)
2913                                 old_ce_phys = root_entry_lctp(&re);
2914                         else
2915                                 old_ce_phys = root_entry_uctp(&re);
2916
2917                         if (!old_ce_phys) {
2918                                 if (ext && devfn == 0) {
2919                                         /* No LCTP, try UCTP */
2920                                         devfn = 0x7f;
2921                                         continue;
2922                                 } else {
2923                                         goto out;
2924                                 }
2925                         }
2926
2927                         ret = -ENOMEM;
2928                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2929                                         MEMREMAP_WB);
2930                         if (!old_ce)
2931                                 goto out;
2932
2933                         new_ce = alloc_pgtable_page(iommu->node);
2934                         if (!new_ce)
2935                                 goto out_unmap;
2936
2937                         ret = 0;
2938                 }
2939
2940                 /* Now copy the context entry */
2941                 memcpy(&ce, old_ce + idx, sizeof(ce));
2942
2943                 if (!__context_present(&ce))
2944                         continue;
2945
2946                 did = context_domain_id(&ce);
2947                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2948                         set_bit(did, iommu->domain_ids);
2949
2950                 /*
2951                  * We need a marker for copied context entries. This
2952                  * marker needs to work for the old format as well as
2953                  * for extended context entries.
2954                  *
2955                  * Bit 67 of the context entry is used. In the old
2956                  * format this bit is available to software, in the
2957                  * extended format it is the PGE bit, but PGE is ignored
2958                  * by HW if PASIDs are disabled (and thus still
2959                  * available).
2960                  *
2961                  * So disable PASIDs first and then mark the entry
2962                  * copied. This means that we don't copy PASID
2963                  * translations from the old kernel, but this is fine as
2964                  * faults there are not fatal.
2965                  */
2966                 context_clear_pasid_enable(&ce);
2967                 context_set_copied(&ce);
2968
2969                 new_ce[idx] = ce;
2970         }
2971
2972         tbl[tbl_idx + pos] = new_ce;
2973
2974         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2975
2976 out_unmap:
2977         memunmap(old_ce);
2978
2979 out:
2980         return ret;
2981 }
2982
2983 static int copy_translation_tables(struct intel_iommu *iommu)
2984 {
2985         struct context_entry **ctxt_tbls;
2986         struct root_entry *old_rt;
2987         phys_addr_t old_rt_phys;
2988         int ctxt_table_entries;
2989         unsigned long flags;
2990         u64 rtaddr_reg;
2991         int bus, ret;
2992         bool new_ext, ext;
2993
2994         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2995         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
2996         new_ext    = !!ecap_ecs(iommu->ecap);
2997
2998         /*
2999          * The RTT bit can only be changed when translation is disabled,
3000          * but disabling translation means to open a window for data
3001          * corruption. So bail out and don't copy anything if we would
3002          * have to change the bit.
3003          */
3004         if (new_ext != ext)
3005                 return -EINVAL;
3006
3007         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3008         if (!old_rt_phys)
3009                 return -EINVAL;
3010
3011         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3012         if (!old_rt)
3013                 return -ENOMEM;
3014
3015         /* This is too big for the stack - allocate it from slab */
3016         ctxt_table_entries = ext ? 512 : 256;
3017         ret = -ENOMEM;
3018         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3019         if (!ctxt_tbls)
3020                 goto out_unmap;
3021
3022         for (bus = 0; bus < 256; bus++) {
3023                 ret = copy_context_table(iommu, &old_rt[bus],
3024                                          ctxt_tbls, bus, ext);
3025                 if (ret) {
3026                         pr_err("%s: Failed to copy context table for bus %d\n",
3027                                 iommu->name, bus);
3028                         continue;
3029                 }
3030         }
3031
3032         spin_lock_irqsave(&iommu->lock, flags);
3033
3034         /* Context tables are copied, now write them to the root_entry table */
3035         for (bus = 0; bus < 256; bus++) {
3036                 int idx = ext ? bus * 2 : bus;
3037                 u64 val;
3038
3039                 if (ctxt_tbls[idx]) {
3040                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3041                         iommu->root_entry[bus].lo = val;
3042                 }
3043
3044                 if (!ext || !ctxt_tbls[idx + 1])
3045                         continue;
3046
3047                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3048                 iommu->root_entry[bus].hi = val;
3049         }
3050
3051         spin_unlock_irqrestore(&iommu->lock, flags);
3052
3053         kfree(ctxt_tbls);
3054
3055         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3056
3057         ret = 0;
3058
3059 out_unmap:
3060         memunmap(old_rt);
3061
3062         return ret;
3063 }
3064
3065 #ifdef CONFIG_INTEL_IOMMU_SVM
3066 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3067 {
3068         struct intel_iommu *iommu = data;
3069         ioasid_t ioasid;
3070
3071         if (!iommu)
3072                 return INVALID_IOASID;
3073         /*
3074          * VT-d virtual command interface always uses the full 20 bit
3075          * PASID range. Host can partition guest PASID range based on
3076          * policies but it is out of guest's control.
3077          */
3078         if (min < PASID_MIN || max > intel_pasid_max_id)
3079                 return INVALID_IOASID;
3080
3081         if (vcmd_alloc_pasid(iommu, &ioasid))
3082                 return INVALID_IOASID;
3083
3084         return ioasid;
3085 }
3086
3087 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3088 {
3089         struct intel_iommu *iommu = data;
3090
3091         if (!iommu)
3092                 return;
3093         /*
3094          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3095          * We can only free the PASID when all the devices are unbound.
3096          */
3097         if (ioasid_find(NULL, ioasid, NULL)) {
3098                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3099                 return;
3100         }
3101         vcmd_free_pasid(iommu, ioasid);
3102 }
3103
3104 static void register_pasid_allocator(struct intel_iommu *iommu)
3105 {
3106         /*
3107          * If we are running in the host, no need for custom allocator
3108          * in that PASIDs are allocated from the host system-wide.
3109          */
3110         if (!cap_caching_mode(iommu->cap))
3111                 return;
3112
3113         if (!sm_supported(iommu)) {
3114                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3115                 return;
3116         }
3117
3118         /*
3119          * Register a custom PASID allocator if we are running in a guest,
3120          * guest PASID must be obtained via virtual command interface.
3121          * There can be multiple vIOMMUs in each guest but only one allocator
3122          * is active. All vIOMMU allocators will eventually be calling the same
3123          * host allocator.
3124          */
3125         if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3126                 return;
3127
3128         pr_info("Register custom PASID allocator\n");
3129         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3130         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3131         iommu->pasid_allocator.pdata = (void *)iommu;
3132         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3133                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3134                 /*
3135                  * Disable scalable mode on this IOMMU if there
3136                  * is no custom allocator. Mixing SM capable vIOMMU
3137                  * and non-SM vIOMMU are not supported.
3138                  */
3139                 intel_iommu_sm = 0;
3140         }
3141 }
3142 #endif
3143
3144 static int __init init_dmars(void)
3145 {
3146         struct dmar_drhd_unit *drhd;
3147         struct intel_iommu *iommu;
3148         int ret;
3149
3150         /*
3151          * for each drhd
3152          *    allocate root
3153          *    initialize and program root entry to not present
3154          * endfor
3155          */
3156         for_each_drhd_unit(drhd) {
3157                 /*
3158                  * lock not needed as this is only incremented in the single
3159                  * threaded kernel __init code path all other access are read
3160                  * only
3161                  */
3162                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3163                         g_num_of_iommus++;
3164                         continue;
3165                 }
3166                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3167         }
3168
3169         /* Preallocate enough resources for IOMMU hot-addition */
3170         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3171                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3172
3173         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3174                         GFP_KERNEL);
3175         if (!g_iommus) {
3176                 pr_err("Allocating global iommu array failed\n");
3177                 ret = -ENOMEM;
3178                 goto error;
3179         }
3180
3181         for_each_iommu(iommu, drhd) {
3182                 if (drhd->ignored) {
3183                         iommu_disable_translation(iommu);
3184                         continue;
3185                 }
3186
3187                 /*
3188                  * Find the max pasid size of all IOMMU's in the system.
3189                  * We need to ensure the system pasid table is no bigger
3190                  * than the smallest supported.
3191                  */
3192                 if (pasid_supported(iommu)) {
3193                         u32 temp = 2 << ecap_pss(iommu->ecap);
3194
3195                         intel_pasid_max_id = min_t(u32, temp,
3196                                                    intel_pasid_max_id);
3197                 }
3198
3199                 g_iommus[iommu->seq_id] = iommu;
3200
3201                 intel_iommu_init_qi(iommu);
3202
3203                 ret = iommu_init_domains(iommu);
3204                 if (ret)
3205                         goto free_iommu;
3206
3207                 init_translation_status(iommu);
3208
3209                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3210                         iommu_disable_translation(iommu);
3211                         clear_translation_pre_enabled(iommu);
3212                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3213                                 iommu->name);
3214                 }
3215
3216                 /*
3217                  * TBD:
3218                  * we could share the same root & context tables
3219                  * among all IOMMU's. Need to Split it later.
3220                  */
3221                 ret = iommu_alloc_root_entry(iommu);
3222                 if (ret)
3223                         goto free_iommu;
3224
3225                 if (translation_pre_enabled(iommu)) {
3226                         pr_info("Translation already enabled - trying to copy translation structures\n");
3227
3228                         ret = copy_translation_tables(iommu);
3229                         if (ret) {
3230                                 /*
3231                                  * We found the IOMMU with translation
3232                                  * enabled - but failed to copy over the
3233                                  * old root-entry table. Try to proceed
3234                                  * by disabling translation now and
3235                                  * allocating a clean root-entry table.
3236                                  * This might cause DMAR faults, but
3237                                  * probably the dump will still succeed.
3238                                  */
3239                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3240                                        iommu->name);
3241                                 iommu_disable_translation(iommu);
3242                                 clear_translation_pre_enabled(iommu);
3243                         } else {
3244                                 pr_info("Copied translation tables from previous kernel for %s\n",
3245                                         iommu->name);
3246                         }
3247                 }
3248
3249                 if (!ecap_pass_through(iommu->ecap))
3250                         hw_pass_through = 0;
3251                 intel_svm_check(iommu);
3252         }
3253
3254         /*
3255          * Now that qi is enabled on all iommus, set the root entry and flush
3256          * caches. This is required on some Intel X58 chipsets, otherwise the
3257          * flush_context function will loop forever and the boot hangs.
3258          */
3259         for_each_active_iommu(iommu, drhd) {
3260                 iommu_flush_write_buffer(iommu);
3261 #ifdef CONFIG_INTEL_IOMMU_SVM
3262                 register_pasid_allocator(iommu);
3263 #endif
3264                 iommu_set_root_entry(iommu);
3265                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3266                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3267         }
3268
3269 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3270         dmar_map_gfx = 0;
3271 #endif
3272
3273         if (!dmar_map_gfx)
3274                 iommu_identity_mapping |= IDENTMAP_GFX;
3275
3276         check_tylersburg_isoch();
3277
3278         ret = si_domain_init(hw_pass_through);
3279         if (ret)
3280                 goto free_iommu;
3281
3282         /*
3283          * for each drhd
3284          *   enable fault log
3285          *   global invalidate context cache
3286          *   global invalidate iotlb
3287          *   enable translation
3288          */
3289         for_each_iommu(iommu, drhd) {
3290                 if (drhd->ignored) {
3291                         /*
3292                          * we always have to disable PMRs or DMA may fail on
3293                          * this device
3294                          */
3295                         if (force_on)
3296                                 iommu_disable_protect_mem_regions(iommu);
3297                         continue;
3298                 }
3299
3300                 iommu_flush_write_buffer(iommu);
3301
3302 #ifdef CONFIG_INTEL_IOMMU_SVM
3303                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3304                         /*
3305                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3306                          * could cause possible lock race condition.
3307                          */
3308                         up_write(&dmar_global_lock);
3309                         ret = intel_svm_enable_prq(iommu);
3310                         down_write(&dmar_global_lock);
3311                         if (ret)
3312                                 goto free_iommu;
3313                 }
3314 #endif
3315                 ret = dmar_set_interrupt(iommu);
3316                 if (ret)
3317                         goto free_iommu;
3318         }
3319
3320         return 0;
3321
3322 free_iommu:
3323         for_each_active_iommu(iommu, drhd) {
3324                 disable_dmar_iommu(iommu);
3325                 free_dmar_iommu(iommu);
3326         }
3327
3328         kfree(g_iommus);
3329
3330 error:
3331         return ret;
3332 }
3333
3334 /* This takes a number of _MM_ pages, not VTD pages */
3335 static unsigned long intel_alloc_iova(struct device *dev,
3336                                      struct dmar_domain *domain,
3337                                      unsigned long nrpages, uint64_t dma_mask)
3338 {
3339         unsigned long iova_pfn;
3340
3341         /*
3342          * Restrict dma_mask to the width that the iommu can handle.
3343          * First-level translation restricts the input-address to a
3344          * canonical address (i.e., address bits 63:N have the same
3345          * value as address bit [N-1], where N is 48-bits with 4-level
3346          * paging and 57-bits with 5-level paging). Hence, skip bit
3347          * [N-1].
3348          */
3349         if (domain_use_first_level(domain))
3350                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3351                                  dma_mask);
3352         else
3353                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3354                                  dma_mask);
3355
3356         /* Ensure we reserve the whole size-aligned region */
3357         nrpages = __roundup_pow_of_two(nrpages);
3358
3359         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3360                 /*
3361                  * First try to allocate an io virtual address in
3362                  * DMA_BIT_MASK(32) and if that fails then try allocating
3363                  * from higher range
3364                  */
3365                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3366                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3367                 if (iova_pfn)
3368                         return iova_pfn;
3369         }
3370         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3371                                    IOVA_PFN(dma_mask), true);
3372         if (unlikely(!iova_pfn)) {
3373                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3374                              nrpages);
3375                 return 0;
3376         }
3377
3378         return iova_pfn;
3379 }
3380
3381 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3382                                      size_t size, int dir, u64 dma_mask)
3383 {
3384         struct dmar_domain *domain;
3385         phys_addr_t start_paddr;
3386         unsigned long iova_pfn;
3387         int prot = 0;
3388         int ret;
3389         struct intel_iommu *iommu;
3390         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3391
3392         BUG_ON(dir == DMA_NONE);
3393
3394         if (unlikely(attach_deferred(dev)))
3395                 do_deferred_attach(dev);
3396
3397         domain = find_domain(dev);
3398         if (!domain)
3399                 return DMA_MAPPING_ERROR;
3400
3401         iommu = domain_get_iommu(domain);
3402         size = aligned_nrpages(paddr, size);
3403
3404         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3405         if (!iova_pfn)
3406                 goto error;
3407
3408         /*
3409          * Check if DMAR supports zero-length reads on write only
3410          * mappings..
3411          */
3412         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3413                         !cap_zlr(iommu->cap))
3414                 prot |= DMA_PTE_READ;
3415         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3416                 prot |= DMA_PTE_WRITE;
3417         /*
3418          * paddr - (paddr + size) might be partial page, we should map the whole
3419          * page.  Note: if two part of one page are separately mapped, we
3420          * might have two guest_addr mapping to the same host paddr, but this
3421          * is not a big problem
3422          */
3423         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3424                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3425         if (ret)
3426                 goto error;
3427
3428         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3429         start_paddr += paddr & ~PAGE_MASK;
3430
3431         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3432
3433         return start_paddr;
3434
3435 error:
3436         if (iova_pfn)
3437                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3438         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3439                 size, (unsigned long long)paddr, dir);
3440         return DMA_MAPPING_ERROR;
3441 }
3442
3443 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3444                                  unsigned long offset, size_t size,
3445                                  enum dma_data_direction dir,
3446                                  unsigned long attrs)
3447 {
3448         return __intel_map_single(dev, page_to_phys(page) + offset,
3449                                   size, dir, *dev->dma_mask);
3450 }
3451
3452 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3453                                      size_t size, enum dma_data_direction dir,
3454                                      unsigned long attrs)
3455 {
3456         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3457 }
3458
3459 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3460 {
3461         struct dmar_domain *domain;
3462         unsigned long start_pfn, last_pfn;
3463         unsigned long nrpages;
3464         unsigned long iova_pfn;
3465         struct intel_iommu *iommu;
3466         struct page *freelist;
3467         struct pci_dev *pdev = NULL;
3468
3469         domain = find_domain(dev);
3470         BUG_ON(!domain);
3471
3472         iommu = domain_get_iommu(domain);
3473
3474         iova_pfn = IOVA_PFN(dev_addr);
3475
3476         nrpages = aligned_nrpages(dev_addr, size);
3477         start_pfn = mm_to_dma_pfn(iova_pfn);
3478         last_pfn = start_pfn + nrpages - 1;
3479
3480         if (dev_is_pci(dev))
3481                 pdev = to_pci_dev(dev);
3482
3483         freelist = domain_unmap(domain, start_pfn, last_pfn);
3484         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3485                         !has_iova_flush_queue(&domain->iovad)) {
3486                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3487                                       nrpages, !freelist, 0);
3488                 /* free iova */
3489                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3490                 dma_free_pagelist(freelist);
3491         } else {
3492                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3493                            (unsigned long)freelist);
3494                 /*
3495                  * queue up the release of the unmap to save the 1/6th of the
3496                  * cpu used up by the iotlb flush operation...
3497                  */
3498         }
3499
3500         trace_unmap_single(dev, dev_addr, size);
3501 }
3502
3503 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3504                              size_t size, enum dma_data_direction dir,
3505                              unsigned long attrs)
3506 {
3507         intel_unmap(dev, dev_addr, size);
3508 }
3509
3510 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3511                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3512 {
3513         intel_unmap(dev, dev_addr, size);
3514 }
3515
3516 static void *intel_alloc_coherent(struct device *dev, size_t size,
3517                                   dma_addr_t *dma_handle, gfp_t flags,
3518                                   unsigned long attrs)
3519 {
3520         struct page *page = NULL;
3521         int order;
3522
3523         if (unlikely(attach_deferred(dev)))
3524                 do_deferred_attach(dev);
3525
3526         size = PAGE_ALIGN(size);
3527         order = get_order(size);
3528
3529         if (gfpflags_allow_blocking(flags)) {
3530                 unsigned int count = size >> PAGE_SHIFT;
3531
3532                 page = dma_alloc_from_contiguous(dev, count, order,
3533                                                  flags & __GFP_NOWARN);
3534         }
3535
3536         if (!page)
3537                 page = alloc_pages(flags, order);
3538         if (!page)
3539                 return NULL;
3540         memset(page_address(page), 0, size);
3541
3542         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3543                                          DMA_BIDIRECTIONAL,
3544                                          dev->coherent_dma_mask);
3545         if (*dma_handle != DMA_MAPPING_ERROR)
3546                 return page_address(page);
3547         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3548                 __free_pages(page, order);
3549
3550         return NULL;
3551 }
3552
3553 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3554                                 dma_addr_t dma_handle, unsigned long attrs)
3555 {
3556         int order;
3557         struct page *page = virt_to_page(vaddr);
3558
3559         size = PAGE_ALIGN(size);
3560         order = get_order(size);
3561
3562         intel_unmap(dev, dma_handle, size);
3563         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3564                 __free_pages(page, order);
3565 }
3566
3567 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3568                            int nelems, enum dma_data_direction dir,
3569                            unsigned long attrs)
3570 {
3571         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3572         unsigned long nrpages = 0;
3573         struct scatterlist *sg;
3574         int i;
3575
3576         for_each_sg(sglist, sg, nelems, i) {
3577                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3578         }
3579
3580         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3581
3582         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3583 }
3584
3585 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3586                         enum dma_data_direction dir, unsigned long attrs)
3587 {
3588         int i;
3589         struct dmar_domain *domain;
3590         size_t size = 0;
3591         int prot = 0;
3592         unsigned long iova_pfn;
3593         int ret;
3594         struct scatterlist *sg;
3595         unsigned long start_vpfn;
3596         struct intel_iommu *iommu;
3597
3598         BUG_ON(dir == DMA_NONE);
3599
3600         if (unlikely(attach_deferred(dev)))
3601                 do_deferred_attach(dev);
3602
3603         domain = find_domain(dev);
3604         if (!domain)
3605                 return 0;
3606
3607         iommu = domain_get_iommu(domain);
3608
3609         for_each_sg(sglist, sg, nelems, i)
3610                 size += aligned_nrpages(sg->offset, sg->length);
3611
3612         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3613                                 *dev->dma_mask);
3614         if (!iova_pfn) {
3615                 sglist->dma_length = 0;
3616                 return 0;
3617         }
3618
3619         /*
3620          * Check if DMAR supports zero-length reads on write only
3621          * mappings..
3622          */
3623         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3624                         !cap_zlr(iommu->cap))
3625                 prot |= DMA_PTE_READ;
3626         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3627                 prot |= DMA_PTE_WRITE;
3628
3629         start_vpfn = mm_to_dma_pfn(iova_pfn);
3630
3631         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3632         if (unlikely(ret)) {
3633                 dma_pte_free_pagetable(domain, start_vpfn,
3634                                        start_vpfn + size - 1,
3635                                        agaw_to_level(domain->agaw) + 1);
3636                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3637                 return 0;
3638         }
3639
3640         for_each_sg(sglist, sg, nelems, i)
3641                 trace_map_sg(dev, i + 1, nelems, sg);
3642
3643         return nelems;
3644 }
3645
3646 static u64 intel_get_required_mask(struct device *dev)
3647 {
3648         return DMA_BIT_MASK(32);
3649 }
3650
3651 static const struct dma_map_ops intel_dma_ops = {
3652         .alloc = intel_alloc_coherent,
3653         .free = intel_free_coherent,
3654         .map_sg = intel_map_sg,
3655         .unmap_sg = intel_unmap_sg,
3656         .map_page = intel_map_page,
3657         .unmap_page = intel_unmap_page,
3658         .map_resource = intel_map_resource,
3659         .unmap_resource = intel_unmap_resource,
3660         .dma_supported = dma_direct_supported,
3661         .mmap = dma_common_mmap,
3662         .get_sgtable = dma_common_get_sgtable,
3663         .get_required_mask = intel_get_required_mask,
3664 };
3665
3666 static void
3667 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3668                    enum dma_data_direction dir, enum dma_sync_target target)
3669 {
3670         struct dmar_domain *domain;
3671         phys_addr_t tlb_addr;
3672
3673         domain = find_domain(dev);
3674         if (WARN_ON(!domain))
3675                 return;
3676
3677         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3678         if (is_swiotlb_buffer(tlb_addr))
3679                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3680 }
3681
3682 static dma_addr_t
3683 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3684                   enum dma_data_direction dir, unsigned long attrs,
3685                   u64 dma_mask)
3686 {
3687         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3688         struct dmar_domain *domain;
3689         struct intel_iommu *iommu;
3690         unsigned long iova_pfn;
3691         unsigned long nrpages;
3692         phys_addr_t tlb_addr;
3693         int prot = 0;
3694         int ret;
3695
3696         if (unlikely(attach_deferred(dev)))
3697                 do_deferred_attach(dev);
3698
3699         domain = find_domain(dev);
3700
3701         if (WARN_ON(dir == DMA_NONE || !domain))
3702                 return DMA_MAPPING_ERROR;
3703
3704         iommu = domain_get_iommu(domain);
3705         if (WARN_ON(!iommu))
3706                 return DMA_MAPPING_ERROR;
3707
3708         nrpages = aligned_nrpages(0, size);
3709         iova_pfn = intel_alloc_iova(dev, domain,
3710                                     dma_to_mm_pfn(nrpages), dma_mask);
3711         if (!iova_pfn)
3712                 return DMA_MAPPING_ERROR;
3713
3714         /*
3715          * Check if DMAR supports zero-length reads on write only
3716          * mappings..
3717          */
3718         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3719                         !cap_zlr(iommu->cap))
3720                 prot |= DMA_PTE_READ;
3721         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3722                 prot |= DMA_PTE_WRITE;
3723
3724         /*
3725          * If both the physical buffer start address and size are
3726          * page aligned, we don't need to use a bounce page.
3727          */
3728         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3729                 tlb_addr = swiotlb_tbl_map_single(dev,
3730                                 __phys_to_dma(dev, io_tlb_start),
3731                                 paddr, size, aligned_size, dir, attrs);
3732                 if (tlb_addr == DMA_MAPPING_ERROR) {
3733                         goto swiotlb_error;
3734                 } else {
3735                         /* Cleanup the padding area. */
3736                         void *padding_start = phys_to_virt(tlb_addr);
3737                         size_t padding_size = aligned_size;
3738
3739                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3740                             (dir == DMA_TO_DEVICE ||
3741                              dir == DMA_BIDIRECTIONAL)) {
3742                                 padding_start += size;
3743                                 padding_size -= size;
3744                         }
3745
3746                         memset(padding_start, 0, padding_size);
3747                 }
3748         } else {
3749                 tlb_addr = paddr;
3750         }
3751
3752         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3753                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3754         if (ret)
3755                 goto mapping_error;
3756
3757         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3758
3759         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3760
3761 mapping_error:
3762         if (is_swiotlb_buffer(tlb_addr))
3763                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3764                                          aligned_size, dir, attrs);
3765 swiotlb_error:
3766         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3767         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3768                 size, (unsigned long long)paddr, dir);
3769
3770         return DMA_MAPPING_ERROR;
3771 }
3772
3773 static void
3774 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3775                     enum dma_data_direction dir, unsigned long attrs)
3776 {
3777         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3778         struct dmar_domain *domain;
3779         phys_addr_t tlb_addr;
3780
3781         domain = find_domain(dev);
3782         if (WARN_ON(!domain))
3783                 return;
3784
3785         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3786         if (WARN_ON(!tlb_addr))
3787                 return;
3788
3789         intel_unmap(dev, dev_addr, size);
3790         if (is_swiotlb_buffer(tlb_addr))
3791                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3792                                          aligned_size, dir, attrs);
3793
3794         trace_bounce_unmap_single(dev, dev_addr, size);
3795 }
3796
3797 static dma_addr_t
3798 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3799                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3800 {
3801         return bounce_map_single(dev, page_to_phys(page) + offset,
3802                                  size, dir, attrs, *dev->dma_mask);
3803 }
3804
3805 static dma_addr_t
3806 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3807                     enum dma_data_direction dir, unsigned long attrs)
3808 {
3809         return bounce_map_single(dev, phys_addr, size,
3810                                  dir, attrs, *dev->dma_mask);
3811 }
3812
3813 static void
3814 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3815                   enum dma_data_direction dir, unsigned long attrs)
3816 {
3817         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3818 }
3819
3820 static void
3821 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3822                       enum dma_data_direction dir, unsigned long attrs)
3823 {
3824         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3825 }
3826
3827 static void
3828 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3829                 enum dma_data_direction dir, unsigned long attrs)
3830 {
3831         struct scatterlist *sg;
3832         int i;
3833
3834         for_each_sg(sglist, sg, nelems, i)
3835                 bounce_unmap_page(dev, sg->dma_address,
3836                                   sg_dma_len(sg), dir, attrs);
3837 }
3838
3839 static int
3840 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3841               enum dma_data_direction dir, unsigned long attrs)
3842 {
3843         int i;
3844         struct scatterlist *sg;
3845
3846         for_each_sg(sglist, sg, nelems, i) {
3847                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3848                                                   sg->offset, sg->length,
3849                                                   dir, attrs);
3850                 if (sg->dma_address == DMA_MAPPING_ERROR)
3851                         goto out_unmap;
3852                 sg_dma_len(sg) = sg->length;
3853         }
3854
3855         for_each_sg(sglist, sg, nelems, i)
3856                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3857
3858         return nelems;
3859
3860 out_unmap:
3861         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3862         return 0;
3863 }
3864
3865 static void
3866 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3867                            size_t size, enum dma_data_direction dir)
3868 {
3869         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3870 }
3871
3872 static void
3873 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3874                               size_t size, enum dma_data_direction dir)
3875 {
3876         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3877 }
3878
3879 static void
3880 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3881                        int nelems, enum dma_data_direction dir)
3882 {
3883         struct scatterlist *sg;
3884         int i;
3885
3886         for_each_sg(sglist, sg, nelems, i)
3887                 bounce_sync_single(dev, sg_dma_address(sg),
3888                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
3889 }
3890
3891 static void
3892 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3893                           int nelems, enum dma_data_direction dir)
3894 {
3895         struct scatterlist *sg;
3896         int i;
3897
3898         for_each_sg(sglist, sg, nelems, i)
3899                 bounce_sync_single(dev, sg_dma_address(sg),
3900                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3901 }
3902
3903 static const struct dma_map_ops bounce_dma_ops = {
3904         .alloc                  = intel_alloc_coherent,
3905         .free                   = intel_free_coherent,
3906         .map_sg                 = bounce_map_sg,
3907         .unmap_sg               = bounce_unmap_sg,
3908         .map_page               = bounce_map_page,
3909         .unmap_page             = bounce_unmap_page,
3910         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
3911         .sync_single_for_device = bounce_sync_single_for_device,
3912         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
3913         .sync_sg_for_device     = bounce_sync_sg_for_device,
3914         .map_resource           = bounce_map_resource,
3915         .unmap_resource         = bounce_unmap_resource,
3916         .dma_supported          = dma_direct_supported,
3917 };
3918
3919 static inline int iommu_domain_cache_init(void)
3920 {
3921         int ret = 0;
3922
3923         iommu_domain_cache = kmem_cache_create("iommu_domain",
3924                                          sizeof(struct dmar_domain),
3925                                          0,
3926                                          SLAB_HWCACHE_ALIGN,
3927
3928                                          NULL);
3929         if (!iommu_domain_cache) {
3930                 pr_err("Couldn't create iommu_domain cache\n");
3931                 ret = -ENOMEM;
3932         }
3933
3934         return ret;
3935 }
3936
3937 static inline int iommu_devinfo_cache_init(void)
3938 {
3939         int ret = 0;
3940
3941         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3942                                          sizeof(struct device_domain_info),
3943                                          0,
3944                                          SLAB_HWCACHE_ALIGN,
3945                                          NULL);
3946         if (!iommu_devinfo_cache) {
3947                 pr_err("Couldn't create devinfo cache\n");
3948                 ret = -ENOMEM;
3949         }
3950
3951         return ret;
3952 }
3953
3954 static int __init iommu_init_mempool(void)
3955 {
3956         int ret;
3957         ret = iova_cache_get();
3958         if (ret)
3959                 return ret;
3960
3961         ret = iommu_domain_cache_init();
3962         if (ret)
3963                 goto domain_error;
3964
3965         ret = iommu_devinfo_cache_init();
3966         if (!ret)
3967                 return ret;
3968
3969         kmem_cache_destroy(iommu_domain_cache);
3970 domain_error:
3971         iova_cache_put();
3972
3973         return -ENOMEM;
3974 }
3975
3976 static void __init iommu_exit_mempool(void)
3977 {
3978         kmem_cache_destroy(iommu_devinfo_cache);
3979         kmem_cache_destroy(iommu_domain_cache);
3980         iova_cache_put();
3981 }
3982
3983 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3984 {
3985         struct dmar_drhd_unit *drhd;
3986         u32 vtbar;
3987         int rc;
3988
3989         /* We know that this device on this chipset has its own IOMMU.
3990          * If we find it under a different IOMMU, then the BIOS is lying
3991          * to us. Hope that the IOMMU for this device is actually
3992          * disabled, and it needs no translation...
3993          */
3994         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3995         if (rc) {
3996                 /* "can't" happen */
3997                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3998                 return;
3999         }
4000         vtbar &= 0xffff0000;
4001
4002         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4003         drhd = dmar_find_matched_drhd_unit(pdev);
4004         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4005                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4006                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4007                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4008         }
4009 }
4010 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4011
4012 static void __init init_no_remapping_devices(void)
4013 {
4014         struct dmar_drhd_unit *drhd;
4015         struct device *dev;
4016         int i;
4017
4018         for_each_drhd_unit(drhd) {
4019                 if (!drhd->include_all) {
4020                         for_each_active_dev_scope(drhd->devices,
4021                                                   drhd->devices_cnt, i, dev)
4022                                 break;
4023                         /* ignore DMAR unit if no devices exist */
4024                         if (i == drhd->devices_cnt)
4025                                 drhd->ignored = 1;
4026                 }
4027         }
4028
4029         for_each_active_drhd_unit(drhd) {
4030                 if (drhd->include_all)
4031                         continue;
4032
4033                 for_each_active_dev_scope(drhd->devices,
4034                                           drhd->devices_cnt, i, dev)
4035                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4036                                 break;
4037                 if (i < drhd->devices_cnt)
4038                         continue;
4039
4040                 /* This IOMMU has *only* gfx devices. Either bypass it or
4041                    set the gfx_mapped flag, as appropriate */
4042                 if (!dmar_map_gfx) {
4043                         drhd->ignored = 1;
4044                         for_each_active_dev_scope(drhd->devices,
4045                                                   drhd->devices_cnt, i, dev)
4046                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4047                 }
4048         }
4049 }
4050
4051 #ifdef CONFIG_SUSPEND
4052 static int init_iommu_hw(void)
4053 {
4054         struct dmar_drhd_unit *drhd;
4055         struct intel_iommu *iommu = NULL;
4056
4057         for_each_active_iommu(iommu, drhd)
4058                 if (iommu->qi)
4059                         dmar_reenable_qi(iommu);
4060
4061         for_each_iommu(iommu, drhd) {
4062                 if (drhd->ignored) {
4063                         /*
4064                          * we always have to disable PMRs or DMA may fail on
4065                          * this device
4066                          */
4067                         if (force_on)
4068                                 iommu_disable_protect_mem_regions(iommu);
4069                         continue;
4070                 }
4071
4072                 iommu_flush_write_buffer(iommu);
4073
4074                 iommu_set_root_entry(iommu);
4075
4076                 iommu->flush.flush_context(iommu, 0, 0, 0,
4077                                            DMA_CCMD_GLOBAL_INVL);
4078                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4079                 iommu_enable_translation(iommu);
4080                 iommu_disable_protect_mem_regions(iommu);
4081         }
4082
4083         return 0;
4084 }
4085
4086 static void iommu_flush_all(void)
4087 {
4088         struct dmar_drhd_unit *drhd;
4089         struct intel_iommu *iommu;
4090
4091         for_each_active_iommu(iommu, drhd) {
4092                 iommu->flush.flush_context(iommu, 0, 0, 0,
4093                                            DMA_CCMD_GLOBAL_INVL);
4094                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4095                                          DMA_TLB_GLOBAL_FLUSH);
4096         }
4097 }
4098
4099 static int iommu_suspend(void)
4100 {
4101         struct dmar_drhd_unit *drhd;
4102         struct intel_iommu *iommu = NULL;
4103         unsigned long flag;
4104
4105         for_each_active_iommu(iommu, drhd) {
4106                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4107                                                  GFP_ATOMIC);
4108                 if (!iommu->iommu_state)
4109                         goto nomem;
4110         }
4111
4112         iommu_flush_all();
4113
4114         for_each_active_iommu(iommu, drhd) {
4115                 iommu_disable_translation(iommu);
4116
4117                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4118
4119                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4120                         readl(iommu->reg + DMAR_FECTL_REG);
4121                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4122                         readl(iommu->reg + DMAR_FEDATA_REG);
4123                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4124                         readl(iommu->reg + DMAR_FEADDR_REG);
4125                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4126                         readl(iommu->reg + DMAR_FEUADDR_REG);
4127
4128                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4129         }
4130         return 0;
4131
4132 nomem:
4133         for_each_active_iommu(iommu, drhd)
4134                 kfree(iommu->iommu_state);
4135
4136         return -ENOMEM;
4137 }
4138
4139 static void iommu_resume(void)
4140 {
4141         struct dmar_drhd_unit *drhd;
4142         struct intel_iommu *iommu = NULL;
4143         unsigned long flag;
4144
4145         if (init_iommu_hw()) {
4146                 if (force_on)
4147                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4148                 else
4149                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4150                 return;
4151         }
4152
4153         for_each_active_iommu(iommu, drhd) {
4154
4155                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4156
4157                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4158                         iommu->reg + DMAR_FECTL_REG);
4159                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4160                         iommu->reg + DMAR_FEDATA_REG);
4161                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4162                         iommu->reg + DMAR_FEADDR_REG);
4163                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4164                         iommu->reg + DMAR_FEUADDR_REG);
4165
4166                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4167         }
4168
4169         for_each_active_iommu(iommu, drhd)
4170                 kfree(iommu->iommu_state);
4171 }
4172
4173 static struct syscore_ops iommu_syscore_ops = {
4174         .resume         = iommu_resume,
4175         .suspend        = iommu_suspend,
4176 };
4177
4178 static void __init init_iommu_pm_ops(void)
4179 {
4180         register_syscore_ops(&iommu_syscore_ops);
4181 }
4182
4183 #else
4184 static inline void init_iommu_pm_ops(void) {}
4185 #endif  /* CONFIG_PM */
4186
4187 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4188 {
4189         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4190             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4191             rmrr->end_address <= rmrr->base_address ||
4192             arch_rmrr_sanity_check(rmrr))
4193                 return -EINVAL;
4194
4195         return 0;
4196 }
4197
4198 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4199 {
4200         struct acpi_dmar_reserved_memory *rmrr;
4201         struct dmar_rmrr_unit *rmrru;
4202
4203         rmrr = (struct acpi_dmar_reserved_memory *)header;
4204         if (rmrr_sanity_check(rmrr)) {
4205                 pr_warn(FW_BUG
4206                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4207                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4208                            rmrr->base_address, rmrr->end_address,
4209                            dmi_get_system_info(DMI_BIOS_VENDOR),
4210                            dmi_get_system_info(DMI_BIOS_VERSION),
4211                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4212                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4213         }
4214
4215         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4216         if (!rmrru)
4217                 goto out;
4218
4219         rmrru->hdr = header;
4220
4221         rmrru->base_address = rmrr->base_address;
4222         rmrru->end_address = rmrr->end_address;
4223
4224         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4225                                 ((void *)rmrr) + rmrr->header.length,
4226                                 &rmrru->devices_cnt);
4227         if (rmrru->devices_cnt && rmrru->devices == NULL)
4228                 goto free_rmrru;
4229
4230         list_add(&rmrru->list, &dmar_rmrr_units);
4231
4232         return 0;
4233 free_rmrru:
4234         kfree(rmrru);
4235 out:
4236         return -ENOMEM;
4237 }
4238
4239 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4240 {
4241         struct dmar_atsr_unit *atsru;
4242         struct acpi_dmar_atsr *tmp;
4243
4244         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4245                                 dmar_rcu_check()) {
4246                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4247                 if (atsr->segment != tmp->segment)
4248                         continue;
4249                 if (atsr->header.length != tmp->header.length)
4250                         continue;
4251                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4252                         return atsru;
4253         }
4254
4255         return NULL;
4256 }
4257
4258 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4259 {
4260         struct acpi_dmar_atsr *atsr;
4261         struct dmar_atsr_unit *atsru;
4262
4263         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4264                 return 0;
4265
4266         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4267         atsru = dmar_find_atsr(atsr);
4268         if (atsru)
4269                 return 0;
4270
4271         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4272         if (!atsru)
4273                 return -ENOMEM;
4274
4275         /*
4276          * If memory is allocated from slab by ACPI _DSM method, we need to
4277          * copy the memory content because the memory buffer will be freed
4278          * on return.
4279          */
4280         atsru->hdr = (void *)(atsru + 1);
4281         memcpy(atsru->hdr, hdr, hdr->length);
4282         atsru->include_all = atsr->flags & 0x1;
4283         if (!atsru->include_all) {
4284                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4285                                 (void *)atsr + atsr->header.length,
4286                                 &atsru->devices_cnt);
4287                 if (atsru->devices_cnt && atsru->devices == NULL) {
4288                         kfree(atsru);
4289                         return -ENOMEM;
4290                 }
4291         }
4292
4293         list_add_rcu(&atsru->list, &dmar_atsr_units);
4294
4295         return 0;
4296 }
4297
4298 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4299 {
4300         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4301         kfree(atsru);
4302 }
4303
4304 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4305 {
4306         struct acpi_dmar_atsr *atsr;
4307         struct dmar_atsr_unit *atsru;
4308
4309         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4310         atsru = dmar_find_atsr(atsr);
4311         if (atsru) {
4312                 list_del_rcu(&atsru->list);
4313                 synchronize_rcu();
4314                 intel_iommu_free_atsr(atsru);
4315         }
4316
4317         return 0;
4318 }
4319
4320 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4321 {
4322         int i;
4323         struct device *dev;
4324         struct acpi_dmar_atsr *atsr;
4325         struct dmar_atsr_unit *atsru;
4326
4327         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4328         atsru = dmar_find_atsr(atsr);
4329         if (!atsru)
4330                 return 0;
4331
4332         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4333                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4334                                           i, dev)
4335                         return -EBUSY;
4336         }
4337
4338         return 0;
4339 }
4340
4341 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4342 {
4343         int sp, ret;
4344         struct intel_iommu *iommu = dmaru->iommu;
4345
4346         if (g_iommus[iommu->seq_id])
4347                 return 0;
4348
4349         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4350                 pr_warn("%s: Doesn't support hardware pass through.\n",
4351                         iommu->name);
4352                 return -ENXIO;
4353         }
4354         if (!ecap_sc_support(iommu->ecap) &&
4355             domain_update_iommu_snooping(iommu)) {
4356                 pr_warn("%s: Doesn't support snooping.\n",
4357                         iommu->name);
4358                 return -ENXIO;
4359         }
4360         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4361         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4362                 pr_warn("%s: Doesn't support large page.\n",
4363                         iommu->name);
4364                 return -ENXIO;
4365         }
4366
4367         /*
4368          * Disable translation if already enabled prior to OS handover.
4369          */
4370         if (iommu->gcmd & DMA_GCMD_TE)
4371                 iommu_disable_translation(iommu);
4372
4373         g_iommus[iommu->seq_id] = iommu;
4374         ret = iommu_init_domains(iommu);
4375         if (ret == 0)
4376                 ret = iommu_alloc_root_entry(iommu);
4377         if (ret)
4378                 goto out;
4379
4380         intel_svm_check(iommu);
4381
4382         if (dmaru->ignored) {
4383                 /*
4384                  * we always have to disable PMRs or DMA may fail on this device
4385                  */
4386                 if (force_on)
4387                         iommu_disable_protect_mem_regions(iommu);
4388                 return 0;
4389         }
4390
4391         intel_iommu_init_qi(iommu);
4392         iommu_flush_write_buffer(iommu);
4393
4394 #ifdef CONFIG_INTEL_IOMMU_SVM
4395         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4396                 ret = intel_svm_enable_prq(iommu);
4397                 if (ret)
4398                         goto disable_iommu;
4399         }
4400 #endif
4401         ret = dmar_set_interrupt(iommu);
4402         if (ret)
4403                 goto disable_iommu;
4404
4405         iommu_set_root_entry(iommu);
4406         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4407         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4408         iommu_enable_translation(iommu);
4409
4410         iommu_disable_protect_mem_regions(iommu);
4411         return 0;
4412
4413 disable_iommu:
4414         disable_dmar_iommu(iommu);
4415 out:
4416         free_dmar_iommu(iommu);
4417         return ret;
4418 }
4419
4420 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4421 {
4422         int ret = 0;
4423         struct intel_iommu *iommu = dmaru->iommu;
4424
4425         if (!intel_iommu_enabled)
4426                 return 0;
4427         if (iommu == NULL)
4428                 return -EINVAL;
4429
4430         if (insert) {
4431                 ret = intel_iommu_add(dmaru);
4432         } else {
4433                 disable_dmar_iommu(iommu);
4434                 free_dmar_iommu(iommu);
4435         }
4436
4437         return ret;
4438 }
4439
4440 static void intel_iommu_free_dmars(void)
4441 {
4442         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4443         struct dmar_atsr_unit *atsru, *atsr_n;
4444
4445         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4446                 list_del(&rmrru->list);
4447                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4448                 kfree(rmrru);
4449         }
4450
4451         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4452                 list_del(&atsru->list);
4453                 intel_iommu_free_atsr(atsru);
4454         }
4455 }
4456
4457 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4458 {
4459         int i, ret = 1;
4460         struct pci_bus *bus;
4461         struct pci_dev *bridge = NULL;
4462         struct device *tmp;
4463         struct acpi_dmar_atsr *atsr;
4464         struct dmar_atsr_unit *atsru;
4465
4466         dev = pci_physfn(dev);
4467         for (bus = dev->bus; bus; bus = bus->parent) {
4468                 bridge = bus->self;
4469                 /* If it's an integrated device, allow ATS */
4470                 if (!bridge)
4471                         return 1;
4472                 /* Connected via non-PCIe: no ATS */
4473                 if (!pci_is_pcie(bridge) ||
4474                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4475                         return 0;
4476                 /* If we found the root port, look it up in the ATSR */
4477                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4478                         break;
4479         }
4480
4481         rcu_read_lock();
4482         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4483                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4484                 if (atsr->segment != pci_domain_nr(dev->bus))
4485                         continue;
4486
4487                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4488                         if (tmp == &bridge->dev)
4489                                 goto out;
4490
4491                 if (atsru->include_all)
4492                         goto out;
4493         }
4494         ret = 0;
4495 out:
4496         rcu_read_unlock();
4497
4498         return ret;
4499 }
4500
4501 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4502 {
4503         int ret;
4504         struct dmar_rmrr_unit *rmrru;
4505         struct dmar_atsr_unit *atsru;
4506         struct acpi_dmar_atsr *atsr;
4507         struct acpi_dmar_reserved_memory *rmrr;
4508
4509         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4510                 return 0;
4511
4512         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4513                 rmrr = container_of(rmrru->hdr,
4514                                     struct acpi_dmar_reserved_memory, header);
4515                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4516                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4517                                 ((void *)rmrr) + rmrr->header.length,
4518                                 rmrr->segment, rmrru->devices,
4519                                 rmrru->devices_cnt);
4520                         if (ret < 0)
4521                                 return ret;
4522                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4523                         dmar_remove_dev_scope(info, rmrr->segment,
4524                                 rmrru->devices, rmrru->devices_cnt);
4525                 }
4526         }
4527
4528         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4529                 if (atsru->include_all)
4530                         continue;
4531
4532                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4533                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4534                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4535                                         (void *)atsr + atsr->header.length,
4536                                         atsr->segment, atsru->devices,
4537                                         atsru->devices_cnt);
4538                         if (ret > 0)
4539                                 break;
4540                         else if (ret < 0)
4541                                 return ret;
4542                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543                         if (dmar_remove_dev_scope(info, atsr->segment,
4544                                         atsru->devices, atsru->devices_cnt))
4545                                 break;
4546                 }
4547         }
4548
4549         return 0;
4550 }
4551
4552 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4553                                        unsigned long val, void *v)
4554 {
4555         struct memory_notify *mhp = v;
4556         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4557         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4558                         mhp->nr_pages - 1);
4559
4560         switch (val) {
4561         case MEM_GOING_ONLINE:
4562                 if (iommu_domain_identity_map(si_domain,
4563                                               start_vpfn, last_vpfn)) {
4564                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4565                                 start_vpfn, last_vpfn);
4566                         return NOTIFY_BAD;
4567                 }
4568                 break;
4569
4570         case MEM_OFFLINE:
4571         case MEM_CANCEL_ONLINE:
4572                 {
4573                         struct dmar_drhd_unit *drhd;
4574                         struct intel_iommu *iommu;
4575                         struct page *freelist;
4576
4577                         freelist = domain_unmap(si_domain,
4578                                                 start_vpfn, last_vpfn);
4579
4580                         rcu_read_lock();
4581                         for_each_active_iommu(iommu, drhd)
4582                                 iommu_flush_iotlb_psi(iommu, si_domain,
4583                                         start_vpfn, mhp->nr_pages,
4584                                         !freelist, 0);
4585                         rcu_read_unlock();
4586                         dma_free_pagelist(freelist);
4587                 }
4588                 break;
4589         }
4590
4591         return NOTIFY_OK;
4592 }
4593
4594 static struct notifier_block intel_iommu_memory_nb = {
4595         .notifier_call = intel_iommu_memory_notifier,
4596         .priority = 0
4597 };
4598
4599 static void free_all_cpu_cached_iovas(unsigned int cpu)
4600 {
4601         int i;
4602
4603         for (i = 0; i < g_num_of_iommus; i++) {
4604                 struct intel_iommu *iommu = g_iommus[i];
4605                 struct dmar_domain *domain;
4606                 int did;
4607
4608                 if (!iommu)
4609                         continue;
4610
4611                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4612                         domain = get_iommu_domain(iommu, (u16)did);
4613
4614                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4615                                 continue;
4616
4617                         free_cpu_cached_iovas(cpu, &domain->iovad);
4618                 }
4619         }
4620 }
4621
4622 static int intel_iommu_cpu_dead(unsigned int cpu)
4623 {
4624         free_all_cpu_cached_iovas(cpu);
4625         return 0;
4626 }
4627
4628 static void intel_disable_iommus(void)
4629 {
4630         struct intel_iommu *iommu = NULL;
4631         struct dmar_drhd_unit *drhd;
4632
4633         for_each_iommu(iommu, drhd)
4634                 iommu_disable_translation(iommu);
4635 }
4636
4637 void intel_iommu_shutdown(void)
4638 {
4639         struct dmar_drhd_unit *drhd;
4640         struct intel_iommu *iommu = NULL;
4641
4642         if (no_iommu || dmar_disabled)
4643                 return;
4644
4645         down_write(&dmar_global_lock);
4646
4647         /* Disable PMRs explicitly here. */
4648         for_each_iommu(iommu, drhd)
4649                 iommu_disable_protect_mem_regions(iommu);
4650
4651         /* Make sure the IOMMUs are switched off */
4652         intel_disable_iommus();
4653
4654         up_write(&dmar_global_lock);
4655 }
4656
4657 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4658 {
4659         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4660
4661         return container_of(iommu_dev, struct intel_iommu, iommu);
4662 }
4663
4664 static ssize_t intel_iommu_show_version(struct device *dev,
4665                                         struct device_attribute *attr,
4666                                         char *buf)
4667 {
4668         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4669         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4670         return sprintf(buf, "%d:%d\n",
4671                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4672 }
4673 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4674
4675 static ssize_t intel_iommu_show_address(struct device *dev,
4676                                         struct device_attribute *attr,
4677                                         char *buf)
4678 {
4679         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4680         return sprintf(buf, "%llx\n", iommu->reg_phys);
4681 }
4682 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4683
4684 static ssize_t intel_iommu_show_cap(struct device *dev,
4685                                     struct device_attribute *attr,
4686                                     char *buf)
4687 {
4688         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689         return sprintf(buf, "%llx\n", iommu->cap);
4690 }
4691 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4692
4693 static ssize_t intel_iommu_show_ecap(struct device *dev,
4694                                     struct device_attribute *attr,
4695                                     char *buf)
4696 {
4697         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4698         return sprintf(buf, "%llx\n", iommu->ecap);
4699 }
4700 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4701
4702 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4703                                       struct device_attribute *attr,
4704                                       char *buf)
4705 {
4706         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4707         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4708 }
4709 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4710
4711 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4712                                            struct device_attribute *attr,
4713                                            char *buf)
4714 {
4715         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4716         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4717                                                   cap_ndoms(iommu->cap)));
4718 }
4719 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4720
4721 static struct attribute *intel_iommu_attrs[] = {
4722         &dev_attr_version.attr,
4723         &dev_attr_address.attr,
4724         &dev_attr_cap.attr,
4725         &dev_attr_ecap.attr,
4726         &dev_attr_domains_supported.attr,
4727         &dev_attr_domains_used.attr,
4728         NULL,
4729 };
4730
4731 static struct attribute_group intel_iommu_group = {
4732         .name = "intel-iommu",
4733         .attrs = intel_iommu_attrs,
4734 };
4735
4736 const struct attribute_group *intel_iommu_groups[] = {
4737         &intel_iommu_group,
4738         NULL,
4739 };
4740
4741 static inline bool has_external_pci(void)
4742 {
4743         struct pci_dev *pdev = NULL;
4744
4745         for_each_pci_dev(pdev)
4746                 if (pdev->external_facing)
4747                         return true;
4748
4749         return false;
4750 }
4751
4752 static int __init platform_optin_force_iommu(void)
4753 {
4754         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4755                 return 0;
4756
4757         if (no_iommu || dmar_disabled)
4758                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4759
4760         /*
4761          * If Intel-IOMMU is disabled by default, we will apply identity
4762          * map for all devices except those marked as being untrusted.
4763          */
4764         if (dmar_disabled)
4765                 iommu_set_default_passthrough(false);
4766
4767         dmar_disabled = 0;
4768         no_iommu = 0;
4769
4770         return 1;
4771 }
4772
4773 static int __init probe_acpi_namespace_devices(void)
4774 {
4775         struct dmar_drhd_unit *drhd;
4776         /* To avoid a -Wunused-but-set-variable warning. */
4777         struct intel_iommu *iommu __maybe_unused;
4778         struct device *dev;
4779         int i, ret = 0;
4780
4781         for_each_active_iommu(iommu, drhd) {
4782                 for_each_active_dev_scope(drhd->devices,
4783                                           drhd->devices_cnt, i, dev) {
4784                         struct acpi_device_physical_node *pn;
4785                         struct iommu_group *group;
4786                         struct acpi_device *adev;
4787
4788                         if (dev->bus != &acpi_bus_type)
4789                                 continue;
4790
4791                         adev = to_acpi_device(dev);
4792                         mutex_lock(&adev->physical_node_lock);
4793                         list_for_each_entry(pn,
4794                                             &adev->physical_node_list, node) {
4795                                 group = iommu_group_get(pn->dev);
4796                                 if (group) {
4797                                         iommu_group_put(group);
4798                                         continue;
4799                                 }
4800
4801                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4802                                 ret = iommu_probe_device(pn->dev);
4803                                 if (ret)
4804                                         break;
4805                         }
4806                         mutex_unlock(&adev->physical_node_lock);
4807
4808                         if (ret)
4809                                 return ret;
4810                 }
4811         }
4812
4813         return 0;
4814 }
4815
4816 int __init intel_iommu_init(void)
4817 {
4818         int ret = -ENODEV;
4819         struct dmar_drhd_unit *drhd;
4820         struct intel_iommu *iommu;
4821
4822         /*
4823          * Intel IOMMU is required for a TXT/tboot launch or platform
4824          * opt in, so enforce that.
4825          */
4826         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4827
4828         if (iommu_init_mempool()) {
4829                 if (force_on)
4830                         panic("tboot: Failed to initialize iommu memory\n");
4831                 return -ENOMEM;
4832         }
4833
4834         down_write(&dmar_global_lock);
4835         if (dmar_table_init()) {
4836                 if (force_on)
4837                         panic("tboot: Failed to initialize DMAR table\n");
4838                 goto out_free_dmar;
4839         }
4840
4841         if (dmar_dev_scope_init() < 0) {
4842                 if (force_on)
4843                         panic("tboot: Failed to initialize DMAR device scope\n");
4844                 goto out_free_dmar;
4845         }
4846
4847         up_write(&dmar_global_lock);
4848
4849         /*
4850          * The bus notifier takes the dmar_global_lock, so lockdep will
4851          * complain later when we register it under the lock.
4852          */
4853         dmar_register_bus_notifier();
4854
4855         down_write(&dmar_global_lock);
4856
4857         if (!no_iommu)
4858                 intel_iommu_debugfs_init();
4859
4860         if (no_iommu || dmar_disabled) {
4861                 /*
4862                  * We exit the function here to ensure IOMMU's remapping and
4863                  * mempool aren't setup, which means that the IOMMU's PMRs
4864                  * won't be disabled via the call to init_dmars(). So disable
4865                  * it explicitly here. The PMRs were setup by tboot prior to
4866                  * calling SENTER, but the kernel is expected to reset/tear
4867                  * down the PMRs.
4868                  */
4869                 if (intel_iommu_tboot_noforce) {
4870                         for_each_iommu(iommu, drhd)
4871                                 iommu_disable_protect_mem_regions(iommu);
4872                 }
4873
4874                 /*
4875                  * Make sure the IOMMUs are switched off, even when we
4876                  * boot into a kexec kernel and the previous kernel left
4877                  * them enabled
4878                  */
4879                 intel_disable_iommus();
4880                 goto out_free_dmar;
4881         }
4882
4883         if (list_empty(&dmar_rmrr_units))
4884                 pr_info("No RMRR found\n");
4885
4886         if (list_empty(&dmar_atsr_units))
4887                 pr_info("No ATSR found\n");
4888
4889         if (dmar_init_reserved_ranges()) {
4890                 if (force_on)
4891                         panic("tboot: Failed to reserve iommu ranges\n");
4892                 goto out_free_reserved_range;
4893         }
4894
4895         if (dmar_map_gfx)
4896                 intel_iommu_gfx_mapped = 1;
4897
4898         init_no_remapping_devices();
4899
4900         ret = init_dmars();
4901         if (ret) {
4902                 if (force_on)
4903                         panic("tboot: Failed to initialize DMARs\n");
4904                 pr_err("Initialization failed\n");
4905                 goto out_free_reserved_range;
4906         }
4907         up_write(&dmar_global_lock);
4908
4909         init_iommu_pm_ops();
4910
4911         down_read(&dmar_global_lock);
4912         for_each_active_iommu(iommu, drhd) {
4913                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4914                                        intel_iommu_groups,
4915                                        "%s", iommu->name);
4916                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4917                 iommu_device_register(&iommu->iommu);
4918         }
4919         up_read(&dmar_global_lock);
4920
4921         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4922         if (si_domain && !hw_pass_through)
4923                 register_memory_notifier(&intel_iommu_memory_nb);
4924         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4925                           intel_iommu_cpu_dead);
4926
4927         down_read(&dmar_global_lock);
4928         if (probe_acpi_namespace_devices())
4929                 pr_warn("ACPI name space devices didn't probe correctly\n");
4930
4931         /* Finally, we enable the DMA remapping hardware. */
4932         for_each_iommu(iommu, drhd) {
4933                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4934                         iommu_enable_translation(iommu);
4935
4936                 iommu_disable_protect_mem_regions(iommu);
4937         }
4938         up_read(&dmar_global_lock);
4939
4940         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4941
4942         intel_iommu_enabled = 1;
4943
4944         return 0;
4945
4946 out_free_reserved_range:
4947         put_iova_domain(&reserved_iova_list);
4948 out_free_dmar:
4949         intel_iommu_free_dmars();
4950         up_write(&dmar_global_lock);
4951         iommu_exit_mempool();
4952         return ret;
4953 }
4954
4955 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4956 {
4957         struct intel_iommu *iommu = opaque;
4958
4959         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4960         return 0;
4961 }
4962
4963 /*
4964  * NB - intel-iommu lacks any sort of reference counting for the users of
4965  * dependent devices.  If multiple endpoints have intersecting dependent
4966  * devices, unbinding the driver from any one of them will possibly leave
4967  * the others unable to operate.
4968  */
4969 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4970 {
4971         if (!iommu || !dev || !dev_is_pci(dev))
4972                 return;
4973
4974         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4975 }
4976
4977 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4978 {
4979         struct dmar_domain *domain;
4980         struct intel_iommu *iommu;
4981         unsigned long flags;
4982
4983         assert_spin_locked(&device_domain_lock);
4984
4985         if (WARN_ON(!info))
4986                 return;
4987
4988         iommu = info->iommu;
4989         domain = info->domain;
4990
4991         if (info->dev) {
4992                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4993                         intel_pasid_tear_down_entry(iommu, info->dev,
4994                                         PASID_RID2PASID, false);
4995
4996                 iommu_disable_dev_iotlb(info);
4997                 if (!dev_is_real_dma_subdevice(info->dev))
4998                         domain_context_clear(iommu, info->dev);
4999                 intel_pasid_free_table(info->dev);
5000         }
5001
5002         unlink_domain_info(info);
5003
5004         spin_lock_irqsave(&iommu->lock, flags);
5005         domain_detach_iommu(domain, iommu);
5006         spin_unlock_irqrestore(&iommu->lock, flags);
5007
5008         free_devinfo_mem(info);
5009 }
5010
5011 static void dmar_remove_one_dev_info(struct device *dev)
5012 {
5013         struct device_domain_info *info;
5014         unsigned long flags;
5015
5016         spin_lock_irqsave(&device_domain_lock, flags);
5017         info = get_domain_info(dev);
5018         if (info)
5019                 __dmar_remove_one_dev_info(info);
5020         spin_unlock_irqrestore(&device_domain_lock, flags);
5021 }
5022
5023 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5024 {
5025         int adjust_width;
5026
5027         /* calculate AGAW */
5028         domain->gaw = guest_width;
5029         adjust_width = guestwidth_to_adjustwidth(guest_width);
5030         domain->agaw = width_to_agaw(adjust_width);
5031
5032         domain->iommu_coherency = 0;
5033         domain->iommu_snooping = 0;
5034         domain->iommu_superpage = 0;
5035         domain->max_addr = 0;
5036
5037         /* always allocate the top pgd */
5038         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5039         if (!domain->pgd)
5040                 return -ENOMEM;
5041         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5042         return 0;
5043 }
5044
5045 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5046 {
5047         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5048         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5049
5050         if (!intel_iommu_strict &&
5051             init_iova_flush_queue(&dmar_domain->iovad,
5052                                   iommu_flush_iova, iova_entry_free))
5053                 pr_info("iova flush queue initialization failed\n");
5054 }
5055
5056 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5057 {
5058         struct dmar_domain *dmar_domain;
5059         struct iommu_domain *domain;
5060
5061         switch (type) {
5062         case IOMMU_DOMAIN_DMA:
5063         /* fallthrough */
5064         case IOMMU_DOMAIN_UNMANAGED:
5065                 dmar_domain = alloc_domain(0);
5066                 if (!dmar_domain) {
5067                         pr_err("Can't allocate dmar_domain\n");
5068                         return NULL;
5069                 }
5070                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5071                         pr_err("Domain initialization failed\n");
5072                         domain_exit(dmar_domain);
5073                         return NULL;
5074                 }
5075
5076                 if (type == IOMMU_DOMAIN_DMA)
5077                         intel_init_iova_domain(dmar_domain);
5078
5079                 domain_update_iommu_cap(dmar_domain);
5080
5081                 domain = &dmar_domain->domain;
5082                 domain->geometry.aperture_start = 0;
5083                 domain->geometry.aperture_end   =
5084                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5085                 domain->geometry.force_aperture = true;
5086
5087                 return domain;
5088         case IOMMU_DOMAIN_IDENTITY:
5089                 return &si_domain->domain;
5090         default:
5091                 return NULL;
5092         }
5093
5094         return NULL;
5095 }
5096
5097 static void intel_iommu_domain_free(struct iommu_domain *domain)
5098 {
5099         if (domain != &si_domain->domain)
5100                 domain_exit(to_dmar_domain(domain));
5101 }
5102
5103 /*
5104  * Check whether a @domain could be attached to the @dev through the
5105  * aux-domain attach/detach APIs.
5106  */
5107 static inline bool
5108 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5109 {
5110         struct device_domain_info *info = get_domain_info(dev);
5111
5112         return info && info->auxd_enabled &&
5113                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5114 }
5115
5116 static void auxiliary_link_device(struct dmar_domain *domain,
5117                                   struct device *dev)
5118 {
5119         struct device_domain_info *info = get_domain_info(dev);
5120
5121         assert_spin_locked(&device_domain_lock);
5122         if (WARN_ON(!info))
5123                 return;
5124
5125         domain->auxd_refcnt++;
5126         list_add(&domain->auxd, &info->auxiliary_domains);
5127 }
5128
5129 static void auxiliary_unlink_device(struct dmar_domain *domain,
5130                                     struct device *dev)
5131 {
5132         struct device_domain_info *info = get_domain_info(dev);
5133
5134         assert_spin_locked(&device_domain_lock);
5135         if (WARN_ON(!info))
5136                 return;
5137
5138         list_del(&domain->auxd);
5139         domain->auxd_refcnt--;
5140
5141         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5142                 ioasid_free(domain->default_pasid);
5143 }
5144
5145 static int aux_domain_add_dev(struct dmar_domain *domain,
5146                               struct device *dev)
5147 {
5148         int ret;
5149         u8 bus, devfn;
5150         unsigned long flags;
5151         struct intel_iommu *iommu;
5152
5153         iommu = device_to_iommu(dev, &bus, &devfn);
5154         if (!iommu)
5155                 return -ENODEV;
5156
5157         if (domain->default_pasid <= 0) {
5158                 int pasid;
5159
5160                 /* No private data needed for the default pasid */
5161                 pasid = ioasid_alloc(NULL, PASID_MIN,
5162                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5163                                      NULL);
5164                 if (pasid == INVALID_IOASID) {
5165                         pr_err("Can't allocate default pasid\n");
5166                         return -ENODEV;
5167                 }
5168                 domain->default_pasid = pasid;
5169         }
5170
5171         spin_lock_irqsave(&device_domain_lock, flags);
5172         /*
5173          * iommu->lock must be held to attach domain to iommu and setup the
5174          * pasid entry for second level translation.
5175          */
5176         spin_lock(&iommu->lock);
5177         ret = domain_attach_iommu(domain, iommu);
5178         if (ret)
5179                 goto attach_failed;
5180
5181         /* Setup the PASID entry for mediated devices: */
5182         if (domain_use_first_level(domain))
5183                 ret = domain_setup_first_level(iommu, domain, dev,
5184                                                domain->default_pasid);
5185         else
5186                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5187                                                      domain->default_pasid);
5188         if (ret)
5189                 goto table_failed;
5190         spin_unlock(&iommu->lock);
5191
5192         auxiliary_link_device(domain, dev);
5193
5194         spin_unlock_irqrestore(&device_domain_lock, flags);
5195
5196         return 0;
5197
5198 table_failed:
5199         domain_detach_iommu(domain, iommu);
5200 attach_failed:
5201         spin_unlock(&iommu->lock);
5202         spin_unlock_irqrestore(&device_domain_lock, flags);
5203         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5204                 ioasid_free(domain->default_pasid);
5205
5206         return ret;
5207 }
5208
5209 static void aux_domain_remove_dev(struct dmar_domain *domain,
5210                                   struct device *dev)
5211 {
5212         struct device_domain_info *info;
5213         struct intel_iommu *iommu;
5214         unsigned long flags;
5215
5216         if (!is_aux_domain(dev, &domain->domain))
5217                 return;
5218
5219         spin_lock_irqsave(&device_domain_lock, flags);
5220         info = get_domain_info(dev);
5221         iommu = info->iommu;
5222
5223         auxiliary_unlink_device(domain, dev);
5224
5225         spin_lock(&iommu->lock);
5226         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5227         domain_detach_iommu(domain, iommu);
5228         spin_unlock(&iommu->lock);
5229
5230         spin_unlock_irqrestore(&device_domain_lock, flags);
5231 }
5232
5233 static int prepare_domain_attach_device(struct iommu_domain *domain,
5234                                         struct device *dev)
5235 {
5236         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5237         struct intel_iommu *iommu;
5238         int addr_width;
5239         u8 bus, devfn;
5240
5241         iommu = device_to_iommu(dev, &bus, &devfn);
5242         if (!iommu)
5243                 return -ENODEV;
5244
5245         /* check if this iommu agaw is sufficient for max mapped address */
5246         addr_width = agaw_to_width(iommu->agaw);
5247         if (addr_width > cap_mgaw(iommu->cap))
5248                 addr_width = cap_mgaw(iommu->cap);
5249
5250         if (dmar_domain->max_addr > (1LL << addr_width)) {
5251                 dev_err(dev, "%s: iommu width (%d) is not "
5252                         "sufficient for the mapped address (%llx)\n",
5253                         __func__, addr_width, dmar_domain->max_addr);
5254                 return -EFAULT;
5255         }
5256         dmar_domain->gaw = addr_width;
5257
5258         /*
5259          * Knock out extra levels of page tables if necessary
5260          */
5261         while (iommu->agaw < dmar_domain->agaw) {
5262                 struct dma_pte *pte;
5263
5264                 pte = dmar_domain->pgd;
5265                 if (dma_pte_present(pte)) {
5266                         dmar_domain->pgd = (struct dma_pte *)
5267                                 phys_to_virt(dma_pte_addr(pte));
5268                         free_pgtable_page(pte);
5269                 }
5270                 dmar_domain->agaw--;
5271         }
5272
5273         return 0;
5274 }
5275
5276 static int intel_iommu_attach_device(struct iommu_domain *domain,
5277                                      struct device *dev)
5278 {
5279         int ret;
5280
5281         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5282             device_is_rmrr_locked(dev)) {
5283                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5284                 return -EPERM;
5285         }
5286
5287         if (is_aux_domain(dev, domain))
5288                 return -EPERM;
5289
5290         /* normally dev is not mapped */
5291         if (unlikely(domain_context_mapped(dev))) {
5292                 struct dmar_domain *old_domain;
5293
5294                 old_domain = find_domain(dev);
5295                 if (old_domain)
5296                         dmar_remove_one_dev_info(dev);
5297         }
5298
5299         ret = prepare_domain_attach_device(domain, dev);
5300         if (ret)
5301                 return ret;
5302
5303         return domain_add_dev_info(to_dmar_domain(domain), dev);
5304 }
5305
5306 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5307                                          struct device *dev)
5308 {
5309         int ret;
5310
5311         if (!is_aux_domain(dev, domain))
5312                 return -EPERM;
5313
5314         ret = prepare_domain_attach_device(domain, dev);
5315         if (ret)
5316                 return ret;
5317
5318         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5319 }
5320
5321 static void intel_iommu_detach_device(struct iommu_domain *domain,
5322                                       struct device *dev)
5323 {
5324         dmar_remove_one_dev_info(dev);
5325 }
5326
5327 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5328                                           struct device *dev)
5329 {
5330         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5331 }
5332
5333 /*
5334  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5335  * VT-d granularity. Invalidation is typically included in the unmap operation
5336  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5337  * owns the first level page tables. Invalidations of translation caches in the
5338  * guest are trapped and passed down to the host.
5339  *
5340  * vIOMMU in the guest will only expose first level page tables, therefore
5341  * we do not support IOTLB granularity for request without PASID (second level).
5342  *
5343  * For example, to find the VT-d granularity encoding for IOTLB
5344  * type and page selective granularity within PASID:
5345  * X: indexed by iommu cache type
5346  * Y: indexed by enum iommu_inv_granularity
5347  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5348  */
5349
5350 static const int
5351 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5352         /*
5353          * PASID based IOTLB invalidation: PASID selective (per PASID),
5354          * page selective (address granularity)
5355          */
5356         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5357         /* PASID based dev TLBs */
5358         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5359         /* PASID cache */
5360         {-EINVAL, -EINVAL, -EINVAL}
5361 };
5362
5363 static inline int to_vtd_granularity(int type, int granu)
5364 {
5365         return inv_type_granu_table[type][granu];
5366 }
5367
5368 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5369 {
5370         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5371
5372         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5373          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5374          * granu size in contiguous memory.
5375          */
5376         return order_base_2(nr_pages);
5377 }
5378
5379 #ifdef CONFIG_INTEL_IOMMU_SVM
5380 static int
5381 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5382                            struct iommu_cache_invalidate_info *inv_info)
5383 {
5384         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5385         struct device_domain_info *info;
5386         struct intel_iommu *iommu;
5387         unsigned long flags;
5388         int cache_type;
5389         u8 bus, devfn;
5390         u16 did, sid;
5391         int ret = 0;
5392         u64 size = 0;
5393
5394         if (!inv_info || !dmar_domain ||
5395             inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5396                 return -EINVAL;
5397
5398         if (!dev || !dev_is_pci(dev))
5399                 return -ENODEV;
5400
5401         iommu = device_to_iommu(dev, &bus, &devfn);
5402         if (!iommu)
5403                 return -ENODEV;
5404
5405         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5406                 return -EINVAL;
5407
5408         spin_lock_irqsave(&device_domain_lock, flags);
5409         spin_lock(&iommu->lock);
5410         info = get_domain_info(dev);
5411         if (!info) {
5412                 ret = -EINVAL;
5413                 goto out_unlock;
5414         }
5415         did = dmar_domain->iommu_did[iommu->seq_id];
5416         sid = PCI_DEVID(bus, devfn);
5417
5418         /* Size is only valid in address selective invalidation */
5419         if (inv_info->granularity != IOMMU_INV_GRANU_PASID)
5420                 size = to_vtd_size(inv_info->addr_info.granule_size,
5421                                    inv_info->addr_info.nb_granules);
5422
5423         for_each_set_bit(cache_type,
5424                          (unsigned long *)&inv_info->cache,
5425                          IOMMU_CACHE_INV_TYPE_NR) {
5426                 int granu = 0;
5427                 u64 pasid = 0;
5428
5429                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5430                 if (granu == -EINVAL) {
5431                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5432                                            cache_type, inv_info->granularity);
5433                         break;
5434                 }
5435
5436                 /*
5437                  * PASID is stored in different locations based on the
5438                  * granularity.
5439                  */
5440                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5441                     (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5442                         pasid = inv_info->pasid_info.pasid;
5443                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5444                          (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5445                         pasid = inv_info->addr_info.pasid;
5446
5447                 switch (BIT(cache_type)) {
5448                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5449                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5450                             size &&
5451                             (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5452                                 pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n",
5453                                                    inv_info->addr_info.addr, size);
5454                                 ret = -ERANGE;
5455                                 goto out_unlock;
5456                         }
5457
5458                         /*
5459                          * If granu is PASID-selective, address is ignored.
5460                          * We use npages = -1 to indicate that.
5461                          */
5462                         qi_flush_piotlb(iommu, did, pasid,
5463                                         mm_to_dma_pfn(inv_info->addr_info.addr),
5464                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5465                                         inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5466
5467                         /*
5468                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5469                          * in the guest may assume IOTLB flush is inclusive,
5470                          * which is more efficient.
5471                          */
5472                         if (info->ats_enabled)
5473                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5474                                                 info->pfsid, pasid,
5475                                                 info->ats_qdep,
5476                                                 inv_info->addr_info.addr,
5477                                                 size, granu);
5478                         break;
5479                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5480                         if (info->ats_enabled)
5481                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5482                                                 info->pfsid, pasid,
5483                                                 info->ats_qdep,
5484                                                 inv_info->addr_info.addr,
5485                                                 size, granu);
5486                         else
5487                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5488                         break;
5489                 default:
5490                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5491                                             cache_type);
5492                         ret = -EINVAL;
5493                 }
5494         }
5495 out_unlock:
5496         spin_unlock(&iommu->lock);
5497         spin_unlock_irqrestore(&device_domain_lock, flags);
5498
5499         return ret;
5500 }
5501 #endif
5502
5503 static int intel_iommu_map(struct iommu_domain *domain,
5504                            unsigned long iova, phys_addr_t hpa,
5505                            size_t size, int iommu_prot, gfp_t gfp)
5506 {
5507         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5508         u64 max_addr;
5509         int prot = 0;
5510         int ret;
5511
5512         if (iommu_prot & IOMMU_READ)
5513                 prot |= DMA_PTE_READ;
5514         if (iommu_prot & IOMMU_WRITE)
5515                 prot |= DMA_PTE_WRITE;
5516         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5517                 prot |= DMA_PTE_SNP;
5518
5519         max_addr = iova + size;
5520         if (dmar_domain->max_addr < max_addr) {
5521                 u64 end;
5522
5523                 /* check if minimum agaw is sufficient for mapped address */
5524                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5525                 if (end < max_addr) {
5526                         pr_err("%s: iommu width (%d) is not "
5527                                "sufficient for the mapped address (%llx)\n",
5528                                __func__, dmar_domain->gaw, max_addr);
5529                         return -EFAULT;
5530                 }
5531                 dmar_domain->max_addr = max_addr;
5532         }
5533         /* Round up size to next multiple of PAGE_SIZE, if it and
5534            the low bits of hpa would take us onto the next page */
5535         size = aligned_nrpages(hpa, size);
5536         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5537                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5538         return ret;
5539 }
5540
5541 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5542                                 unsigned long iova, size_t size,
5543                                 struct iommu_iotlb_gather *gather)
5544 {
5545         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5546         struct page *freelist = NULL;
5547         unsigned long start_pfn, last_pfn;
5548         unsigned int npages;
5549         int iommu_id, level = 0;
5550
5551         /* Cope with horrid API which requires us to unmap more than the
5552            size argument if it happens to be a large-page mapping. */
5553         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5554
5555         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5556                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5557
5558         start_pfn = iova >> VTD_PAGE_SHIFT;
5559         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5560
5561         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5562
5563         npages = last_pfn - start_pfn + 1;
5564
5565         for_each_domain_iommu(iommu_id, dmar_domain)
5566                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5567                                       start_pfn, npages, !freelist, 0);
5568
5569         dma_free_pagelist(freelist);
5570
5571         if (dmar_domain->max_addr == iova + size)
5572                 dmar_domain->max_addr = iova;
5573
5574         return size;
5575 }
5576
5577 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5578                                             dma_addr_t iova)
5579 {
5580         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5581         struct dma_pte *pte;
5582         int level = 0;
5583         u64 phys = 0;
5584
5585         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5586         if (pte && dma_pte_present(pte))
5587                 phys = dma_pte_addr(pte) +
5588                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5589                                                 VTD_PAGE_SHIFT) - 1));
5590
5591         return phys;
5592 }
5593
5594 static inline bool scalable_mode_support(void)
5595 {
5596         struct dmar_drhd_unit *drhd;
5597         struct intel_iommu *iommu;
5598         bool ret = true;
5599
5600         rcu_read_lock();
5601         for_each_active_iommu(iommu, drhd) {
5602                 if (!sm_supported(iommu)) {
5603                         ret = false;
5604                         break;
5605                 }
5606         }
5607         rcu_read_unlock();
5608
5609         return ret;
5610 }
5611
5612 static inline bool iommu_pasid_support(void)
5613 {
5614         struct dmar_drhd_unit *drhd;
5615         struct intel_iommu *iommu;
5616         bool ret = true;
5617
5618         rcu_read_lock();
5619         for_each_active_iommu(iommu, drhd) {
5620                 if (!pasid_supported(iommu)) {
5621                         ret = false;
5622                         break;
5623                 }
5624         }
5625         rcu_read_unlock();
5626
5627         return ret;
5628 }
5629
5630 static inline bool nested_mode_support(void)
5631 {
5632         struct dmar_drhd_unit *drhd;
5633         struct intel_iommu *iommu;
5634         bool ret = true;
5635
5636         rcu_read_lock();
5637         for_each_active_iommu(iommu, drhd) {
5638                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5639                         ret = false;
5640                         break;
5641                 }
5642         }
5643         rcu_read_unlock();
5644
5645         return ret;
5646 }
5647
5648 static bool intel_iommu_capable(enum iommu_cap cap)
5649 {
5650         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5651                 return domain_update_iommu_snooping(NULL) == 1;
5652         if (cap == IOMMU_CAP_INTR_REMAP)
5653                 return irq_remapping_enabled == 1;
5654
5655         return false;
5656 }
5657
5658 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5659 {
5660         struct intel_iommu *iommu;
5661         u8 bus, devfn;
5662
5663         iommu = device_to_iommu(dev, &bus, &devfn);
5664         if (!iommu)
5665                 return ERR_PTR(-ENODEV);
5666
5667         if (translation_pre_enabled(iommu))
5668                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5669
5670         return &iommu->iommu;
5671 }
5672
5673 static void intel_iommu_release_device(struct device *dev)
5674 {
5675         struct intel_iommu *iommu;
5676         u8 bus, devfn;
5677
5678         iommu = device_to_iommu(dev, &bus, &devfn);
5679         if (!iommu)
5680                 return;
5681
5682         dmar_remove_one_dev_info(dev);
5683
5684         set_dma_ops(dev, NULL);
5685 }
5686
5687 static void intel_iommu_probe_finalize(struct device *dev)
5688 {
5689         struct iommu_domain *domain;
5690
5691         domain = iommu_get_domain_for_dev(dev);
5692         if (device_needs_bounce(dev))
5693                 set_dma_ops(dev, &bounce_dma_ops);
5694         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5695                 set_dma_ops(dev, &intel_dma_ops);
5696         else
5697                 set_dma_ops(dev, NULL);
5698 }
5699
5700 static void intel_iommu_get_resv_regions(struct device *device,
5701                                          struct list_head *head)
5702 {
5703         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5704         struct iommu_resv_region *reg;
5705         struct dmar_rmrr_unit *rmrr;
5706         struct device *i_dev;
5707         int i;
5708
5709         down_read(&dmar_global_lock);
5710         for_each_rmrr_units(rmrr) {
5711                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5712                                           i, i_dev) {
5713                         struct iommu_resv_region *resv;
5714                         enum iommu_resv_type type;
5715                         size_t length;
5716
5717                         if (i_dev != device &&
5718                             !is_downstream_to_pci_bridge(device, i_dev))
5719                                 continue;
5720
5721                         length = rmrr->end_address - rmrr->base_address + 1;
5722
5723                         type = device_rmrr_is_relaxable(device) ?
5724                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5725
5726                         resv = iommu_alloc_resv_region(rmrr->base_address,
5727                                                        length, prot, type);
5728                         if (!resv)
5729                                 break;
5730
5731                         list_add_tail(&resv->list, head);
5732                 }
5733         }
5734         up_read(&dmar_global_lock);
5735
5736 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5737         if (dev_is_pci(device)) {
5738                 struct pci_dev *pdev = to_pci_dev(device);
5739
5740                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5741                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5742                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5743                         if (reg)
5744                                 list_add_tail(&reg->list, head);
5745                 }
5746         }
5747 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5748
5749         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5750                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5751                                       0, IOMMU_RESV_MSI);
5752         if (!reg)
5753                 return;
5754         list_add_tail(&reg->list, head);
5755 }
5756
5757 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5758 {
5759         struct device_domain_info *info;
5760         struct context_entry *context;
5761         struct dmar_domain *domain;
5762         unsigned long flags;
5763         u64 ctx_lo;
5764         int ret;
5765
5766         domain = find_domain(dev);
5767         if (!domain)
5768                 return -EINVAL;
5769
5770         spin_lock_irqsave(&device_domain_lock, flags);
5771         spin_lock(&iommu->lock);
5772
5773         ret = -EINVAL;
5774         info = get_domain_info(dev);
5775         if (!info || !info->pasid_supported)
5776                 goto out;
5777
5778         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5779         if (WARN_ON(!context))
5780                 goto out;
5781
5782         ctx_lo = context[0].lo;
5783
5784         if (!(ctx_lo & CONTEXT_PASIDE)) {
5785                 ctx_lo |= CONTEXT_PASIDE;
5786                 context[0].lo = ctx_lo;
5787                 wmb();
5788                 iommu->flush.flush_context(iommu,
5789                                            domain->iommu_did[iommu->seq_id],
5790                                            PCI_DEVID(info->bus, info->devfn),
5791                                            DMA_CCMD_MASK_NOBIT,
5792                                            DMA_CCMD_DEVICE_INVL);
5793         }
5794
5795         /* Enable PASID support in the device, if it wasn't already */
5796         if (!info->pasid_enabled)
5797                 iommu_enable_dev_iotlb(info);
5798
5799         ret = 0;
5800
5801  out:
5802         spin_unlock(&iommu->lock);
5803         spin_unlock_irqrestore(&device_domain_lock, flags);
5804
5805         return ret;
5806 }
5807
5808 static void intel_iommu_apply_resv_region(struct device *dev,
5809                                           struct iommu_domain *domain,
5810                                           struct iommu_resv_region *region)
5811 {
5812         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5813         unsigned long start, end;
5814
5815         start = IOVA_PFN(region->start);
5816         end   = IOVA_PFN(region->start + region->length - 1);
5817
5818         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5819 }
5820
5821 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5822 {
5823         if (dev_is_pci(dev))
5824                 return pci_device_group(dev);
5825         return generic_device_group(dev);
5826 }
5827
5828 #ifdef CONFIG_INTEL_IOMMU_SVM
5829 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5830 {
5831         struct intel_iommu *iommu;
5832         u8 bus, devfn;
5833
5834         if (iommu_dummy(dev)) {
5835                 dev_warn(dev,
5836                          "No IOMMU translation for device; cannot enable SVM\n");
5837                 return NULL;
5838         }
5839
5840         iommu = device_to_iommu(dev, &bus, &devfn);
5841         if ((!iommu)) {
5842                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5843                 return NULL;
5844         }
5845
5846         return iommu;
5847 }
5848 #endif /* CONFIG_INTEL_IOMMU_SVM */
5849
5850 static int intel_iommu_enable_auxd(struct device *dev)
5851 {
5852         struct device_domain_info *info;
5853         struct intel_iommu *iommu;
5854         unsigned long flags;
5855         u8 bus, devfn;
5856         int ret;
5857
5858         iommu = device_to_iommu(dev, &bus, &devfn);
5859         if (!iommu || dmar_disabled)
5860                 return -EINVAL;
5861
5862         if (!sm_supported(iommu) || !pasid_supported(iommu))
5863                 return -EINVAL;
5864
5865         ret = intel_iommu_enable_pasid(iommu, dev);
5866         if (ret)
5867                 return -ENODEV;
5868
5869         spin_lock_irqsave(&device_domain_lock, flags);
5870         info = get_domain_info(dev);
5871         info->auxd_enabled = 1;
5872         spin_unlock_irqrestore(&device_domain_lock, flags);
5873
5874         return 0;
5875 }
5876
5877 static int intel_iommu_disable_auxd(struct device *dev)
5878 {
5879         struct device_domain_info *info;
5880         unsigned long flags;
5881
5882         spin_lock_irqsave(&device_domain_lock, flags);
5883         info = get_domain_info(dev);
5884         if (!WARN_ON(!info))
5885                 info->auxd_enabled = 0;
5886         spin_unlock_irqrestore(&device_domain_lock, flags);
5887
5888         return 0;
5889 }
5890
5891 /*
5892  * A PCI express designated vendor specific extended capability is defined
5893  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5894  * for system software and tools to detect endpoint devices supporting the
5895  * Intel scalable IO virtualization without host driver dependency.
5896  *
5897  * Returns the address of the matching extended capability structure within
5898  * the device's PCI configuration space or 0 if the device does not support
5899  * it.
5900  */
5901 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5902 {
5903         int pos;
5904         u16 vendor, id;
5905
5906         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5907         while (pos) {
5908                 pci_read_config_word(pdev, pos + 4, &vendor);
5909                 pci_read_config_word(pdev, pos + 8, &id);
5910                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5911                         return pos;
5912
5913                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5914         }
5915
5916         return 0;
5917 }
5918
5919 static bool
5920 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5921 {
5922         if (feat == IOMMU_DEV_FEAT_AUX) {
5923                 int ret;
5924
5925                 if (!dev_is_pci(dev) || dmar_disabled ||
5926                     !scalable_mode_support() || !iommu_pasid_support())
5927                         return false;
5928
5929                 ret = pci_pasid_features(to_pci_dev(dev));
5930                 if (ret < 0)
5931                         return false;
5932
5933                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5934         }
5935
5936         if (feat == IOMMU_DEV_FEAT_SVA) {
5937                 struct device_domain_info *info = get_domain_info(dev);
5938
5939                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5940                         info->pasid_supported && info->pri_supported &&
5941                         info->ats_supported;
5942         }
5943
5944         return false;
5945 }
5946
5947 static int
5948 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5949 {
5950         if (feat == IOMMU_DEV_FEAT_AUX)
5951                 return intel_iommu_enable_auxd(dev);
5952
5953         if (feat == IOMMU_DEV_FEAT_SVA) {
5954                 struct device_domain_info *info = get_domain_info(dev);
5955
5956                 if (!info)
5957                         return -EINVAL;
5958
5959                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5960                         return 0;
5961         }
5962
5963         return -ENODEV;
5964 }
5965
5966 static int
5967 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5968 {
5969         if (feat == IOMMU_DEV_FEAT_AUX)
5970                 return intel_iommu_disable_auxd(dev);
5971
5972         return -ENODEV;
5973 }
5974
5975 static bool
5976 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5977 {
5978         struct device_domain_info *info = get_domain_info(dev);
5979
5980         if (feat == IOMMU_DEV_FEAT_AUX)
5981                 return scalable_mode_support() && info && info->auxd_enabled;
5982
5983         return false;
5984 }
5985
5986 static int
5987 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5988 {
5989         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5990
5991         return dmar_domain->default_pasid > 0 ?
5992                         dmar_domain->default_pasid : -EINVAL;
5993 }
5994
5995 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5996                                            struct device *dev)
5997 {
5998         return attach_deferred(dev);
5999 }
6000
6001 static int
6002 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6003                             enum iommu_attr attr, void *data)
6004 {
6005         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6006         unsigned long flags;
6007         int ret = 0;
6008
6009         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6010                 return -EINVAL;
6011
6012         switch (attr) {
6013         case DOMAIN_ATTR_NESTING:
6014                 spin_lock_irqsave(&device_domain_lock, flags);
6015                 if (nested_mode_support() &&
6016                     list_empty(&dmar_domain->devices)) {
6017                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6018                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6019                 } else {
6020                         ret = -ENODEV;
6021                 }
6022                 spin_unlock_irqrestore(&device_domain_lock, flags);
6023                 break;
6024         default:
6025                 ret = -EINVAL;
6026                 break;
6027         }
6028
6029         return ret;
6030 }
6031
6032 /*
6033  * Check that the device does not live on an external facing PCI port that is
6034  * marked as untrusted. Such devices should not be able to apply quirks and
6035  * thus not be able to bypass the IOMMU restrictions.
6036  */
6037 static bool risky_device(struct pci_dev *pdev)
6038 {
6039         if (pdev->untrusted) {
6040                 pci_info(pdev,
6041                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6042                          pdev->vendor, pdev->device);
6043                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6044                 return true;
6045         }
6046         return false;
6047 }
6048
6049 const struct iommu_ops intel_iommu_ops = {
6050         .capable                = intel_iommu_capable,
6051         .domain_alloc           = intel_iommu_domain_alloc,
6052         .domain_free            = intel_iommu_domain_free,
6053         .domain_set_attr        = intel_iommu_domain_set_attr,
6054         .attach_dev             = intel_iommu_attach_device,
6055         .detach_dev             = intel_iommu_detach_device,
6056         .aux_attach_dev         = intel_iommu_aux_attach_device,
6057         .aux_detach_dev         = intel_iommu_aux_detach_device,
6058         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6059         .map                    = intel_iommu_map,
6060         .unmap                  = intel_iommu_unmap,
6061         .iova_to_phys           = intel_iommu_iova_to_phys,
6062         .probe_device           = intel_iommu_probe_device,
6063         .probe_finalize         = intel_iommu_probe_finalize,
6064         .release_device         = intel_iommu_release_device,
6065         .get_resv_regions       = intel_iommu_get_resv_regions,
6066         .put_resv_regions       = generic_iommu_put_resv_regions,
6067         .apply_resv_region      = intel_iommu_apply_resv_region,
6068         .device_group           = intel_iommu_device_group,
6069         .dev_has_feat           = intel_iommu_dev_has_feat,
6070         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6071         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6072         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6073         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6074         .def_domain_type        = device_def_domain_type,
6075         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6076 #ifdef CONFIG_INTEL_IOMMU_SVM
6077         .cache_invalidate       = intel_iommu_sva_invalidate,
6078         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6079         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6080         .sva_bind               = intel_svm_bind,
6081         .sva_unbind             = intel_svm_unbind,
6082         .sva_get_pasid          = intel_svm_get_pasid,
6083 #endif
6084 };
6085
6086 static void quirk_iommu_igfx(struct pci_dev *dev)
6087 {
6088         if (risky_device(dev))
6089                 return;
6090
6091         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6092         dmar_map_gfx = 0;
6093 }
6094
6095 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6096 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6097 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6098 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6099 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6100 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6101 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6102 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6103
6104 /* Broadwell igfx malfunctions with dmar */
6105 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6106 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6107 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6108 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6109 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6110 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6111 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6112 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6113 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6114 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6122 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6123 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6124 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6125 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6126 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6127 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6128 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6129
6130 static void quirk_iommu_rwbf(struct pci_dev *dev)
6131 {
6132         if (risky_device(dev))
6133                 return;
6134
6135         /*
6136          * Mobile 4 Series Chipset neglects to set RWBF capability,
6137          * but needs it. Same seems to hold for the desktop versions.
6138          */
6139         pci_info(dev, "Forcing write-buffer flush capability\n");
6140         rwbf_quirk = 1;
6141 }
6142
6143 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6144 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6145 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6146 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6147 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6148 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6149 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6150
6151 #define GGC 0x52
6152 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6153 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6154 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6155 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6156 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6157 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6158 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6159 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6160
6161 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6162 {
6163         unsigned short ggc;
6164
6165         if (risky_device(dev))
6166                 return;
6167
6168         if (pci_read_config_word(dev, GGC, &ggc))
6169                 return;
6170
6171         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6172                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6173                 dmar_map_gfx = 0;
6174         } else if (dmar_map_gfx) {
6175                 /* we have to ensure the gfx device is idle before we flush */
6176                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6177                 intel_iommu_strict = 1;
6178        }
6179 }
6180 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6181 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6182 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6183 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6184
6185 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6186    ISOCH DMAR unit for the Azalia sound device, but not give it any
6187    TLB entries, which causes it to deadlock. Check for that.  We do
6188    this in a function called from init_dmars(), instead of in a PCI
6189    quirk, because we don't want to print the obnoxious "BIOS broken"
6190    message if VT-d is actually disabled.
6191 */
6192 static void __init check_tylersburg_isoch(void)
6193 {
6194         struct pci_dev *pdev;
6195         uint32_t vtisochctrl;
6196
6197         /* If there's no Azalia in the system anyway, forget it. */
6198         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6199         if (!pdev)
6200                 return;
6201
6202         if (risky_device(pdev)) {
6203                 pci_dev_put(pdev);
6204                 return;
6205         }
6206
6207         pci_dev_put(pdev);
6208
6209         /* System Management Registers. Might be hidden, in which case
6210            we can't do the sanity check. But that's OK, because the
6211            known-broken BIOSes _don't_ actually hide it, so far. */
6212         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6213         if (!pdev)
6214                 return;
6215
6216         if (risky_device(pdev)) {
6217                 pci_dev_put(pdev);
6218                 return;
6219         }
6220
6221         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6222                 pci_dev_put(pdev);
6223                 return;
6224         }
6225
6226         pci_dev_put(pdev);
6227
6228         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6229         if (vtisochctrl & 1)
6230                 return;
6231
6232         /* Drop all bits other than the number of TLB entries */
6233         vtisochctrl &= 0x1c;
6234
6235         /* If we have the recommended number of TLB entries (16), fine. */
6236         if (vtisochctrl == 0x10)
6237                 return;
6238
6239         /* Zero TLB entries? You get to ride the short bus to school. */
6240         if (!vtisochctrl) {
6241                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6242                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6243                      dmi_get_system_info(DMI_BIOS_VENDOR),
6244                      dmi_get_system_info(DMI_BIOS_VERSION),
6245                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6246                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6247                 return;
6248         }
6249
6250         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6251                vtisochctrl);
6252 }