iommu/vt-d: Delegate the dma domain to upper layer
[linux-2.6-block.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22 #define dev_fmt(fmt)    pr_fmt(fmt)
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
55
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
58
59 #define ROOT_SIZE               VTD_PAGE_SIZE
60 #define CONTEXT_SIZE            VTD_PAGE_SIZE
61
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66
67 #define IOAPIC_RANGE_START      (0xfee00000)
68 #define IOAPIC_RANGE_END        (0xfeefffff)
69 #define IOVA_START_ADDR         (0x1000)
70
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75
76 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
82                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN          (1)
87
88 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
89
90 /* page table handling */
91 #define LEVEL_STRIDE            (9)
92 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
93
94 /*
95  * This bitmap is used to advertise the page sizes our hardware support
96  * to the IOMMU core, which will then use this information to split
97  * physically contiguous memory regions it is mapping into page sizes
98  * that we support.
99  *
100  * Traditionally the IOMMU core just handed us the mappings directly,
101  * after making sure the size is an order of a 4KiB page and that the
102  * mapping has natural alignment.
103  *
104  * To retain this behavior, we currently advertise that we support
105  * all page sizes that are an order of 4KiB.
106  *
107  * If at some point we'd like to utilize the IOMMU core's new behavior,
108  * we could change this to advertise the real page sizes we support.
109  */
110 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
111
112 static inline int agaw_to_level(int agaw)
113 {
114         return agaw + 2;
115 }
116
117 static inline int agaw_to_width(int agaw)
118 {
119         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 }
121
122 static inline int width_to_agaw(int width)
123 {
124         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 }
126
127 static inline unsigned int level_to_offset_bits(int level)
128 {
129         return (level - 1) * LEVEL_STRIDE;
130 }
131
132 static inline int pfn_level_offset(unsigned long pfn, int level)
133 {
134         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 }
136
137 static inline unsigned long level_mask(int level)
138 {
139         return -1UL << level_to_offset_bits(level);
140 }
141
142 static inline unsigned long level_size(int level)
143 {
144         return 1UL << level_to_offset_bits(level);
145 }
146
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
148 {
149         return (pfn + level_size(level) - 1) & level_mask(level);
150 }
151
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
153 {
154         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 }
156
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158    are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
160 {
161         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 }
163
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
165 {
166         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
167 }
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
169 {
170         return mm_to_dma_pfn(page_to_pfn(pg));
171 }
172 static inline unsigned long virt_to_dma_pfn(void *p)
173 {
174         return page_to_dma_pfn(virt_to_page(p));
175 }
176
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
179
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
182
183 /*
184  * set to 1 to panic kernel if can't successfully enable VT-d
185  * (used when kernel is launched w/ TXT)
186  */
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
190
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193 /*
194  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
195  * if marked present.
196  */
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
198 {
199         if (!(re->lo & 1))
200                 return 0;
201
202         return re->lo & VTD_PAGE_MASK;
203 }
204
205 /*
206  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
207  * if marked present.
208  */
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
210 {
211         if (!(re->hi & 1))
212                 return 0;
213
214         return re->hi & VTD_PAGE_MASK;
215 }
216
217 static inline void context_clear_pasid_enable(struct context_entry *context)
218 {
219         context->lo &= ~(1ULL << 11);
220 }
221
222 static inline bool context_pasid_enabled(struct context_entry *context)
223 {
224         return !!(context->lo & (1ULL << 11));
225 }
226
227 static inline void context_set_copied(struct context_entry *context)
228 {
229         context->hi |= (1ull << 3);
230 }
231
232 static inline bool context_copied(struct context_entry *context)
233 {
234         return !!(context->hi & (1ULL << 3));
235 }
236
237 static inline bool __context_present(struct context_entry *context)
238 {
239         return (context->lo & 1);
240 }
241
242 bool context_present(struct context_entry *context)
243 {
244         return context_pasid_enabled(context) ?
245              __context_present(context) :
246              __context_present(context) && !context_copied(context);
247 }
248
249 static inline void context_set_present(struct context_entry *context)
250 {
251         context->lo |= 1;
252 }
253
254 static inline void context_set_fault_enable(struct context_entry *context)
255 {
256         context->lo &= (((u64)-1) << 2) | 1;
257 }
258
259 static inline void context_set_translation_type(struct context_entry *context,
260                                                 unsigned long value)
261 {
262         context->lo &= (((u64)-1) << 4) | 3;
263         context->lo |= (value & 3) << 2;
264 }
265
266 static inline void context_set_address_root(struct context_entry *context,
267                                             unsigned long value)
268 {
269         context->lo &= ~VTD_PAGE_MASK;
270         context->lo |= value & VTD_PAGE_MASK;
271 }
272
273 static inline void context_set_address_width(struct context_entry *context,
274                                              unsigned long value)
275 {
276         context->hi |= value & 7;
277 }
278
279 static inline void context_set_domain_id(struct context_entry *context,
280                                          unsigned long value)
281 {
282         context->hi |= (value & ((1 << 16) - 1)) << 8;
283 }
284
285 static inline int context_domain_id(struct context_entry *c)
286 {
287         return((c->hi >> 8) & 0xffff);
288 }
289
290 static inline void context_clear_entry(struct context_entry *context)
291 {
292         context->lo = 0;
293         context->hi = 0;
294 }
295
296 /*
297  * This domain is a statically identity mapping domain.
298  *      1. This domain creats a static 1:1 mapping to all usable memory.
299  *      2. It maps to each iommu if successful.
300  *      3. Each iommu mapps to this domain if successful.
301  */
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
304
305 /* si_domain contains mulitple devices */
306 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
307
308 #define for_each_domain_iommu(idx, domain)                      \
309         for (idx = 0; idx < g_num_of_iommus; idx++)             \
310                 if (domain->iommu_refcnt[idx])
311
312 struct dmar_rmrr_unit {
313         struct list_head list;          /* list of rmrr units   */
314         struct acpi_dmar_header *hdr;   /* ACPI header          */
315         u64     base_address;           /* reserved base address*/
316         u64     end_address;            /* reserved end address */
317         struct dmar_dev_scope *devices; /* target devices */
318         int     devices_cnt;            /* target device count */
319         struct iommu_resv_region *resv; /* reserved region handle */
320 };
321
322 struct dmar_atsr_unit {
323         struct list_head list;          /* list of ATSR units */
324         struct acpi_dmar_header *hdr;   /* ACPI header */
325         struct dmar_dev_scope *devices; /* target devices */
326         int devices_cnt;                /* target device count */
327         u8 include_all:1;               /* include all ports */
328 };
329
330 static LIST_HEAD(dmar_atsr_units);
331 static LIST_HEAD(dmar_rmrr_units);
332
333 #define for_each_rmrr_units(rmrr) \
334         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335
336 /* bitmap for indexing intel_iommus */
337 static int g_num_of_iommus;
338
339 static void domain_exit(struct dmar_domain *domain);
340 static void domain_remove_dev_info(struct dmar_domain *domain);
341 static void dmar_remove_one_dev_info(struct device *dev);
342 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
343 static void domain_context_clear(struct intel_iommu *iommu,
344                                  struct device *dev);
345 static int domain_detach_iommu(struct dmar_domain *domain,
346                                struct intel_iommu *iommu);
347 static bool device_is_rmrr_locked(struct device *dev);
348
349 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
350 int dmar_disabled = 0;
351 #else
352 int dmar_disabled = 1;
353 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
354
355 int intel_iommu_sm;
356 int intel_iommu_enabled = 0;
357 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
358
359 static int dmar_map_gfx = 1;
360 static int dmar_forcedac;
361 static int intel_iommu_strict;
362 static int intel_iommu_superpage = 1;
363 static int iommu_identity_mapping;
364
365 #define IDENTMAP_ALL            1
366 #define IDENTMAP_GFX            2
367 #define IDENTMAP_AZALIA         4
368
369 int intel_iommu_gfx_mapped;
370 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
371
372 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
373 static DEFINE_SPINLOCK(device_domain_lock);
374 static LIST_HEAD(device_domain_list);
375
376 /*
377  * Iterate over elements in device_domain_list and call the specified
378  * callback @fn against each element.
379  */
380 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
381                                      void *data), void *data)
382 {
383         int ret = 0;
384         unsigned long flags;
385         struct device_domain_info *info;
386
387         spin_lock_irqsave(&device_domain_lock, flags);
388         list_for_each_entry(info, &device_domain_list, global) {
389                 ret = fn(info, data);
390                 if (ret) {
391                         spin_unlock_irqrestore(&device_domain_lock, flags);
392                         return ret;
393                 }
394         }
395         spin_unlock_irqrestore(&device_domain_lock, flags);
396
397         return 0;
398 }
399
400 const struct iommu_ops intel_iommu_ops;
401
402 static bool translation_pre_enabled(struct intel_iommu *iommu)
403 {
404         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
405 }
406
407 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
408 {
409         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
410 }
411
412 static void init_translation_status(struct intel_iommu *iommu)
413 {
414         u32 gsts;
415
416         gsts = readl(iommu->reg + DMAR_GSTS_REG);
417         if (gsts & DMA_GSTS_TES)
418                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
419 }
420
421 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
422 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
423 {
424         return container_of(dom, struct dmar_domain, domain);
425 }
426
427 static int __init intel_iommu_setup(char *str)
428 {
429         if (!str)
430                 return -EINVAL;
431         while (*str) {
432                 if (!strncmp(str, "on", 2)) {
433                         dmar_disabled = 0;
434                         pr_info("IOMMU enabled\n");
435                 } else if (!strncmp(str, "off", 3)) {
436                         dmar_disabled = 1;
437                         no_platform_optin = 1;
438                         pr_info("IOMMU disabled\n");
439                 } else if (!strncmp(str, "igfx_off", 8)) {
440                         dmar_map_gfx = 0;
441                         pr_info("Disable GFX device mapping\n");
442                 } else if (!strncmp(str, "forcedac", 8)) {
443                         pr_info("Forcing DAC for PCI devices\n");
444                         dmar_forcedac = 1;
445                 } else if (!strncmp(str, "strict", 6)) {
446                         pr_info("Disable batched IOTLB flush\n");
447                         intel_iommu_strict = 1;
448                 } else if (!strncmp(str, "sp_off", 6)) {
449                         pr_info("Disable supported super page\n");
450                         intel_iommu_superpage = 0;
451                 } else if (!strncmp(str, "sm_on", 5)) {
452                         pr_info("Intel-IOMMU: scalable mode supported\n");
453                         intel_iommu_sm = 1;
454                 } else if (!strncmp(str, "tboot_noforce", 13)) {
455                         printk(KERN_INFO
456                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457                         intel_iommu_tboot_noforce = 1;
458                 }
459
460                 str += strcspn(str, ",");
461                 while (*str == ',')
462                         str++;
463         }
464         return 0;
465 }
466 __setup("intel_iommu=", intel_iommu_setup);
467
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
470
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
472 {
473         struct dmar_domain **domains;
474         int idx = did >> 8;
475
476         domains = iommu->domains[idx];
477         if (!domains)
478                 return NULL;
479
480         return domains[did & 0xff];
481 }
482
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484                              struct dmar_domain *domain)
485 {
486         struct dmar_domain **domains;
487         int idx = did >> 8;
488
489         if (!iommu->domains[idx]) {
490                 size_t size = 256 * sizeof(struct dmar_domain *);
491                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
492         }
493
494         domains = iommu->domains[idx];
495         if (WARN_ON(!domains))
496                 return;
497         else
498                 domains[did & 0xff] = domain;
499 }
500
501 void *alloc_pgtable_page(int node)
502 {
503         struct page *page;
504         void *vaddr = NULL;
505
506         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
507         if (page)
508                 vaddr = page_address(page);
509         return vaddr;
510 }
511
512 void free_pgtable_page(void *vaddr)
513 {
514         free_page((unsigned long)vaddr);
515 }
516
517 static inline void *alloc_domain_mem(void)
518 {
519         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
520 }
521
522 static void free_domain_mem(void *vaddr)
523 {
524         kmem_cache_free(iommu_domain_cache, vaddr);
525 }
526
527 static inline void * alloc_devinfo_mem(void)
528 {
529         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
530 }
531
532 static inline void free_devinfo_mem(void *vaddr)
533 {
534         kmem_cache_free(iommu_devinfo_cache, vaddr);
535 }
536
537 static inline int domain_type_is_si(struct dmar_domain *domain)
538 {
539         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
540 }
541
542 static inline int domain_pfn_supported(struct dmar_domain *domain,
543                                        unsigned long pfn)
544 {
545         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
546
547         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
548 }
549
550 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
551 {
552         unsigned long sagaw;
553         int agaw = -1;
554
555         sagaw = cap_sagaw(iommu->cap);
556         for (agaw = width_to_agaw(max_gaw);
557              agaw >= 0; agaw--) {
558                 if (test_bit(agaw, &sagaw))
559                         break;
560         }
561
562         return agaw;
563 }
564
565 /*
566  * Calculate max SAGAW for each iommu.
567  */
568 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
569 {
570         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
571 }
572
573 /*
574  * calculate agaw for each iommu.
575  * "SAGAW" may be different across iommus, use a default agaw, and
576  * get a supported less agaw for iommus that don't support the default agaw.
577  */
578 int iommu_calculate_agaw(struct intel_iommu *iommu)
579 {
580         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
581 }
582
583 /* This functionin only returns single iommu in a domain */
584 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
585 {
586         int iommu_id;
587
588         /* si_domain and vm domain should not get here. */
589         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
590                 return NULL;
591
592         for_each_domain_iommu(iommu_id, domain)
593                 break;
594
595         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
596                 return NULL;
597
598         return g_iommus[iommu_id];
599 }
600
601 static void domain_update_iommu_coherency(struct dmar_domain *domain)
602 {
603         struct dmar_drhd_unit *drhd;
604         struct intel_iommu *iommu;
605         bool found = false;
606         int i;
607
608         domain->iommu_coherency = 1;
609
610         for_each_domain_iommu(i, domain) {
611                 found = true;
612                 if (!ecap_coherent(g_iommus[i]->ecap)) {
613                         domain->iommu_coherency = 0;
614                         break;
615                 }
616         }
617         if (found)
618                 return;
619
620         /* No hardware attached; use lowest common denominator */
621         rcu_read_lock();
622         for_each_active_iommu(iommu, drhd) {
623                 if (!ecap_coherent(iommu->ecap)) {
624                         domain->iommu_coherency = 0;
625                         break;
626                 }
627         }
628         rcu_read_unlock();
629 }
630
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
632 {
633         struct dmar_drhd_unit *drhd;
634         struct intel_iommu *iommu;
635         int ret = 1;
636
637         rcu_read_lock();
638         for_each_active_iommu(iommu, drhd) {
639                 if (iommu != skip) {
640                         if (!ecap_sc_support(iommu->ecap)) {
641                                 ret = 0;
642                                 break;
643                         }
644                 }
645         }
646         rcu_read_unlock();
647
648         return ret;
649 }
650
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
652 {
653         struct dmar_drhd_unit *drhd;
654         struct intel_iommu *iommu;
655         int mask = 0xf;
656
657         if (!intel_iommu_superpage) {
658                 return 0;
659         }
660
661         /* set iommu_superpage to the smallest common denominator */
662         rcu_read_lock();
663         for_each_active_iommu(iommu, drhd) {
664                 if (iommu != skip) {
665                         mask &= cap_super_page_val(iommu->cap);
666                         if (!mask)
667                                 break;
668                 }
669         }
670         rcu_read_unlock();
671
672         return fls(mask);
673 }
674
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
677 {
678         domain_update_iommu_coherency(domain);
679         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
681 }
682
683 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
684                                          u8 devfn, int alloc)
685 {
686         struct root_entry *root = &iommu->root_entry[bus];
687         struct context_entry *context;
688         u64 *entry;
689
690         entry = &root->lo;
691         if (sm_supported(iommu)) {
692                 if (devfn >= 0x80) {
693                         devfn -= 0x80;
694                         entry = &root->hi;
695                 }
696                 devfn *= 2;
697         }
698         if (*entry & 1)
699                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
700         else {
701                 unsigned long phy_addr;
702                 if (!alloc)
703                         return NULL;
704
705                 context = alloc_pgtable_page(iommu->node);
706                 if (!context)
707                         return NULL;
708
709                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
710                 phy_addr = virt_to_phys((void *)context);
711                 *entry = phy_addr | 1;
712                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
713         }
714         return &context[devfn];
715 }
716
717 static int iommu_dummy(struct device *dev)
718 {
719         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
720 }
721
722 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
723 {
724         struct dmar_drhd_unit *drhd = NULL;
725         struct intel_iommu *iommu;
726         struct device *tmp;
727         struct pci_dev *ptmp, *pdev = NULL;
728         u16 segment = 0;
729         int i;
730
731         if (iommu_dummy(dev))
732                 return NULL;
733
734         if (dev_is_pci(dev)) {
735                 struct pci_dev *pf_pdev;
736
737                 pdev = to_pci_dev(dev);
738
739 #ifdef CONFIG_X86
740                 /* VMD child devices currently cannot be handled individually */
741                 if (is_vmd(pdev->bus))
742                         return NULL;
743 #endif
744
745                 /* VFs aren't listed in scope tables; we need to look up
746                  * the PF instead to find the IOMMU. */
747                 pf_pdev = pci_physfn(pdev);
748                 dev = &pf_pdev->dev;
749                 segment = pci_domain_nr(pdev->bus);
750         } else if (has_acpi_companion(dev))
751                 dev = &ACPI_COMPANION(dev)->dev;
752
753         rcu_read_lock();
754         for_each_active_iommu(iommu, drhd) {
755                 if (pdev && segment != drhd->segment)
756                         continue;
757
758                 for_each_active_dev_scope(drhd->devices,
759                                           drhd->devices_cnt, i, tmp) {
760                         if (tmp == dev) {
761                                 /* For a VF use its original BDF# not that of the PF
762                                  * which we used for the IOMMU lookup. Strictly speaking
763                                  * we could do this for all PCI devices; we only need to
764                                  * get the BDF# from the scope table for ACPI matches. */
765                                 if (pdev && pdev->is_virtfn)
766                                         goto got_pdev;
767
768                                 *bus = drhd->devices[i].bus;
769                                 *devfn = drhd->devices[i].devfn;
770                                 goto out;
771                         }
772
773                         if (!pdev || !dev_is_pci(tmp))
774                                 continue;
775
776                         ptmp = to_pci_dev(tmp);
777                         if (ptmp->subordinate &&
778                             ptmp->subordinate->number <= pdev->bus->number &&
779                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
780                                 goto got_pdev;
781                 }
782
783                 if (pdev && drhd->include_all) {
784                 got_pdev:
785                         *bus = pdev->bus->number;
786                         *devfn = pdev->devfn;
787                         goto out;
788                 }
789         }
790         iommu = NULL;
791  out:
792         rcu_read_unlock();
793
794         return iommu;
795 }
796
797 static void domain_flush_cache(struct dmar_domain *domain,
798                                void *addr, int size)
799 {
800         if (!domain->iommu_coherency)
801                 clflush_cache_range(addr, size);
802 }
803
804 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
805 {
806         struct context_entry *context;
807         int ret = 0;
808         unsigned long flags;
809
810         spin_lock_irqsave(&iommu->lock, flags);
811         context = iommu_context_addr(iommu, bus, devfn, 0);
812         if (context)
813                 ret = context_present(context);
814         spin_unlock_irqrestore(&iommu->lock, flags);
815         return ret;
816 }
817
818 static void free_context_table(struct intel_iommu *iommu)
819 {
820         int i;
821         unsigned long flags;
822         struct context_entry *context;
823
824         spin_lock_irqsave(&iommu->lock, flags);
825         if (!iommu->root_entry) {
826                 goto out;
827         }
828         for (i = 0; i < ROOT_ENTRY_NR; i++) {
829                 context = iommu_context_addr(iommu, i, 0, 0);
830                 if (context)
831                         free_pgtable_page(context);
832
833                 if (!sm_supported(iommu))
834                         continue;
835
836                 context = iommu_context_addr(iommu, i, 0x80, 0);
837                 if (context)
838                         free_pgtable_page(context);
839
840         }
841         free_pgtable_page(iommu->root_entry);
842         iommu->root_entry = NULL;
843 out:
844         spin_unlock_irqrestore(&iommu->lock, flags);
845 }
846
847 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
848                                       unsigned long pfn, int *target_level)
849 {
850         struct dma_pte *parent, *pte;
851         int level = agaw_to_level(domain->agaw);
852         int offset;
853
854         BUG_ON(!domain->pgd);
855
856         if (!domain_pfn_supported(domain, pfn))
857                 /* Address beyond IOMMU's addressing capabilities. */
858                 return NULL;
859
860         parent = domain->pgd;
861
862         while (1) {
863                 void *tmp_page;
864
865                 offset = pfn_level_offset(pfn, level);
866                 pte = &parent[offset];
867                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
868                         break;
869                 if (level == *target_level)
870                         break;
871
872                 if (!dma_pte_present(pte)) {
873                         uint64_t pteval;
874
875                         tmp_page = alloc_pgtable_page(domain->nid);
876
877                         if (!tmp_page)
878                                 return NULL;
879
880                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
881                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
882                         if (cmpxchg64(&pte->val, 0ULL, pteval))
883                                 /* Someone else set it while we were thinking; use theirs. */
884                                 free_pgtable_page(tmp_page);
885                         else
886                                 domain_flush_cache(domain, pte, sizeof(*pte));
887                 }
888                 if (level == 1)
889                         break;
890
891                 parent = phys_to_virt(dma_pte_addr(pte));
892                 level--;
893         }
894
895         if (!*target_level)
896                 *target_level = level;
897
898         return pte;
899 }
900
901
902 /* return address's pte at specific level */
903 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
904                                          unsigned long pfn,
905                                          int level, int *large_page)
906 {
907         struct dma_pte *parent, *pte;
908         int total = agaw_to_level(domain->agaw);
909         int offset;
910
911         parent = domain->pgd;
912         while (level <= total) {
913                 offset = pfn_level_offset(pfn, total);
914                 pte = &parent[offset];
915                 if (level == total)
916                         return pte;
917
918                 if (!dma_pte_present(pte)) {
919                         *large_page = total;
920                         break;
921                 }
922
923                 if (dma_pte_superpage(pte)) {
924                         *large_page = total;
925                         return pte;
926                 }
927
928                 parent = phys_to_virt(dma_pte_addr(pte));
929                 total--;
930         }
931         return NULL;
932 }
933
934 /* clear last level pte, a tlb flush should be followed */
935 static void dma_pte_clear_range(struct dmar_domain *domain,
936                                 unsigned long start_pfn,
937                                 unsigned long last_pfn)
938 {
939         unsigned int large_page;
940         struct dma_pte *first_pte, *pte;
941
942         BUG_ON(!domain_pfn_supported(domain, start_pfn));
943         BUG_ON(!domain_pfn_supported(domain, last_pfn));
944         BUG_ON(start_pfn > last_pfn);
945
946         /* we don't need lock here; nobody else touches the iova range */
947         do {
948                 large_page = 1;
949                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
950                 if (!pte) {
951                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
952                         continue;
953                 }
954                 do {
955                         dma_clear_pte(pte);
956                         start_pfn += lvl_to_nr_pages(large_page);
957                         pte++;
958                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
959
960                 domain_flush_cache(domain, first_pte,
961                                    (void *)pte - (void *)first_pte);
962
963         } while (start_pfn && start_pfn <= last_pfn);
964 }
965
966 static void dma_pte_free_level(struct dmar_domain *domain, int level,
967                                int retain_level, struct dma_pte *pte,
968                                unsigned long pfn, unsigned long start_pfn,
969                                unsigned long last_pfn)
970 {
971         pfn = max(start_pfn, pfn);
972         pte = &pte[pfn_level_offset(pfn, level)];
973
974         do {
975                 unsigned long level_pfn;
976                 struct dma_pte *level_pte;
977
978                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
979                         goto next;
980
981                 level_pfn = pfn & level_mask(level);
982                 level_pte = phys_to_virt(dma_pte_addr(pte));
983
984                 if (level > 2) {
985                         dma_pte_free_level(domain, level - 1, retain_level,
986                                            level_pte, level_pfn, start_pfn,
987                                            last_pfn);
988                 }
989
990                 /*
991                  * Free the page table if we're below the level we want to
992                  * retain and the range covers the entire table.
993                  */
994                 if (level < retain_level && !(start_pfn > level_pfn ||
995                       last_pfn < level_pfn + level_size(level) - 1)) {
996                         dma_clear_pte(pte);
997                         domain_flush_cache(domain, pte, sizeof(*pte));
998                         free_pgtable_page(level_pte);
999                 }
1000 next:
1001                 pfn += level_size(level);
1002         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1003 }
1004
1005 /*
1006  * clear last level (leaf) ptes and free page table pages below the
1007  * level we wish to keep intact.
1008  */
1009 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1010                                    unsigned long start_pfn,
1011                                    unsigned long last_pfn,
1012                                    int retain_level)
1013 {
1014         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1015         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1016         BUG_ON(start_pfn > last_pfn);
1017
1018         dma_pte_clear_range(domain, start_pfn, last_pfn);
1019
1020         /* We don't need lock here; nobody else touches the iova range */
1021         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1022                            domain->pgd, 0, start_pfn, last_pfn);
1023
1024         /* free pgd */
1025         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1026                 free_pgtable_page(domain->pgd);
1027                 domain->pgd = NULL;
1028         }
1029 }
1030
1031 /* When a page at a given level is being unlinked from its parent, we don't
1032    need to *modify* it at all. All we need to do is make a list of all the
1033    pages which can be freed just as soon as we've flushed the IOTLB and we
1034    know the hardware page-walk will no longer touch them.
1035    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1036    be freed. */
1037 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1038                                             int level, struct dma_pte *pte,
1039                                             struct page *freelist)
1040 {
1041         struct page *pg;
1042
1043         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1044         pg->freelist = freelist;
1045         freelist = pg;
1046
1047         if (level == 1)
1048                 return freelist;
1049
1050         pte = page_address(pg);
1051         do {
1052                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1053                         freelist = dma_pte_list_pagetables(domain, level - 1,
1054                                                            pte, freelist);
1055                 pte++;
1056         } while (!first_pte_in_page(pte));
1057
1058         return freelist;
1059 }
1060
1061 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1062                                         struct dma_pte *pte, unsigned long pfn,
1063                                         unsigned long start_pfn,
1064                                         unsigned long last_pfn,
1065                                         struct page *freelist)
1066 {
1067         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1068
1069         pfn = max(start_pfn, pfn);
1070         pte = &pte[pfn_level_offset(pfn, level)];
1071
1072         do {
1073                 unsigned long level_pfn;
1074
1075                 if (!dma_pte_present(pte))
1076                         goto next;
1077
1078                 level_pfn = pfn & level_mask(level);
1079
1080                 /* If range covers entire pagetable, free it */
1081                 if (start_pfn <= level_pfn &&
1082                     last_pfn >= level_pfn + level_size(level) - 1) {
1083                         /* These suborbinate page tables are going away entirely. Don't
1084                            bother to clear them; we're just going to *free* them. */
1085                         if (level > 1 && !dma_pte_superpage(pte))
1086                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1087
1088                         dma_clear_pte(pte);
1089                         if (!first_pte)
1090                                 first_pte = pte;
1091                         last_pte = pte;
1092                 } else if (level > 1) {
1093                         /* Recurse down into a level that isn't *entirely* obsolete */
1094                         freelist = dma_pte_clear_level(domain, level - 1,
1095                                                        phys_to_virt(dma_pte_addr(pte)),
1096                                                        level_pfn, start_pfn, last_pfn,
1097                                                        freelist);
1098                 }
1099 next:
1100                 pfn += level_size(level);
1101         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1102
1103         if (first_pte)
1104                 domain_flush_cache(domain, first_pte,
1105                                    (void *)++last_pte - (void *)first_pte);
1106
1107         return freelist;
1108 }
1109
1110 /* We can't just free the pages because the IOMMU may still be walking
1111    the page tables, and may have cached the intermediate levels. The
1112    pages can only be freed after the IOTLB flush has been done. */
1113 static struct page *domain_unmap(struct dmar_domain *domain,
1114                                  unsigned long start_pfn,
1115                                  unsigned long last_pfn)
1116 {
1117         struct page *freelist;
1118
1119         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1120         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1121         BUG_ON(start_pfn > last_pfn);
1122
1123         /* we don't need lock here; nobody else touches the iova range */
1124         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1125                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1126
1127         /* free pgd */
1128         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1129                 struct page *pgd_page = virt_to_page(domain->pgd);
1130                 pgd_page->freelist = freelist;
1131                 freelist = pgd_page;
1132
1133                 domain->pgd = NULL;
1134         }
1135
1136         return freelist;
1137 }
1138
1139 static void dma_free_pagelist(struct page *freelist)
1140 {
1141         struct page *pg;
1142
1143         while ((pg = freelist)) {
1144                 freelist = pg->freelist;
1145                 free_pgtable_page(page_address(pg));
1146         }
1147 }
1148
1149 static void iova_entry_free(unsigned long data)
1150 {
1151         struct page *freelist = (struct page *)data;
1152
1153         dma_free_pagelist(freelist);
1154 }
1155
1156 /* iommu handling */
1157 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1158 {
1159         struct root_entry *root;
1160         unsigned long flags;
1161
1162         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1163         if (!root) {
1164                 pr_err("Allocating root entry for %s failed\n",
1165                         iommu->name);
1166                 return -ENOMEM;
1167         }
1168
1169         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1170
1171         spin_lock_irqsave(&iommu->lock, flags);
1172         iommu->root_entry = root;
1173         spin_unlock_irqrestore(&iommu->lock, flags);
1174
1175         return 0;
1176 }
1177
1178 static void iommu_set_root_entry(struct intel_iommu *iommu)
1179 {
1180         u64 addr;
1181         u32 sts;
1182         unsigned long flag;
1183
1184         addr = virt_to_phys(iommu->root_entry);
1185         if (sm_supported(iommu))
1186                 addr |= DMA_RTADDR_SMT;
1187
1188         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1189         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1190
1191         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1192
1193         /* Make sure hardware complete it */
1194         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1195                       readl, (sts & DMA_GSTS_RTPS), sts);
1196
1197         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1198 }
1199
1200 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1201 {
1202         u32 val;
1203         unsigned long flag;
1204
1205         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1206                 return;
1207
1208         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1210
1211         /* Make sure hardware complete it */
1212         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213                       readl, (!(val & DMA_GSTS_WBFS)), val);
1214
1215         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1216 }
1217
1218 /* return value determine if we need a write buffer flush */
1219 static void __iommu_flush_context(struct intel_iommu *iommu,
1220                                   u16 did, u16 source_id, u8 function_mask,
1221                                   u64 type)
1222 {
1223         u64 val = 0;
1224         unsigned long flag;
1225
1226         switch (type) {
1227         case DMA_CCMD_GLOBAL_INVL:
1228                 val = DMA_CCMD_GLOBAL_INVL;
1229                 break;
1230         case DMA_CCMD_DOMAIN_INVL:
1231                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1232                 break;
1233         case DMA_CCMD_DEVICE_INVL:
1234                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1235                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1236                 break;
1237         default:
1238                 BUG();
1239         }
1240         val |= DMA_CCMD_ICC;
1241
1242         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1243         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1244
1245         /* Make sure hardware complete it */
1246         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1247                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1248
1249         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1250 }
1251
1252 /* return value determine if we need a write buffer flush */
1253 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1254                                 u64 addr, unsigned int size_order, u64 type)
1255 {
1256         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1257         u64 val = 0, val_iva = 0;
1258         unsigned long flag;
1259
1260         switch (type) {
1261         case DMA_TLB_GLOBAL_FLUSH:
1262                 /* global flush doesn't need set IVA_REG */
1263                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1264                 break;
1265         case DMA_TLB_DSI_FLUSH:
1266                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1267                 break;
1268         case DMA_TLB_PSI_FLUSH:
1269                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1270                 /* IH bit is passed in as part of address */
1271                 val_iva = size_order | addr;
1272                 break;
1273         default:
1274                 BUG();
1275         }
1276         /* Note: set drain read/write */
1277 #if 0
1278         /*
1279          * This is probably to be super secure.. Looks like we can
1280          * ignore it without any impact.
1281          */
1282         if (cap_read_drain(iommu->cap))
1283                 val |= DMA_TLB_READ_DRAIN;
1284 #endif
1285         if (cap_write_drain(iommu->cap))
1286                 val |= DMA_TLB_WRITE_DRAIN;
1287
1288         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1289         /* Note: Only uses first TLB reg currently */
1290         if (val_iva)
1291                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1292         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1293
1294         /* Make sure hardware complete it */
1295         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1296                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1297
1298         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1299
1300         /* check IOTLB invalidation granularity */
1301         if (DMA_TLB_IAIG(val) == 0)
1302                 pr_err("Flush IOTLB failed\n");
1303         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1304                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1305                         (unsigned long long)DMA_TLB_IIRG(type),
1306                         (unsigned long long)DMA_TLB_IAIG(val));
1307 }
1308
1309 static struct device_domain_info *
1310 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1311                          u8 bus, u8 devfn)
1312 {
1313         struct device_domain_info *info;
1314
1315         assert_spin_locked(&device_domain_lock);
1316
1317         if (!iommu->qi)
1318                 return NULL;
1319
1320         list_for_each_entry(info, &domain->devices, link)
1321                 if (info->iommu == iommu && info->bus == bus &&
1322                     info->devfn == devfn) {
1323                         if (info->ats_supported && info->dev)
1324                                 return info;
1325                         break;
1326                 }
1327
1328         return NULL;
1329 }
1330
1331 static void domain_update_iotlb(struct dmar_domain *domain)
1332 {
1333         struct device_domain_info *info;
1334         bool has_iotlb_device = false;
1335
1336         assert_spin_locked(&device_domain_lock);
1337
1338         list_for_each_entry(info, &domain->devices, link) {
1339                 struct pci_dev *pdev;
1340
1341                 if (!info->dev || !dev_is_pci(info->dev))
1342                         continue;
1343
1344                 pdev = to_pci_dev(info->dev);
1345                 if (pdev->ats_enabled) {
1346                         has_iotlb_device = true;
1347                         break;
1348                 }
1349         }
1350
1351         domain->has_iotlb_device = has_iotlb_device;
1352 }
1353
1354 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1355 {
1356         struct pci_dev *pdev;
1357
1358         assert_spin_locked(&device_domain_lock);
1359
1360         if (!info || !dev_is_pci(info->dev))
1361                 return;
1362
1363         pdev = to_pci_dev(info->dev);
1364         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1365          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1366          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1367          * reserved, which should be set to 0.
1368          */
1369         if (!ecap_dit(info->iommu->ecap))
1370                 info->pfsid = 0;
1371         else {
1372                 struct pci_dev *pf_pdev;
1373
1374                 /* pdev will be returned if device is not a vf */
1375                 pf_pdev = pci_physfn(pdev);
1376                 info->pfsid = pci_dev_id(pf_pdev);
1377         }
1378
1379 #ifdef CONFIG_INTEL_IOMMU_SVM
1380         /* The PCIe spec, in its wisdom, declares that the behaviour of
1381            the device if you enable PASID support after ATS support is
1382            undefined. So always enable PASID support on devices which
1383            have it, even if we can't yet know if we're ever going to
1384            use it. */
1385         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1386                 info->pasid_enabled = 1;
1387
1388         if (info->pri_supported &&
1389             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1390             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1391                 info->pri_enabled = 1;
1392 #endif
1393         if (!pdev->untrusted && info->ats_supported &&
1394             pci_ats_page_aligned(pdev) &&
1395             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1396                 info->ats_enabled = 1;
1397                 domain_update_iotlb(info->domain);
1398                 info->ats_qdep = pci_ats_queue_depth(pdev);
1399         }
1400 }
1401
1402 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1403 {
1404         struct pci_dev *pdev;
1405
1406         assert_spin_locked(&device_domain_lock);
1407
1408         if (!dev_is_pci(info->dev))
1409                 return;
1410
1411         pdev = to_pci_dev(info->dev);
1412
1413         if (info->ats_enabled) {
1414                 pci_disable_ats(pdev);
1415                 info->ats_enabled = 0;
1416                 domain_update_iotlb(info->domain);
1417         }
1418 #ifdef CONFIG_INTEL_IOMMU_SVM
1419         if (info->pri_enabled) {
1420                 pci_disable_pri(pdev);
1421                 info->pri_enabled = 0;
1422         }
1423         if (info->pasid_enabled) {
1424                 pci_disable_pasid(pdev);
1425                 info->pasid_enabled = 0;
1426         }
1427 #endif
1428 }
1429
1430 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1431                                   u64 addr, unsigned mask)
1432 {
1433         u16 sid, qdep;
1434         unsigned long flags;
1435         struct device_domain_info *info;
1436
1437         if (!domain->has_iotlb_device)
1438                 return;
1439
1440         spin_lock_irqsave(&device_domain_lock, flags);
1441         list_for_each_entry(info, &domain->devices, link) {
1442                 if (!info->ats_enabled)
1443                         continue;
1444
1445                 sid = info->bus << 8 | info->devfn;
1446                 qdep = info->ats_qdep;
1447                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1448                                 qdep, addr, mask);
1449         }
1450         spin_unlock_irqrestore(&device_domain_lock, flags);
1451 }
1452
1453 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1454                                   struct dmar_domain *domain,
1455                                   unsigned long pfn, unsigned int pages,
1456                                   int ih, int map)
1457 {
1458         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1459         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1460         u16 did = domain->iommu_did[iommu->seq_id];
1461
1462         BUG_ON(pages == 0);
1463
1464         if (ih)
1465                 ih = 1 << 6;
1466         /*
1467          * Fallback to domain selective flush if no PSI support or the size is
1468          * too big.
1469          * PSI requires page size to be 2 ^ x, and the base address is naturally
1470          * aligned to the size
1471          */
1472         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1473                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1474                                                 DMA_TLB_DSI_FLUSH);
1475         else
1476                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1477                                                 DMA_TLB_PSI_FLUSH);
1478
1479         /*
1480          * In caching mode, changes of pages from non-present to present require
1481          * flush. However, device IOTLB doesn't need to be flushed in this case.
1482          */
1483         if (!cap_caching_mode(iommu->cap) || !map)
1484                 iommu_flush_dev_iotlb(domain, addr, mask);
1485 }
1486
1487 /* Notification for newly created mappings */
1488 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1489                                         struct dmar_domain *domain,
1490                                         unsigned long pfn, unsigned int pages)
1491 {
1492         /* It's a non-present to present mapping. Only flush if caching mode */
1493         if (cap_caching_mode(iommu->cap))
1494                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1495         else
1496                 iommu_flush_write_buffer(iommu);
1497 }
1498
1499 static void iommu_flush_iova(struct iova_domain *iovad)
1500 {
1501         struct dmar_domain *domain;
1502         int idx;
1503
1504         domain = container_of(iovad, struct dmar_domain, iovad);
1505
1506         for_each_domain_iommu(idx, domain) {
1507                 struct intel_iommu *iommu = g_iommus[idx];
1508                 u16 did = domain->iommu_did[iommu->seq_id];
1509
1510                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1511
1512                 if (!cap_caching_mode(iommu->cap))
1513                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1514                                               0, MAX_AGAW_PFN_WIDTH);
1515         }
1516 }
1517
1518 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1519 {
1520         u32 pmen;
1521         unsigned long flags;
1522
1523         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1524                 return;
1525
1526         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1527         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1528         pmen &= ~DMA_PMEN_EPM;
1529         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1530
1531         /* wait for the protected region status bit to clear */
1532         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1533                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1534
1535         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1536 }
1537
1538 static void iommu_enable_translation(struct intel_iommu *iommu)
1539 {
1540         u32 sts;
1541         unsigned long flags;
1542
1543         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1544         iommu->gcmd |= DMA_GCMD_TE;
1545         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1546
1547         /* Make sure hardware complete it */
1548         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1549                       readl, (sts & DMA_GSTS_TES), sts);
1550
1551         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1552 }
1553
1554 static void iommu_disable_translation(struct intel_iommu *iommu)
1555 {
1556         u32 sts;
1557         unsigned long flag;
1558
1559         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1560         iommu->gcmd &= ~DMA_GCMD_TE;
1561         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1562
1563         /* Make sure hardware complete it */
1564         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1565                       readl, (!(sts & DMA_GSTS_TES)), sts);
1566
1567         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1568 }
1569
1570
1571 static int iommu_init_domains(struct intel_iommu *iommu)
1572 {
1573         u32 ndomains, nlongs;
1574         size_t size;
1575
1576         ndomains = cap_ndoms(iommu->cap);
1577         pr_debug("%s: Number of Domains supported <%d>\n",
1578                  iommu->name, ndomains);
1579         nlongs = BITS_TO_LONGS(ndomains);
1580
1581         spin_lock_init(&iommu->lock);
1582
1583         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1584         if (!iommu->domain_ids) {
1585                 pr_err("%s: Allocating domain id array failed\n",
1586                        iommu->name);
1587                 return -ENOMEM;
1588         }
1589
1590         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1591         iommu->domains = kzalloc(size, GFP_KERNEL);
1592
1593         if (iommu->domains) {
1594                 size = 256 * sizeof(struct dmar_domain *);
1595                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1596         }
1597
1598         if (!iommu->domains || !iommu->domains[0]) {
1599                 pr_err("%s: Allocating domain array failed\n",
1600                        iommu->name);
1601                 kfree(iommu->domain_ids);
1602                 kfree(iommu->domains);
1603                 iommu->domain_ids = NULL;
1604                 iommu->domains    = NULL;
1605                 return -ENOMEM;
1606         }
1607
1608
1609
1610         /*
1611          * If Caching mode is set, then invalid translations are tagged
1612          * with domain-id 0, hence we need to pre-allocate it. We also
1613          * use domain-id 0 as a marker for non-allocated domain-id, so
1614          * make sure it is not used for a real domain.
1615          */
1616         set_bit(0, iommu->domain_ids);
1617
1618         /*
1619          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1620          * entry for first-level or pass-through translation modes should
1621          * be programmed with a domain id different from those used for
1622          * second-level or nested translation. We reserve a domain id for
1623          * this purpose.
1624          */
1625         if (sm_supported(iommu))
1626                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1627
1628         return 0;
1629 }
1630
1631 static void disable_dmar_iommu(struct intel_iommu *iommu)
1632 {
1633         struct device_domain_info *info, *tmp;
1634         unsigned long flags;
1635
1636         if (!iommu->domains || !iommu->domain_ids)
1637                 return;
1638
1639         spin_lock_irqsave(&device_domain_lock, flags);
1640         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1641                 struct dmar_domain *domain;
1642
1643                 if (info->iommu != iommu)
1644                         continue;
1645
1646                 if (!info->dev || !info->domain)
1647                         continue;
1648
1649                 domain = info->domain;
1650
1651                 __dmar_remove_one_dev_info(info);
1652         }
1653         spin_unlock_irqrestore(&device_domain_lock, flags);
1654
1655         if (iommu->gcmd & DMA_GCMD_TE)
1656                 iommu_disable_translation(iommu);
1657 }
1658
1659 static void free_dmar_iommu(struct intel_iommu *iommu)
1660 {
1661         if ((iommu->domains) && (iommu->domain_ids)) {
1662                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1663                 int i;
1664
1665                 for (i = 0; i < elems; i++)
1666                         kfree(iommu->domains[i]);
1667                 kfree(iommu->domains);
1668                 kfree(iommu->domain_ids);
1669                 iommu->domains = NULL;
1670                 iommu->domain_ids = NULL;
1671         }
1672
1673         g_iommus[iommu->seq_id] = NULL;
1674
1675         /* free context mapping */
1676         free_context_table(iommu);
1677
1678 #ifdef CONFIG_INTEL_IOMMU_SVM
1679         if (pasid_supported(iommu)) {
1680                 if (ecap_prs(iommu->ecap))
1681                         intel_svm_finish_prq(iommu);
1682         }
1683 #endif
1684 }
1685
1686 static struct dmar_domain *alloc_domain(int flags)
1687 {
1688         struct dmar_domain *domain;
1689
1690         domain = alloc_domain_mem();
1691         if (!domain)
1692                 return NULL;
1693
1694         memset(domain, 0, sizeof(*domain));
1695         domain->nid = NUMA_NO_NODE;
1696         domain->flags = flags;
1697         domain->has_iotlb_device = false;
1698         INIT_LIST_HEAD(&domain->devices);
1699
1700         return domain;
1701 }
1702
1703 /* Must be called with iommu->lock */
1704 static int domain_attach_iommu(struct dmar_domain *domain,
1705                                struct intel_iommu *iommu)
1706 {
1707         unsigned long ndomains;
1708         int num;
1709
1710         assert_spin_locked(&device_domain_lock);
1711         assert_spin_locked(&iommu->lock);
1712
1713         domain->iommu_refcnt[iommu->seq_id] += 1;
1714         domain->iommu_count += 1;
1715         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1716                 ndomains = cap_ndoms(iommu->cap);
1717                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1718
1719                 if (num >= ndomains) {
1720                         pr_err("%s: No free domain ids\n", iommu->name);
1721                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1722                         domain->iommu_count -= 1;
1723                         return -ENOSPC;
1724                 }
1725
1726                 set_bit(num, iommu->domain_ids);
1727                 set_iommu_domain(iommu, num, domain);
1728
1729                 domain->iommu_did[iommu->seq_id] = num;
1730                 domain->nid                      = iommu->node;
1731
1732                 domain_update_iommu_cap(domain);
1733         }
1734
1735         return 0;
1736 }
1737
1738 static int domain_detach_iommu(struct dmar_domain *domain,
1739                                struct intel_iommu *iommu)
1740 {
1741         int num, count;
1742
1743         assert_spin_locked(&device_domain_lock);
1744         assert_spin_locked(&iommu->lock);
1745
1746         domain->iommu_refcnt[iommu->seq_id] -= 1;
1747         count = --domain->iommu_count;
1748         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1749                 num = domain->iommu_did[iommu->seq_id];
1750                 clear_bit(num, iommu->domain_ids);
1751                 set_iommu_domain(iommu, num, NULL);
1752
1753                 domain_update_iommu_cap(domain);
1754                 domain->iommu_did[iommu->seq_id] = 0;
1755         }
1756
1757         return count;
1758 }
1759
1760 static struct iova_domain reserved_iova_list;
1761 static struct lock_class_key reserved_rbtree_key;
1762
1763 static int dmar_init_reserved_ranges(void)
1764 {
1765         struct pci_dev *pdev = NULL;
1766         struct iova *iova;
1767         int i;
1768
1769         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1770
1771         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1772                 &reserved_rbtree_key);
1773
1774         /* IOAPIC ranges shouldn't be accessed by DMA */
1775         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1776                 IOVA_PFN(IOAPIC_RANGE_END));
1777         if (!iova) {
1778                 pr_err("Reserve IOAPIC range failed\n");
1779                 return -ENODEV;
1780         }
1781
1782         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1783         for_each_pci_dev(pdev) {
1784                 struct resource *r;
1785
1786                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1787                         r = &pdev->resource[i];
1788                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1789                                 continue;
1790                         iova = reserve_iova(&reserved_iova_list,
1791                                             IOVA_PFN(r->start),
1792                                             IOVA_PFN(r->end));
1793                         if (!iova) {
1794                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1795                                 return -ENODEV;
1796                         }
1797                 }
1798         }
1799         return 0;
1800 }
1801
1802 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1803 {
1804         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1805 }
1806
1807 static inline int guestwidth_to_adjustwidth(int gaw)
1808 {
1809         int agaw;
1810         int r = (gaw - 12) % 9;
1811
1812         if (r == 0)
1813                 agaw = gaw;
1814         else
1815                 agaw = gaw + 9 - r;
1816         if (agaw > 64)
1817                 agaw = 64;
1818         return agaw;
1819 }
1820
1821 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1822                        int guest_width)
1823 {
1824         int adjust_width, agaw;
1825         unsigned long sagaw;
1826         int err;
1827
1828         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1829
1830         err = init_iova_flush_queue(&domain->iovad,
1831                                     iommu_flush_iova, iova_entry_free);
1832         if (err)
1833                 return err;
1834
1835         domain_reserve_special_ranges(domain);
1836
1837         /* calculate AGAW */
1838         if (guest_width > cap_mgaw(iommu->cap))
1839                 guest_width = cap_mgaw(iommu->cap);
1840         domain->gaw = guest_width;
1841         adjust_width = guestwidth_to_adjustwidth(guest_width);
1842         agaw = width_to_agaw(adjust_width);
1843         sagaw = cap_sagaw(iommu->cap);
1844         if (!test_bit(agaw, &sagaw)) {
1845                 /* hardware doesn't support it, choose a bigger one */
1846                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1847                 agaw = find_next_bit(&sagaw, 5, agaw);
1848                 if (agaw >= 5)
1849                         return -ENODEV;
1850         }
1851         domain->agaw = agaw;
1852
1853         if (ecap_coherent(iommu->ecap))
1854                 domain->iommu_coherency = 1;
1855         else
1856                 domain->iommu_coherency = 0;
1857
1858         if (ecap_sc_support(iommu->ecap))
1859                 domain->iommu_snooping = 1;
1860         else
1861                 domain->iommu_snooping = 0;
1862
1863         if (intel_iommu_superpage)
1864                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1865         else
1866                 domain->iommu_superpage = 0;
1867
1868         domain->nid = iommu->node;
1869
1870         /* always allocate the top pgd */
1871         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1872         if (!domain->pgd)
1873                 return -ENOMEM;
1874         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1875         return 0;
1876 }
1877
1878 static void domain_exit(struct dmar_domain *domain)
1879 {
1880         struct page *freelist;
1881
1882         /* Remove associated devices and clear attached or cached domains */
1883         domain_remove_dev_info(domain);
1884
1885         /* destroy iovas */
1886         put_iova_domain(&domain->iovad);
1887
1888         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1889
1890         dma_free_pagelist(freelist);
1891
1892         free_domain_mem(domain);
1893 }
1894
1895 /*
1896  * Get the PASID directory size for scalable mode context entry.
1897  * Value of X in the PDTS field of a scalable mode context entry
1898  * indicates PASID directory with 2^(X + 7) entries.
1899  */
1900 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1901 {
1902         int pds, max_pde;
1903
1904         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1905         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1906         if (pds < 7)
1907                 return 0;
1908
1909         return pds - 7;
1910 }
1911
1912 /*
1913  * Set the RID_PASID field of a scalable mode context entry. The
1914  * IOMMU hardware will use the PASID value set in this field for
1915  * DMA translations of DMA requests without PASID.
1916  */
1917 static inline void
1918 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1919 {
1920         context->hi |= pasid & ((1 << 20) - 1);
1921         context->hi |= (1 << 20);
1922 }
1923
1924 /*
1925  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1926  * entry.
1927  */
1928 static inline void context_set_sm_dte(struct context_entry *context)
1929 {
1930         context->lo |= (1 << 2);
1931 }
1932
1933 /*
1934  * Set the PRE(Page Request Enable) field of a scalable mode context
1935  * entry.
1936  */
1937 static inline void context_set_sm_pre(struct context_entry *context)
1938 {
1939         context->lo |= (1 << 4);
1940 }
1941
1942 /* Convert value to context PASID directory size field coding. */
1943 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1944
1945 static int domain_context_mapping_one(struct dmar_domain *domain,
1946                                       struct intel_iommu *iommu,
1947                                       struct pasid_table *table,
1948                                       u8 bus, u8 devfn)
1949 {
1950         u16 did = domain->iommu_did[iommu->seq_id];
1951         int translation = CONTEXT_TT_MULTI_LEVEL;
1952         struct device_domain_info *info = NULL;
1953         struct context_entry *context;
1954         unsigned long flags;
1955         int ret;
1956
1957         WARN_ON(did == 0);
1958
1959         if (hw_pass_through && domain_type_is_si(domain))
1960                 translation = CONTEXT_TT_PASS_THROUGH;
1961
1962         pr_debug("Set context mapping for %02x:%02x.%d\n",
1963                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1964
1965         BUG_ON(!domain->pgd);
1966
1967         spin_lock_irqsave(&device_domain_lock, flags);
1968         spin_lock(&iommu->lock);
1969
1970         ret = -ENOMEM;
1971         context = iommu_context_addr(iommu, bus, devfn, 1);
1972         if (!context)
1973                 goto out_unlock;
1974
1975         ret = 0;
1976         if (context_present(context))
1977                 goto out_unlock;
1978
1979         /*
1980          * For kdump cases, old valid entries may be cached due to the
1981          * in-flight DMA and copied pgtable, but there is no unmapping
1982          * behaviour for them, thus we need an explicit cache flush for
1983          * the newly-mapped device. For kdump, at this point, the device
1984          * is supposed to finish reset at its driver probe stage, so no
1985          * in-flight DMA will exist, and we don't need to worry anymore
1986          * hereafter.
1987          */
1988         if (context_copied(context)) {
1989                 u16 did_old = context_domain_id(context);
1990
1991                 if (did_old < cap_ndoms(iommu->cap)) {
1992                         iommu->flush.flush_context(iommu, did_old,
1993                                                    (((u16)bus) << 8) | devfn,
1994                                                    DMA_CCMD_MASK_NOBIT,
1995                                                    DMA_CCMD_DEVICE_INVL);
1996                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1997                                                  DMA_TLB_DSI_FLUSH);
1998                 }
1999         }
2000
2001         context_clear_entry(context);
2002
2003         if (sm_supported(iommu)) {
2004                 unsigned long pds;
2005
2006                 WARN_ON(!table);
2007
2008                 /* Setup the PASID DIR pointer: */
2009                 pds = context_get_sm_pds(table);
2010                 context->lo = (u64)virt_to_phys(table->table) |
2011                                 context_pdts(pds);
2012
2013                 /* Setup the RID_PASID field: */
2014                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2015
2016                 /*
2017                  * Setup the Device-TLB enable bit and Page request
2018                  * Enable bit:
2019                  */
2020                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2021                 if (info && info->ats_supported)
2022                         context_set_sm_dte(context);
2023                 if (info && info->pri_supported)
2024                         context_set_sm_pre(context);
2025         } else {
2026                 struct dma_pte *pgd = domain->pgd;
2027                 int agaw;
2028
2029                 context_set_domain_id(context, did);
2030
2031                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2032                         /*
2033                          * Skip top levels of page tables for iommu which has
2034                          * less agaw than default. Unnecessary for PT mode.
2035                          */
2036                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2037                                 ret = -ENOMEM;
2038                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2039                                 if (!dma_pte_present(pgd))
2040                                         goto out_unlock;
2041                         }
2042
2043                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2044                         if (info && info->ats_supported)
2045                                 translation = CONTEXT_TT_DEV_IOTLB;
2046                         else
2047                                 translation = CONTEXT_TT_MULTI_LEVEL;
2048
2049                         context_set_address_root(context, virt_to_phys(pgd));
2050                         context_set_address_width(context, agaw);
2051                 } else {
2052                         /*
2053                          * In pass through mode, AW must be programmed to
2054                          * indicate the largest AGAW value supported by
2055                          * hardware. And ASR is ignored by hardware.
2056                          */
2057                         context_set_address_width(context, iommu->msagaw);
2058                 }
2059
2060                 context_set_translation_type(context, translation);
2061         }
2062
2063         context_set_fault_enable(context);
2064         context_set_present(context);
2065         domain_flush_cache(domain, context, sizeof(*context));
2066
2067         /*
2068          * It's a non-present to present mapping. If hardware doesn't cache
2069          * non-present entry we only need to flush the write-buffer. If the
2070          * _does_ cache non-present entries, then it does so in the special
2071          * domain #0, which we have to flush:
2072          */
2073         if (cap_caching_mode(iommu->cap)) {
2074                 iommu->flush.flush_context(iommu, 0,
2075                                            (((u16)bus) << 8) | devfn,
2076                                            DMA_CCMD_MASK_NOBIT,
2077                                            DMA_CCMD_DEVICE_INVL);
2078                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2079         } else {
2080                 iommu_flush_write_buffer(iommu);
2081         }
2082         iommu_enable_dev_iotlb(info);
2083
2084         ret = 0;
2085
2086 out_unlock:
2087         spin_unlock(&iommu->lock);
2088         spin_unlock_irqrestore(&device_domain_lock, flags);
2089
2090         return ret;
2091 }
2092
2093 struct domain_context_mapping_data {
2094         struct dmar_domain *domain;
2095         struct intel_iommu *iommu;
2096         struct pasid_table *table;
2097 };
2098
2099 static int domain_context_mapping_cb(struct pci_dev *pdev,
2100                                      u16 alias, void *opaque)
2101 {
2102         struct domain_context_mapping_data *data = opaque;
2103
2104         return domain_context_mapping_one(data->domain, data->iommu,
2105                                           data->table, PCI_BUS_NUM(alias),
2106                                           alias & 0xff);
2107 }
2108
2109 static int
2110 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2111 {
2112         struct domain_context_mapping_data data;
2113         struct pasid_table *table;
2114         struct intel_iommu *iommu;
2115         u8 bus, devfn;
2116
2117         iommu = device_to_iommu(dev, &bus, &devfn);
2118         if (!iommu)
2119                 return -ENODEV;
2120
2121         table = intel_pasid_get_table(dev);
2122
2123         if (!dev_is_pci(dev))
2124                 return domain_context_mapping_one(domain, iommu, table,
2125                                                   bus, devfn);
2126
2127         data.domain = domain;
2128         data.iommu = iommu;
2129         data.table = table;
2130
2131         return pci_for_each_dma_alias(to_pci_dev(dev),
2132                                       &domain_context_mapping_cb, &data);
2133 }
2134
2135 static int domain_context_mapped_cb(struct pci_dev *pdev,
2136                                     u16 alias, void *opaque)
2137 {
2138         struct intel_iommu *iommu = opaque;
2139
2140         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2141 }
2142
2143 static int domain_context_mapped(struct device *dev)
2144 {
2145         struct intel_iommu *iommu;
2146         u8 bus, devfn;
2147
2148         iommu = device_to_iommu(dev, &bus, &devfn);
2149         if (!iommu)
2150                 return -ENODEV;
2151
2152         if (!dev_is_pci(dev))
2153                 return device_context_mapped(iommu, bus, devfn);
2154
2155         return !pci_for_each_dma_alias(to_pci_dev(dev),
2156                                        domain_context_mapped_cb, iommu);
2157 }
2158
2159 /* Returns a number of VTD pages, but aligned to MM page size */
2160 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2161                                             size_t size)
2162 {
2163         host_addr &= ~PAGE_MASK;
2164         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2165 }
2166
2167 /* Return largest possible superpage level for a given mapping */
2168 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2169                                           unsigned long iov_pfn,
2170                                           unsigned long phy_pfn,
2171                                           unsigned long pages)
2172 {
2173         int support, level = 1;
2174         unsigned long pfnmerge;
2175
2176         support = domain->iommu_superpage;
2177
2178         /* To use a large page, the virtual *and* physical addresses
2179            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2180            of them will mean we have to use smaller pages. So just
2181            merge them and check both at once. */
2182         pfnmerge = iov_pfn | phy_pfn;
2183
2184         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2185                 pages >>= VTD_STRIDE_SHIFT;
2186                 if (!pages)
2187                         break;
2188                 pfnmerge >>= VTD_STRIDE_SHIFT;
2189                 level++;
2190                 support--;
2191         }
2192         return level;
2193 }
2194
2195 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2196                             struct scatterlist *sg, unsigned long phys_pfn,
2197                             unsigned long nr_pages, int prot)
2198 {
2199         struct dma_pte *first_pte = NULL, *pte = NULL;
2200         phys_addr_t uninitialized_var(pteval);
2201         unsigned long sg_res = 0;
2202         unsigned int largepage_lvl = 0;
2203         unsigned long lvl_pages = 0;
2204
2205         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2206
2207         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2208                 return -EINVAL;
2209
2210         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2211
2212         if (!sg) {
2213                 sg_res = nr_pages;
2214                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2215         }
2216
2217         while (nr_pages > 0) {
2218                 uint64_t tmp;
2219
2220                 if (!sg_res) {
2221                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2222
2223                         sg_res = aligned_nrpages(sg->offset, sg->length);
2224                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2225                         sg->dma_length = sg->length;
2226                         pteval = (sg_phys(sg) - pgoff) | prot;
2227                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2228                 }
2229
2230                 if (!pte) {
2231                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2232
2233                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2234                         if (!pte)
2235                                 return -ENOMEM;
2236                         /* It is large page*/
2237                         if (largepage_lvl > 1) {
2238                                 unsigned long nr_superpages, end_pfn;
2239
2240                                 pteval |= DMA_PTE_LARGE_PAGE;
2241                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2242
2243                                 nr_superpages = sg_res / lvl_pages;
2244                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2245
2246                                 /*
2247                                  * Ensure that old small page tables are
2248                                  * removed to make room for superpage(s).
2249                                  * We're adding new large pages, so make sure
2250                                  * we don't remove their parent tables.
2251                                  */
2252                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2253                                                        largepage_lvl + 1);
2254                         } else {
2255                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2256                         }
2257
2258                 }
2259                 /* We don't need lock here, nobody else
2260                  * touches the iova range
2261                  */
2262                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2263                 if (tmp) {
2264                         static int dumps = 5;
2265                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2266                                 iov_pfn, tmp, (unsigned long long)pteval);
2267                         if (dumps) {
2268                                 dumps--;
2269                                 debug_dma_dump_mappings(NULL);
2270                         }
2271                         WARN_ON(1);
2272                 }
2273
2274                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276                 BUG_ON(nr_pages < lvl_pages);
2277                 BUG_ON(sg_res < lvl_pages);
2278
2279                 nr_pages -= lvl_pages;
2280                 iov_pfn += lvl_pages;
2281                 phys_pfn += lvl_pages;
2282                 pteval += lvl_pages * VTD_PAGE_SIZE;
2283                 sg_res -= lvl_pages;
2284
2285                 /* If the next PTE would be the first in a new page, then we
2286                    need to flush the cache on the entries we've just written.
2287                    And then we'll need to recalculate 'pte', so clear it and
2288                    let it get set again in the if (!pte) block above.
2289
2290                    If we're done (!nr_pages) we need to flush the cache too.
2291
2292                    Also if we've been setting superpages, we may need to
2293                    recalculate 'pte' and switch back to smaller pages for the
2294                    end of the mapping, if the trailing size is not enough to
2295                    use another superpage (i.e. sg_res < lvl_pages). */
2296                 pte++;
2297                 if (!nr_pages || first_pte_in_page(pte) ||
2298                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2299                         domain_flush_cache(domain, first_pte,
2300                                            (void *)pte - (void *)first_pte);
2301                         pte = NULL;
2302                 }
2303
2304                 if (!sg_res && nr_pages)
2305                         sg = sg_next(sg);
2306         }
2307         return 0;
2308 }
2309
2310 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2311                           struct scatterlist *sg, unsigned long phys_pfn,
2312                           unsigned long nr_pages, int prot)
2313 {
2314         int iommu_id, ret;
2315         struct intel_iommu *iommu;
2316
2317         /* Do the real mapping first */
2318         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2319         if (ret)
2320                 return ret;
2321
2322         for_each_domain_iommu(iommu_id, domain) {
2323                 iommu = g_iommus[iommu_id];
2324                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2325         }
2326
2327         return 0;
2328 }
2329
2330 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2331                                     struct scatterlist *sg, unsigned long nr_pages,
2332                                     int prot)
2333 {
2334         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2335 }
2336
2337 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2338                                      unsigned long phys_pfn, unsigned long nr_pages,
2339                                      int prot)
2340 {
2341         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2342 }
2343
2344 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2345 {
2346         unsigned long flags;
2347         struct context_entry *context;
2348         u16 did_old;
2349
2350         if (!iommu)
2351                 return;
2352
2353         spin_lock_irqsave(&iommu->lock, flags);
2354         context = iommu_context_addr(iommu, bus, devfn, 0);
2355         if (!context) {
2356                 spin_unlock_irqrestore(&iommu->lock, flags);
2357                 return;
2358         }
2359         did_old = context_domain_id(context);
2360         context_clear_entry(context);
2361         __iommu_flush_cache(iommu, context, sizeof(*context));
2362         spin_unlock_irqrestore(&iommu->lock, flags);
2363         iommu->flush.flush_context(iommu,
2364                                    did_old,
2365                                    (((u16)bus) << 8) | devfn,
2366                                    DMA_CCMD_MASK_NOBIT,
2367                                    DMA_CCMD_DEVICE_INVL);
2368         iommu->flush.flush_iotlb(iommu,
2369                                  did_old,
2370                                  0,
2371                                  0,
2372                                  DMA_TLB_DSI_FLUSH);
2373 }
2374
2375 static inline void unlink_domain_info(struct device_domain_info *info)
2376 {
2377         assert_spin_locked(&device_domain_lock);
2378         list_del(&info->link);
2379         list_del(&info->global);
2380         if (info->dev)
2381                 info->dev->archdata.iommu = NULL;
2382 }
2383
2384 static void domain_remove_dev_info(struct dmar_domain *domain)
2385 {
2386         struct device_domain_info *info, *tmp;
2387         unsigned long flags;
2388
2389         spin_lock_irqsave(&device_domain_lock, flags);
2390         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2391                 __dmar_remove_one_dev_info(info);
2392         spin_unlock_irqrestore(&device_domain_lock, flags);
2393 }
2394
2395 /*
2396  * find_domain
2397  * Note: we use struct device->archdata.iommu stores the info
2398  */
2399 static struct dmar_domain *find_domain(struct device *dev)
2400 {
2401         struct device_domain_info *info;
2402
2403         /* No lock here, assumes no domain exit in normal case */
2404         info = dev->archdata.iommu;
2405         if (likely(info))
2406                 return info->domain;
2407         return NULL;
2408 }
2409
2410 static inline struct device_domain_info *
2411 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2412 {
2413         struct device_domain_info *info;
2414
2415         list_for_each_entry(info, &device_domain_list, global)
2416                 if (info->iommu->segment == segment && info->bus == bus &&
2417                     info->devfn == devfn)
2418                         return info;
2419
2420         return NULL;
2421 }
2422
2423 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2424                                                     int bus, int devfn,
2425                                                     struct device *dev,
2426                                                     struct dmar_domain *domain)
2427 {
2428         struct dmar_domain *found = NULL;
2429         struct device_domain_info *info;
2430         unsigned long flags;
2431         int ret;
2432
2433         info = alloc_devinfo_mem();
2434         if (!info)
2435                 return NULL;
2436
2437         info->bus = bus;
2438         info->devfn = devfn;
2439         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2440         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2441         info->ats_qdep = 0;
2442         info->dev = dev;
2443         info->domain = domain;
2444         info->iommu = iommu;
2445         info->pasid_table = NULL;
2446         info->auxd_enabled = 0;
2447         INIT_LIST_HEAD(&info->auxiliary_domains);
2448
2449         if (dev && dev_is_pci(dev)) {
2450                 struct pci_dev *pdev = to_pci_dev(info->dev);
2451
2452                 if (!pdev->untrusted &&
2453                     !pci_ats_disabled() &&
2454                     ecap_dev_iotlb_support(iommu->ecap) &&
2455                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2456                     dmar_find_matched_atsr_unit(pdev))
2457                         info->ats_supported = 1;
2458
2459                 if (sm_supported(iommu)) {
2460                         if (pasid_supported(iommu)) {
2461                                 int features = pci_pasid_features(pdev);
2462                                 if (features >= 0)
2463                                         info->pasid_supported = features | 1;
2464                         }
2465
2466                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2467                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2468                                 info->pri_supported = 1;
2469                 }
2470         }
2471
2472         spin_lock_irqsave(&device_domain_lock, flags);
2473         if (dev)
2474                 found = find_domain(dev);
2475
2476         if (!found) {
2477                 struct device_domain_info *info2;
2478                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2479                 if (info2) {
2480                         found      = info2->domain;
2481                         info2->dev = dev;
2482                 }
2483         }
2484
2485         if (found) {
2486                 spin_unlock_irqrestore(&device_domain_lock, flags);
2487                 free_devinfo_mem(info);
2488                 /* Caller must free the original domain */
2489                 return found;
2490         }
2491
2492         spin_lock(&iommu->lock);
2493         ret = domain_attach_iommu(domain, iommu);
2494         spin_unlock(&iommu->lock);
2495
2496         if (ret) {
2497                 spin_unlock_irqrestore(&device_domain_lock, flags);
2498                 free_devinfo_mem(info);
2499                 return NULL;
2500         }
2501
2502         list_add(&info->link, &domain->devices);
2503         list_add(&info->global, &device_domain_list);
2504         if (dev)
2505                 dev->archdata.iommu = info;
2506         spin_unlock_irqrestore(&device_domain_lock, flags);
2507
2508         /* PASID table is mandatory for a PCI device in scalable mode. */
2509         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2510                 ret = intel_pasid_alloc_table(dev);
2511                 if (ret) {
2512                         dev_err(dev, "PASID table allocation failed\n");
2513                         dmar_remove_one_dev_info(dev);
2514                         return NULL;
2515                 }
2516
2517                 /* Setup the PASID entry for requests without PASID: */
2518                 spin_lock(&iommu->lock);
2519                 if (hw_pass_through && domain_type_is_si(domain))
2520                         ret = intel_pasid_setup_pass_through(iommu, domain,
2521                                         dev, PASID_RID2PASID);
2522                 else
2523                         ret = intel_pasid_setup_second_level(iommu, domain,
2524                                         dev, PASID_RID2PASID);
2525                 spin_unlock(&iommu->lock);
2526                 if (ret) {
2527                         dev_err(dev, "Setup RID2PASID failed\n");
2528                         dmar_remove_one_dev_info(dev);
2529                         return NULL;
2530                 }
2531         }
2532
2533         if (dev && domain_context_mapping(domain, dev)) {
2534                 dev_err(dev, "Domain context map failed\n");
2535                 dmar_remove_one_dev_info(dev);
2536                 return NULL;
2537         }
2538
2539         return domain;
2540 }
2541
2542 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2543 {
2544         *(u16 *)opaque = alias;
2545         return 0;
2546 }
2547
2548 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2549 {
2550         struct device_domain_info *info;
2551         struct dmar_domain *domain = NULL;
2552         struct intel_iommu *iommu;
2553         u16 dma_alias;
2554         unsigned long flags;
2555         u8 bus, devfn;
2556
2557         iommu = device_to_iommu(dev, &bus, &devfn);
2558         if (!iommu)
2559                 return NULL;
2560
2561         if (dev_is_pci(dev)) {
2562                 struct pci_dev *pdev = to_pci_dev(dev);
2563
2564                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2565
2566                 spin_lock_irqsave(&device_domain_lock, flags);
2567                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2568                                                       PCI_BUS_NUM(dma_alias),
2569                                                       dma_alias & 0xff);
2570                 if (info) {
2571                         iommu = info->iommu;
2572                         domain = info->domain;
2573                 }
2574                 spin_unlock_irqrestore(&device_domain_lock, flags);
2575
2576                 /* DMA alias already has a domain, use it */
2577                 if (info)
2578                         goto out;
2579         }
2580
2581         /* Allocate and initialize new domain for the device */
2582         domain = alloc_domain(0);
2583         if (!domain)
2584                 return NULL;
2585         if (domain_init(domain, iommu, gaw)) {
2586                 domain_exit(domain);
2587                 return NULL;
2588         }
2589
2590 out:
2591
2592         return domain;
2593 }
2594
2595 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2596                                               struct dmar_domain *domain)
2597 {
2598         struct intel_iommu *iommu;
2599         struct dmar_domain *tmp;
2600         u16 req_id, dma_alias;
2601         u8 bus, devfn;
2602
2603         iommu = device_to_iommu(dev, &bus, &devfn);
2604         if (!iommu)
2605                 return NULL;
2606
2607         req_id = ((u16)bus << 8) | devfn;
2608
2609         if (dev_is_pci(dev)) {
2610                 struct pci_dev *pdev = to_pci_dev(dev);
2611
2612                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2613
2614                 /* register PCI DMA alias device */
2615                 if (req_id != dma_alias) {
2616                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2617                                         dma_alias & 0xff, NULL, domain);
2618
2619                         if (!tmp || tmp != domain)
2620                                 return tmp;
2621                 }
2622         }
2623
2624         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2625         if (!tmp || tmp != domain)
2626                 return tmp;
2627
2628         return domain;
2629 }
2630
2631 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2632 {
2633         struct dmar_domain *domain, *tmp;
2634
2635         domain = find_domain(dev);
2636         if (domain)
2637                 goto out;
2638
2639         domain = find_or_alloc_domain(dev, gaw);
2640         if (!domain)
2641                 goto out;
2642
2643         tmp = set_domain_for_dev(dev, domain);
2644         if (!tmp || domain != tmp) {
2645                 domain_exit(domain);
2646                 domain = tmp;
2647         }
2648
2649 out:
2650
2651         return domain;
2652 }
2653
2654 static int iommu_domain_identity_map(struct dmar_domain *domain,
2655                                      unsigned long long start,
2656                                      unsigned long long end)
2657 {
2658         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2659         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2660
2661         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2662                           dma_to_mm_pfn(last_vpfn))) {
2663                 pr_err("Reserving iova failed\n");
2664                 return -ENOMEM;
2665         }
2666
2667         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2668         /*
2669          * RMRR range might have overlap with physical memory range,
2670          * clear it first
2671          */
2672         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2673
2674         return __domain_mapping(domain, first_vpfn, NULL,
2675                                 first_vpfn, last_vpfn - first_vpfn + 1,
2676                                 DMA_PTE_READ|DMA_PTE_WRITE);
2677 }
2678
2679 static int domain_prepare_identity_map(struct device *dev,
2680                                        struct dmar_domain *domain,
2681                                        unsigned long long start,
2682                                        unsigned long long end)
2683 {
2684         /* For _hardware_ passthrough, don't bother. But for software
2685            passthrough, we do it anyway -- it may indicate a memory
2686            range which is reserved in E820, so which didn't get set
2687            up to start with in si_domain */
2688         if (domain == si_domain && hw_pass_through) {
2689                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2690                          start, end);
2691                 return 0;
2692         }
2693
2694         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2695
2696         if (end < start) {
2697                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2698                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2699                         dmi_get_system_info(DMI_BIOS_VENDOR),
2700                         dmi_get_system_info(DMI_BIOS_VERSION),
2701                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2702                 return -EIO;
2703         }
2704
2705         if (end >> agaw_to_width(domain->agaw)) {
2706                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2707                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2708                      agaw_to_width(domain->agaw),
2709                      dmi_get_system_info(DMI_BIOS_VENDOR),
2710                      dmi_get_system_info(DMI_BIOS_VERSION),
2711                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2712                 return -EIO;
2713         }
2714
2715         return iommu_domain_identity_map(domain, start, end);
2716 }
2717
2718 static int iommu_prepare_identity_map(struct device *dev,
2719                                       unsigned long long start,
2720                                       unsigned long long end)
2721 {
2722         struct dmar_domain *domain;
2723         int ret;
2724
2725         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2726         if (!domain)
2727                 return -ENOMEM;
2728
2729         ret = domain_prepare_identity_map(dev, domain, start, end);
2730         if (ret)
2731                 domain_exit(domain);
2732
2733         return ret;
2734 }
2735
2736 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2737                                          struct device *dev)
2738 {
2739         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2740                 return 0;
2741         return iommu_prepare_identity_map(dev, rmrr->base_address,
2742                                           rmrr->end_address);
2743 }
2744
2745 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2746 static inline void iommu_prepare_isa(void)
2747 {
2748         struct pci_dev *pdev;
2749         int ret;
2750
2751         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2752         if (!pdev)
2753                 return;
2754
2755         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2756         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2757
2758         if (ret)
2759                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2760
2761         pci_dev_put(pdev);
2762 }
2763 #else
2764 static inline void iommu_prepare_isa(void)
2765 {
2766         return;
2767 }
2768 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2769
2770 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2771
2772 static int __init si_domain_init(int hw)
2773 {
2774         struct dmar_rmrr_unit *rmrr;
2775         struct device *dev;
2776         int i, nid, ret;
2777
2778         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2779         if (!si_domain)
2780                 return -EFAULT;
2781
2782         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2783                 domain_exit(si_domain);
2784                 return -EFAULT;
2785         }
2786
2787         if (hw)
2788                 return 0;
2789
2790         for_each_online_node(nid) {
2791                 unsigned long start_pfn, end_pfn;
2792                 int i;
2793
2794                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2795                         ret = iommu_domain_identity_map(si_domain,
2796                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2797                         if (ret)
2798                                 return ret;
2799                 }
2800         }
2801
2802         /*
2803          * Normally we use DMA domains for devices which have RMRRs. But we
2804          * loose this requirement for graphic and usb devices. Identity map
2805          * the RMRRs for graphic and USB devices so that they could use the
2806          * si_domain.
2807          */
2808         for_each_rmrr_units(rmrr) {
2809                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2810                                           i, dev) {
2811                         unsigned long long start = rmrr->base_address;
2812                         unsigned long long end = rmrr->end_address;
2813
2814                         if (device_is_rmrr_locked(dev))
2815                                 continue;
2816
2817                         if (WARN_ON(end < start ||
2818                                     end >> agaw_to_width(si_domain->agaw)))
2819                                 continue;
2820
2821                         ret = iommu_domain_identity_map(si_domain, start, end);
2822                         if (ret)
2823                                 return ret;
2824                 }
2825         }
2826
2827         return 0;
2828 }
2829
2830 static int identity_mapping(struct device *dev)
2831 {
2832         struct device_domain_info *info;
2833
2834         info = dev->archdata.iommu;
2835         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2836                 return (info->domain == si_domain);
2837
2838         return 0;
2839 }
2840
2841 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2842 {
2843         struct dmar_domain *ndomain;
2844         struct intel_iommu *iommu;
2845         u8 bus, devfn;
2846
2847         iommu = device_to_iommu(dev, &bus, &devfn);
2848         if (!iommu)
2849                 return -ENODEV;
2850
2851         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2852         if (ndomain != domain)
2853                 return -EBUSY;
2854
2855         return 0;
2856 }
2857
2858 static bool device_has_rmrr(struct device *dev)
2859 {
2860         struct dmar_rmrr_unit *rmrr;
2861         struct device *tmp;
2862         int i;
2863
2864         rcu_read_lock();
2865         for_each_rmrr_units(rmrr) {
2866                 /*
2867                  * Return TRUE if this RMRR contains the device that
2868                  * is passed in.
2869                  */
2870                 for_each_active_dev_scope(rmrr->devices,
2871                                           rmrr->devices_cnt, i, tmp)
2872                         if (tmp == dev) {
2873                                 rcu_read_unlock();
2874                                 return true;
2875                         }
2876         }
2877         rcu_read_unlock();
2878         return false;
2879 }
2880
2881 /*
2882  * There are a couple cases where we need to restrict the functionality of
2883  * devices associated with RMRRs.  The first is when evaluating a device for
2884  * identity mapping because problems exist when devices are moved in and out
2885  * of domains and their respective RMRR information is lost.  This means that
2886  * a device with associated RMRRs will never be in a "passthrough" domain.
2887  * The second is use of the device through the IOMMU API.  This interface
2888  * expects to have full control of the IOVA space for the device.  We cannot
2889  * satisfy both the requirement that RMRR access is maintained and have an
2890  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2891  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2892  * We therefore prevent devices associated with an RMRR from participating in
2893  * the IOMMU API, which eliminates them from device assignment.
2894  *
2895  * In both cases we assume that PCI USB devices with RMRRs have them largely
2896  * for historical reasons and that the RMRR space is not actively used post
2897  * boot.  This exclusion may change if vendors begin to abuse it.
2898  *
2899  * The same exception is made for graphics devices, with the requirement that
2900  * any use of the RMRR regions will be torn down before assigning the device
2901  * to a guest.
2902  */
2903 static bool device_is_rmrr_locked(struct device *dev)
2904 {
2905         if (!device_has_rmrr(dev))
2906                 return false;
2907
2908         if (dev_is_pci(dev)) {
2909                 struct pci_dev *pdev = to_pci_dev(dev);
2910
2911                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2912                         return false;
2913         }
2914
2915         return true;
2916 }
2917
2918 /*
2919  * Return the required default domain type for a specific device.
2920  *
2921  * @dev: the device in query
2922  * @startup: true if this is during early boot
2923  *
2924  * Returns:
2925  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2926  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2927  *  - 0: both identity and dynamic domains work for this device
2928  */
2929 static int device_def_domain_type(struct device *dev, int startup)
2930 {
2931         if (dev_is_pci(dev)) {
2932                 struct pci_dev *pdev = to_pci_dev(dev);
2933
2934                 if (device_is_rmrr_locked(dev))
2935                         return IOMMU_DOMAIN_DMA;
2936
2937                 /*
2938                  * Prevent any device marked as untrusted from getting
2939                  * placed into the statically identity mapping domain.
2940                  */
2941                 if (pdev->untrusted)
2942                         return IOMMU_DOMAIN_DMA;
2943
2944                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2945                         return IOMMU_DOMAIN_IDENTITY;
2946
2947                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2948                         return IOMMU_DOMAIN_IDENTITY;
2949
2950                 /*
2951                  * We want to start off with all devices in the 1:1 domain, and
2952                  * take them out later if we find they can't access all of memory.
2953                  *
2954                  * However, we can't do this for PCI devices behind bridges,
2955                  * because all PCI devices behind the same bridge will end up
2956                  * with the same source-id on their transactions.
2957                  *
2958                  * Practically speaking, we can't change things around for these
2959                  * devices at run-time, because we can't be sure there'll be no
2960                  * DMA transactions in flight for any of their siblings.
2961                  *
2962                  * So PCI devices (unless they're on the root bus) as well as
2963                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2964                  * the 1:1 domain, just in _case_ one of their siblings turns out
2965                  * not to be able to map all of memory.
2966                  */
2967                 if (!pci_is_pcie(pdev)) {
2968                         if (!pci_is_root_bus(pdev->bus))
2969                                 return IOMMU_DOMAIN_DMA;
2970                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2971                                 return IOMMU_DOMAIN_DMA;
2972                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2973                         return IOMMU_DOMAIN_DMA;
2974         } else {
2975                 if (device_has_rmrr(dev))
2976                         return IOMMU_DOMAIN_DMA;
2977         }
2978
2979         /*
2980          * At boot time, we don't yet know if devices will be 64-bit capable.
2981          * Assume that they will — if they turn out not to be, then we can
2982          * take them out of the 1:1 domain later.
2983          */
2984         if (!startup) {
2985                 /*
2986                  * If the device's dma_mask is less than the system's memory
2987                  * size then this is not a candidate for identity mapping.
2988                  */
2989                 u64 dma_mask = *dev->dma_mask;
2990
2991                 if (dev->coherent_dma_mask &&
2992                     dev->coherent_dma_mask < dma_mask)
2993                         dma_mask = dev->coherent_dma_mask;
2994
2995                 return dma_mask >= dma_get_required_mask(dev);
2996         }
2997
2998         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2999                         IOMMU_DOMAIN_IDENTITY : 0;
3000 }
3001
3002 static inline int iommu_should_identity_map(struct device *dev, int startup)
3003 {
3004         return device_def_domain_type(dev, startup) == IOMMU_DOMAIN_IDENTITY;
3005 }
3006
3007 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3008 {
3009         int ret;
3010
3011         if (!iommu_should_identity_map(dev, 1))
3012                 return 0;
3013
3014         ret = domain_add_dev_info(si_domain, dev);
3015         if (!ret)
3016                 dev_info(dev, "%s identity mapping\n",
3017                          hw ? "Hardware" : "Software");
3018         else if (ret == -ENODEV)
3019                 /* device not associated with an iommu */
3020                 ret = 0;
3021
3022         return ret;
3023 }
3024
3025
3026 static int __init iommu_prepare_static_identity_mapping(int hw)
3027 {
3028         struct pci_dev *pdev = NULL;
3029         struct dmar_drhd_unit *drhd;
3030         struct intel_iommu *iommu;
3031         struct device *dev;
3032         int i;
3033         int ret = 0;
3034
3035         for_each_pci_dev(pdev) {
3036                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3037                 if (ret)
3038                         return ret;
3039         }
3040
3041         for_each_active_iommu(iommu, drhd)
3042                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3043                         struct acpi_device_physical_node *pn;
3044                         struct acpi_device *adev;
3045
3046                         if (dev->bus != &acpi_bus_type)
3047                                 continue;
3048
3049                         adev= to_acpi_device(dev);
3050                         mutex_lock(&adev->physical_node_lock);
3051                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3052                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3053                                 if (ret)
3054                                         break;
3055                         }
3056                         mutex_unlock(&adev->physical_node_lock);
3057                         if (ret)
3058                                 return ret;
3059                 }
3060
3061         return 0;
3062 }
3063
3064 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3065 {
3066         /*
3067          * Start from the sane iommu hardware state.
3068          * If the queued invalidation is already initialized by us
3069          * (for example, while enabling interrupt-remapping) then
3070          * we got the things already rolling from a sane state.
3071          */
3072         if (!iommu->qi) {
3073                 /*
3074                  * Clear any previous faults.
3075                  */
3076                 dmar_fault(-1, iommu);
3077                 /*
3078                  * Disable queued invalidation if supported and already enabled
3079                  * before OS handover.
3080                  */
3081                 dmar_disable_qi(iommu);
3082         }
3083
3084         if (dmar_enable_qi(iommu)) {
3085                 /*
3086                  * Queued Invalidate not enabled, use Register Based Invalidate
3087                  */
3088                 iommu->flush.flush_context = __iommu_flush_context;
3089                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3090                 pr_info("%s: Using Register based invalidation\n",
3091                         iommu->name);
3092         } else {
3093                 iommu->flush.flush_context = qi_flush_context;
3094                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3095                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3096         }
3097 }
3098
3099 static int copy_context_table(struct intel_iommu *iommu,
3100                               struct root_entry *old_re,
3101                               struct context_entry **tbl,
3102                               int bus, bool ext)
3103 {
3104         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3105         struct context_entry *new_ce = NULL, ce;
3106         struct context_entry *old_ce = NULL;
3107         struct root_entry re;
3108         phys_addr_t old_ce_phys;
3109
3110         tbl_idx = ext ? bus * 2 : bus;
3111         memcpy(&re, old_re, sizeof(re));
3112
3113         for (devfn = 0; devfn < 256; devfn++) {
3114                 /* First calculate the correct index */
3115                 idx = (ext ? devfn * 2 : devfn) % 256;
3116
3117                 if (idx == 0) {
3118                         /* First save what we may have and clean up */
3119                         if (new_ce) {
3120                                 tbl[tbl_idx] = new_ce;
3121                                 __iommu_flush_cache(iommu, new_ce,
3122                                                     VTD_PAGE_SIZE);
3123                                 pos = 1;
3124                         }
3125
3126                         if (old_ce)
3127                                 memunmap(old_ce);
3128
3129                         ret = 0;
3130                         if (devfn < 0x80)
3131                                 old_ce_phys = root_entry_lctp(&re);
3132                         else
3133                                 old_ce_phys = root_entry_uctp(&re);
3134
3135                         if (!old_ce_phys) {
3136                                 if (ext && devfn == 0) {
3137                                         /* No LCTP, try UCTP */
3138                                         devfn = 0x7f;
3139                                         continue;
3140                                 } else {
3141                                         goto out;
3142                                 }
3143                         }
3144
3145                         ret = -ENOMEM;
3146                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3147                                         MEMREMAP_WB);
3148                         if (!old_ce)
3149                                 goto out;
3150
3151                         new_ce = alloc_pgtable_page(iommu->node);
3152                         if (!new_ce)
3153                                 goto out_unmap;
3154
3155                         ret = 0;
3156                 }
3157
3158                 /* Now copy the context entry */
3159                 memcpy(&ce, old_ce + idx, sizeof(ce));
3160
3161                 if (!__context_present(&ce))
3162                         continue;
3163
3164                 did = context_domain_id(&ce);
3165                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3166                         set_bit(did, iommu->domain_ids);
3167
3168                 /*
3169                  * We need a marker for copied context entries. This
3170                  * marker needs to work for the old format as well as
3171                  * for extended context entries.
3172                  *
3173                  * Bit 67 of the context entry is used. In the old
3174                  * format this bit is available to software, in the
3175                  * extended format it is the PGE bit, but PGE is ignored
3176                  * by HW if PASIDs are disabled (and thus still
3177                  * available).
3178                  *
3179                  * So disable PASIDs first and then mark the entry
3180                  * copied. This means that we don't copy PASID
3181                  * translations from the old kernel, but this is fine as
3182                  * faults there are not fatal.
3183                  */
3184                 context_clear_pasid_enable(&ce);
3185                 context_set_copied(&ce);
3186
3187                 new_ce[idx] = ce;
3188         }
3189
3190         tbl[tbl_idx + pos] = new_ce;
3191
3192         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3193
3194 out_unmap:
3195         memunmap(old_ce);
3196
3197 out:
3198         return ret;
3199 }
3200
3201 static int copy_translation_tables(struct intel_iommu *iommu)
3202 {
3203         struct context_entry **ctxt_tbls;
3204         struct root_entry *old_rt;
3205         phys_addr_t old_rt_phys;
3206         int ctxt_table_entries;
3207         unsigned long flags;
3208         u64 rtaddr_reg;
3209         int bus, ret;
3210         bool new_ext, ext;
3211
3212         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3213         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3214         new_ext    = !!ecap_ecs(iommu->ecap);
3215
3216         /*
3217          * The RTT bit can only be changed when translation is disabled,
3218          * but disabling translation means to open a window for data
3219          * corruption. So bail out and don't copy anything if we would
3220          * have to change the bit.
3221          */
3222         if (new_ext != ext)
3223                 return -EINVAL;
3224
3225         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3226         if (!old_rt_phys)
3227                 return -EINVAL;
3228
3229         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3230         if (!old_rt)
3231                 return -ENOMEM;
3232
3233         /* This is too big for the stack - allocate it from slab */
3234         ctxt_table_entries = ext ? 512 : 256;
3235         ret = -ENOMEM;
3236         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3237         if (!ctxt_tbls)
3238                 goto out_unmap;
3239
3240         for (bus = 0; bus < 256; bus++) {
3241                 ret = copy_context_table(iommu, &old_rt[bus],
3242                                          ctxt_tbls, bus, ext);
3243                 if (ret) {
3244                         pr_err("%s: Failed to copy context table for bus %d\n",
3245                                 iommu->name, bus);
3246                         continue;
3247                 }
3248         }
3249
3250         spin_lock_irqsave(&iommu->lock, flags);
3251
3252         /* Context tables are copied, now write them to the root_entry table */
3253         for (bus = 0; bus < 256; bus++) {
3254                 int idx = ext ? bus * 2 : bus;
3255                 u64 val;
3256
3257                 if (ctxt_tbls[idx]) {
3258                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3259                         iommu->root_entry[bus].lo = val;
3260                 }
3261
3262                 if (!ext || !ctxt_tbls[idx + 1])
3263                         continue;
3264
3265                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3266                 iommu->root_entry[bus].hi = val;
3267         }
3268
3269         spin_unlock_irqrestore(&iommu->lock, flags);
3270
3271         kfree(ctxt_tbls);
3272
3273         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3274
3275         ret = 0;
3276
3277 out_unmap:
3278         memunmap(old_rt);
3279
3280         return ret;
3281 }
3282
3283 static int __init init_dmars(void)
3284 {
3285         struct dmar_drhd_unit *drhd;
3286         struct dmar_rmrr_unit *rmrr;
3287         bool copied_tables = false;
3288         struct device *dev;
3289         struct intel_iommu *iommu;
3290         int i, ret;
3291
3292         /*
3293          * for each drhd
3294          *    allocate root
3295          *    initialize and program root entry to not present
3296          * endfor
3297          */
3298         for_each_drhd_unit(drhd) {
3299                 /*
3300                  * lock not needed as this is only incremented in the single
3301                  * threaded kernel __init code path all other access are read
3302                  * only
3303                  */
3304                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3305                         g_num_of_iommus++;
3306                         continue;
3307                 }
3308                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3309         }
3310
3311         /* Preallocate enough resources for IOMMU hot-addition */
3312         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3313                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3314
3315         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3316                         GFP_KERNEL);
3317         if (!g_iommus) {
3318                 pr_err("Allocating global iommu array failed\n");
3319                 ret = -ENOMEM;
3320                 goto error;
3321         }
3322
3323         for_each_active_iommu(iommu, drhd) {
3324                 /*
3325                  * Find the max pasid size of all IOMMU's in the system.
3326                  * We need to ensure the system pasid table is no bigger
3327                  * than the smallest supported.
3328                  */
3329                 if (pasid_supported(iommu)) {
3330                         u32 temp = 2 << ecap_pss(iommu->ecap);
3331
3332                         intel_pasid_max_id = min_t(u32, temp,
3333                                                    intel_pasid_max_id);
3334                 }
3335
3336                 g_iommus[iommu->seq_id] = iommu;
3337
3338                 intel_iommu_init_qi(iommu);
3339
3340                 ret = iommu_init_domains(iommu);
3341                 if (ret)
3342                         goto free_iommu;
3343
3344                 init_translation_status(iommu);
3345
3346                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3347                         iommu_disable_translation(iommu);
3348                         clear_translation_pre_enabled(iommu);
3349                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3350                                 iommu->name);
3351                 }
3352
3353                 /*
3354                  * TBD:
3355                  * we could share the same root & context tables
3356                  * among all IOMMU's. Need to Split it later.
3357                  */
3358                 ret = iommu_alloc_root_entry(iommu);
3359                 if (ret)
3360                         goto free_iommu;
3361
3362                 if (translation_pre_enabled(iommu)) {
3363                         pr_info("Translation already enabled - trying to copy translation structures\n");
3364
3365                         ret = copy_translation_tables(iommu);
3366                         if (ret) {
3367                                 /*
3368                                  * We found the IOMMU with translation
3369                                  * enabled - but failed to copy over the
3370                                  * old root-entry table. Try to proceed
3371                                  * by disabling translation now and
3372                                  * allocating a clean root-entry table.
3373                                  * This might cause DMAR faults, but
3374                                  * probably the dump will still succeed.
3375                                  */
3376                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3377                                        iommu->name);
3378                                 iommu_disable_translation(iommu);
3379                                 clear_translation_pre_enabled(iommu);
3380                         } else {
3381                                 pr_info("Copied translation tables from previous kernel for %s\n",
3382                                         iommu->name);
3383                                 copied_tables = true;
3384                         }
3385                 }
3386
3387                 if (!ecap_pass_through(iommu->ecap))
3388                         hw_pass_through = 0;
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390                 if (pasid_supported(iommu))
3391                         intel_svm_init(iommu);
3392 #endif
3393         }
3394
3395         /*
3396          * Now that qi is enabled on all iommus, set the root entry and flush
3397          * caches. This is required on some Intel X58 chipsets, otherwise the
3398          * flush_context function will loop forever and the boot hangs.
3399          */
3400         for_each_active_iommu(iommu, drhd) {
3401                 iommu_flush_write_buffer(iommu);
3402                 iommu_set_root_entry(iommu);
3403                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3404                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3405         }
3406
3407         if (iommu_pass_through)
3408                 iommu_identity_mapping |= IDENTMAP_ALL;
3409
3410 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3411         dmar_map_gfx = 0;
3412 #endif
3413
3414         if (!dmar_map_gfx)
3415                 iommu_identity_mapping |= IDENTMAP_GFX;
3416
3417         check_tylersburg_isoch();
3418
3419         ret = si_domain_init(hw_pass_through);
3420         if (ret)
3421                 goto free_iommu;
3422
3423
3424         /*
3425          * If we copied translations from a previous kernel in the kdump
3426          * case, we can not assign the devices to domains now, as that
3427          * would eliminate the old mappings. So skip this part and defer
3428          * the assignment to device driver initialization time.
3429          */
3430         if (copied_tables)
3431                 goto domains_done;
3432
3433         /*
3434          * If pass through is not set or not enabled, setup context entries for
3435          * identity mappings for rmrr, gfx, and isa and may fall back to static
3436          * identity mapping if iommu_identity_mapping is set.
3437          */
3438         if (iommu_identity_mapping) {
3439                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3440                 if (ret) {
3441                         pr_crit("Failed to setup IOMMU pass-through\n");
3442                         goto free_iommu;
3443                 }
3444         }
3445         /*
3446          * For each rmrr
3447          *   for each dev attached to rmrr
3448          *   do
3449          *     locate drhd for dev, alloc domain for dev
3450          *     allocate free domain
3451          *     allocate page table entries for rmrr
3452          *     if context not allocated for bus
3453          *           allocate and init context
3454          *           set present in root table for this bus
3455          *     init context with domain, translation etc
3456          *    endfor
3457          * endfor
3458          */
3459         pr_info("Setting RMRR:\n");
3460         for_each_rmrr_units(rmrr) {
3461                 /* some BIOS lists non-exist devices in DMAR table. */
3462                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3463                                           i, dev) {
3464                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3465                         if (ret)
3466                                 pr_err("Mapping reserved region failed\n");
3467                 }
3468         }
3469
3470         iommu_prepare_isa();
3471
3472 domains_done:
3473
3474         /*
3475          * for each drhd
3476          *   enable fault log
3477          *   global invalidate context cache
3478          *   global invalidate iotlb
3479          *   enable translation
3480          */
3481         for_each_iommu(iommu, drhd) {
3482                 if (drhd->ignored) {
3483                         /*
3484                          * we always have to disable PMRs or DMA may fail on
3485                          * this device
3486                          */
3487                         if (force_on)
3488                                 iommu_disable_protect_mem_regions(iommu);
3489                         continue;
3490                 }
3491
3492                 iommu_flush_write_buffer(iommu);
3493
3494 #ifdef CONFIG_INTEL_IOMMU_SVM
3495                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3496                         /*
3497                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3498                          * could cause possible lock race condition.
3499                          */
3500                         up_write(&dmar_global_lock);
3501                         ret = intel_svm_enable_prq(iommu);
3502                         down_write(&dmar_global_lock);
3503                         if (ret)
3504                                 goto free_iommu;
3505                 }
3506 #endif
3507                 ret = dmar_set_interrupt(iommu);
3508                 if (ret)
3509                         goto free_iommu;
3510         }
3511
3512         return 0;
3513
3514 free_iommu:
3515         for_each_active_iommu(iommu, drhd) {
3516                 disable_dmar_iommu(iommu);
3517                 free_dmar_iommu(iommu);
3518         }
3519
3520         kfree(g_iommus);
3521
3522 error:
3523         return ret;
3524 }
3525
3526 /* This takes a number of _MM_ pages, not VTD pages */
3527 static unsigned long intel_alloc_iova(struct device *dev,
3528                                      struct dmar_domain *domain,
3529                                      unsigned long nrpages, uint64_t dma_mask)
3530 {
3531         unsigned long iova_pfn;
3532
3533         /* Restrict dma_mask to the width that the iommu can handle */
3534         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3535         /* Ensure we reserve the whole size-aligned region */
3536         nrpages = __roundup_pow_of_two(nrpages);
3537
3538         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3539                 /*
3540                  * First try to allocate an io virtual address in
3541                  * DMA_BIT_MASK(32) and if that fails then try allocating
3542                  * from higher range
3543                  */
3544                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3545                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3546                 if (iova_pfn)
3547                         return iova_pfn;
3548         }
3549         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3550                                    IOVA_PFN(dma_mask), true);
3551         if (unlikely(!iova_pfn)) {
3552                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3553                 return 0;
3554         }
3555
3556         return iova_pfn;
3557 }
3558
3559 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3560 {
3561         struct dmar_domain *domain, *tmp;
3562         struct dmar_rmrr_unit *rmrr;
3563         struct device *i_dev;
3564         int i, ret;
3565
3566         domain = find_domain(dev);
3567         if (domain)
3568                 goto out;
3569
3570         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3571         if (!domain)
3572                 goto out;
3573
3574         /* We have a new domain - setup possible RMRRs for the device */
3575         rcu_read_lock();
3576         for_each_rmrr_units(rmrr) {
3577                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3578                                           i, i_dev) {
3579                         if (i_dev != dev)
3580                                 continue;
3581
3582                         ret = domain_prepare_identity_map(dev, domain,
3583                                                           rmrr->base_address,
3584                                                           rmrr->end_address);
3585                         if (ret)
3586                                 dev_err(dev, "Mapping reserved region failed\n");
3587                 }
3588         }
3589         rcu_read_unlock();
3590
3591         tmp = set_domain_for_dev(dev, domain);
3592         if (!tmp || domain != tmp) {
3593                 domain_exit(domain);
3594                 domain = tmp;
3595         }
3596
3597 out:
3598
3599         if (!domain)
3600                 dev_err(dev, "Allocating domain failed\n");
3601
3602
3603         return domain;
3604 }
3605
3606 /* Check if the dev needs to go through non-identity map and unmap process.*/
3607 static bool iommu_need_mapping(struct device *dev)
3608 {
3609         int found;
3610
3611         if (iommu_dummy(dev))
3612                 return false;
3613
3614         found = identity_mapping(dev);
3615         if (found) {
3616                 if (iommu_should_identity_map(dev, 0))
3617                         return false;
3618
3619                 /*
3620                  * 32 bit DMA is removed from si_domain and fall back to
3621                  * non-identity mapping.
3622                  */
3623                 dmar_remove_one_dev_info(dev);
3624                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3625         } else {
3626                 /*
3627                  * In case of a detached 64 bit DMA device from vm, the device
3628                  * is put into si_domain for identity mapping.
3629                  */
3630                 if (iommu_should_identity_map(dev, 0) &&
3631                     !domain_add_dev_info(si_domain, dev)) {
3632                         dev_info(dev, "64bit DMA uses identity mapping\n");
3633                         return false;
3634                 }
3635         }
3636
3637         return true;
3638 }
3639
3640 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3641                                      size_t size, int dir, u64 dma_mask)
3642 {
3643         struct dmar_domain *domain;
3644         phys_addr_t start_paddr;
3645         unsigned long iova_pfn;
3646         int prot = 0;
3647         int ret;
3648         struct intel_iommu *iommu;
3649         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3650
3651         BUG_ON(dir == DMA_NONE);
3652
3653         domain = get_valid_domain_for_dev(dev);
3654         if (!domain)
3655                 return DMA_MAPPING_ERROR;
3656
3657         iommu = domain_get_iommu(domain);
3658         size = aligned_nrpages(paddr, size);
3659
3660         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3661         if (!iova_pfn)
3662                 goto error;
3663
3664         /*
3665          * Check if DMAR supports zero-length reads on write only
3666          * mappings..
3667          */
3668         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3669                         !cap_zlr(iommu->cap))
3670                 prot |= DMA_PTE_READ;
3671         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3672                 prot |= DMA_PTE_WRITE;
3673         /*
3674          * paddr - (paddr + size) might be partial page, we should map the whole
3675          * page.  Note: if two part of one page are separately mapped, we
3676          * might have two guest_addr mapping to the same host paddr, but this
3677          * is not a big problem
3678          */
3679         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3680                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3681         if (ret)
3682                 goto error;
3683
3684         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3685         start_paddr += paddr & ~PAGE_MASK;
3686         return start_paddr;
3687
3688 error:
3689         if (iova_pfn)
3690                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3691         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3692                 size, (unsigned long long)paddr, dir);
3693         return DMA_MAPPING_ERROR;
3694 }
3695
3696 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3697                                  unsigned long offset, size_t size,
3698                                  enum dma_data_direction dir,
3699                                  unsigned long attrs)
3700 {
3701         if (iommu_need_mapping(dev))
3702                 return __intel_map_single(dev, page_to_phys(page) + offset,
3703                                 size, dir, *dev->dma_mask);
3704         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3705 }
3706
3707 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3708                                      size_t size, enum dma_data_direction dir,
3709                                      unsigned long attrs)
3710 {
3711         if (iommu_need_mapping(dev))
3712                 return __intel_map_single(dev, phys_addr, size, dir,
3713                                 *dev->dma_mask);
3714         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3715 }
3716
3717 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3718 {
3719         struct dmar_domain *domain;
3720         unsigned long start_pfn, last_pfn;
3721         unsigned long nrpages;
3722         unsigned long iova_pfn;
3723         struct intel_iommu *iommu;
3724         struct page *freelist;
3725         struct pci_dev *pdev = NULL;
3726
3727         domain = find_domain(dev);
3728         BUG_ON(!domain);
3729
3730         iommu = domain_get_iommu(domain);
3731
3732         iova_pfn = IOVA_PFN(dev_addr);
3733
3734         nrpages = aligned_nrpages(dev_addr, size);
3735         start_pfn = mm_to_dma_pfn(iova_pfn);
3736         last_pfn = start_pfn + nrpages - 1;
3737
3738         if (dev_is_pci(dev))
3739                 pdev = to_pci_dev(dev);
3740
3741         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3742
3743         freelist = domain_unmap(domain, start_pfn, last_pfn);
3744
3745         if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3746                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3747                                       nrpages, !freelist, 0);
3748                 /* free iova */
3749                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3750                 dma_free_pagelist(freelist);
3751         } else {
3752                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3753                            (unsigned long)freelist);
3754                 /*
3755                  * queue up the release of the unmap to save the 1/6th of the
3756                  * cpu used up by the iotlb flush operation...
3757                  */
3758         }
3759 }
3760
3761 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3762                              size_t size, enum dma_data_direction dir,
3763                              unsigned long attrs)
3764 {
3765         if (iommu_need_mapping(dev))
3766                 intel_unmap(dev, dev_addr, size);
3767         else
3768                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3769 }
3770
3771 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3772                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3773 {
3774         if (iommu_need_mapping(dev))
3775                 intel_unmap(dev, dev_addr, size);
3776 }
3777
3778 static void *intel_alloc_coherent(struct device *dev, size_t size,
3779                                   dma_addr_t *dma_handle, gfp_t flags,
3780                                   unsigned long attrs)
3781 {
3782         struct page *page = NULL;
3783         int order;
3784
3785         if (!iommu_need_mapping(dev))
3786                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3787
3788         size = PAGE_ALIGN(size);
3789         order = get_order(size);
3790
3791         if (gfpflags_allow_blocking(flags)) {
3792                 unsigned int count = size >> PAGE_SHIFT;
3793
3794                 page = dma_alloc_from_contiguous(dev, count, order,
3795                                                  flags & __GFP_NOWARN);
3796         }
3797
3798         if (!page)
3799                 page = alloc_pages(flags, order);
3800         if (!page)
3801                 return NULL;
3802         memset(page_address(page), 0, size);
3803
3804         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3805                                          DMA_BIDIRECTIONAL,
3806                                          dev->coherent_dma_mask);
3807         if (*dma_handle != DMA_MAPPING_ERROR)
3808                 return page_address(page);
3809         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3810                 __free_pages(page, order);
3811
3812         return NULL;
3813 }
3814
3815 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3816                                 dma_addr_t dma_handle, unsigned long attrs)
3817 {
3818         int order;
3819         struct page *page = virt_to_page(vaddr);
3820
3821         if (!iommu_need_mapping(dev))
3822                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3823
3824         size = PAGE_ALIGN(size);
3825         order = get_order(size);
3826
3827         intel_unmap(dev, dma_handle, size);
3828         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3829                 __free_pages(page, order);
3830 }
3831
3832 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3833                            int nelems, enum dma_data_direction dir,
3834                            unsigned long attrs)
3835 {
3836         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3837         unsigned long nrpages = 0;
3838         struct scatterlist *sg;
3839         int i;
3840
3841         if (!iommu_need_mapping(dev))
3842                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3843
3844         for_each_sg(sglist, sg, nelems, i) {
3845                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3846         }
3847
3848         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3849 }
3850
3851 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3852                         enum dma_data_direction dir, unsigned long attrs)
3853 {
3854         int i;
3855         struct dmar_domain *domain;
3856         size_t size = 0;
3857         int prot = 0;
3858         unsigned long iova_pfn;
3859         int ret;
3860         struct scatterlist *sg;
3861         unsigned long start_vpfn;
3862         struct intel_iommu *iommu;
3863
3864         BUG_ON(dir == DMA_NONE);
3865         if (!iommu_need_mapping(dev))
3866                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3867
3868         domain = get_valid_domain_for_dev(dev);
3869         if (!domain)
3870                 return 0;
3871
3872         iommu = domain_get_iommu(domain);
3873
3874         for_each_sg(sglist, sg, nelems, i)
3875                 size += aligned_nrpages(sg->offset, sg->length);
3876
3877         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3878                                 *dev->dma_mask);
3879         if (!iova_pfn) {
3880                 sglist->dma_length = 0;
3881                 return 0;
3882         }
3883
3884         /*
3885          * Check if DMAR supports zero-length reads on write only
3886          * mappings..
3887          */
3888         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3889                         !cap_zlr(iommu->cap))
3890                 prot |= DMA_PTE_READ;
3891         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3892                 prot |= DMA_PTE_WRITE;
3893
3894         start_vpfn = mm_to_dma_pfn(iova_pfn);
3895
3896         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3897         if (unlikely(ret)) {
3898                 dma_pte_free_pagetable(domain, start_vpfn,
3899                                        start_vpfn + size - 1,
3900                                        agaw_to_level(domain->agaw) + 1);
3901                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3902                 return 0;
3903         }
3904
3905         return nelems;
3906 }
3907
3908 static const struct dma_map_ops intel_dma_ops = {
3909         .alloc = intel_alloc_coherent,
3910         .free = intel_free_coherent,
3911         .map_sg = intel_map_sg,
3912         .unmap_sg = intel_unmap_sg,
3913         .map_page = intel_map_page,
3914         .unmap_page = intel_unmap_page,
3915         .map_resource = intel_map_resource,
3916         .unmap_resource = intel_unmap_resource,
3917         .dma_supported = dma_direct_supported,
3918 };
3919
3920 static inline int iommu_domain_cache_init(void)
3921 {
3922         int ret = 0;
3923
3924         iommu_domain_cache = kmem_cache_create("iommu_domain",
3925                                          sizeof(struct dmar_domain),
3926                                          0,
3927                                          SLAB_HWCACHE_ALIGN,
3928
3929                                          NULL);
3930         if (!iommu_domain_cache) {
3931                 pr_err("Couldn't create iommu_domain cache\n");
3932                 ret = -ENOMEM;
3933         }
3934
3935         return ret;
3936 }
3937
3938 static inline int iommu_devinfo_cache_init(void)
3939 {
3940         int ret = 0;
3941
3942         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3943                                          sizeof(struct device_domain_info),
3944                                          0,
3945                                          SLAB_HWCACHE_ALIGN,
3946                                          NULL);
3947         if (!iommu_devinfo_cache) {
3948                 pr_err("Couldn't create devinfo cache\n");
3949                 ret = -ENOMEM;
3950         }
3951
3952         return ret;
3953 }
3954
3955 static int __init iommu_init_mempool(void)
3956 {
3957         int ret;
3958         ret = iova_cache_get();
3959         if (ret)
3960                 return ret;
3961
3962         ret = iommu_domain_cache_init();
3963         if (ret)
3964                 goto domain_error;
3965
3966         ret = iommu_devinfo_cache_init();
3967         if (!ret)
3968                 return ret;
3969
3970         kmem_cache_destroy(iommu_domain_cache);
3971 domain_error:
3972         iova_cache_put();
3973
3974         return -ENOMEM;
3975 }
3976
3977 static void __init iommu_exit_mempool(void)
3978 {
3979         kmem_cache_destroy(iommu_devinfo_cache);
3980         kmem_cache_destroy(iommu_domain_cache);
3981         iova_cache_put();
3982 }
3983
3984 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3985 {
3986         struct dmar_drhd_unit *drhd;
3987         u32 vtbar;
3988         int rc;
3989
3990         /* We know that this device on this chipset has its own IOMMU.
3991          * If we find it under a different IOMMU, then the BIOS is lying
3992          * to us. Hope that the IOMMU for this device is actually
3993          * disabled, and it needs no translation...
3994          */
3995         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3996         if (rc) {
3997                 /* "can't" happen */
3998                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3999                 return;
4000         }
4001         vtbar &= 0xffff0000;
4002
4003         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4004         drhd = dmar_find_matched_drhd_unit(pdev);
4005         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4006                             TAINT_FIRMWARE_WORKAROUND,
4007                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4008                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4009 }
4010 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4011
4012 static void __init init_no_remapping_devices(void)
4013 {
4014         struct dmar_drhd_unit *drhd;
4015         struct device *dev;
4016         int i;
4017
4018         for_each_drhd_unit(drhd) {
4019                 if (!drhd->include_all) {
4020                         for_each_active_dev_scope(drhd->devices,
4021                                                   drhd->devices_cnt, i, dev)
4022                                 break;
4023                         /* ignore DMAR unit if no devices exist */
4024                         if (i == drhd->devices_cnt)
4025                                 drhd->ignored = 1;
4026                 }
4027         }
4028
4029         for_each_active_drhd_unit(drhd) {
4030                 if (drhd->include_all)
4031                         continue;
4032
4033                 for_each_active_dev_scope(drhd->devices,
4034                                           drhd->devices_cnt, i, dev)
4035                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4036                                 break;
4037                 if (i < drhd->devices_cnt)
4038                         continue;
4039
4040                 /* This IOMMU has *only* gfx devices. Either bypass it or
4041                    set the gfx_mapped flag, as appropriate */
4042                 if (!dmar_map_gfx) {
4043                         drhd->ignored = 1;
4044                         for_each_active_dev_scope(drhd->devices,
4045                                                   drhd->devices_cnt, i, dev)
4046                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4047                 }
4048         }
4049 }
4050
4051 #ifdef CONFIG_SUSPEND
4052 static int init_iommu_hw(void)
4053 {
4054         struct dmar_drhd_unit *drhd;
4055         struct intel_iommu *iommu = NULL;
4056
4057         for_each_active_iommu(iommu, drhd)
4058                 if (iommu->qi)
4059                         dmar_reenable_qi(iommu);
4060
4061         for_each_iommu(iommu, drhd) {
4062                 if (drhd->ignored) {
4063                         /*
4064                          * we always have to disable PMRs or DMA may fail on
4065                          * this device
4066                          */
4067                         if (force_on)
4068                                 iommu_disable_protect_mem_regions(iommu);
4069                         continue;
4070                 }
4071
4072                 iommu_flush_write_buffer(iommu);
4073
4074                 iommu_set_root_entry(iommu);
4075
4076                 iommu->flush.flush_context(iommu, 0, 0, 0,
4077                                            DMA_CCMD_GLOBAL_INVL);
4078                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4079                 iommu_enable_translation(iommu);
4080                 iommu_disable_protect_mem_regions(iommu);
4081         }
4082
4083         return 0;
4084 }
4085
4086 static void iommu_flush_all(void)
4087 {
4088         struct dmar_drhd_unit *drhd;
4089         struct intel_iommu *iommu;
4090
4091         for_each_active_iommu(iommu, drhd) {
4092                 iommu->flush.flush_context(iommu, 0, 0, 0,
4093                                            DMA_CCMD_GLOBAL_INVL);
4094                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4095                                          DMA_TLB_GLOBAL_FLUSH);
4096         }
4097 }
4098
4099 static int iommu_suspend(void)
4100 {
4101         struct dmar_drhd_unit *drhd;
4102         struct intel_iommu *iommu = NULL;
4103         unsigned long flag;
4104
4105         for_each_active_iommu(iommu, drhd) {
4106                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4107                                                  GFP_ATOMIC);
4108                 if (!iommu->iommu_state)
4109                         goto nomem;
4110         }
4111
4112         iommu_flush_all();
4113
4114         for_each_active_iommu(iommu, drhd) {
4115                 iommu_disable_translation(iommu);
4116
4117                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4118
4119                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4120                         readl(iommu->reg + DMAR_FECTL_REG);
4121                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4122                         readl(iommu->reg + DMAR_FEDATA_REG);
4123                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4124                         readl(iommu->reg + DMAR_FEADDR_REG);
4125                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4126                         readl(iommu->reg + DMAR_FEUADDR_REG);
4127
4128                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4129         }
4130         return 0;
4131
4132 nomem:
4133         for_each_active_iommu(iommu, drhd)
4134                 kfree(iommu->iommu_state);
4135
4136         return -ENOMEM;
4137 }
4138
4139 static void iommu_resume(void)
4140 {
4141         struct dmar_drhd_unit *drhd;
4142         struct intel_iommu *iommu = NULL;
4143         unsigned long flag;
4144
4145         if (init_iommu_hw()) {
4146                 if (force_on)
4147                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4148                 else
4149                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4150                 return;
4151         }
4152
4153         for_each_active_iommu(iommu, drhd) {
4154
4155                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4156
4157                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4158                         iommu->reg + DMAR_FECTL_REG);
4159                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4160                         iommu->reg + DMAR_FEDATA_REG);
4161                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4162                         iommu->reg + DMAR_FEADDR_REG);
4163                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4164                         iommu->reg + DMAR_FEUADDR_REG);
4165
4166                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4167         }
4168
4169         for_each_active_iommu(iommu, drhd)
4170                 kfree(iommu->iommu_state);
4171 }
4172
4173 static struct syscore_ops iommu_syscore_ops = {
4174         .resume         = iommu_resume,
4175         .suspend        = iommu_suspend,
4176 };
4177
4178 static void __init init_iommu_pm_ops(void)
4179 {
4180         register_syscore_ops(&iommu_syscore_ops);
4181 }
4182
4183 #else
4184 static inline void init_iommu_pm_ops(void) {}
4185 #endif  /* CONFIG_PM */
4186
4187
4188 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4189 {
4190         struct acpi_dmar_reserved_memory *rmrr;
4191         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4192         struct dmar_rmrr_unit *rmrru;
4193         size_t length;
4194
4195         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4196         if (!rmrru)
4197                 goto out;
4198
4199         rmrru->hdr = header;
4200         rmrr = (struct acpi_dmar_reserved_memory *)header;
4201         rmrru->base_address = rmrr->base_address;
4202         rmrru->end_address = rmrr->end_address;
4203
4204         length = rmrr->end_address - rmrr->base_address + 1;
4205         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4206                                               IOMMU_RESV_DIRECT);
4207         if (!rmrru->resv)
4208                 goto free_rmrru;
4209
4210         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4211                                 ((void *)rmrr) + rmrr->header.length,
4212                                 &rmrru->devices_cnt);
4213         if (rmrru->devices_cnt && rmrru->devices == NULL)
4214                 goto free_all;
4215
4216         list_add(&rmrru->list, &dmar_rmrr_units);
4217
4218         return 0;
4219 free_all:
4220         kfree(rmrru->resv);
4221 free_rmrru:
4222         kfree(rmrru);
4223 out:
4224         return -ENOMEM;
4225 }
4226
4227 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4228 {
4229         struct dmar_atsr_unit *atsru;
4230         struct acpi_dmar_atsr *tmp;
4231
4232         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4233                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4234                 if (atsr->segment != tmp->segment)
4235                         continue;
4236                 if (atsr->header.length != tmp->header.length)
4237                         continue;
4238                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4239                         return atsru;
4240         }
4241
4242         return NULL;
4243 }
4244
4245 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4246 {
4247         struct acpi_dmar_atsr *atsr;
4248         struct dmar_atsr_unit *atsru;
4249
4250         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4251                 return 0;
4252
4253         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4254         atsru = dmar_find_atsr(atsr);
4255         if (atsru)
4256                 return 0;
4257
4258         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4259         if (!atsru)
4260                 return -ENOMEM;
4261
4262         /*
4263          * If memory is allocated from slab by ACPI _DSM method, we need to
4264          * copy the memory content because the memory buffer will be freed
4265          * on return.
4266          */
4267         atsru->hdr = (void *)(atsru + 1);
4268         memcpy(atsru->hdr, hdr, hdr->length);
4269         atsru->include_all = atsr->flags & 0x1;
4270         if (!atsru->include_all) {
4271                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4272                                 (void *)atsr + atsr->header.length,
4273                                 &atsru->devices_cnt);
4274                 if (atsru->devices_cnt && atsru->devices == NULL) {
4275                         kfree(atsru);
4276                         return -ENOMEM;
4277                 }
4278         }
4279
4280         list_add_rcu(&atsru->list, &dmar_atsr_units);
4281
4282         return 0;
4283 }
4284
4285 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4286 {
4287         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4288         kfree(atsru);
4289 }
4290
4291 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4292 {
4293         struct acpi_dmar_atsr *atsr;
4294         struct dmar_atsr_unit *atsru;
4295
4296         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4297         atsru = dmar_find_atsr(atsr);
4298         if (atsru) {
4299                 list_del_rcu(&atsru->list);
4300                 synchronize_rcu();
4301                 intel_iommu_free_atsr(atsru);
4302         }
4303
4304         return 0;
4305 }
4306
4307 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4308 {
4309         int i;
4310         struct device *dev;
4311         struct acpi_dmar_atsr *atsr;
4312         struct dmar_atsr_unit *atsru;
4313
4314         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4315         atsru = dmar_find_atsr(atsr);
4316         if (!atsru)
4317                 return 0;
4318
4319         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4320                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4321                                           i, dev)
4322                         return -EBUSY;
4323         }
4324
4325         return 0;
4326 }
4327
4328 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4329 {
4330         int sp, ret;
4331         struct intel_iommu *iommu = dmaru->iommu;
4332
4333         if (g_iommus[iommu->seq_id])
4334                 return 0;
4335
4336         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4337                 pr_warn("%s: Doesn't support hardware pass through.\n",
4338                         iommu->name);
4339                 return -ENXIO;
4340         }
4341         if (!ecap_sc_support(iommu->ecap) &&
4342             domain_update_iommu_snooping(iommu)) {
4343                 pr_warn("%s: Doesn't support snooping.\n",
4344                         iommu->name);
4345                 return -ENXIO;
4346         }
4347         sp = domain_update_iommu_superpage(iommu) - 1;
4348         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4349                 pr_warn("%s: Doesn't support large page.\n",
4350                         iommu->name);
4351                 return -ENXIO;
4352         }
4353
4354         /*
4355          * Disable translation if already enabled prior to OS handover.
4356          */
4357         if (iommu->gcmd & DMA_GCMD_TE)
4358                 iommu_disable_translation(iommu);
4359
4360         g_iommus[iommu->seq_id] = iommu;
4361         ret = iommu_init_domains(iommu);
4362         if (ret == 0)
4363                 ret = iommu_alloc_root_entry(iommu);
4364         if (ret)
4365                 goto out;
4366
4367 #ifdef CONFIG_INTEL_IOMMU_SVM
4368         if (pasid_supported(iommu))
4369                 intel_svm_init(iommu);
4370 #endif
4371
4372         if (dmaru->ignored) {
4373                 /*
4374                  * we always have to disable PMRs or DMA may fail on this device
4375                  */
4376                 if (force_on)
4377                         iommu_disable_protect_mem_regions(iommu);
4378                 return 0;
4379         }
4380
4381         intel_iommu_init_qi(iommu);
4382         iommu_flush_write_buffer(iommu);
4383
4384 #ifdef CONFIG_INTEL_IOMMU_SVM
4385         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4386                 ret = intel_svm_enable_prq(iommu);
4387                 if (ret)
4388                         goto disable_iommu;
4389         }
4390 #endif
4391         ret = dmar_set_interrupt(iommu);
4392         if (ret)
4393                 goto disable_iommu;
4394
4395         iommu_set_root_entry(iommu);
4396         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4397         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4398         iommu_enable_translation(iommu);
4399
4400         iommu_disable_protect_mem_regions(iommu);
4401         return 0;
4402
4403 disable_iommu:
4404         disable_dmar_iommu(iommu);
4405 out:
4406         free_dmar_iommu(iommu);
4407         return ret;
4408 }
4409
4410 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4411 {
4412         int ret = 0;
4413         struct intel_iommu *iommu = dmaru->iommu;
4414
4415         if (!intel_iommu_enabled)
4416                 return 0;
4417         if (iommu == NULL)
4418                 return -EINVAL;
4419
4420         if (insert) {
4421                 ret = intel_iommu_add(dmaru);
4422         } else {
4423                 disable_dmar_iommu(iommu);
4424                 free_dmar_iommu(iommu);
4425         }
4426
4427         return ret;
4428 }
4429
4430 static void intel_iommu_free_dmars(void)
4431 {
4432         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4433         struct dmar_atsr_unit *atsru, *atsr_n;
4434
4435         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4436                 list_del(&rmrru->list);
4437                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4438                 kfree(rmrru->resv);
4439                 kfree(rmrru);
4440         }
4441
4442         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4443                 list_del(&atsru->list);
4444                 intel_iommu_free_atsr(atsru);
4445         }
4446 }
4447
4448 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4449 {
4450         int i, ret = 1;
4451         struct pci_bus *bus;
4452         struct pci_dev *bridge = NULL;
4453         struct device *tmp;
4454         struct acpi_dmar_atsr *atsr;
4455         struct dmar_atsr_unit *atsru;
4456
4457         dev = pci_physfn(dev);
4458         for (bus = dev->bus; bus; bus = bus->parent) {
4459                 bridge = bus->self;
4460                 /* If it's an integrated device, allow ATS */
4461                 if (!bridge)
4462                         return 1;
4463                 /* Connected via non-PCIe: no ATS */
4464                 if (!pci_is_pcie(bridge) ||
4465                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4466                         return 0;
4467                 /* If we found the root port, look it up in the ATSR */
4468                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4469                         break;
4470         }
4471
4472         rcu_read_lock();
4473         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4474                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4475                 if (atsr->segment != pci_domain_nr(dev->bus))
4476                         continue;
4477
4478                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4479                         if (tmp == &bridge->dev)
4480                                 goto out;
4481
4482                 if (atsru->include_all)
4483                         goto out;
4484         }
4485         ret = 0;
4486 out:
4487         rcu_read_unlock();
4488
4489         return ret;
4490 }
4491
4492 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4493 {
4494         int ret;
4495         struct dmar_rmrr_unit *rmrru;
4496         struct dmar_atsr_unit *atsru;
4497         struct acpi_dmar_atsr *atsr;
4498         struct acpi_dmar_reserved_memory *rmrr;
4499
4500         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4501                 return 0;
4502
4503         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4504                 rmrr = container_of(rmrru->hdr,
4505                                     struct acpi_dmar_reserved_memory, header);
4506                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4507                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4508                                 ((void *)rmrr) + rmrr->header.length,
4509                                 rmrr->segment, rmrru->devices,
4510                                 rmrru->devices_cnt);
4511                         if (ret < 0)
4512                                 return ret;
4513                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4514                         dmar_remove_dev_scope(info, rmrr->segment,
4515                                 rmrru->devices, rmrru->devices_cnt);
4516                 }
4517         }
4518
4519         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4520                 if (atsru->include_all)
4521                         continue;
4522
4523                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4524                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4525                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4526                                         (void *)atsr + atsr->header.length,
4527                                         atsr->segment, atsru->devices,
4528                                         atsru->devices_cnt);
4529                         if (ret > 0)
4530                                 break;
4531                         else if (ret < 0)
4532                                 return ret;
4533                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4534                         if (dmar_remove_dev_scope(info, atsr->segment,
4535                                         atsru->devices, atsru->devices_cnt))
4536                                 break;
4537                 }
4538         }
4539
4540         return 0;
4541 }
4542
4543 /*
4544  * Here we only respond to action of unbound device from driver.
4545  *
4546  * Added device is not attached to its DMAR domain here yet. That will happen
4547  * when mapping the device to iova.
4548  */
4549 static int device_notifier(struct notifier_block *nb,
4550                                   unsigned long action, void *data)
4551 {
4552         struct device *dev = data;
4553         struct dmar_domain *domain;
4554
4555         if (iommu_dummy(dev))
4556                 return 0;
4557
4558         if (action == BUS_NOTIFY_REMOVED_DEVICE) {
4559                 domain = find_domain(dev);
4560                 if (!domain)
4561                         return 0;
4562
4563                 dmar_remove_one_dev_info(dev);
4564         } else if (action == BUS_NOTIFY_ADD_DEVICE) {
4565                 if (iommu_should_identity_map(dev, 1))
4566                         domain_add_dev_info(si_domain, dev);
4567         }
4568
4569         return 0;
4570 }
4571
4572 static struct notifier_block device_nb = {
4573         .notifier_call = device_notifier,
4574 };
4575
4576 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4577                                        unsigned long val, void *v)
4578 {
4579         struct memory_notify *mhp = v;
4580         unsigned long long start, end;
4581         unsigned long start_vpfn, last_vpfn;
4582
4583         switch (val) {
4584         case MEM_GOING_ONLINE:
4585                 start = mhp->start_pfn << PAGE_SHIFT;
4586                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4587                 if (iommu_domain_identity_map(si_domain, start, end)) {
4588                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4589                                 start, end);
4590                         return NOTIFY_BAD;
4591                 }
4592                 break;
4593
4594         case MEM_OFFLINE:
4595         case MEM_CANCEL_ONLINE:
4596                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4597                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4598                 while (start_vpfn <= last_vpfn) {
4599                         struct iova *iova;
4600                         struct dmar_drhd_unit *drhd;
4601                         struct intel_iommu *iommu;
4602                         struct page *freelist;
4603
4604                         iova = find_iova(&si_domain->iovad, start_vpfn);
4605                         if (iova == NULL) {
4606                                 pr_debug("Failed get IOVA for PFN %lx\n",
4607                                          start_vpfn);
4608                                 break;
4609                         }
4610
4611                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4612                                                      start_vpfn, last_vpfn);
4613                         if (iova == NULL) {
4614                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4615                                         start_vpfn, last_vpfn);
4616                                 return NOTIFY_BAD;
4617                         }
4618
4619                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4620                                                iova->pfn_hi);
4621
4622                         rcu_read_lock();
4623                         for_each_active_iommu(iommu, drhd)
4624                                 iommu_flush_iotlb_psi(iommu, si_domain,
4625                                         iova->pfn_lo, iova_size(iova),
4626                                         !freelist, 0);
4627                         rcu_read_unlock();
4628                         dma_free_pagelist(freelist);
4629
4630                         start_vpfn = iova->pfn_hi + 1;
4631                         free_iova_mem(iova);
4632                 }
4633                 break;
4634         }
4635
4636         return NOTIFY_OK;
4637 }
4638
4639 static struct notifier_block intel_iommu_memory_nb = {
4640         .notifier_call = intel_iommu_memory_notifier,
4641         .priority = 0
4642 };
4643
4644 static void free_all_cpu_cached_iovas(unsigned int cpu)
4645 {
4646         int i;
4647
4648         for (i = 0; i < g_num_of_iommus; i++) {
4649                 struct intel_iommu *iommu = g_iommus[i];
4650                 struct dmar_domain *domain;
4651                 int did;
4652
4653                 if (!iommu)
4654                         continue;
4655
4656                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4657                         domain = get_iommu_domain(iommu, (u16)did);
4658
4659                         if (!domain)
4660                                 continue;
4661                         free_cpu_cached_iovas(cpu, &domain->iovad);
4662                 }
4663         }
4664 }
4665
4666 static int intel_iommu_cpu_dead(unsigned int cpu)
4667 {
4668         free_all_cpu_cached_iovas(cpu);
4669         return 0;
4670 }
4671
4672 static void intel_disable_iommus(void)
4673 {
4674         struct intel_iommu *iommu = NULL;
4675         struct dmar_drhd_unit *drhd;
4676
4677         for_each_iommu(iommu, drhd)
4678                 iommu_disable_translation(iommu);
4679 }
4680
4681 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4682 {
4683         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4684
4685         return container_of(iommu_dev, struct intel_iommu, iommu);
4686 }
4687
4688 static ssize_t intel_iommu_show_version(struct device *dev,
4689                                         struct device_attribute *attr,
4690                                         char *buf)
4691 {
4692         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4693         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4694         return sprintf(buf, "%d:%d\n",
4695                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4696 }
4697 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4698
4699 static ssize_t intel_iommu_show_address(struct device *dev,
4700                                         struct device_attribute *attr,
4701                                         char *buf)
4702 {
4703         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4704         return sprintf(buf, "%llx\n", iommu->reg_phys);
4705 }
4706 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4707
4708 static ssize_t intel_iommu_show_cap(struct device *dev,
4709                                     struct device_attribute *attr,
4710                                     char *buf)
4711 {
4712         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4713         return sprintf(buf, "%llx\n", iommu->cap);
4714 }
4715 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4716
4717 static ssize_t intel_iommu_show_ecap(struct device *dev,
4718                                     struct device_attribute *attr,
4719                                     char *buf)
4720 {
4721         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4722         return sprintf(buf, "%llx\n", iommu->ecap);
4723 }
4724 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4725
4726 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4727                                       struct device_attribute *attr,
4728                                       char *buf)
4729 {
4730         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4731         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4732 }
4733 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4734
4735 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4736                                            struct device_attribute *attr,
4737                                            char *buf)
4738 {
4739         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4740         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4741                                                   cap_ndoms(iommu->cap)));
4742 }
4743 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4744
4745 static struct attribute *intel_iommu_attrs[] = {
4746         &dev_attr_version.attr,
4747         &dev_attr_address.attr,
4748         &dev_attr_cap.attr,
4749         &dev_attr_ecap.attr,
4750         &dev_attr_domains_supported.attr,
4751         &dev_attr_domains_used.attr,
4752         NULL,
4753 };
4754
4755 static struct attribute_group intel_iommu_group = {
4756         .name = "intel-iommu",
4757         .attrs = intel_iommu_attrs,
4758 };
4759
4760 const struct attribute_group *intel_iommu_groups[] = {
4761         &intel_iommu_group,
4762         NULL,
4763 };
4764
4765 static int __init platform_optin_force_iommu(void)
4766 {
4767         struct pci_dev *pdev = NULL;
4768         bool has_untrusted_dev = false;
4769
4770         if (!dmar_platform_optin() || no_platform_optin)
4771                 return 0;
4772
4773         for_each_pci_dev(pdev) {
4774                 if (pdev->untrusted) {
4775                         has_untrusted_dev = true;
4776                         break;
4777                 }
4778         }
4779
4780         if (!has_untrusted_dev)
4781                 return 0;
4782
4783         if (no_iommu || dmar_disabled)
4784                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4785
4786         /*
4787          * If Intel-IOMMU is disabled by default, we will apply identity
4788          * map for all devices except those marked as being untrusted.
4789          */
4790         if (dmar_disabled)
4791                 iommu_identity_mapping |= IDENTMAP_ALL;
4792
4793         dmar_disabled = 0;
4794 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4795         swiotlb = 0;
4796 #endif
4797         no_iommu = 0;
4798
4799         return 1;
4800 }
4801
4802 int __init intel_iommu_init(void)
4803 {
4804         int ret = -ENODEV;
4805         struct dmar_drhd_unit *drhd;
4806         struct intel_iommu *iommu;
4807
4808         /*
4809          * Intel IOMMU is required for a TXT/tboot launch or platform
4810          * opt in, so enforce that.
4811          */
4812         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4813
4814         if (iommu_init_mempool()) {
4815                 if (force_on)
4816                         panic("tboot: Failed to initialize iommu memory\n");
4817                 return -ENOMEM;
4818         }
4819
4820         down_write(&dmar_global_lock);
4821         if (dmar_table_init()) {
4822                 if (force_on)
4823                         panic("tboot: Failed to initialize DMAR table\n");
4824                 goto out_free_dmar;
4825         }
4826
4827         if (dmar_dev_scope_init() < 0) {
4828                 if (force_on)
4829                         panic("tboot: Failed to initialize DMAR device scope\n");
4830                 goto out_free_dmar;
4831         }
4832
4833         up_write(&dmar_global_lock);
4834
4835         /*
4836          * The bus notifier takes the dmar_global_lock, so lockdep will
4837          * complain later when we register it under the lock.
4838          */
4839         dmar_register_bus_notifier();
4840
4841         down_write(&dmar_global_lock);
4842
4843         if (no_iommu || dmar_disabled) {
4844                 /*
4845                  * We exit the function here to ensure IOMMU's remapping and
4846                  * mempool aren't setup, which means that the IOMMU's PMRs
4847                  * won't be disabled via the call to init_dmars(). So disable
4848                  * it explicitly here. The PMRs were setup by tboot prior to
4849                  * calling SENTER, but the kernel is expected to reset/tear
4850                  * down the PMRs.
4851                  */
4852                 if (intel_iommu_tboot_noforce) {
4853                         for_each_iommu(iommu, drhd)
4854                                 iommu_disable_protect_mem_regions(iommu);
4855                 }
4856
4857                 /*
4858                  * Make sure the IOMMUs are switched off, even when we
4859                  * boot into a kexec kernel and the previous kernel left
4860                  * them enabled
4861                  */
4862                 intel_disable_iommus();
4863                 goto out_free_dmar;
4864         }
4865
4866         if (list_empty(&dmar_rmrr_units))
4867                 pr_info("No RMRR found\n");
4868
4869         if (list_empty(&dmar_atsr_units))
4870                 pr_info("No ATSR found\n");
4871
4872         if (dmar_init_reserved_ranges()) {
4873                 if (force_on)
4874                         panic("tboot: Failed to reserve iommu ranges\n");
4875                 goto out_free_reserved_range;
4876         }
4877
4878         if (dmar_map_gfx)
4879                 intel_iommu_gfx_mapped = 1;
4880
4881         init_no_remapping_devices();
4882
4883         ret = init_dmars();
4884         if (ret) {
4885                 if (force_on)
4886                         panic("tboot: Failed to initialize DMARs\n");
4887                 pr_err("Initialization failed\n");
4888                 goto out_free_reserved_range;
4889         }
4890         up_write(&dmar_global_lock);
4891
4892 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4893         swiotlb = 0;
4894 #endif
4895         dma_ops = &intel_dma_ops;
4896
4897         init_iommu_pm_ops();
4898
4899         for_each_active_iommu(iommu, drhd) {
4900                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4901                                        intel_iommu_groups,
4902                                        "%s", iommu->name);
4903                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4904                 iommu_device_register(&iommu->iommu);
4905         }
4906
4907         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4908         bus_register_notifier(&pci_bus_type, &device_nb);
4909         if (si_domain && !hw_pass_through)
4910                 register_memory_notifier(&intel_iommu_memory_nb);
4911         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4912                           intel_iommu_cpu_dead);
4913
4914         /* Finally, we enable the DMA remapping hardware. */
4915         for_each_iommu(iommu, drhd) {
4916                 if (!translation_pre_enabled(iommu))
4917                         iommu_enable_translation(iommu);
4918
4919                 iommu_disable_protect_mem_regions(iommu);
4920         }
4921         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4922
4923         intel_iommu_enabled = 1;
4924         intel_iommu_debugfs_init();
4925
4926         return 0;
4927
4928 out_free_reserved_range:
4929         put_iova_domain(&reserved_iova_list);
4930 out_free_dmar:
4931         intel_iommu_free_dmars();
4932         up_write(&dmar_global_lock);
4933         iommu_exit_mempool();
4934         return ret;
4935 }
4936
4937 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4938 {
4939         struct intel_iommu *iommu = opaque;
4940
4941         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4942         return 0;
4943 }
4944
4945 /*
4946  * NB - intel-iommu lacks any sort of reference counting for the users of
4947  * dependent devices.  If multiple endpoints have intersecting dependent
4948  * devices, unbinding the driver from any one of them will possibly leave
4949  * the others unable to operate.
4950  */
4951 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4952 {
4953         if (!iommu || !dev || !dev_is_pci(dev))
4954                 return;
4955
4956         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4957 }
4958
4959 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4960 {
4961         struct intel_iommu *iommu;
4962         unsigned long flags;
4963
4964         assert_spin_locked(&device_domain_lock);
4965
4966         if (WARN_ON(!info))
4967                 return;
4968
4969         iommu = info->iommu;
4970
4971         if (info->dev) {
4972                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4973                         intel_pasid_tear_down_entry(iommu, info->dev,
4974                                         PASID_RID2PASID);
4975
4976                 iommu_disable_dev_iotlb(info);
4977                 domain_context_clear(iommu, info->dev);
4978                 intel_pasid_free_table(info->dev);
4979         }
4980
4981         unlink_domain_info(info);
4982
4983         spin_lock_irqsave(&iommu->lock, flags);
4984         domain_detach_iommu(info->domain, iommu);
4985         spin_unlock_irqrestore(&iommu->lock, flags);
4986
4987         free_devinfo_mem(info);
4988 }
4989
4990 static void dmar_remove_one_dev_info(struct device *dev)
4991 {
4992         struct device_domain_info *info;
4993         unsigned long flags;
4994
4995         spin_lock_irqsave(&device_domain_lock, flags);
4996         info = dev->archdata.iommu;
4997         __dmar_remove_one_dev_info(info);
4998         spin_unlock_irqrestore(&device_domain_lock, flags);
4999 }
5000
5001 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5002 {
5003         int adjust_width;
5004
5005         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5006         domain_reserve_special_ranges(domain);
5007
5008         /* calculate AGAW */
5009         domain->gaw = guest_width;
5010         adjust_width = guestwidth_to_adjustwidth(guest_width);
5011         domain->agaw = width_to_agaw(adjust_width);
5012
5013         domain->iommu_coherency = 0;
5014         domain->iommu_snooping = 0;
5015         domain->iommu_superpage = 0;
5016         domain->max_addr = 0;
5017
5018         /* always allocate the top pgd */
5019         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5020         if (!domain->pgd)
5021                 return -ENOMEM;
5022         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5023         return 0;
5024 }
5025
5026 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5027 {
5028         struct dmar_domain *dmar_domain;
5029         struct iommu_domain *domain;
5030
5031         switch (type) {
5032         case IOMMU_DOMAIN_DMA:
5033         /* fallthrough */
5034         case IOMMU_DOMAIN_UNMANAGED:
5035                 dmar_domain = alloc_domain(0);
5036                 if (!dmar_domain) {
5037                         pr_err("Can't allocate dmar_domain\n");
5038                         return NULL;
5039                 }
5040                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5041                         pr_err("Domain initialization failed\n");
5042                         domain_exit(dmar_domain);
5043                         return NULL;
5044                 }
5045
5046                 if (type == IOMMU_DOMAIN_DMA &&
5047                     init_iova_flush_queue(&dmar_domain->iovad,
5048                                           iommu_flush_iova, iova_entry_free)) {
5049                         pr_warn("iova flush queue initialization failed\n");
5050                         intel_iommu_strict = 1;
5051                 }
5052
5053                 domain_update_iommu_cap(dmar_domain);
5054
5055                 domain = &dmar_domain->domain;
5056                 domain->geometry.aperture_start = 0;
5057                 domain->geometry.aperture_end   =
5058                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5059                 domain->geometry.force_aperture = true;
5060
5061                 return domain;
5062         case IOMMU_DOMAIN_IDENTITY:
5063                 return &si_domain->domain;
5064         default:
5065                 return NULL;
5066         }
5067
5068         return NULL;
5069 }
5070
5071 static void intel_iommu_domain_free(struct iommu_domain *domain)
5072 {
5073         if (domain != &si_domain->domain)
5074                 domain_exit(to_dmar_domain(domain));
5075 }
5076
5077 /*
5078  * Check whether a @domain could be attached to the @dev through the
5079  * aux-domain attach/detach APIs.
5080  */
5081 static inline bool
5082 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5083 {
5084         struct device_domain_info *info = dev->archdata.iommu;
5085
5086         return info && info->auxd_enabled &&
5087                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5088 }
5089
5090 static void auxiliary_link_device(struct dmar_domain *domain,
5091                                   struct device *dev)
5092 {
5093         struct device_domain_info *info = dev->archdata.iommu;
5094
5095         assert_spin_locked(&device_domain_lock);
5096         if (WARN_ON(!info))
5097                 return;
5098
5099         domain->auxd_refcnt++;
5100         list_add(&domain->auxd, &info->auxiliary_domains);
5101 }
5102
5103 static void auxiliary_unlink_device(struct dmar_domain *domain,
5104                                     struct device *dev)
5105 {
5106         struct device_domain_info *info = dev->archdata.iommu;
5107
5108         assert_spin_locked(&device_domain_lock);
5109         if (WARN_ON(!info))
5110                 return;
5111
5112         list_del(&domain->auxd);
5113         domain->auxd_refcnt--;
5114
5115         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5116                 intel_pasid_free_id(domain->default_pasid);
5117 }
5118
5119 static int aux_domain_add_dev(struct dmar_domain *domain,
5120                               struct device *dev)
5121 {
5122         int ret;
5123         u8 bus, devfn;
5124         unsigned long flags;
5125         struct intel_iommu *iommu;
5126
5127         iommu = device_to_iommu(dev, &bus, &devfn);
5128         if (!iommu)
5129                 return -ENODEV;
5130
5131         if (domain->default_pasid <= 0) {
5132                 int pasid;
5133
5134                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5135                                              pci_max_pasids(to_pci_dev(dev)),
5136                                              GFP_KERNEL);
5137                 if (pasid <= 0) {
5138                         pr_err("Can't allocate default pasid\n");
5139                         return -ENODEV;
5140                 }
5141                 domain->default_pasid = pasid;
5142         }
5143
5144         spin_lock_irqsave(&device_domain_lock, flags);
5145         /*
5146          * iommu->lock must be held to attach domain to iommu and setup the
5147          * pasid entry for second level translation.
5148          */
5149         spin_lock(&iommu->lock);
5150         ret = domain_attach_iommu(domain, iommu);
5151         if (ret)
5152                 goto attach_failed;
5153
5154         /* Setup the PASID entry for mediated devices: */
5155         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5156                                              domain->default_pasid);
5157         if (ret)
5158                 goto table_failed;
5159         spin_unlock(&iommu->lock);
5160
5161         auxiliary_link_device(domain, dev);
5162
5163         spin_unlock_irqrestore(&device_domain_lock, flags);
5164
5165         return 0;
5166
5167 table_failed:
5168         domain_detach_iommu(domain, iommu);
5169 attach_failed:
5170         spin_unlock(&iommu->lock);
5171         spin_unlock_irqrestore(&device_domain_lock, flags);
5172         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5173                 intel_pasid_free_id(domain->default_pasid);
5174
5175         return ret;
5176 }
5177
5178 static void aux_domain_remove_dev(struct dmar_domain *domain,
5179                                   struct device *dev)
5180 {
5181         struct device_domain_info *info;
5182         struct intel_iommu *iommu;
5183         unsigned long flags;
5184
5185         if (!is_aux_domain(dev, &domain->domain))
5186                 return;
5187
5188         spin_lock_irqsave(&device_domain_lock, flags);
5189         info = dev->archdata.iommu;
5190         iommu = info->iommu;
5191
5192         auxiliary_unlink_device(domain, dev);
5193
5194         spin_lock(&iommu->lock);
5195         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5196         domain_detach_iommu(domain, iommu);
5197         spin_unlock(&iommu->lock);
5198
5199         spin_unlock_irqrestore(&device_domain_lock, flags);
5200 }
5201
5202 static int prepare_domain_attach_device(struct iommu_domain *domain,
5203                                         struct device *dev)
5204 {
5205         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5206         struct intel_iommu *iommu;
5207         int addr_width;
5208         u8 bus, devfn;
5209
5210         iommu = device_to_iommu(dev, &bus, &devfn);
5211         if (!iommu)
5212                 return -ENODEV;
5213
5214         /* check if this iommu agaw is sufficient for max mapped address */
5215         addr_width = agaw_to_width(iommu->agaw);
5216         if (addr_width > cap_mgaw(iommu->cap))
5217                 addr_width = cap_mgaw(iommu->cap);
5218
5219         if (dmar_domain->max_addr > (1LL << addr_width)) {
5220                 dev_err(dev, "%s: iommu width (%d) is not "
5221                         "sufficient for the mapped address (%llx)\n",
5222                         __func__, addr_width, dmar_domain->max_addr);
5223                 return -EFAULT;
5224         }
5225         dmar_domain->gaw = addr_width;
5226
5227         /*
5228          * Knock out extra levels of page tables if necessary
5229          */
5230         while (iommu->agaw < dmar_domain->agaw) {
5231                 struct dma_pte *pte;
5232
5233                 pte = dmar_domain->pgd;
5234                 if (dma_pte_present(pte)) {
5235                         dmar_domain->pgd = (struct dma_pte *)
5236                                 phys_to_virt(dma_pte_addr(pte));
5237                         free_pgtable_page(pte);
5238                 }
5239                 dmar_domain->agaw--;
5240         }
5241
5242         return 0;
5243 }
5244
5245 static int intel_iommu_attach_device(struct iommu_domain *domain,
5246                                      struct device *dev)
5247 {
5248         int ret;
5249
5250         if (device_is_rmrr_locked(dev)) {
5251                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5252                 return -EPERM;
5253         }
5254
5255         if (is_aux_domain(dev, domain))
5256                 return -EPERM;
5257
5258         /* normally dev is not mapped */
5259         if (unlikely(domain_context_mapped(dev))) {
5260                 struct dmar_domain *old_domain;
5261
5262                 old_domain = find_domain(dev);
5263                 if (old_domain)
5264                         dmar_remove_one_dev_info(dev);
5265         }
5266
5267         ret = prepare_domain_attach_device(domain, dev);
5268         if (ret)
5269                 return ret;
5270
5271         return domain_add_dev_info(to_dmar_domain(domain), dev);
5272 }
5273
5274 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5275                                          struct device *dev)
5276 {
5277         int ret;
5278
5279         if (!is_aux_domain(dev, domain))
5280                 return -EPERM;
5281
5282         ret = prepare_domain_attach_device(domain, dev);
5283         if (ret)
5284                 return ret;
5285
5286         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5287 }
5288
5289 static void intel_iommu_detach_device(struct iommu_domain *domain,
5290                                       struct device *dev)
5291 {
5292         dmar_remove_one_dev_info(dev);
5293 }
5294
5295 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5296                                           struct device *dev)
5297 {
5298         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5299 }
5300
5301 static int intel_iommu_map(struct iommu_domain *domain,
5302                            unsigned long iova, phys_addr_t hpa,
5303                            size_t size, int iommu_prot)
5304 {
5305         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5306         u64 max_addr;
5307         int prot = 0;
5308         int ret;
5309
5310         if (iommu_prot & IOMMU_READ)
5311                 prot |= DMA_PTE_READ;
5312         if (iommu_prot & IOMMU_WRITE)
5313                 prot |= DMA_PTE_WRITE;
5314         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5315                 prot |= DMA_PTE_SNP;
5316
5317         max_addr = iova + size;
5318         if (dmar_domain->max_addr < max_addr) {
5319                 u64 end;
5320
5321                 /* check if minimum agaw is sufficient for mapped address */
5322                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5323                 if (end < max_addr) {
5324                         pr_err("%s: iommu width (%d) is not "
5325                                "sufficient for the mapped address (%llx)\n",
5326                                __func__, dmar_domain->gaw, max_addr);
5327                         return -EFAULT;
5328                 }
5329                 dmar_domain->max_addr = max_addr;
5330         }
5331         /* Round up size to next multiple of PAGE_SIZE, if it and
5332            the low bits of hpa would take us onto the next page */
5333         size = aligned_nrpages(hpa, size);
5334         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5335                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5336         return ret;
5337 }
5338
5339 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5340                                 unsigned long iova, size_t size)
5341 {
5342         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5343         struct page *freelist = NULL;
5344         unsigned long start_pfn, last_pfn;
5345         unsigned int npages;
5346         int iommu_id, level = 0;
5347
5348         /* Cope with horrid API which requires us to unmap more than the
5349            size argument if it happens to be a large-page mapping. */
5350         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5351
5352         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5353                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5354
5355         start_pfn = iova >> VTD_PAGE_SHIFT;
5356         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5357
5358         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5359
5360         npages = last_pfn - start_pfn + 1;
5361
5362         for_each_domain_iommu(iommu_id, dmar_domain)
5363                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5364                                       start_pfn, npages, !freelist, 0);
5365
5366         dma_free_pagelist(freelist);
5367
5368         if (dmar_domain->max_addr == iova + size)
5369                 dmar_domain->max_addr = iova;
5370
5371         return size;
5372 }
5373
5374 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5375                                             dma_addr_t iova)
5376 {
5377         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5378         struct dma_pte *pte;
5379         int level = 0;
5380         u64 phys = 0;
5381
5382         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5383         if (pte)
5384                 phys = dma_pte_addr(pte);
5385
5386         return phys;
5387 }
5388
5389 static inline bool scalable_mode_support(void)
5390 {
5391         struct dmar_drhd_unit *drhd;
5392         struct intel_iommu *iommu;
5393         bool ret = true;
5394
5395         rcu_read_lock();
5396         for_each_active_iommu(iommu, drhd) {
5397                 if (!sm_supported(iommu)) {
5398                         ret = false;
5399                         break;
5400                 }
5401         }
5402         rcu_read_unlock();
5403
5404         return ret;
5405 }
5406
5407 static inline bool iommu_pasid_support(void)
5408 {
5409         struct dmar_drhd_unit *drhd;
5410         struct intel_iommu *iommu;
5411         bool ret = true;
5412
5413         rcu_read_lock();
5414         for_each_active_iommu(iommu, drhd) {
5415                 if (!pasid_supported(iommu)) {
5416                         ret = false;
5417                         break;
5418                 }
5419         }
5420         rcu_read_unlock();
5421
5422         return ret;
5423 }
5424
5425 static bool intel_iommu_capable(enum iommu_cap cap)
5426 {
5427         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5428                 return domain_update_iommu_snooping(NULL) == 1;
5429         if (cap == IOMMU_CAP_INTR_REMAP)
5430                 return irq_remapping_enabled == 1;
5431
5432         return false;
5433 }
5434
5435 static int intel_iommu_add_device(struct device *dev)
5436 {
5437         struct intel_iommu *iommu;
5438         struct iommu_group *group;
5439         u8 bus, devfn;
5440
5441         iommu = device_to_iommu(dev, &bus, &devfn);
5442         if (!iommu)
5443                 return -ENODEV;
5444
5445         iommu_device_link(&iommu->iommu, dev);
5446
5447         group = iommu_group_get_for_dev(dev);
5448
5449         if (IS_ERR(group))
5450                 return PTR_ERR(group);
5451
5452         iommu_group_put(group);
5453         return 0;
5454 }
5455
5456 static void intel_iommu_remove_device(struct device *dev)
5457 {
5458         struct intel_iommu *iommu;
5459         u8 bus, devfn;
5460
5461         iommu = device_to_iommu(dev, &bus, &devfn);
5462         if (!iommu)
5463                 return;
5464
5465         iommu_group_remove_device(dev);
5466
5467         iommu_device_unlink(&iommu->iommu, dev);
5468 }
5469
5470 static void intel_iommu_get_resv_regions(struct device *device,
5471                                          struct list_head *head)
5472 {
5473         struct iommu_resv_region *reg;
5474         struct dmar_rmrr_unit *rmrr;
5475         struct device *i_dev;
5476         int i;
5477
5478         rcu_read_lock();
5479         for_each_rmrr_units(rmrr) {
5480                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5481                                           i, i_dev) {
5482                         if (i_dev != device)
5483                                 continue;
5484
5485                         list_add_tail(&rmrr->resv->list, head);
5486                 }
5487         }
5488         rcu_read_unlock();
5489
5490 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5491         if (dev_is_pci(device)) {
5492                 struct pci_dev *pdev = to_pci_dev(device);
5493
5494                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5495                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5496                                                       IOMMU_RESV_DIRECT);
5497                         if (reg)
5498                                 list_add_tail(&reg->list, head);
5499                 }
5500         }
5501 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5502
5503         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5504                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5505                                       0, IOMMU_RESV_MSI);
5506         if (!reg)
5507                 return;
5508         list_add_tail(&reg->list, head);
5509 }
5510
5511 static void intel_iommu_put_resv_regions(struct device *dev,
5512                                          struct list_head *head)
5513 {
5514         struct iommu_resv_region *entry, *next;
5515
5516         list_for_each_entry_safe(entry, next, head, list) {
5517                 if (entry->type == IOMMU_RESV_MSI)
5518                         kfree(entry);
5519         }
5520 }
5521
5522 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5523 {
5524         struct device_domain_info *info;
5525         struct context_entry *context;
5526         struct dmar_domain *domain;
5527         unsigned long flags;
5528         u64 ctx_lo;
5529         int ret;
5530
5531         domain = get_valid_domain_for_dev(dev);
5532         if (!domain)
5533                 return -EINVAL;
5534
5535         spin_lock_irqsave(&device_domain_lock, flags);
5536         spin_lock(&iommu->lock);
5537
5538         ret = -EINVAL;
5539         info = dev->archdata.iommu;
5540         if (!info || !info->pasid_supported)
5541                 goto out;
5542
5543         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5544         if (WARN_ON(!context))
5545                 goto out;
5546
5547         ctx_lo = context[0].lo;
5548
5549         if (!(ctx_lo & CONTEXT_PASIDE)) {
5550                 ctx_lo |= CONTEXT_PASIDE;
5551                 context[0].lo = ctx_lo;
5552                 wmb();
5553                 iommu->flush.flush_context(iommu,
5554                                            domain->iommu_did[iommu->seq_id],
5555                                            PCI_DEVID(info->bus, info->devfn),
5556                                            DMA_CCMD_MASK_NOBIT,
5557                                            DMA_CCMD_DEVICE_INVL);
5558         }
5559
5560         /* Enable PASID support in the device, if it wasn't already */
5561         if (!info->pasid_enabled)
5562                 iommu_enable_dev_iotlb(info);
5563
5564         ret = 0;
5565
5566  out:
5567         spin_unlock(&iommu->lock);
5568         spin_unlock_irqrestore(&device_domain_lock, flags);
5569
5570         return ret;
5571 }
5572
5573 static void intel_iommu_apply_resv_region(struct device *dev,
5574                                           struct iommu_domain *domain,
5575                                           struct iommu_resv_region *region)
5576 {
5577         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5578         unsigned long start, end;
5579
5580         start = IOVA_PFN(region->start);
5581         end   = IOVA_PFN(region->start + region->length - 1);
5582
5583         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5584 }
5585
5586 #ifdef CONFIG_INTEL_IOMMU_SVM
5587 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5588 {
5589         struct intel_iommu *iommu;
5590         u8 bus, devfn;
5591
5592         if (iommu_dummy(dev)) {
5593                 dev_warn(dev,
5594                          "No IOMMU translation for device; cannot enable SVM\n");
5595                 return NULL;
5596         }
5597
5598         iommu = device_to_iommu(dev, &bus, &devfn);
5599         if ((!iommu)) {
5600                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5601                 return NULL;
5602         }
5603
5604         return iommu;
5605 }
5606 #endif /* CONFIG_INTEL_IOMMU_SVM */
5607
5608 static int intel_iommu_enable_auxd(struct device *dev)
5609 {
5610         struct device_domain_info *info;
5611         struct intel_iommu *iommu;
5612         unsigned long flags;
5613         u8 bus, devfn;
5614         int ret;
5615
5616         iommu = device_to_iommu(dev, &bus, &devfn);
5617         if (!iommu || dmar_disabled)
5618                 return -EINVAL;
5619
5620         if (!sm_supported(iommu) || !pasid_supported(iommu))
5621                 return -EINVAL;
5622
5623         ret = intel_iommu_enable_pasid(iommu, dev);
5624         if (ret)
5625                 return -ENODEV;
5626
5627         spin_lock_irqsave(&device_domain_lock, flags);
5628         info = dev->archdata.iommu;
5629         info->auxd_enabled = 1;
5630         spin_unlock_irqrestore(&device_domain_lock, flags);
5631
5632         return 0;
5633 }
5634
5635 static int intel_iommu_disable_auxd(struct device *dev)
5636 {
5637         struct device_domain_info *info;
5638         unsigned long flags;
5639
5640         spin_lock_irqsave(&device_domain_lock, flags);
5641         info = dev->archdata.iommu;
5642         if (!WARN_ON(!info))
5643                 info->auxd_enabled = 0;
5644         spin_unlock_irqrestore(&device_domain_lock, flags);
5645
5646         return 0;
5647 }
5648
5649 /*
5650  * A PCI express designated vendor specific extended capability is defined
5651  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5652  * for system software and tools to detect endpoint devices supporting the
5653  * Intel scalable IO virtualization without host driver dependency.
5654  *
5655  * Returns the address of the matching extended capability structure within
5656  * the device's PCI configuration space or 0 if the device does not support
5657  * it.
5658  */
5659 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5660 {
5661         int pos;
5662         u16 vendor, id;
5663
5664         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5665         while (pos) {
5666                 pci_read_config_word(pdev, pos + 4, &vendor);
5667                 pci_read_config_word(pdev, pos + 8, &id);
5668                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5669                         return pos;
5670
5671                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5672         }
5673
5674         return 0;
5675 }
5676
5677 static bool
5678 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5679 {
5680         if (feat == IOMMU_DEV_FEAT_AUX) {
5681                 int ret;
5682
5683                 if (!dev_is_pci(dev) || dmar_disabled ||
5684                     !scalable_mode_support() || !iommu_pasid_support())
5685                         return false;
5686
5687                 ret = pci_pasid_features(to_pci_dev(dev));
5688                 if (ret < 0)
5689                         return false;
5690
5691                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5692         }
5693
5694         return false;
5695 }
5696
5697 static int
5698 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5699 {
5700         if (feat == IOMMU_DEV_FEAT_AUX)
5701                 return intel_iommu_enable_auxd(dev);
5702
5703         return -ENODEV;
5704 }
5705
5706 static int
5707 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5708 {
5709         if (feat == IOMMU_DEV_FEAT_AUX)
5710                 return intel_iommu_disable_auxd(dev);
5711
5712         return -ENODEV;
5713 }
5714
5715 static bool
5716 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5717 {
5718         struct device_domain_info *info = dev->archdata.iommu;
5719
5720         if (feat == IOMMU_DEV_FEAT_AUX)
5721                 return scalable_mode_support() && info && info->auxd_enabled;
5722
5723         return false;
5724 }
5725
5726 static int
5727 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5728 {
5729         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5730
5731         return dmar_domain->default_pasid > 0 ?
5732                         dmar_domain->default_pasid : -EINVAL;
5733 }
5734
5735 const struct iommu_ops intel_iommu_ops = {
5736         .capable                = intel_iommu_capable,
5737         .domain_alloc           = intel_iommu_domain_alloc,
5738         .domain_free            = intel_iommu_domain_free,
5739         .attach_dev             = intel_iommu_attach_device,
5740         .detach_dev             = intel_iommu_detach_device,
5741         .aux_attach_dev         = intel_iommu_aux_attach_device,
5742         .aux_detach_dev         = intel_iommu_aux_detach_device,
5743         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5744         .map                    = intel_iommu_map,
5745         .unmap                  = intel_iommu_unmap,
5746         .iova_to_phys           = intel_iommu_iova_to_phys,
5747         .add_device             = intel_iommu_add_device,
5748         .remove_device          = intel_iommu_remove_device,
5749         .get_resv_regions       = intel_iommu_get_resv_regions,
5750         .put_resv_regions       = intel_iommu_put_resv_regions,
5751         .apply_resv_region      = intel_iommu_apply_resv_region,
5752         .device_group           = pci_device_group,
5753         .dev_has_feat           = intel_iommu_dev_has_feat,
5754         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5755         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5756         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5757         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5758 };
5759
5760 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5761 {
5762         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5763         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5764         dmar_map_gfx = 0;
5765 }
5766
5767 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5768 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5774
5775 static void quirk_iommu_rwbf(struct pci_dev *dev)
5776 {
5777         /*
5778          * Mobile 4 Series Chipset neglects to set RWBF capability,
5779          * but needs it. Same seems to hold for the desktop versions.
5780          */
5781         pci_info(dev, "Forcing write-buffer flush capability\n");
5782         rwbf_quirk = 1;
5783 }
5784
5785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5792
5793 #define GGC 0x52
5794 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5795 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5796 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5797 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5798 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5799 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5800 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5801 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5802
5803 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5804 {
5805         unsigned short ggc;
5806
5807         if (pci_read_config_word(dev, GGC, &ggc))
5808                 return;
5809
5810         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5811                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5812                 dmar_map_gfx = 0;
5813         } else if (dmar_map_gfx) {
5814                 /* we have to ensure the gfx device is idle before we flush */
5815                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5816                 intel_iommu_strict = 1;
5817        }
5818 }
5819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5823
5824 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5825    ISOCH DMAR unit for the Azalia sound device, but not give it any
5826    TLB entries, which causes it to deadlock. Check for that.  We do
5827    this in a function called from init_dmars(), instead of in a PCI
5828    quirk, because we don't want to print the obnoxious "BIOS broken"
5829    message if VT-d is actually disabled.
5830 */
5831 static void __init check_tylersburg_isoch(void)
5832 {
5833         struct pci_dev *pdev;
5834         uint32_t vtisochctrl;
5835
5836         /* If there's no Azalia in the system anyway, forget it. */
5837         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5838         if (!pdev)
5839                 return;
5840         pci_dev_put(pdev);
5841
5842         /* System Management Registers. Might be hidden, in which case
5843            we can't do the sanity check. But that's OK, because the
5844            known-broken BIOSes _don't_ actually hide it, so far. */
5845         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5846         if (!pdev)
5847                 return;
5848
5849         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5850                 pci_dev_put(pdev);
5851                 return;
5852         }
5853
5854         pci_dev_put(pdev);
5855
5856         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5857         if (vtisochctrl & 1)
5858                 return;
5859
5860         /* Drop all bits other than the number of TLB entries */
5861         vtisochctrl &= 0x1c;
5862
5863         /* If we have the recommended number of TLB entries (16), fine. */
5864         if (vtisochctrl == 0x10)
5865                 return;
5866
5867         /* Zero TLB entries? You get to ride the short bus to school. */
5868         if (!vtisochctrl) {
5869                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5870                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5871                      dmi_get_system_info(DMI_BIOS_VENDOR),
5872                      dmi_get_system_info(DMI_BIOS_VERSION),
5873                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5874                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5875                 return;
5876         }
5877
5878         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5879                vtisochctrl);
5880 }