Merge tag 'vfs-6.7.misc' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs
[linux-block.git] / drivers / iommu / intel / iommu.c
CommitLineData
2025cf9e 1// SPDX-License-Identifier: GPL-2.0-only
ba395927 2/*
ea8ea460 3 * Copyright © 2006-2014 Intel Corporation.
ba395927 4 *
ea8ea460
DW
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
9f10e5bf 10 * Joerg Roedel <jroedel@suse.de>
ba395927
KA
11 */
12
9f10e5bf 13#define pr_fmt(fmt) "DMAR: " fmt
932a6523 14#define dev_fmt(fmt) pr_fmt(fmt)
9f10e5bf 15
763e656c
LB
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
763e656c 18#include <linux/dmi.h>
763e656c
LB
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
134fac3f 23#include <linux/syscore_ops.h>
69575d38 24#include <linux/tboot.h>
55243393 25#include <uapi/linux/iommufd.h>
ba395927 26
2585a279 27#include "iommu.h"
f2042ed2 28#include "../dma-iommu.h"
672cf6df 29#include "../irq_remapping.h"
757636ed 30#include "../iommu-sva.h"
02f3effd 31#include "pasid.h"
ad3d1902 32#include "cap_audit.h"
d8a7c0cf 33#include "perfmon.h"
078e1ee2 34
5b6985ce
FY
35#define ROOT_SIZE VTD_PAGE_SIZE
36#define CONTEXT_SIZE VTD_PAGE_SIZE
37
ba395927 38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
18436afd 39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
ba395927 40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
e0fc7e0b 41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
ba395927
KA
42
43#define IOAPIC_RANGE_START (0xfee00000)
44#define IOAPIC_RANGE_END (0xfeefffff)
45#define IOVA_START_ADDR (0x1000)
46
5e3b4a15 47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
ba395927 48
4ed0d3e6 49#define MAX_AGAW_WIDTH 64
5c645b35 50#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
4ed0d3e6 51
c062db03
LB
52#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
2ebe3151
DW
54
55/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
58 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
ba395927 60
1b722500
RM
61/* IO virtual address start page frame number */
62#define IOVA_START_PFN (1)
63
f27be03b 64#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
5e0d2a6f 65
df08cdc7
AM
66/* page table handling */
67#define LEVEL_STRIDE (9)
68#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69
70static inline int agaw_to_level(int agaw)
71{
72 return agaw + 2;
73}
74
75static inline int agaw_to_width(int agaw)
76{
5c645b35 77 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
df08cdc7
AM
78}
79
80static inline int width_to_agaw(int width)
81{
5c645b35 82 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
df08cdc7
AM
83}
84
85static inline unsigned int level_to_offset_bits(int level)
86{
87 return (level - 1) * LEVEL_STRIDE;
88}
89
29aaebbc 90static inline int pfn_level_offset(u64 pfn, int level)
df08cdc7
AM
91{
92 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93}
94
29aaebbc 95static inline u64 level_mask(int level)
df08cdc7 96{
29aaebbc 97 return -1ULL << level_to_offset_bits(level);
df08cdc7
AM
98}
99
29aaebbc 100static inline u64 level_size(int level)
df08cdc7 101{
29aaebbc 102 return 1ULL << level_to_offset_bits(level);
df08cdc7
AM
103}
104
29aaebbc 105static inline u64 align_to_level(u64 pfn, int level)
df08cdc7
AM
106{
107 return (pfn + level_size(level) - 1) & level_mask(level);
108}
fd18de50 109
6dd9a7c7
YS
110static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111{
29aaebbc 112 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
6dd9a7c7
YS
113}
114
dd4e8319
DW
115/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116 are never going to work. */
fb5f50a4 117static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
dd4e8319
DW
118{
119 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120}
fb5f50a4
YX
121static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122{
123 return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124}
dd4e8319
DW
125static inline unsigned long page_to_dma_pfn(struct page *pg)
126{
fb5f50a4 127 return mm_to_dma_pfn_start(page_to_pfn(pg));
dd4e8319
DW
128}
129static inline unsigned long virt_to_dma_pfn(void *p)
130{
131 return page_to_dma_pfn(virt_to_page(p));
132}
133
e0fc7e0b 134static void __init check_tylersburg_isoch(void);
9af88143
DW
135static int rwbf_quirk;
136
b779260b
JC
137/*
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
140 */
141static int force_on = 0;
4d213e76 142static int intel_iommu_tboot_noforce;
89a6079d 143static int no_platform_optin;
b779260b 144
46b08e1a 145#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
46b08e1a 146
091d42e4
JR
147/*
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149 * if marked present.
150 */
151static phys_addr_t root_entry_lctp(struct root_entry *re)
152{
153 if (!(re->lo & 1))
154 return 0;
155
156 return re->lo & VTD_PAGE_MASK;
157}
158
159/*
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161 * if marked present.
162 */
163static phys_addr_t root_entry_uctp(struct root_entry *re)
164{
165 if (!(re->hi & 1))
166 return 0;
46b08e1a 167
091d42e4
JR
168 return re->hi & VTD_PAGE_MASK;
169}
c07e7d21 170
c07e7d21
MM
171static inline void context_set_present(struct context_entry *context)
172{
173 context->lo |= 1;
174}
175
176static inline void context_set_fault_enable(struct context_entry *context)
177{
178 context->lo &= (((u64)-1) << 2) | 1;
179}
180
c07e7d21
MM
181static inline void context_set_translation_type(struct context_entry *context,
182 unsigned long value)
183{
184 context->lo &= (((u64)-1) << 4) | 3;
185 context->lo |= (value & 3) << 2;
186}
187
188static inline void context_set_address_root(struct context_entry *context,
189 unsigned long value)
190{
1a2262f9 191 context->lo &= ~VTD_PAGE_MASK;
c07e7d21
MM
192 context->lo |= value & VTD_PAGE_MASK;
193}
194
195static inline void context_set_address_width(struct context_entry *context,
196 unsigned long value)
197{
198 context->hi |= value & 7;
199}
200
201static inline void context_set_domain_id(struct context_entry *context,
202 unsigned long value)
203{
204 context->hi |= (value & ((1 << 16) - 1)) << 8;
205}
206
0faa19a1
LB
207static inline void context_set_pasid(struct context_entry *context)
208{
209 context->lo |= CONTEXT_PASIDE;
210}
211
dbcd861f
JR
212static inline int context_domain_id(struct context_entry *c)
213{
214 return((c->hi >> 8) & 0xffff);
215}
216
c07e7d21
MM
217static inline void context_clear_entry(struct context_entry *context)
218{
219 context->lo = 0;
220 context->hi = 0;
221}
7a8fc25e 222
0c5f6c0d
LB
223static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224{
225 if (!iommu->copied_tables)
226 return false;
227
228 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229}
230
231static inline void
232set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233{
234 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235}
236
237static inline void
238clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239{
240 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241}
242
2c2e2c38
FY
243/*
244 * This domain is a statically identity mapping domain.
245 * 1. This domain creats a static 1:1 mapping to all usable memory.
246 * 2. It maps to each iommu if successful.
247 * 3. Each iommu mapps to this domain if successful.
248 */
19943b0e
DW
249static struct dmar_domain *si_domain;
250static int hw_pass_through = 1;
2c2e2c38 251
b94e4117
JL
252struct dmar_rmrr_unit {
253 struct list_head list; /* list of rmrr units */
254 struct acpi_dmar_header *hdr; /* ACPI header */
255 u64 base_address; /* reserved base address*/
256 u64 end_address; /* reserved end address */
832bd858 257 struct dmar_dev_scope *devices; /* target devices */
b94e4117
JL
258 int devices_cnt; /* target device count */
259};
260
261struct dmar_atsr_unit {
262 struct list_head list; /* list of ATSR units */
263 struct acpi_dmar_header *hdr; /* ACPI header */
832bd858 264 struct dmar_dev_scope *devices; /* target devices */
b94e4117
JL
265 int devices_cnt; /* target device count */
266 u8 include_all:1; /* include all ports */
267};
268
31a75cbb
YC
269struct dmar_satc_unit {
270 struct list_head list; /* list of SATC units */
271 struct acpi_dmar_header *hdr; /* ACPI header */
272 struct dmar_dev_scope *devices; /* target devices */
273 struct intel_iommu *iommu; /* the corresponding iommu */
274 int devices_cnt; /* target device count */
275 u8 atc_required:1; /* ATS is required */
276};
277
b94e4117
JL
278static LIST_HEAD(dmar_atsr_units);
279static LIST_HEAD(dmar_rmrr_units);
31a75cbb 280static LIST_HEAD(dmar_satc_units);
b94e4117
JL
281
282#define for_each_rmrr_units(rmrr) \
283 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
c7be17c2 285static void device_block_translation(struct device *dev);
35a99c54 286static void intel_iommu_domain_free(struct iommu_domain *domain);
ba395927 287
01dac2d9
LB
288int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
04618252 290
8bc1f85c
ED
291int intel_iommu_enabled = 0;
292EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293
2d9e667e 294static int dmar_map_gfx = 1;
6dd9a7c7 295static int intel_iommu_superpage = 1;
ae853ddb 296static int iommu_identity_mapping;
b1012ca8 297static int iommu_skip_te_disable;
c83b2f20 298
ae853ddb
DW
299#define IDENTMAP_GFX 2
300#define IDENTMAP_AZALIA 4
c83b2f20 301
b0119e87 302const struct iommu_ops intel_iommu_ops;
a8bcbb0d 303
4158c2ec
JR
304static bool translation_pre_enabled(struct intel_iommu *iommu)
305{
306 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307}
308
091d42e4
JR
309static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310{
311 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312}
313
4158c2ec
JR
314static void init_translation_status(struct intel_iommu *iommu)
315{
316 u32 gsts;
317
318 gsts = readl(iommu->reg + DMAR_GSTS_REG);
319 if (gsts & DMA_GSTS_TES)
320 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321}
322
ba395927
KA
323static int __init intel_iommu_setup(char *str)
324{
325 if (!str)
326 return -EINVAL;
5240aed2 327
ba395927 328 while (*str) {
0cd5c3c8
KM
329 if (!strncmp(str, "on", 2)) {
330 dmar_disabled = 0;
9f10e5bf 331 pr_info("IOMMU enabled\n");
0cd5c3c8 332 } else if (!strncmp(str, "off", 3)) {
ba395927 333 dmar_disabled = 1;
89a6079d 334 no_platform_optin = 1;
9f10e5bf 335 pr_info("IOMMU disabled\n");
ba395927
KA
336 } else if (!strncmp(str, "igfx_off", 8)) {
337 dmar_map_gfx = 0;
9f10e5bf 338 pr_info("Disable GFX device mapping\n");
7d3b03ce 339 } else if (!strncmp(str, "forcedac", 8)) {
3542dcb1
RM
340 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341 iommu_dma_forcedac = true;
5e0d2a6f 342 } else if (!strncmp(str, "strict", 6)) {
1d479f16 343 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
308723e3 344 iommu_set_dma_strict();
6dd9a7c7 345 } else if (!strncmp(str, "sp_off", 6)) {
9f10e5bf 346 pr_info("Disable supported super page\n");
6dd9a7c7 347 intel_iommu_superpage = 0;
8950dcd8 348 } else if (!strncmp(str, "sm_on", 5)) {
792fb43c 349 pr_info("Enable scalable mode if hardware supports\n");
8950dcd8 350 intel_iommu_sm = 1;
792fb43c
LB
351 } else if (!strncmp(str, "sm_off", 6)) {
352 pr_info("Scalable mode is disallowed\n");
353 intel_iommu_sm = 0;
bfd20f1c 354 } else if (!strncmp(str, "tboot_noforce", 13)) {
8627892a 355 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
bfd20f1c 356 intel_iommu_tboot_noforce = 1;
5240aed2
TU
357 } else {
358 pr_notice("Unknown option - '%s'\n", str);
ba395927
KA
359 }
360
361 str += strcspn(str, ",");
362 while (*str == ',')
363 str++;
364 }
5240aed2
TU
365
366 return 1;
ba395927
KA
367}
368__setup("intel_iommu=", intel_iommu_setup);
369
2552d3a2 370void *alloc_pgtable_page(int node, gfp_t gfp)
eb3fa7cb 371{
4c923d47
SS
372 struct page *page;
373 void *vaddr = NULL;
eb3fa7cb 374
2552d3a2 375 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
4c923d47
SS
376 if (page)
377 vaddr = page_address(page);
eb3fa7cb 378 return vaddr;
ba395927
KA
379}
380
9ddbfb42 381void free_pgtable_page(void *vaddr)
ba395927
KA
382{
383 free_page((unsigned long)vaddr);
384}
385
28ccce0d
JR
386static inline int domain_type_is_si(struct dmar_domain *domain)
387{
b34380a6 388 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
28ccce0d
JR
389}
390
162d1b10
JL
391static inline int domain_pfn_supported(struct dmar_domain *domain,
392 unsigned long pfn)
393{
394 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397}
398
53fc7ad6
LB
399/*
400 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402 * the returned SAGAW.
403 */
404static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405{
406 unsigned long fl_sagaw, sl_sagaw;
407
b722cb32 408 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
53fc7ad6
LB
409 sl_sagaw = cap_sagaw(iommu->cap);
410
411 /* Second level only. */
412 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413 return sl_sagaw;
414
415 /* First level only. */
416 if (!ecap_slts(iommu->ecap))
417 return fl_sagaw;
418
419 return fl_sagaw & sl_sagaw;
420}
421
4ed0d3e6 422static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
1b573683
WH
423{
424 unsigned long sagaw;
05d2cbf9 425 int agaw;
1b573683 426
53fc7ad6
LB
427 sagaw = __iommu_calculate_sagaw(iommu);
428 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
1b573683
WH
429 if (test_bit(agaw, &sagaw))
430 break;
431 }
432
433 return agaw;
434}
435
4ed0d3e6
FY
436/*
437 * Calculate max SAGAW for each iommu.
438 */
439int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440{
441 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442}
443
444/*
445 * calculate agaw for each iommu.
446 * "SAGAW" may be different across iommus, use a default agaw, and
447 * get a supported less agaw for iommus that don't support the default agaw.
448 */
449int iommu_calculate_agaw(struct intel_iommu *iommu)
450{
451 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452}
453
04c00956
LB
454static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455{
456 return sm_supported(iommu) ?
457 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458}
459
8e604097
WH
460static void domain_update_iommu_coherency(struct dmar_domain *domain)
461{
ba949f4c 462 struct iommu_domain_info *info;
d0501960
DW
463 struct dmar_drhd_unit *drhd;
464 struct intel_iommu *iommu;
2f119c78 465 bool found = false;
ba949f4c 466 unsigned long i;
2e12bc29 467
1f106ff0 468 domain->iommu_coherency = true;
ba949f4c 469 xa_for_each(&domain->iommu_array, i, info) {
2f119c78 470 found = true;
ba949f4c 471 if (!iommu_paging_structure_coherency(info->iommu)) {
1f106ff0 472 domain->iommu_coherency = false;
8e604097
WH
473 break;
474 }
8e604097 475 }
d0501960
DW
476 if (found)
477 return;
478
479 /* No hardware attached; use lowest common denominator */
480 rcu_read_lock();
481 for_each_active_iommu(iommu, drhd) {
04c00956 482 if (!iommu_paging_structure_coherency(iommu)) {
1f106ff0 483 domain->iommu_coherency = false;
d0501960
DW
484 break;
485 }
486 }
487 rcu_read_unlock();
8e604097
WH
488}
489
64229e8f
LB
490static int domain_update_iommu_superpage(struct dmar_domain *domain,
491 struct intel_iommu *skip)
6dd9a7c7 492{
8140a95d 493 struct dmar_drhd_unit *drhd;
161f6934 494 struct intel_iommu *iommu;
64229e8f 495 int mask = 0x3;
6dd9a7c7 496
cee57d4f 497 if (!intel_iommu_superpage)
161f6934 498 return 0;
6dd9a7c7 499
8140a95d 500 /* set iommu_superpage to the smallest common denominator */
0e242612 501 rcu_read_lock();
8140a95d 502 for_each_active_iommu(iommu, drhd) {
161f6934 503 if (iommu != skip) {
e5b0feb4 504 if (domain && domain->use_first_level) {
64229e8f
LB
505 if (!cap_fl1gp_support(iommu->cap))
506 mask = 0x1;
507 } else {
508 mask &= cap_super_page_val(iommu->cap);
509 }
510
161f6934
JL
511 if (!mask)
512 break;
6dd9a7c7
YS
513 }
514 }
0e242612
JL
515 rcu_read_unlock();
516
161f6934 517 return fls(mask);
6dd9a7c7
YS
518}
519
d2ef0962
LB
520static int domain_update_device_node(struct dmar_domain *domain)
521{
522 struct device_domain_info *info;
523 int nid = NUMA_NO_NODE;
a349ffcb 524 unsigned long flags;
d2ef0962 525
a349ffcb 526 spin_lock_irqsave(&domain->lock, flags);
d2ef0962 527 list_for_each_entry(info, &domain->devices, link) {
d2ef0962
LB
528 /*
529 * There could possibly be multiple device numa nodes as devices
530 * within the same domain may sit behind different IOMMUs. There
531 * isn't perfect answer in such situation, so we select first
532 * come first served policy.
533 */
534 nid = dev_to_node(info->dev);
535 if (nid != NUMA_NO_NODE)
536 break;
537 }
a349ffcb 538 spin_unlock_irqrestore(&domain->lock, flags);
d2ef0962
LB
539
540 return nid;
541}
542
7c29ada5
LY
543static void domain_update_iotlb(struct dmar_domain *domain);
544
a886d5a7
LB
545/* Return the super pagesize bitmap if supported. */
546static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547{
548 unsigned long bitmap = 0;
549
550 /*
551 * 1-level super page supports page size of 2MiB, 2-level super page
552 * supports page size of both 2MiB and 1GiB.
553 */
554 if (domain->iommu_superpage == 1)
555 bitmap |= SZ_2M;
556 else if (domain->iommu_superpage == 2)
557 bitmap |= SZ_2M | SZ_1G;
558
559 return bitmap;
560}
561
58c610bd
SY
562/* Some capabilities may be different across iommus */
563static void domain_update_iommu_cap(struct dmar_domain *domain)
564{
565 domain_update_iommu_coherency(domain);
64229e8f 566 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
d2ef0962
LB
567
568 /*
569 * If RHSA is missing, we should default to the device numa domain
570 * as fall back.
571 */
572 if (domain->nid == NUMA_NO_NODE)
573 domain->nid = domain_update_device_node(domain);
c062db03
LB
574
575 /*
576 * First-level translation restricts the input-address to a
577 * canonical address (i.e., address bits 63:N have the same
578 * value as address bit [N-1], where N is 48-bits with 4-level
579 * paging and 57-bits with 5-level paging). Hence, skip bit
580 * [N-1].
581 */
e5b0feb4 582 if (domain->use_first_level)
c062db03
LB
583 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584 else
585 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
7c29ada5 586
a886d5a7 587 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
7c29ada5 588 domain_update_iotlb(domain);
58c610bd
SY
589}
590
26b86092
SM
591struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592 u8 devfn, int alloc)
03ecc32c
DW
593{
594 struct root_entry *root = &iommu->root_entry[bus];
595 struct context_entry *context;
596 u64 *entry;
597
0c5f6c0d
LB
598 /*
599 * Except that the caller requested to allocate a new entry,
600 * returning a copied context entry makes no sense.
601 */
602 if (!alloc && context_copied(iommu, bus, devfn))
603 return NULL;
604
4df4eab1 605 entry = &root->lo;
765b6a98 606 if (sm_supported(iommu)) {
03ecc32c
DW
607 if (devfn >= 0x80) {
608 devfn -= 0x80;
609 entry = &root->hi;
610 }
611 devfn *= 2;
612 }
03ecc32c
DW
613 if (*entry & 1)
614 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615 else {
616 unsigned long phy_addr;
617 if (!alloc)
618 return NULL;
619
2552d3a2 620 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
03ecc32c
DW
621 if (!context)
622 return NULL;
623
624 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625 phy_addr = virt_to_phys((void *)context);
626 *entry = phy_addr | 1;
627 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628 }
629 return &context[devfn];
630}
631
b9a7f981
EA
632/**
633 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634 * sub-hierarchy of a candidate PCI-PCI bridge
635 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636 * @bridge: the candidate PCI-PCI bridge
637 *
638 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639 */
640static bool
641is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642{
643 struct pci_dev *pdev, *pbridge;
644
645 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646 return false;
647
648 pdev = to_pci_dev(dev);
649 pbridge = to_pci_dev(bridge);
650
651 if (pbridge->subordinate &&
652 pbridge->subordinate->number <= pdev->bus->number &&
653 pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 return true;
655
656 return false;
657}
658
2d33b7d6
LB
659static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660{
661 struct dmar_drhd_unit *drhd;
662 u32 vtbar;
663 int rc;
664
665 /* We know that this device on this chipset has its own IOMMU.
666 * If we find it under a different IOMMU, then the BIOS is lying
667 * to us. Hope that the IOMMU for this device is actually
668 * disabled, and it needs no translation...
669 */
670 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671 if (rc) {
672 /* "can't" happen */
673 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674 return false;
675 }
676 vtbar &= 0xffff0000;
677
678 /* we know that the this iommu should be at offset 0xa000 from vtbar */
679 drhd = dmar_find_matched_drhd_unit(pdev);
680 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683 return true;
684 }
685
686 return false;
687}
688
689static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690{
691 if (!iommu || iommu->drhd->ignored)
692 return true;
693
694 if (dev_is_pci(dev)) {
695 struct pci_dev *pdev = to_pci_dev(dev);
696
697 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699 quirk_ioat_snb_local_iommu(pdev))
700 return true;
701 }
702
703 return false;
704}
705
dd6692f1 706struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
c7151a8d
WH
707{
708 struct dmar_drhd_unit *drhd = NULL;
dd6692f1 709 struct pci_dev *pdev = NULL;
b683b230 710 struct intel_iommu *iommu;
156baca8 711 struct device *tmp;
aa4d066a 712 u16 segment = 0;
c7151a8d
WH
713 int i;
714
2d33b7d6 715 if (!dev)
4ed6a540
DW
716 return NULL;
717
156baca8 718 if (dev_is_pci(dev)) {
1c387188
AR
719 struct pci_dev *pf_pdev;
720
e3560ee4 721 pdev = pci_real_dma_dev(to_pci_dev(dev));
5823e330 722
1c387188
AR
723 /* VFs aren't listed in scope tables; we need to look up
724 * the PF instead to find the IOMMU. */
725 pf_pdev = pci_physfn(pdev);
726 dev = &pf_pdev->dev;
156baca8 727 segment = pci_domain_nr(pdev->bus);
ca5b74d2 728 } else if (has_acpi_companion(dev))
156baca8
DW
729 dev = &ACPI_COMPANION(dev)->dev;
730
0e242612 731 rcu_read_lock();
2d33b7d6 732 for_each_iommu(iommu, drhd) {
156baca8 733 if (pdev && segment != drhd->segment)
276dbf99 734 continue;
c7151a8d 735
b683b230 736 for_each_active_dev_scope(drhd->devices,
156baca8
DW
737 drhd->devices_cnt, i, tmp) {
738 if (tmp == dev) {
1c387188
AR
739 /* For a VF use its original BDF# not that of the PF
740 * which we used for the IOMMU lookup. Strictly speaking
741 * we could do this for all PCI devices; we only need to
742 * get the BDF# from the scope table for ACPI matches. */
5003ae1e 743 if (pdev && pdev->is_virtfn)
1c387188
AR
744 goto got_pdev;
745
dd6692f1
LB
746 if (bus && devfn) {
747 *bus = drhd->devices[i].bus;
748 *devfn = drhd->devices[i].devfn;
749 }
b683b230 750 goto out;
156baca8
DW
751 }
752
b9a7f981 753 if (is_downstream_to_pci_bridge(dev, tmp))
156baca8 754 goto got_pdev;
924b6231 755 }
c7151a8d 756
156baca8 757 if (pdev && drhd->include_all) {
2187a57e 758got_pdev:
dd6692f1
LB
759 if (bus && devfn) {
760 *bus = pdev->bus->number;
761 *devfn = pdev->devfn;
762 }
b683b230 763 goto out;
156baca8 764 }
c7151a8d 765 }
b683b230 766 iommu = NULL;
2187a57e 767out:
2d33b7d6
LB
768 if (iommu_is_dummy(iommu, dev))
769 iommu = NULL;
770
0e242612 771 rcu_read_unlock();
c7151a8d 772
b683b230 773 return iommu;
c7151a8d
WH
774}
775
5331fe6f
WH
776static void domain_flush_cache(struct dmar_domain *domain,
777 void *addr, int size)
778{
779 if (!domain->iommu_coherency)
780 clflush_cache_range(addr, size);
781}
782
ba395927
KA
783static void free_context_table(struct intel_iommu *iommu)
784{
ba395927 785 struct context_entry *context;
2e1c8daf
LB
786 int i;
787
788 if (!iommu->root_entry)
789 return;
ba395927 790
ba395927 791 for (i = 0; i < ROOT_ENTRY_NR; i++) {
03ecc32c 792 context = iommu_context_addr(iommu, i, 0, 0);
ba395927
KA
793 if (context)
794 free_pgtable_page(context);
03ecc32c 795
765b6a98 796 if (!sm_supported(iommu))
03ecc32c
DW
797 continue;
798
799 context = iommu_context_addr(iommu, i, 0x80, 0);
800 if (context)
801 free_pgtable_page(context);
ba395927 802 }
2e1c8daf 803
ba395927
KA
804 free_pgtable_page(iommu->root_entry);
805 iommu->root_entry = NULL;
ba395927
KA
806}
807
914ff771 808#ifdef CONFIG_DMAR_DEBUG
35bf49e0
LB
809static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810 u8 bus, u8 devfn, struct dma_pte *parent, int level)
914ff771 811{
35bf49e0
LB
812 struct dma_pte *pte;
813 int offset;
914ff771
KMP
814
815 while (1) {
816 offset = pfn_level_offset(pfn, level);
817 pte = &parent[offset];
818 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819 pr_info("PTE not present at level %d\n", level);
820 break;
821 }
822
823 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825 if (level == 1)
826 break;
827
828 parent = phys_to_virt(dma_pte_addr(pte));
829 level--;
830 }
831}
832
833void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834 unsigned long long addr, u32 pasid)
835{
836 struct pasid_dir_entry *dir, *pde;
837 struct pasid_entry *entries, *pte;
838 struct context_entry *ctx_entry;
839 struct root_entry *rt_entry;
35bf49e0 840 int i, dir_index, index, level;
914ff771
KMP
841 u8 devfn = source_id & 0xff;
842 u8 bus = source_id >> 8;
35bf49e0 843 struct dma_pte *pgtable;
914ff771
KMP
844
845 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847 /* root entry dump */
848 rt_entry = &iommu->root_entry[bus];
849 if (!rt_entry) {
850 pr_info("root table entry is not present\n");
851 return;
852 }
853
854 if (sm_supported(iommu))
855 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856 rt_entry->hi, rt_entry->lo);
857 else
858 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860 /* context entry dump */
861 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862 if (!ctx_entry) {
863 pr_info("context table entry is not present\n");
864 return;
865 }
866
867 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868 ctx_entry->hi, ctx_entry->lo);
869
870 /* legacy mode does not require PASID entries */
35bf49e0
LB
871 if (!sm_supported(iommu)) {
872 level = agaw_to_level(ctx_entry->hi & 7);
873 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
914ff771 874 goto pgtable_walk;
35bf49e0 875 }
914ff771
KMP
876
877 /* get the pointer to pasid directory entry */
878 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879 if (!dir) {
880 pr_info("pasid directory entry is not present\n");
881 return;
882 }
883 /* For request-without-pasid, get the pasid from context entry */
fffaed1e 884 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
42987801 885 pasid = IOMMU_NO_PASID;
914ff771
KMP
886
887 dir_index = pasid >> PASID_PDE_SHIFT;
888 pde = &dir[dir_index];
889 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891 /* get the pointer to the pasid table entry */
892 entries = get_pasid_table_from_pde(pde);
893 if (!entries) {
894 pr_info("pasid table entry is not present\n");
895 return;
896 }
897 index = pasid & PASID_PTE_MASK;
898 pte = &entries[index];
899 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
35bf49e0
LB
902 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905 } else {
906 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908 }
909
914ff771 910pgtable_walk:
35bf49e0 911 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
914ff771
KMP
912}
913#endif
914
b026fd28 915static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
2d4d7676
JG
916 unsigned long pfn, int *target_level,
917 gfp_t gfp)
ba395927 918{
e083ea5b 919 struct dma_pte *parent, *pte;
ba395927 920 int level = agaw_to_level(domain->agaw);
4399c8bf 921 int offset;
ba395927 922
162d1b10 923 if (!domain_pfn_supported(domain, pfn))
f9423606
JS
924 /* Address beyond IOMMU's addressing capabilities. */
925 return NULL;
926
ba395927
KA
927 parent = domain->pgd;
928
5cf0a76f 929 while (1) {
ba395927
KA
930 void *tmp_page;
931
b026fd28 932 offset = pfn_level_offset(pfn, level);
ba395927 933 pte = &parent[offset];
5cf0a76f 934 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
6dd9a7c7 935 break;
5cf0a76f 936 if (level == *target_level)
ba395927
KA
937 break;
938
19c239ce 939 if (!dma_pte_present(pte)) {
c85994e4
DW
940 uint64_t pteval;
941
2d4d7676 942 tmp_page = alloc_pgtable_page(domain->nid, gfp);
ba395927 943
206a73c1 944 if (!tmp_page)
ba395927 945 return NULL;
206a73c1 946
c85994e4 947 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
64de5af0 948 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
e5b0feb4 949 if (domain->use_first_level)
242b0aae
TZ
950 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
effad4b5 952 if (cmpxchg64(&pte->val, 0ULL, pteval))
c85994e4
DW
953 /* Someone else set it while we were thinking; use theirs. */
954 free_pgtable_page(tmp_page);
effad4b5 955 else
c85994e4 956 domain_flush_cache(domain, pte, sizeof(*pte));
ba395927 957 }
5cf0a76f
DW
958 if (level == 1)
959 break;
960
19c239ce 961 parent = phys_to_virt(dma_pte_addr(pte));
ba395927
KA
962 level--;
963 }
964
5cf0a76f
DW
965 if (!*target_level)
966 *target_level = level;
967
ba395927
KA
968 return pte;
969}
970
971/* return address's pte at specific level */
90dcfb5e
DW
972static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973 unsigned long pfn,
6dd9a7c7 974 int level, int *large_page)
ba395927 975{
e083ea5b 976 struct dma_pte *parent, *pte;
ba395927
KA
977 int total = agaw_to_level(domain->agaw);
978 int offset;
979
980 parent = domain->pgd;
981 while (level <= total) {
90dcfb5e 982 offset = pfn_level_offset(pfn, total);
ba395927
KA
983 pte = &parent[offset];
984 if (level == total)
985 return pte;
986
6dd9a7c7
YS
987 if (!dma_pte_present(pte)) {
988 *large_page = total;
ba395927 989 break;
6dd9a7c7
YS
990 }
991
e16922af 992 if (dma_pte_superpage(pte)) {
6dd9a7c7
YS
993 *large_page = total;
994 return pte;
995 }
996
19c239ce 997 parent = phys_to_virt(dma_pte_addr(pte));
ba395927
KA
998 total--;
999 }
1000 return NULL;
1001}
1002
ba395927 1003/* clear last level pte, a tlb flush should be followed */
5cf0a76f 1004static void dma_pte_clear_range(struct dmar_domain *domain,
595badf5
DW
1005 unsigned long start_pfn,
1006 unsigned long last_pfn)
ba395927 1007{
e083ea5b 1008 unsigned int large_page;
310a5ab9 1009 struct dma_pte *first_pte, *pte;
66eae846 1010
35dc5d89
TZ
1011 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012 WARN_ON(start_pfn > last_pfn))
1013 return;
ba395927 1014
04b18e65 1015 /* we don't need lock here; nobody else touches the iova range */
59c36286 1016 do {
6dd9a7c7
YS
1017 large_page = 1;
1018 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
310a5ab9 1019 if (!pte) {
6dd9a7c7 1020 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
310a5ab9
DW
1021 continue;
1022 }
6dd9a7c7 1023 do {
310a5ab9 1024 dma_clear_pte(pte);
6dd9a7c7 1025 start_pfn += lvl_to_nr_pages(large_page);
310a5ab9 1026 pte++;
75e6bf96
DW
1027 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
310a5ab9
DW
1029 domain_flush_cache(domain, first_pte,
1030 (void *)pte - (void *)first_pte);
59c36286
DW
1031
1032 } while (start_pfn && start_pfn <= last_pfn);
ba395927
KA
1033}
1034
3269ee0b 1035static void dma_pte_free_level(struct dmar_domain *domain, int level,
bc24c571
DD
1036 int retain_level, struct dma_pte *pte,
1037 unsigned long pfn, unsigned long start_pfn,
1038 unsigned long last_pfn)
3269ee0b
AW
1039{
1040 pfn = max(start_pfn, pfn);
1041 pte = &pte[pfn_level_offset(pfn, level)];
1042
1043 do {
1044 unsigned long level_pfn;
1045 struct dma_pte *level_pte;
1046
1047 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048 goto next;
1049
f7116e11 1050 level_pfn = pfn & level_mask(level);
3269ee0b
AW
1051 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
bc24c571
DD
1053 if (level > 2) {
1054 dma_pte_free_level(domain, level - 1, retain_level,
1055 level_pte, level_pfn, start_pfn,
1056 last_pfn);
1057 }
3269ee0b 1058
bc24c571
DD
1059 /*
1060 * Free the page table if we're below the level we want to
1061 * retain and the range covers the entire table.
1062 */
1063 if (level < retain_level && !(start_pfn > level_pfn ||
08336fd2 1064 last_pfn < level_pfn + level_size(level) - 1)) {
3269ee0b
AW
1065 dma_clear_pte(pte);
1066 domain_flush_cache(domain, pte, sizeof(*pte));
1067 free_pgtable_page(level_pte);
1068 }
1069next:
1070 pfn += level_size(level);
1071 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072}
1073
bc24c571
DD
1074/*
1075 * clear last level (leaf) ptes and free page table pages below the
1076 * level we wish to keep intact.
1077 */
ba395927 1078static void dma_pte_free_pagetable(struct dmar_domain *domain,
d794dc9b 1079 unsigned long start_pfn,
bc24c571
DD
1080 unsigned long last_pfn,
1081 int retain_level)
ba395927 1082{
d41a4adb
JL
1083 dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
f3a0a52f 1085 /* We don't need lock here; nobody else touches the iova range */
bc24c571 1086 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
3269ee0b 1087 domain->pgd, 0, start_pfn, last_pfn);
6660c63a 1088
ba395927 1089 /* free pgd */
d794dc9b 1090 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
ba395927
KA
1091 free_pgtable_page(domain->pgd);
1092 domain->pgd = NULL;
1093 }
1094}
1095
ea8ea460
DW
1096/* When a page at a given level is being unlinked from its parent, we don't
1097 need to *modify* it at all. All we need to do is make a list of all the
1098 pages which can be freed just as soon as we've flushed the IOTLB and we
1099 know the hardware page-walk will no longer touch them.
1100 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101 be freed. */
87f60cc6
MWO
1102static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103 int level, struct dma_pte *pte,
1104 struct list_head *freelist)
ea8ea460
DW
1105{
1106 struct page *pg;
1107
1108 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
87f60cc6 1109 list_add_tail(&pg->lru, freelist);
ea8ea460
DW
1110
1111 if (level == 1)
87f60cc6 1112 return;
ea8ea460 1113
adeb2590
JL
1114 pte = page_address(pg);
1115 do {
ea8ea460 1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
87f60cc6 1117 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
adeb2590
JL
1118 pte++;
1119 } while (!first_pte_in_page(pte));
ea8ea460
DW
1120}
1121
87f60cc6
MWO
1122static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123 struct dma_pte *pte, unsigned long pfn,
1124 unsigned long start_pfn, unsigned long last_pfn,
1125 struct list_head *freelist)
ea8ea460
DW
1126{
1127 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129 pfn = max(start_pfn, pfn);
1130 pte = &pte[pfn_level_offset(pfn, level)];
1131
1132 do {
86dc40c7 1133 unsigned long level_pfn = pfn & level_mask(level);
ea8ea460
DW
1134
1135 if (!dma_pte_present(pte))
1136 goto next;
1137
ea8ea460
DW
1138 /* If range covers entire pagetable, free it */
1139 if (start_pfn <= level_pfn &&
1140 last_pfn >= level_pfn + level_size(level) - 1) {
1141 /* These suborbinate page tables are going away entirely. Don't
1142 bother to clear them; we're just going to *free* them. */
1143 if (level > 1 && !dma_pte_superpage(pte))
87f60cc6 1144 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
ea8ea460
DW
1145
1146 dma_clear_pte(pte);
1147 if (!first_pte)
1148 first_pte = pte;
1149 last_pte = pte;
1150 } else if (level > 1) {
1151 /* Recurse down into a level that isn't *entirely* obsolete */
87f60cc6
MWO
1152 dma_pte_clear_level(domain, level - 1,
1153 phys_to_virt(dma_pte_addr(pte)),
1154 level_pfn, start_pfn, last_pfn,
1155 freelist);
ea8ea460
DW
1156 }
1157next:
86dc40c7 1158 pfn = level_pfn + level_size(level);
ea8ea460
DW
1159 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161 if (first_pte)
1162 domain_flush_cache(domain, first_pte,
1163 (void *)++last_pte - (void *)first_pte);
ea8ea460
DW
1164}
1165
1166/* We can't just free the pages because the IOMMU may still be walking
1167 the page tables, and may have cached the intermediate levels. The
1168 pages can only be freed after the IOTLB flush has been done. */
87f60cc6
MWO
1169static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170 unsigned long last_pfn, struct list_head *freelist)
ea8ea460 1171{
35dc5d89
TZ
1172 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173 WARN_ON(start_pfn > last_pfn))
1174 return;
ea8ea460
DW
1175
1176 /* we don't need lock here; nobody else touches the iova range */
87f60cc6
MWO
1177 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178 domain->pgd, 0, start_pfn, last_pfn, freelist);
ea8ea460
DW
1179
1180 /* free pgd */
1181 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182 struct page *pgd_page = virt_to_page(domain->pgd);
87f60cc6 1183 list_add_tail(&pgd_page->lru, freelist);
ea8ea460
DW
1184 domain->pgd = NULL;
1185 }
ea8ea460
DW
1186}
1187
ba395927
KA
1188/* iommu handling */
1189static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190{
1191 struct root_entry *root;
ba395927 1192
82d9654f 1193 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
ffebeb46 1194 if (!root) {
9f10e5bf 1195 pr_err("Allocating root entry for %s failed\n",
ffebeb46 1196 iommu->name);
ba395927 1197 return -ENOMEM;
ffebeb46 1198 }
ba395927 1199
5b6985ce 1200 __iommu_flush_cache(iommu, root, ROOT_SIZE);
ba395927 1201 iommu->root_entry = root;
ba395927
KA
1202
1203 return 0;
1204}
1205
ba395927
KA
1206static void iommu_set_root_entry(struct intel_iommu *iommu)
1207{
03ecc32c 1208 u64 addr;
c416daa9 1209 u32 sts;
ba395927
KA
1210 unsigned long flag;
1211
03ecc32c 1212 addr = virt_to_phys(iommu->root_entry);
7373a8cc
LB
1213 if (sm_supported(iommu))
1214 addr |= DMA_RTADDR_SMT;
ba395927 1215
1f5b3c3f 1216 raw_spin_lock_irqsave(&iommu->register_lock, flag);
03ecc32c 1217 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
ba395927 1218
c416daa9 1219 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
ba395927
KA
1220
1221 /* Make sure hardware complete it */
1222 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
c416daa9 1223 readl, (sts & DMA_GSTS_RTPS), sts);
ba395927 1224
1f5b3c3f 1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
c0474a60 1226
6ad931a2
LB
1227 /*
1228 * Hardware invalidates all DMA remapping hardware translation
1229 * caches as part of SRTP flow.
1230 */
1231 if (cap_esrtps(iommu->cap))
1232 return;
1233
c0474a60
LB
1234 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235 if (sm_supported(iommu))
1236 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
ba395927
KA
1238}
1239
6f7db75e 1240void iommu_flush_write_buffer(struct intel_iommu *iommu)
ba395927
KA
1241{
1242 u32 val;
1243 unsigned long flag;
1244
9af88143 1245 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
ba395927 1246 return;
ba395927 1247
1f5b3c3f 1248 raw_spin_lock_irqsave(&iommu->register_lock, flag);
462b60f6 1249 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
ba395927
KA
1250
1251 /* Make sure hardware complete it */
1252 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
c416daa9 1253 readl, (!(val & DMA_GSTS_WBFS)), val);
ba395927 1254
1f5b3c3f 1255 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
ba395927
KA
1256}
1257
1258/* return value determine if we need a write buffer flush */
4c25a2c1
DW
1259static void __iommu_flush_context(struct intel_iommu *iommu,
1260 u16 did, u16 source_id, u8 function_mask,
1261 u64 type)
ba395927
KA
1262{
1263 u64 val = 0;
1264 unsigned long flag;
1265
ba395927
KA
1266 switch (type) {
1267 case DMA_CCMD_GLOBAL_INVL:
1268 val = DMA_CCMD_GLOBAL_INVL;
1269 break;
1270 case DMA_CCMD_DOMAIN_INVL:
1271 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272 break;
1273 case DMA_CCMD_DEVICE_INVL:
1274 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276 break;
1277 default:
4a627a25
TZ
1278 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279 iommu->name, type);
1280 return;
ba395927
KA
1281 }
1282 val |= DMA_CCMD_ICC;
1283
1f5b3c3f 1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
ba395927
KA
1285 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1f5b3c3f 1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
ba395927
KA
1292}
1293
ba395927 1294/* return value determine if we need a write buffer flush */
1f0ef2aa
DW
1295static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296 u64 addr, unsigned int size_order, u64 type)
ba395927
KA
1297{
1298 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299 u64 val = 0, val_iva = 0;
1300 unsigned long flag;
1301
ba395927
KA
1302 switch (type) {
1303 case DMA_TLB_GLOBAL_FLUSH:
1304 /* global flush doesn't need set IVA_REG */
1305 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306 break;
1307 case DMA_TLB_DSI_FLUSH:
1308 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 break;
1310 case DMA_TLB_PSI_FLUSH:
1311 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
ea8ea460 1312 /* IH bit is passed in as part of address */
ba395927
KA
1313 val_iva = size_order | addr;
1314 break;
1315 default:
4a627a25
TZ
1316 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317 iommu->name, type);
1318 return;
ba395927 1319 }
b4da4e11 1320
ba395927
KA
1321 if (cap_write_drain(iommu->cap))
1322 val |= DMA_TLB_WRITE_DRAIN;
1323
1f5b3c3f 1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
ba395927
KA
1325 /* Note: Only uses first TLB reg currently */
1326 if (val_iva)
1327 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1f5b3c3f 1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
ba395927
KA
1335
1336 /* check IOTLB invalidation granularity */
1337 if (DMA_TLB_IAIG(val) == 0)
9f10e5bf 1338 pr_err("Flush IOTLB failed\n");
ba395927 1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
9f10e5bf 1340 pr_debug("TLB flush request %Lx, actual %Lx\n",
5b6985ce
FY
1341 (unsigned long long)DMA_TLB_IIRG(type),
1342 (unsigned long long)DMA_TLB_IAIG(val));
ba395927
KA
1343}
1344
64ae892b 1345static struct device_domain_info *
0faa19a1
LB
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347 struct intel_iommu *iommu, u8 bus, u8 devfn)
93a23a72 1348{
93a23a72 1349 struct device_domain_info *info;
a349ffcb 1350 unsigned long flags;
93a23a72 1351
a349ffcb 1352 spin_lock_irqsave(&domain->lock, flags);
969aaefb 1353 list_for_each_entry(info, &domain->devices, link) {
c3b497c6
JL
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
a349ffcb 1356 spin_unlock_irqrestore(&domain->lock, flags);
0faa19a1 1357 return info;
93a23a72 1358 }
969aaefb 1359 }
a349ffcb 1360 spin_unlock_irqrestore(&domain->lock, flags);
93a23a72 1361
b16d0cb9 1362 return NULL;
93a23a72
YZ
1363}
1364
0824c592
OP
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
7d0c9da6 1367 struct dev_pasid_info *dev_pasid;
0824c592
OP
1368 struct device_domain_info *info;
1369 bool has_iotlb_device = false;
a349ffcb 1370 unsigned long flags;
0824c592 1371
a349ffcb 1372 spin_lock_irqsave(&domain->lock, flags);
969aaefb 1373 list_for_each_entry(info, &domain->devices, link) {
7c29ada5 1374 if (info->ats_enabled) {
0824c592
OP
1375 has_iotlb_device = true;
1376 break;
1377 }
969aaefb 1378 }
7d0c9da6
LB
1379
1380 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381 info = dev_iommu_priv_get(dev_pasid->dev);
1382 if (info->ats_enabled) {
1383 has_iotlb_device = true;
1384 break;
1385 }
1386 }
0824c592 1387 domain->has_iotlb_device = has_iotlb_device;
a349ffcb 1388 spin_unlock_irqrestore(&domain->lock, flags);
0824c592
OP
1389}
1390
e65a6897
JP
1391/*
1392 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394 * check because it applies only to the built-in QAT devices and it doesn't
1395 * grant additional privileges.
1396 */
81c95fba 1397#define BUGGY_QAT_DEVID_MASK 0x4940
e65a6897
JP
1398static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399{
1400 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401 return false;
1402
1403 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404 return false;
1405
1406 return true;
1407}
1408
0faa19a1 1409static void iommu_enable_pci_caps(struct device_domain_info *info)
ba395927 1410{
fb0cc3aa
BH
1411 struct pci_dev *pdev;
1412
c7be17c2 1413 if (!dev_is_pci(info->dev))
93a23a72
YZ
1414 return;
1415
fb0cc3aa 1416 pdev = to_pci_dev(info->dev);
fb0cc3aa 1417
b16d0cb9
DW
1418 /* The PCIe spec, in its wisdom, declares that the behaviour of
1419 the device if you enable PASID support after ATS support is
1420 undefined. So always enable PASID support on devices which
1421 have it, even if we can't yet know if we're ever going to
1422 use it. */
1423 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424 info->pasid_enabled = 1;
1425
da656a04 1426 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
fb58fdcd 1427 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
b16d0cb9 1428 info->ats_enabled = 1;
0824c592 1429 domain_update_iotlb(info->domain);
b16d0cb9 1430 }
93a23a72
YZ
1431}
1432
ba502132 1433static void iommu_disable_pci_caps(struct device_domain_info *info)
93a23a72 1434{
b16d0cb9
DW
1435 struct pci_dev *pdev;
1436
da972fb1 1437 if (!dev_is_pci(info->dev))
93a23a72
YZ
1438 return;
1439
b16d0cb9
DW
1440 pdev = to_pci_dev(info->dev);
1441
1442 if (info->ats_enabled) {
1443 pci_disable_ats(pdev);
1444 info->ats_enabled = 0;
0824c592 1445 domain_update_iotlb(info->domain);
b16d0cb9 1446 }
0faa19a1 1447
b16d0cb9
DW
1448 if (info->pasid_enabled) {
1449 pci_disable_pasid(pdev);
1450 info->pasid_enabled = 0;
1451 }
93a23a72
YZ
1452}
1453
7c29ada5
LY
1454static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455 u64 addr, unsigned int mask)
1456{
1457 u16 sid, qdep;
1458
1459 if (!info || !info->ats_enabled)
1460 return;
1461
1462 sid = info->bus << 8 | info->devfn;
1463 qdep = info->ats_qdep;
1464 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 qdep, addr, mask);
42987801 1466 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
7c29ada5
LY
1467}
1468
93a23a72
YZ
1469static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 u64 addr, unsigned mask)
1471{
7d0c9da6 1472 struct dev_pasid_info *dev_pasid;
93a23a72 1473 struct device_domain_info *info;
a349ffcb 1474 unsigned long flags;
93a23a72 1475
0824c592
OP
1476 if (!domain->has_iotlb_device)
1477 return;
1478
a349ffcb 1479 spin_lock_irqsave(&domain->lock, flags);
7c29ada5
LY
1480 list_for_each_entry(info, &domain->devices, link)
1481 __iommu_flush_dev_iotlb(info, addr, mask);
7d0c9da6
LB
1482
1483 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486 if (!info->ats_enabled)
1487 continue;
1488
1489 qi_flush_dev_iotlb_pasid(info->iommu,
1490 PCI_DEVID(info->bus, info->devfn),
1491 info->pfsid, dev_pasid->pasid,
1492 info->ats_qdep, addr,
1493 mask);
1494 }
a349ffcb 1495 spin_unlock_irqrestore(&domain->lock, flags);
93a23a72
YZ
1496}
1497
ac1a3483
LB
1498static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499 struct dmar_domain *domain, u64 addr,
1500 unsigned long npages, bool ih)
1501{
1502 u16 did = domain_id_iommu(domain, iommu);
7d0c9da6 1503 struct dev_pasid_info *dev_pasid;
ac1a3483
LB
1504 unsigned long flags;
1505
1506 spin_lock_irqsave(&domain->lock, flags);
7d0c9da6
LB
1507 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
37f900e7
LB
1510 if (!list_empty(&domain->devices))
1511 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
a349ffcb 1512 spin_unlock_irqrestore(&domain->lock, flags);
93a23a72
YZ
1513}
1514
a1ddcbe9
JR
1515static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516 struct dmar_domain *domain,
1517 unsigned long pfn, unsigned int pages,
1518 int ih, int map)
ba395927 1519{
59bf3557
DS
1520 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521 unsigned int mask = ilog2(aligned_pages);
03d6a246 1522 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
ba949f4c 1523 u16 did = domain_id_iommu(domain, iommu);
ba395927 1524
4a627a25
TZ
1525 if (WARN_ON(!pages))
1526 return;
ba395927 1527
ea8ea460
DW
1528 if (ih)
1529 ih = 1 << 6;
33cd6e64 1530
e5b0feb4 1531 if (domain->use_first_level) {
ac1a3483 1532 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
33cd6e64 1533 } else {
59bf3557
DS
1534 unsigned long bitmask = aligned_pages - 1;
1535
1536 /*
1537 * PSI masks the low order bits of the base address. If the
1538 * address isn't aligned to the mask, then compute a mask value
1539 * needed to ensure the target range is flushed.
1540 */
1541 if (unlikely(bitmask & pfn)) {
1542 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544 /*
1545 * Since end_pfn <= pfn + bitmask, the only way bits
1546 * higher than bitmask can differ in pfn and end_pfn is
1547 * by carrying. This means after masking out bitmask,
1548 * high bits starting with the first set bit in
1549 * shared_bits are all equal in both pfn and end_pfn.
1550 */
1551 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553 }
1554
33cd6e64
LB
1555 /*
1556 * Fallback to domain selective flush if no PSI support or
59bf3557 1557 * the size is too big.
33cd6e64
LB
1558 */
1559 if (!cap_pgsel_inv(iommu->cap) ||
1560 mask > cap_max_amask_val(iommu->cap))
1561 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562 DMA_TLB_DSI_FLUSH);
1563 else
1564 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565 DMA_TLB_PSI_FLUSH);
1566 }
bf92df30
YZ
1567
1568 /*
82653633
NA
1569 * In caching mode, changes of pages from non-present to present require
1570 * flush. However, device IOTLB doesn't need to be flushed in this case.
bf92df30 1571 */
82653633 1572 if (!cap_caching_mode(iommu->cap) || !map)
9d2e6505 1573 iommu_flush_dev_iotlb(domain, addr, mask);
ba395927
KA
1574}
1575
eed91a0b
PX
1576/* Notification for newly created mappings */
1577static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578 struct dmar_domain *domain,
1579 unsigned long pfn, unsigned int pages)
1580{
33cd6e64
LB
1581 /*
1582 * It's a non-present to present mapping. Only flush if caching mode
1583 * and second level.
1584 */
e5b0feb4 1585 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
eed91a0b
PX
1586 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587 else
1588 iommu_flush_write_buffer(iommu);
1589}
1590
c588072b 1591static void intel_flush_iotlb_all(struct iommu_domain *domain)
13cf0174 1592{
c588072b 1593 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
ba949f4c
LB
1594 struct iommu_domain_info *info;
1595 unsigned long idx;
13cf0174 1596
ba949f4c
LB
1597 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598 struct intel_iommu *iommu = info->iommu;
1599 u16 did = domain_id_iommu(dmar_domain, iommu);
13cf0174 1600
e5b0feb4 1601 if (dmar_domain->use_first_level)
ac1a3483 1602 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
33cd6e64
LB
1603 else
1604 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605 DMA_TLB_DSI_FLUSH);
13cf0174
JR
1606
1607 if (!cap_caching_mode(iommu->cap))
402e6688 1608 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
13cf0174
JR
1609 }
1610}
1611
f8bab735 1612static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613{
1614 u32 pmen;
1615 unsigned long flags;
1616
5bb71fc7
LB
1617 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618 return;
1619
1f5b3c3f 1620 raw_spin_lock_irqsave(&iommu->register_lock, flags);
f8bab735 1621 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622 pmen &= ~DMA_PMEN_EPM;
1623 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625 /* wait for the protected region status bit to clear */
1626 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1f5b3c3f 1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
f8bab735 1630}
1631
2a41ccee 1632static void iommu_enable_translation(struct intel_iommu *iommu)
ba395927
KA
1633{
1634 u32 sts;
1635 unsigned long flags;
1636
1f5b3c3f 1637 raw_spin_lock_irqsave(&iommu->register_lock, flags);
c416daa9
DW
1638 iommu->gcmd |= DMA_GCMD_TE;
1639 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
ba395927
KA
1640
1641 /* Make sure hardware complete it */
1642 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
c416daa9 1643 readl, (sts & DMA_GSTS_TES), sts);
ba395927 1644
1f5b3c3f 1645 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
ba395927
KA
1646}
1647
2a41ccee 1648static void iommu_disable_translation(struct intel_iommu *iommu)
ba395927
KA
1649{
1650 u32 sts;
1651 unsigned long flag;
1652
b1012ca8
LB
1653 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655 return;
1656
1f5b3c3f 1657 raw_spin_lock_irqsave(&iommu->register_lock, flag);
ba395927
KA
1658 iommu->gcmd &= ~DMA_GCMD_TE;
1659 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661 /* Make sure hardware complete it */
1662 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
c416daa9 1663 readl, (!(sts & DMA_GSTS_TES)), sts);
ba395927 1664
1f5b3c3f 1665 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
ba395927
KA
1666}
1667
1668static int iommu_init_domains(struct intel_iommu *iommu)
1669{
bb712573 1670 u32 ndomains;
ba395927
KA
1671
1672 ndomains = cap_ndoms(iommu->cap);
8bf47816 1673 pr_debug("%s: Number of Domains supported <%d>\n",
9f10e5bf 1674 iommu->name, ndomains);
ba395927 1675
94a91b50
DD
1676 spin_lock_init(&iommu->lock);
1677
bb712573 1678 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
5e41c998 1679 if (!iommu->domain_ids)
ba395927 1680 return -ENOMEM;
8bf47816 1681
ba395927 1682 /*
c0e8a6c8
JR
1683 * If Caching mode is set, then invalid translations are tagged
1684 * with domain-id 0, hence we need to pre-allocate it. We also
1685 * use domain-id 0 as a marker for non-allocated domain-id, so
1686 * make sure it is not used for a real domain.
ba395927 1687 */
c0e8a6c8
JR
1688 set_bit(0, iommu->domain_ids);
1689
3b33d4ab
LB
1690 /*
1691 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692 * entry for first-level or pass-through translation modes should
1693 * be programmed with a domain id different from those used for
1694 * second-level or nested translation. We reserve a domain id for
1695 * this purpose.
1696 */
1697 if (sm_supported(iommu))
1698 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
ba395927
KA
1700 return 0;
1701}
ba395927 1702
ffebeb46 1703static void disable_dmar_iommu(struct intel_iommu *iommu)
ba395927 1704{
402e6688 1705 if (!iommu->domain_ids)
29a27719 1706 return;
a4eaa86c 1707
98f7b0db
LB
1708 /*
1709 * All iommu domains must have been detached from the devices,
1710 * hence there should be no domain IDs in use.
1711 */
1712 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713 > NUM_RESERVED_DID))
1714 return;
ba395927
KA
1715
1716 if (iommu->gcmd & DMA_GCMD_TE)
1717 iommu_disable_translation(iommu);
ffebeb46 1718}
ba395927 1719
ffebeb46
JL
1720static void free_dmar_iommu(struct intel_iommu *iommu)
1721{
402e6688 1722 if (iommu->domain_ids) {
bb712573 1723 bitmap_free(iommu->domain_ids);
ffebeb46
JL
1724 iommu->domain_ids = NULL;
1725 }
ba395927 1726
0c5f6c0d
LB
1727 if (iommu->copied_tables) {
1728 bitmap_free(iommu->copied_tables);
1729 iommu->copied_tables = NULL;
1730 }
1731
ba395927
KA
1732 /* free context mapping */
1733 free_context_table(iommu);
8a94ade4
DW
1734
1735#ifdef CONFIG_INTEL_IOMMU_SVM
765b6a98 1736 if (pasid_supported(iommu)) {
a222a7f0
DW
1737 if (ecap_prs(iommu->ecap))
1738 intel_svm_finish_prq(iommu);
a222a7f0 1739 }
8a94ade4 1740#endif
ba395927
KA
1741}
1742
a1948f2e
LB
1743/*
1744 * Check and return whether first level is used by default for
b802d070 1745 * DMA translation.
a1948f2e 1746 */
032c5ee4 1747static bool first_level_by_default(unsigned int type)
a1948f2e 1748{
032c5ee4
LB
1749 /* Only SL is available in legacy mode */
1750 if (!scalable_mode_support())
1751 return false;
1752
1753 /* Only level (either FL or SL) is available, just use it */
1754 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755 return intel_cap_flts_sanity();
1756
1757 /* Both levels are available, decide it based on domain type */
1758 return type != IOMMU_DOMAIN_UNMANAGED;
a1948f2e
LB
1759}
1760
b34380a6 1761static struct dmar_domain *alloc_domain(unsigned int type)
ba395927 1762{
ba395927 1763 struct dmar_domain *domain;
ba395927 1764
ee2653bb 1765 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
ba395927
KA
1766 if (!domain)
1767 return NULL;
1768
98fa15f3 1769 domain->nid = NUMA_NO_NODE;
032c5ee4 1770 if (first_level_by_default(type))
e5b0feb4 1771 domain->use_first_level = true;
0824c592 1772 domain->has_iotlb_device = false;
92d03cc8 1773 INIT_LIST_HEAD(&domain->devices);
7d0c9da6 1774 INIT_LIST_HEAD(&domain->dev_pasids);
5eaafdf0 1775 spin_lock_init(&domain->lock);
ba949f4c 1776 xa_init(&domain->iommu_array);
2c2e2c38
FY
1777
1778 return domain;
1779}
1780
d160aca5 1781static int domain_attach_iommu(struct dmar_domain *domain,
fb170fb4
JL
1782 struct intel_iommu *iommu)
1783{
ba949f4c 1784 struct iommu_domain_info *info, *curr;
44bde614 1785 unsigned long ndomains;
ba949f4c 1786 int num, ret = -ENOSPC;
44bde614 1787
ba949f4c
LB
1788 info = kzalloc(sizeof(*info), GFP_KERNEL);
1789 if (!info)
1790 return -ENOMEM;
ba395927 1791
2c3262f9 1792 spin_lock(&iommu->lock);
ba949f4c
LB
1793 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794 if (curr) {
1795 curr->refcnt++;
1796 spin_unlock(&iommu->lock);
1797 kfree(info);
1798 return 0;
1799 }
d160aca5 1800
ba949f4c
LB
1801 ndomains = cap_ndoms(iommu->cap);
1802 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803 if (num >= ndomains) {
1804 pr_err("%s: No free domain ids\n", iommu->name);
1805 goto err_unlock;
fb170fb4 1806 }
ba395927 1807
ba949f4c
LB
1808 set_bit(num, iommu->domain_ids);
1809 info->refcnt = 1;
1810 info->did = num;
1811 info->iommu = iommu;
1812 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813 NULL, info, GFP_ATOMIC);
1814 if (curr) {
1815 ret = xa_err(curr) ? : -EBUSY;
1816 goto err_clear;
fb170fb4 1817 }
ba949f4c 1818 domain_update_iommu_cap(domain);
d160aca5 1819
2c3262f9 1820 spin_unlock(&iommu->lock);
55d94043 1821 return 0;
ba949f4c
LB
1822
1823err_clear:
1824 clear_bit(info->did, iommu->domain_ids);
1825err_unlock:
1826 spin_unlock(&iommu->lock);
1827 kfree(info);
2c3262f9 1828 return ret;
fb170fb4
JL
1829}
1830
74f6d776
PP
1831static void domain_detach_iommu(struct dmar_domain *domain,
1832 struct intel_iommu *iommu)
fb170fb4 1833{
ba949f4c 1834 struct iommu_domain_info *info;
fb170fb4 1835
2c3262f9 1836 spin_lock(&iommu->lock);
ba949f4c
LB
1837 info = xa_load(&domain->iommu_array, iommu->seq_id);
1838 if (--info->refcnt == 0) {
1839 clear_bit(info->did, iommu->domain_ids);
1840 xa_erase(&domain->iommu_array, iommu->seq_id);
1841 domain->nid = NUMA_NO_NODE;
fb170fb4 1842 domain_update_iommu_cap(domain);
ba949f4c 1843 kfree(info);
fb170fb4 1844 }
2c3262f9 1845 spin_unlock(&iommu->lock);
fb170fb4
JL
1846}
1847
ba395927
KA
1848static inline int guestwidth_to_adjustwidth(int gaw)
1849{
1850 int agaw;
1851 int r = (gaw - 12) % 9;
1852
1853 if (r == 0)
1854 agaw = gaw;
1855 else
1856 agaw = gaw + 9 - r;
1857 if (agaw > 64)
1858 agaw = 64;
1859 return agaw;
1860}
1861
ba395927
KA
1862static void domain_exit(struct dmar_domain *domain)
1863{
3ee9eca7 1864 if (domain->pgd) {
87f60cc6 1865 LIST_HEAD(freelist);
ba395927 1866
87f60cc6
MWO
1867 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868 put_pages_list(&freelist);
3ee9eca7 1869 }
ea8ea460 1870
79d82ce4
LB
1871 if (WARN_ON(!list_empty(&domain->devices)))
1872 return;
1873
ee2653bb 1874 kfree(domain);
ba395927
KA
1875}
1876
7373a8cc
LB
1877/*
1878 * Get the PASID directory size for scalable mode context entry.
1879 * Value of X in the PDTS field of a scalable mode context entry
1880 * indicates PASID directory with 2^(X + 7) entries.
1881 */
1882static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883{
4599d78a 1884 unsigned long pds, max_pde;
7373a8cc
LB
1885
1886 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
4599d78a 1887 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
7373a8cc
LB
1888 if (pds < 7)
1889 return 0;
1890
1891 return pds - 7;
1892}
1893
1894/*
1895 * Set the RID_PASID field of a scalable mode context entry. The
1896 * IOMMU hardware will use the PASID value set in this field for
1897 * DMA translations of DMA requests without PASID.
1898 */
1899static inline void
1900context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901{
1902 context->hi |= pasid & ((1 << 20) - 1);
7373a8cc
LB
1903}
1904
1905/*
1906 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907 * entry.
1908 */
1909static inline void context_set_sm_dte(struct context_entry *context)
1910{
b31064f8 1911 context->lo |= BIT_ULL(2);
7373a8cc
LB
1912}
1913
1914/*
1915 * Set the PRE(Page Request Enable) field of a scalable mode context
1916 * entry.
1917 */
1918static inline void context_set_sm_pre(struct context_entry *context)
1919{
b31064f8 1920 context->lo |= BIT_ULL(4);
7373a8cc
LB
1921}
1922
1923/* Convert value to context PASID directory size field coding. */
1924#define context_pdts(pds) (((pds) & 0x7) << 9)
1925
64ae892b
DW
1926static int domain_context_mapping_one(struct dmar_domain *domain,
1927 struct intel_iommu *iommu,
ca6e322d 1928 struct pasid_table *table,
28ccce0d 1929 u8 bus, u8 devfn)
ba395927 1930{
969aaefb 1931 struct device_domain_info *info =
0faa19a1 1932 domain_lookup_dev_info(domain, iommu, bus, devfn);
ba949f4c 1933 u16 did = domain_id_iommu(domain, iommu);
28ccce0d 1934 int translation = CONTEXT_TT_MULTI_LEVEL;
ba395927 1935 struct context_entry *context;
7373a8cc 1936 int ret;
28ccce0d
JR
1937
1938 if (hw_pass_through && domain_type_is_si(domain))
1939 translation = CONTEXT_TT_PASS_THROUGH;
ba395927
KA
1940
1941 pr_debug("Set context mapping for %02x:%02x.%d\n",
1942 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
4ed0d3e6 1943
55d94043 1944 spin_lock(&iommu->lock);
55d94043 1945 ret = -ENOMEM;
03ecc32c 1946 context = iommu_context_addr(iommu, bus, devfn, 1);
ba395927 1947 if (!context)
55d94043 1948 goto out_unlock;
ba395927 1949
55d94043 1950 ret = 0;
0c5f6c0d 1951 if (context_present(context) && !context_copied(iommu, bus, devfn))
55d94043 1952 goto out_unlock;
cf484d0e 1953
aec0e861
XP
1954 /*
1955 * For kdump cases, old valid entries may be cached due to the
1956 * in-flight DMA and copied pgtable, but there is no unmapping
1957 * behaviour for them, thus we need an explicit cache flush for
1958 * the newly-mapped device. For kdump, at this point, the device
1959 * is supposed to finish reset at its driver probe stage, so no
1960 * in-flight DMA will exist, and we don't need to worry anymore
1961 * hereafter.
1962 */
0c5f6c0d 1963 if (context_copied(iommu, bus, devfn)) {
aec0e861
XP
1964 u16 did_old = context_domain_id(context);
1965
b117e038 1966 if (did_old < cap_ndoms(iommu->cap)) {
aec0e861
XP
1967 iommu->flush.flush_context(iommu, did_old,
1968 (((u16)bus) << 8) | devfn,
1969 DMA_CCMD_MASK_NOBIT,
1970 DMA_CCMD_DEVICE_INVL);
f73a7eee
KA
1971 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972 DMA_TLB_DSI_FLUSH);
1973 }
0c5f6c0d
LB
1974
1975 clear_context_copied(iommu, bus, devfn);
aec0e861
XP
1976 }
1977
de24e553 1978 context_clear_entry(context);
ea6606b0 1979
7373a8cc
LB
1980 if (sm_supported(iommu)) {
1981 unsigned long pds;
4ed0d3e6 1982
7373a8cc
LB
1983 /* Setup the PASID DIR pointer: */
1984 pds = context_get_sm_pds(table);
1985 context->lo = (u64)virt_to_phys(table->table) |
1986 context_pdts(pds);
1987
1988 /* Setup the RID_PASID field: */
42987801 1989 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
de24e553 1990
de24e553 1991 /*
7373a8cc
LB
1992 * Setup the Device-TLB enable bit and Page request
1993 * Enable bit:
de24e553 1994 */
7373a8cc
LB
1995 if (info && info->ats_supported)
1996 context_set_sm_dte(context);
1997 if (info && info->pri_supported)
1998 context_set_sm_pre(context);
0faa19a1
LB
1999 if (info && info->pasid_supported)
2000 context_set_pasid(context);
7373a8cc
LB
2001 } else {
2002 struct dma_pte *pgd = domain->pgd;
2003 int agaw;
2004
2005 context_set_domain_id(context, did);
7373a8cc
LB
2006
2007 if (translation != CONTEXT_TT_PASS_THROUGH) {
2008 /*
2009 * Skip top levels of page tables for iommu which has
2010 * less agaw than default. Unnecessary for PT mode.
2011 */
2012 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013 ret = -ENOMEM;
2014 pgd = phys_to_virt(dma_pte_addr(pgd));
2015 if (!dma_pte_present(pgd))
2016 goto out_unlock;
2017 }
2018
7373a8cc
LB
2019 if (info && info->ats_supported)
2020 translation = CONTEXT_TT_DEV_IOTLB;
2021 else
2022 translation = CONTEXT_TT_MULTI_LEVEL;
2023
2024 context_set_address_root(context, virt_to_phys(pgd));
2025 context_set_address_width(context, agaw);
2026 } else {
2027 /*
2028 * In pass through mode, AW must be programmed to
2029 * indicate the largest AGAW value supported by
2030 * hardware. And ASR is ignored by hardware.
2031 */
2032 context_set_address_width(context, iommu->msagaw);
2033 }
41b80db2
LB
2034
2035 context_set_translation_type(context, translation);
93a23a72 2036 }
4ed0d3e6 2037
c07e7d21
MM
2038 context_set_fault_enable(context);
2039 context_set_present(context);
04c00956
LB
2040 if (!ecap_coherent(iommu->ecap))
2041 clflush_cache_range(context, sizeof(*context));
ba395927 2042
4c25a2c1
DW
2043 /*
2044 * It's a non-present to present mapping. If hardware doesn't cache
2045 * non-present entry we only need to flush the write-buffer. If the
2046 * _does_ cache non-present entries, then it does so in the special
2047 * domain #0, which we have to flush:
2048 */
2049 if (cap_caching_mode(iommu->cap)) {
2050 iommu->flush.flush_context(iommu, 0,
2051 (((u16)bus) << 8) | devfn,
2052 DMA_CCMD_MASK_NOBIT,
2053 DMA_CCMD_DEVICE_INVL);
c6c2cebd 2054 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
4c25a2c1 2055 } else {
ba395927 2056 iommu_flush_write_buffer(iommu);
4c25a2c1 2057 }
c7151a8d 2058
55d94043
JR
2059 ret = 0;
2060
2061out_unlock:
2062 spin_unlock(&iommu->lock);
fb170fb4 2063
5c365d18 2064 return ret;
ba395927
KA
2065}
2066
0ce4a85f
LB
2067struct domain_context_mapping_data {
2068 struct dmar_domain *domain;
2069 struct intel_iommu *iommu;
2070 struct pasid_table *table;
2071};
2072
2073static int domain_context_mapping_cb(struct pci_dev *pdev,
2074 u16 alias, void *opaque)
2075{
2076 struct domain_context_mapping_data *data = opaque;
2077
2078 return domain_context_mapping_one(data->domain, data->iommu,
2079 data->table, PCI_BUS_NUM(alias),
2080 alias & 0xff);
2081}
2082
ba395927 2083static int
28ccce0d 2084domain_context_mapping(struct dmar_domain *domain, struct device *dev)
ba395927 2085{
0ce4a85f 2086 struct domain_context_mapping_data data;
ca6e322d 2087 struct pasid_table *table;
64ae892b 2088 struct intel_iommu *iommu;
156baca8 2089 u8 bus, devfn;
64ae892b 2090
e1f167f3 2091 iommu = device_to_iommu(dev, &bus, &devfn);
64ae892b
DW
2092 if (!iommu)
2093 return -ENODEV;
ba395927 2094
ca6e322d 2095 table = intel_pasid_get_table(dev);
0ce4a85f
LB
2096
2097 if (!dev_is_pci(dev))
2098 return domain_context_mapping_one(domain, iommu, table,
2099 bus, devfn);
2100
2101 data.domain = domain;
2102 data.iommu = iommu;
2103 data.table = table;
2104
2105 return pci_for_each_dma_alias(to_pci_dev(dev),
2106 &domain_context_mapping_cb, &data);
579305f7
AW
2107}
2108
f532959b
FY
2109/* Returns a number of VTD pages, but aligned to MM page size */
2110static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111 size_t size)
2112{
2113 host_addr &= ~PAGE_MASK;
2114 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115}
2116
6dd9a7c7
YS
2117/* Return largest possible superpage level for a given mapping */
2118static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119 unsigned long iov_pfn,
2120 unsigned long phy_pfn,
2121 unsigned long pages)
2122{
2123 int support, level = 1;
2124 unsigned long pfnmerge;
2125
2126 support = domain->iommu_superpage;
2127
2128 /* To use a large page, the virtual *and* physical addresses
2129 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130 of them will mean we have to use smaller pages. So just
2131 merge them and check both at once. */
2132 pfnmerge = iov_pfn | phy_pfn;
2133
2134 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135 pages >>= VTD_STRIDE_SHIFT;
2136 if (!pages)
2137 break;
2138 pfnmerge >>= VTD_STRIDE_SHIFT;
2139 level++;
2140 support--;
2141 }
2142 return level;
2143}
2144
38c527ae
LM
2145/*
2146 * Ensure that old small page tables are removed to make room for superpage(s).
2147 * We're going to add new large pages, so make sure we don't remove their parent
2148 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149 */
2150static void switch_to_super_page(struct dmar_domain *domain,
2151 unsigned long start_pfn,
2152 unsigned long end_pfn, int level)
2153{
2154 unsigned long lvl_pages = lvl_to_nr_pages(level);
ba949f4c 2155 struct iommu_domain_info *info;
38c527ae 2156 struct dma_pte *pte = NULL;
ba949f4c 2157 unsigned long i;
38c527ae
LM
2158
2159 while (start_pfn <= end_pfn) {
2160 if (!pte)
2d4d7676
JG
2161 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162 GFP_ATOMIC);
38c527ae
LM
2163
2164 if (dma_pte_present(pte)) {
2165 dma_pte_free_pagetable(domain, start_pfn,
2166 start_pfn + lvl_pages - 1,
2167 level + 1);
2168
ba949f4c
LB
2169 xa_for_each(&domain->iommu_array, i, info)
2170 iommu_flush_iotlb_psi(info->iommu, domain,
38c527ae
LM
2171 start_pfn, lvl_pages,
2172 0, 0);
2173 }
2174
2175 pte++;
2176 start_pfn += lvl_pages;
2177 if (first_pte_in_page(pte))
2178 pte = NULL;
2179 }
2180}
2181
58a8bb39
LB
2182static int
2183__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2d4d7676
JG
2184 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185 gfp_t gfp)
e1605495 2186{
75cc1018 2187 struct dma_pte *first_pte = NULL, *pte = NULL;
6dd9a7c7
YS
2188 unsigned int largepage_lvl = 0;
2189 unsigned long lvl_pages = 0;
58a8bb39 2190 phys_addr_t pteval;
ddf09b6d 2191 u64 attr;
e1605495 2192
cbf2f9e8
TZ
2193 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194 return -EINVAL;
e1605495
DW
2195
2196 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197 return -EINVAL;
2198
ddf09b6d 2199 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
eea53c58 2200 attr |= DMA_FL_PTE_PRESENT;
e5b0feb4 2201 if (domain->use_first_level) {
289b3b00
LB
2202 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203 if (prot & DMA_PTE_WRITE)
2204 attr |= DMA_FL_PTE_DIRTY;
a8ce9ebb
LB
2205 }
2206
58a8bb39 2207 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
9051aa02 2208
6dd9a7c7 2209 while (nr_pages > 0) {
c85994e4
DW
2210 uint64_t tmp;
2211
e1605495 2212 if (!pte) {
58a8bb39
LB
2213 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2214 phys_pfn, nr_pages);
6dd9a7c7 2215
2d4d7676
JG
2216 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2217 gfp);
e1605495
DW
2218 if (!pte)
2219 return -ENOMEM;
75cc1018
LB
2220 first_pte = pte;
2221
9906b935
LM
2222 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2223
6dd9a7c7 2224 /* It is large page*/
6491d4d0 2225 if (largepage_lvl > 1) {
38c527ae 2226 unsigned long end_pfn;
9906b935 2227 unsigned long pages_to_remove;
ba2374fd 2228
6dd9a7c7 2229 pteval |= DMA_PTE_LARGE_PAGE;
9906b935
LM
2230 pages_to_remove = min_t(unsigned long, nr_pages,
2231 nr_pte_to_next_page(pte) * lvl_pages);
2232 end_pfn = iov_pfn + pages_to_remove - 1;
38c527ae 2233 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
6491d4d0 2234 } else {
6dd9a7c7 2235 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
6491d4d0 2236 }
6dd9a7c7 2237
e1605495
DW
2238 }
2239 /* We don't need lock here, nobody else
2240 * touches the iova range
2241 */
7766a3fb 2242 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
c85994e4 2243 if (tmp) {
1bf20f0d 2244 static int dumps = 5;
9f10e5bf
JR
2245 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2246 iov_pfn, tmp, (unsigned long long)pteval);
1bf20f0d
DW
2247 if (dumps) {
2248 dumps--;
2249 debug_dma_dump_mappings(NULL);
2250 }
2251 WARN_ON(1);
2252 }
6dd9a7c7 2253
6dd9a7c7
YS
2254 nr_pages -= lvl_pages;
2255 iov_pfn += lvl_pages;
2256 phys_pfn += lvl_pages;
2257 pteval += lvl_pages * VTD_PAGE_SIZE;
6dd9a7c7
YS
2258
2259 /* If the next PTE would be the first in a new page, then we
58a8bb39
LB
2260 * need to flush the cache on the entries we've just written.
2261 * And then we'll need to recalculate 'pte', so clear it and
2262 * let it get set again in the if (!pte) block above.
2263 *
2264 * If we're done (!nr_pages) we need to flush the cache too.
2265 *
2266 * Also if we've been setting superpages, we may need to
2267 * recalculate 'pte' and switch back to smaller pages for the
2268 * end of the mapping, if the trailing size is not enough to
2269 * use another superpage (i.e. nr_pages < lvl_pages).
2270 */
e1605495 2271 pte++;
6dd9a7c7 2272 if (!nr_pages || first_pte_in_page(pte) ||
75cc1018
LB
2273 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2274 domain_flush_cache(domain, first_pte,
2275 (void *)pte - (void *)first_pte);
e1605495 2276 pte = NULL;
75cc1018 2277 }
095303e0
LB
2278 }
2279
2280 return 0;
87684fd9
PX
2281}
2282
37764b95 2283static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
ba395927 2284{
37764b95 2285 struct intel_iommu *iommu = info->iommu;
5082219b
FS
2286 struct context_entry *context;
2287 u16 did_old;
2288
c7151a8d
WH
2289 if (!iommu)
2290 return;
8c11e798 2291
ffd5869d 2292 spin_lock(&iommu->lock);
5082219b
FS
2293 context = iommu_context_addr(iommu, bus, devfn, 0);
2294 if (!context) {
ffd5869d 2295 spin_unlock(&iommu->lock);
5082219b
FS
2296 return;
2297 }
37764b95
SK
2298
2299 if (sm_supported(iommu)) {
2300 if (hw_pass_through && domain_type_is_si(info->domain))
2301 did_old = FLPT_DEFAULT_DID;
2302 else
ba949f4c 2303 did_old = domain_id_iommu(info->domain, iommu);
37764b95
SK
2304 } else {
2305 did_old = context_domain_id(context);
2306 }
2307
5082219b
FS
2308 context_clear_entry(context);
2309 __iommu_flush_cache(iommu, context, sizeof(*context));
ffd5869d 2310 spin_unlock(&iommu->lock);
5082219b
FS
2311 iommu->flush.flush_context(iommu,
2312 did_old,
2313 (((u16)bus) << 8) | devfn,
2314 DMA_CCMD_MASK_NOBIT,
2315 DMA_CCMD_DEVICE_INVL);
c0474a60
LB
2316
2317 if (sm_supported(iommu))
2318 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2319
5082219b
FS
2320 iommu->flush.flush_iotlb(iommu,
2321 did_old,
2322 0,
2323 0,
2324 DMA_TLB_DSI_FLUSH);
37764b95
SK
2325
2326 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
ba395927
KA
2327}
2328
ddf09b6d
LB
2329static int domain_setup_first_level(struct intel_iommu *iommu,
2330 struct dmar_domain *domain,
2331 struct device *dev,
c7b6bac9 2332 u32 pasid)
ddf09b6d 2333{
ddf09b6d
LB
2334 struct dma_pte *pgd = domain->pgd;
2335 int agaw, level;
54c80d90 2336 int flags = 0;
ddf09b6d
LB
2337
2338 /*
2339 * Skip top levels of page tables for iommu which has
2340 * less agaw than default. Unnecessary for PT mode.
2341 */
2342 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2343 pgd = phys_to_virt(dma_pte_addr(pgd));
2344 if (!dma_pte_present(pgd))
2345 return -ENOMEM;
2346 }
2347
2348 level = agaw_to_level(agaw);
2349 if (level != 4 && level != 5)
2350 return -EINVAL;
2351
54c80d90
LB
2352 if (level == 5)
2353 flags |= PASID_FLAG_FL5LP;
ddf09b6d 2354
fc0051cb 2355 if (domain->force_snooping)
6c00612d
LB
2356 flags |= PASID_FLAG_PAGE_SNOOP;
2357
ddf09b6d 2358 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
ba949f4c 2359 domain_id_iommu(domain, iommu),
ddf09b6d
LB
2360 flags);
2361}
2362
8038bdb8
JD
2363static bool dev_is_real_dma_subdevice(struct device *dev)
2364{
2365 return dev && dev_is_pci(dev) &&
2366 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2367}
2368
b213203e 2369static int iommu_domain_identity_map(struct dmar_domain *domain,
e70b081c
TM
2370 unsigned long first_vpfn,
2371 unsigned long last_vpfn)
ba395927 2372{
ba395927
KA
2373 /*
2374 * RMRR range might have overlap with physical memory range,
2375 * clear it first
2376 */
c5395d5c 2377 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
ba395927 2378
58a8bb39 2379 return __domain_mapping(domain, first_vpfn,
87684fd9 2380 first_vpfn, last_vpfn - first_vpfn + 1,
4951eb26 2381 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
b213203e
DW
2382}
2383
301e7ee1
JR
2384static int md_domain_init(struct dmar_domain *domain, int guest_width);
2385
071e1374 2386static int __init si_domain_init(int hw)
2c2e2c38 2387{
4de354ec
LB
2388 struct dmar_rmrr_unit *rmrr;
2389 struct device *dev;
2390 int i, nid, ret;
2c2e2c38 2391
b34380a6 2392 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2c2e2c38
FY
2393 if (!si_domain)
2394 return -EFAULT;
2395
301e7ee1 2396 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2c2e2c38 2397 domain_exit(si_domain);
620bf9f9 2398 si_domain = NULL;
2c2e2c38
FY
2399 return -EFAULT;
2400 }
2401
19943b0e
DW
2402 if (hw)
2403 return 0;
2404
c7ab48d2 2405 for_each_online_node(nid) {
5dfe8660
TH
2406 unsigned long start_pfn, end_pfn;
2407 int i;
2408
2409 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2410 ret = iommu_domain_identity_map(si_domain,
fb5f50a4
YX
2411 mm_to_dma_pfn_start(start_pfn),
2412 mm_to_dma_pfn_end(end_pfn));
5dfe8660
TH
2413 if (ret)
2414 return ret;
2415 }
c7ab48d2
DW
2416 }
2417
4de354ec 2418 /*
9235cb13
LB
2419 * Identity map the RMRRs so that devices with RMRRs could also use
2420 * the si_domain.
4de354ec
LB
2421 */
2422 for_each_rmrr_units(rmrr) {
2423 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2424 i, dev) {
2425 unsigned long long start = rmrr->base_address;
2426 unsigned long long end = rmrr->end_address;
2427
4de354ec
LB
2428 if (WARN_ON(end < start ||
2429 end >> agaw_to_width(si_domain->agaw)))
2430 continue;
2431
48f0bcfb 2432 ret = iommu_domain_identity_map(si_domain,
fb5f50a4
YX
2433 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2434 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
4de354ec
LB
2435 if (ret)
2436 return ret;
2437 }
2438 }
2439
2c2e2c38
FY
2440 return 0;
2441}
2442
a8204479
LB
2443static int dmar_domain_attach_device(struct dmar_domain *domain,
2444 struct device *dev)
2c2e2c38 2445{
bac4e778 2446 struct device_domain_info *info = dev_iommu_priv_get(dev);
5a8f40e8 2447 struct intel_iommu *iommu;
a349ffcb 2448 unsigned long flags;
156baca8 2449 u8 bus, devfn;
bac4e778 2450 int ret;
2c2e2c38 2451
5913c9bf 2452 iommu = device_to_iommu(dev, &bus, &devfn);
5a8f40e8
DW
2453 if (!iommu)
2454 return -ENODEV;
2455
bac4e778 2456 ret = domain_attach_iommu(domain, iommu);
969aaefb 2457 if (ret)
bac4e778 2458 return ret;
969aaefb 2459 info->domain = domain;
a349ffcb 2460 spin_lock_irqsave(&domain->lock, flags);
bac4e778 2461 list_add(&info->link, &domain->devices);
a349ffcb 2462 spin_unlock_irqrestore(&domain->lock, flags);
bac4e778
LB
2463
2464 /* PASID table is mandatory for a PCI device in scalable mode. */
2465 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
bac4e778 2466 /* Setup the PASID entry for requests without PASID: */
bac4e778
LB
2467 if (hw_pass_through && domain_type_is_si(domain))
2468 ret = intel_pasid_setup_pass_through(iommu, domain,
42987801 2469 dev, IOMMU_NO_PASID);
e5b0feb4 2470 else if (domain->use_first_level)
bac4e778 2471 ret = domain_setup_first_level(iommu, domain, dev,
42987801 2472 IOMMU_NO_PASID);
bac4e778
LB
2473 else
2474 ret = intel_pasid_setup_second_level(iommu, domain,
42987801 2475 dev, IOMMU_NO_PASID);
bac4e778
LB
2476 if (ret) {
2477 dev_err(dev, "Setup RID2PASID failed\n");
c7be17c2 2478 device_block_translation(dev);
bac4e778
LB
2479 return ret;
2480 }
2481 }
2482
2483 ret = domain_context_mapping(domain, dev);
2484 if (ret) {
2485 dev_err(dev, "Domain context map failed\n");
c7be17c2 2486 device_block_translation(dev);
bac4e778
LB
2487 return ret;
2488 }
2c2e2c38 2489
c7be17c2
LB
2490 iommu_enable_pci_caps(info);
2491
2c2e2c38
FY
2492 return 0;
2493}
2494
1c5c59fb
EA
2495/**
2496 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2497 * is relaxable (ie. is allowed to be not enforced under some conditions)
2498 * @dev: device handle
2499 *
2500 * We assume that PCI USB devices with RMRRs have them largely
2501 * for historical reasons and that the RMRR space is not actively used post
2502 * boot. This exclusion may change if vendors begin to abuse it.
2503 *
2504 * The same exception is made for graphics devices, with the requirement that
2505 * any use of the RMRR regions will be torn down before assigning the device
2506 * to a guest.
2507 *
2508 * Return: true if the RMRR is relaxable, false otherwise
2509 */
2510static bool device_rmrr_is_relaxable(struct device *dev)
2511{
2512 struct pci_dev *pdev;
2513
2514 if (!dev_is_pci(dev))
2515 return false;
2516
2517 pdev = to_pci_dev(dev);
2518 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2519 return true;
2520 else
2521 return false;
2522}
2523
f273a453
LB
2524/*
2525 * Return the required default domain type for a specific device.
2526 *
2527 * @dev: the device in query
2528 * @startup: true if this is during early boot
2529 *
2530 * Returns:
2531 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2532 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2533 * - 0: both identity and dynamic domains work for this device
2534 */
0e31a726 2535static int device_def_domain_type(struct device *dev)
6941af28 2536{
3bdb2591
DW
2537 if (dev_is_pci(dev)) {
2538 struct pci_dev *pdev = to_pci_dev(dev);
ea2447f7 2539
3bdb2591 2540 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
f273a453 2541 return IOMMU_DOMAIN_IDENTITY;
e0fc7e0b 2542
3bdb2591 2543 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
f273a453 2544 return IOMMU_DOMAIN_IDENTITY;
3bdb2591 2545 }
3dfc813d 2546
b89b6605 2547 return 0;
f273a453
LB
2548}
2549
ffebeb46
JL
2550static void intel_iommu_init_qi(struct intel_iommu *iommu)
2551{
2552 /*
2553 * Start from the sane iommu hardware state.
2554 * If the queued invalidation is already initialized by us
2555 * (for example, while enabling interrupt-remapping) then
2556 * we got the things already rolling from a sane state.
2557 */
2558 if (!iommu->qi) {
2559 /*
2560 * Clear any previous faults.
2561 */
2562 dmar_fault(-1, iommu);
2563 /*
2564 * Disable queued invalidation if supported and already enabled
2565 * before OS handover.
2566 */
2567 dmar_disable_qi(iommu);
2568 }
2569
2570 if (dmar_enable_qi(iommu)) {
2571 /*
2572 * Queued Invalidate not enabled, use Register Based Invalidate
2573 */
2574 iommu->flush.flush_context = __iommu_flush_context;
2575 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
9f10e5bf 2576 pr_info("%s: Using Register based invalidation\n",
ffebeb46
JL
2577 iommu->name);
2578 } else {
2579 iommu->flush.flush_context = qi_flush_context;
2580 iommu->flush.flush_iotlb = qi_flush_iotlb;
9f10e5bf 2581 pr_info("%s: Using Queued invalidation\n", iommu->name);
ffebeb46
JL
2582 }
2583}
2584
091d42e4 2585static int copy_context_table(struct intel_iommu *iommu,
dfddb969 2586 struct root_entry *old_re,
091d42e4
JR
2587 struct context_entry **tbl,
2588 int bus, bool ext)
2589{
dbcd861f 2590 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
543c8dcf 2591 struct context_entry *new_ce = NULL, ce;
dfddb969 2592 struct context_entry *old_ce = NULL;
543c8dcf 2593 struct root_entry re;
091d42e4
JR
2594 phys_addr_t old_ce_phys;
2595
2596 tbl_idx = ext ? bus * 2 : bus;
dfddb969 2597 memcpy(&re, old_re, sizeof(re));
091d42e4
JR
2598
2599 for (devfn = 0; devfn < 256; devfn++) {
2600 /* First calculate the correct index */
2601 idx = (ext ? devfn * 2 : devfn) % 256;
2602
2603 if (idx == 0) {
2604 /* First save what we may have and clean up */
2605 if (new_ce) {
2606 tbl[tbl_idx] = new_ce;
2607 __iommu_flush_cache(iommu, new_ce,
2608 VTD_PAGE_SIZE);
2609 pos = 1;
2610 }
2611
2612 if (old_ce)
829383e1 2613 memunmap(old_ce);
091d42e4
JR
2614
2615 ret = 0;
2616 if (devfn < 0x80)
543c8dcf 2617 old_ce_phys = root_entry_lctp(&re);
091d42e4 2618 else
543c8dcf 2619 old_ce_phys = root_entry_uctp(&re);
091d42e4
JR
2620
2621 if (!old_ce_phys) {
2622 if (ext && devfn == 0) {
2623 /* No LCTP, try UCTP */
2624 devfn = 0x7f;
2625 continue;
2626 } else {
2627 goto out;
2628 }
2629 }
2630
2631 ret = -ENOMEM;
dfddb969
DW
2632 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2633 MEMREMAP_WB);
091d42e4
JR
2634 if (!old_ce)
2635 goto out;
2636
4951eb26 2637 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
091d42e4
JR
2638 if (!new_ce)
2639 goto out_unmap;
2640
2641 ret = 0;
2642 }
2643
2644 /* Now copy the context entry */
dfddb969 2645 memcpy(&ce, old_ce + idx, sizeof(ce));
091d42e4 2646
0c5f6c0d 2647 if (!context_present(&ce))
091d42e4
JR
2648 continue;
2649
dbcd861f
JR
2650 did = context_domain_id(&ce);
2651 if (did >= 0 && did < cap_ndoms(iommu->cap))
2652 set_bit(did, iommu->domain_ids);
2653
0c5f6c0d 2654 set_context_copied(iommu, bus, devfn);
091d42e4
JR
2655 new_ce[idx] = ce;
2656 }
2657
2658 tbl[tbl_idx + pos] = new_ce;
2659
2660 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2661
2662out_unmap:
dfddb969 2663 memunmap(old_ce);
091d42e4
JR
2664
2665out:
2666 return ret;
2667}
2668
2669static int copy_translation_tables(struct intel_iommu *iommu)
2670{
2671 struct context_entry **ctxt_tbls;
dfddb969 2672 struct root_entry *old_rt;
091d42e4
JR
2673 phys_addr_t old_rt_phys;
2674 int ctxt_table_entries;
091d42e4
JR
2675 u64 rtaddr_reg;
2676 int bus, ret;
c3361f2f 2677 bool new_ext, ext;
091d42e4
JR
2678
2679 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
0c5f6c0d
LB
2680 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2681 new_ext = !!sm_supported(iommu);
c3361f2f
JR
2682
2683 /*
2684 * The RTT bit can only be changed when translation is disabled,
2685 * but disabling translation means to open a window for data
2686 * corruption. So bail out and don't copy anything if we would
2687 * have to change the bit.
2688 */
2689 if (new_ext != ext)
2690 return -EINVAL;
091d42e4 2691
0c5f6c0d
LB
2692 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2693 if (!iommu->copied_tables)
2694 return -ENOMEM;
2695
091d42e4
JR
2696 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2697 if (!old_rt_phys)
2698 return -EINVAL;
2699
dfddb969 2700 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
091d42e4
JR
2701 if (!old_rt)
2702 return -ENOMEM;
2703
2704 /* This is too big for the stack - allocate it from slab */
2705 ctxt_table_entries = ext ? 512 : 256;
2706 ret = -ENOMEM;
6396bb22 2707 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
091d42e4
JR
2708 if (!ctxt_tbls)
2709 goto out_unmap;
2710
2711 for (bus = 0; bus < 256; bus++) {
2712 ret = copy_context_table(iommu, &old_rt[bus],
2713 ctxt_tbls, bus, ext);
2714 if (ret) {
2715 pr_err("%s: Failed to copy context table for bus %d\n",
2716 iommu->name, bus);
2717 continue;
2718 }
2719 }
2720
ffd5869d 2721 spin_lock(&iommu->lock);
091d42e4
JR
2722
2723 /* Context tables are copied, now write them to the root_entry table */
2724 for (bus = 0; bus < 256; bus++) {
2725 int idx = ext ? bus * 2 : bus;
2726 u64 val;
2727
2728 if (ctxt_tbls[idx]) {
2729 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2730 iommu->root_entry[bus].lo = val;
2731 }
2732
2733 if (!ext || !ctxt_tbls[idx + 1])
2734 continue;
2735
2736 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2737 iommu->root_entry[bus].hi = val;
2738 }
2739
ffd5869d 2740 spin_unlock(&iommu->lock);
091d42e4
JR
2741
2742 kfree(ctxt_tbls);
2743
2744 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2745
2746 ret = 0;
2747
2748out_unmap:
dfddb969 2749 memunmap(old_rt);
091d42e4
JR
2750
2751 return ret;
2752}
2753
b779260b 2754static int __init init_dmars(void)
ba395927
KA
2755{
2756 struct dmar_drhd_unit *drhd;
ba395927 2757 struct intel_iommu *iommu;
df4f3c60 2758 int ret;
2c2e2c38 2759
ad3d1902
KMP
2760 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2761 if (ret)
2762 goto free_iommu;
2763
6a8c6748
LB
2764 for_each_iommu(iommu, drhd) {
2765 if (drhd->ignored) {
2766 iommu_disable_translation(iommu);
2767 continue;
2768 }
2769
56283174
LB
2770 /*
2771 * Find the max pasid size of all IOMMU's in the system.
2772 * We need to ensure the system pasid table is no bigger
2773 * than the smallest supported.
2774 */
765b6a98 2775 if (pasid_supported(iommu)) {
56283174
LB
2776 u32 temp = 2 << ecap_pss(iommu->ecap);
2777
2778 intel_pasid_max_id = min_t(u32, temp,
2779 intel_pasid_max_id);
2780 }
2781
b63d80d1
JR
2782 intel_iommu_init_qi(iommu);
2783
e61d98d8
SS
2784 ret = iommu_init_domains(iommu);
2785 if (ret)
989d51fc 2786 goto free_iommu;
e61d98d8 2787
4158c2ec
JR
2788 init_translation_status(iommu);
2789
091d42e4
JR
2790 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2791 iommu_disable_translation(iommu);
2792 clear_translation_pre_enabled(iommu);
2793 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2794 iommu->name);
2795 }
4158c2ec 2796
ba395927
KA
2797 /*
2798 * TBD:
2799 * we could share the same root & context tables
25985edc 2800 * among all IOMMU's. Need to Split it later.
ba395927
KA
2801 */
2802 ret = iommu_alloc_root_entry(iommu);
ffebeb46 2803 if (ret)
989d51fc 2804 goto free_iommu;
5f0a7f76 2805
091d42e4
JR
2806 if (translation_pre_enabled(iommu)) {
2807 pr_info("Translation already enabled - trying to copy translation structures\n");
2808
2809 ret = copy_translation_tables(iommu);
2810 if (ret) {
2811 /*
2812 * We found the IOMMU with translation
2813 * enabled - but failed to copy over the
2814 * old root-entry table. Try to proceed
2815 * by disabling translation now and
2816 * allocating a clean root-entry table.
2817 * This might cause DMAR faults, but
2818 * probably the dump will still succeed.
2819 */
2820 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2821 iommu->name);
2822 iommu_disable_translation(iommu);
2823 clear_translation_pre_enabled(iommu);
2824 } else {
2825 pr_info("Copied translation tables from previous kernel for %s\n",
2826 iommu->name);
2827 }
2828 }
2829
4ed0d3e6 2830 if (!ecap_pass_through(iommu->ecap))
19943b0e 2831 hw_pass_through = 0;
ff3dc652 2832 intel_svm_check(iommu);
ba395927
KA
2833 }
2834
a4c34ff1
JR
2835 /*
2836 * Now that qi is enabled on all iommus, set the root entry and flush
2837 * caches. This is required on some Intel X58 chipsets, otherwise the
2838 * flush_context function will loop forever and the boot hangs.
2839 */
2840 for_each_active_iommu(iommu, drhd) {
2841 iommu_flush_write_buffer(iommu);
2842 iommu_set_root_entry(iommu);
a4c34ff1
JR
2843 }
2844
d3f13810 2845#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
5daab580 2846 dmar_map_gfx = 0;
19943b0e 2847#endif
e0fc7e0b 2848
5daab580
LB
2849 if (!dmar_map_gfx)
2850 iommu_identity_mapping |= IDENTMAP_GFX;
2851
21e722c4
AR
2852 check_tylersburg_isoch();
2853
4de354ec
LB
2854 ret = si_domain_init(hw_pass_through);
2855 if (ret)
2856 goto free_iommu;
86080ccc 2857
ba395927
KA
2858 /*
2859 * for each drhd
2860 * enable fault log
2861 * global invalidate context cache
2862 * global invalidate iotlb
2863 * enable translation
2864 */
7c919779 2865 for_each_iommu(iommu, drhd) {
51a63e67
JC
2866 if (drhd->ignored) {
2867 /*
2868 * we always have to disable PMRs or DMA may fail on
2869 * this device
2870 */
2871 if (force_on)
7c919779 2872 iommu_disable_protect_mem_regions(iommu);
ba395927 2873 continue;
51a63e67 2874 }
ba395927
KA
2875
2876 iommu_flush_write_buffer(iommu);
2877
a222a7f0 2878#ifdef CONFIG_INTEL_IOMMU_SVM
765b6a98 2879 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
a7755c3c
LB
2880 /*
2881 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2882 * could cause possible lock race condition.
2883 */
2884 up_write(&dmar_global_lock);
a222a7f0 2885 ret = intel_svm_enable_prq(iommu);
a7755c3c 2886 down_write(&dmar_global_lock);
a222a7f0
DW
2887 if (ret)
2888 goto free_iommu;
2889 }
2890#endif
3460a6d9
KA
2891 ret = dmar_set_interrupt(iommu);
2892 if (ret)
989d51fc 2893 goto free_iommu;
ba395927
KA
2894 }
2895
2896 return 0;
989d51fc
JL
2897
2898free_iommu:
ffebeb46
JL
2899 for_each_active_iommu(iommu, drhd) {
2900 disable_dmar_iommu(iommu);
a868e6b7 2901 free_dmar_iommu(iommu);
ffebeb46 2902 }
620bf9f9
JS
2903 if (si_domain) {
2904 domain_exit(si_domain);
2905 si_domain = NULL;
2906 }
13cf0174 2907
ba395927
KA
2908 return ret;
2909}
2910
ba395927
KA
2911static void __init init_no_remapping_devices(void)
2912{
2913 struct dmar_drhd_unit *drhd;
832bd858 2914 struct device *dev;
b683b230 2915 int i;
ba395927
KA
2916
2917 for_each_drhd_unit(drhd) {
2918 if (!drhd->include_all) {
b683b230
JL
2919 for_each_active_dev_scope(drhd->devices,
2920 drhd->devices_cnt, i, dev)
2921 break;
832bd858 2922 /* ignore DMAR unit if no devices exist */
ba395927
KA
2923 if (i == drhd->devices_cnt)
2924 drhd->ignored = 1;
2925 }
2926 }
2927
7c919779 2928 for_each_active_drhd_unit(drhd) {
7c919779 2929 if (drhd->include_all)
ba395927
KA
2930 continue;
2931
b683b230
JL
2932 for_each_active_dev_scope(drhd->devices,
2933 drhd->devices_cnt, i, dev)
832bd858 2934 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
ba395927 2935 break;
ba395927
KA
2936 if (i < drhd->devices_cnt)
2937 continue;
2938
c0771df8
DW
2939 /* This IOMMU has *only* gfx devices. Either bypass it or
2940 set the gfx_mapped flag, as appropriate */
b1012ca8 2941 drhd->gfx_dedicated = 1;
2d33b7d6 2942 if (!dmar_map_gfx)
c0771df8 2943 drhd->ignored = 1;
ba395927
KA
2944 }
2945}
2946
f59c7b69
FY
2947#ifdef CONFIG_SUSPEND
2948static int init_iommu_hw(void)
2949{
2950 struct dmar_drhd_unit *drhd;
2951 struct intel_iommu *iommu = NULL;
a0e9911a 2952 int ret;
f59c7b69 2953
a0e9911a
YX
2954 for_each_active_iommu(iommu, drhd) {
2955 if (iommu->qi) {
2956 ret = dmar_reenable_qi(iommu);
2957 if (ret)
2958 return ret;
2959 }
2960 }
f59c7b69 2961
b779260b
JC
2962 for_each_iommu(iommu, drhd) {
2963 if (drhd->ignored) {
2964 /*
2965 * we always have to disable PMRs or DMA may fail on
2966 * this device
2967 */
2968 if (force_on)
2969 iommu_disable_protect_mem_regions(iommu);
2970 continue;
2971 }
095303e0 2972
f59c7b69 2973 iommu_flush_write_buffer(iommu);
f59c7b69 2974 iommu_set_root_entry(iommu);
2a41ccee 2975 iommu_enable_translation(iommu);
b94996c9 2976 iommu_disable_protect_mem_regions(iommu);
f59c7b69
FY
2977 }
2978
2979 return 0;
2980}
2981
2982static void iommu_flush_all(void)
2983{
2984 struct dmar_drhd_unit *drhd;
2985 struct intel_iommu *iommu;
2986
2987 for_each_active_iommu(iommu, drhd) {
2988 iommu->flush.flush_context(iommu, 0, 0, 0,
1f0ef2aa 2989 DMA_CCMD_GLOBAL_INVL);
f59c7b69 2990 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1f0ef2aa 2991 DMA_TLB_GLOBAL_FLUSH);
f59c7b69
FY
2992 }
2993}
2994
134fac3f 2995static int iommu_suspend(void)
f59c7b69
FY
2996{
2997 struct dmar_drhd_unit *drhd;
2998 struct intel_iommu *iommu = NULL;
2999 unsigned long flag;
3000
f59c7b69
FY
3001 iommu_flush_all();
3002
3003 for_each_active_iommu(iommu, drhd) {
3004 iommu_disable_translation(iommu);
3005
1f5b3c3f 3006 raw_spin_lock_irqsave(&iommu->register_lock, flag);
f59c7b69
FY
3007
3008 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3009 readl(iommu->reg + DMAR_FECTL_REG);
3010 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3011 readl(iommu->reg + DMAR_FEDATA_REG);
3012 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3013 readl(iommu->reg + DMAR_FEADDR_REG);
3014 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3015 readl(iommu->reg + DMAR_FEUADDR_REG);
3016
1f5b3c3f 3017 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
f59c7b69
FY
3018 }
3019 return 0;
f59c7b69
FY
3020}
3021
134fac3f 3022static void iommu_resume(void)
f59c7b69
FY
3023{
3024 struct dmar_drhd_unit *drhd;
3025 struct intel_iommu *iommu = NULL;
3026 unsigned long flag;
3027
3028 if (init_iommu_hw()) {
b779260b
JC
3029 if (force_on)
3030 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3031 else
3032 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
134fac3f 3033 return;
f59c7b69
FY
3034 }
3035
3036 for_each_active_iommu(iommu, drhd) {
3037
1f5b3c3f 3038 raw_spin_lock_irqsave(&iommu->register_lock, flag);
f59c7b69
FY
3039
3040 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3041 iommu->reg + DMAR_FECTL_REG);
3042 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3043 iommu->reg + DMAR_FEDATA_REG);
3044 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3045 iommu->reg + DMAR_FEADDR_REG);
3046 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3047 iommu->reg + DMAR_FEUADDR_REG);
3048
1f5b3c3f 3049 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
f59c7b69 3050 }
f59c7b69
FY
3051}
3052
134fac3f 3053static struct syscore_ops iommu_syscore_ops = {
f59c7b69
FY
3054 .resume = iommu_resume,
3055 .suspend = iommu_suspend,
3056};
3057
134fac3f 3058static void __init init_iommu_pm_ops(void)
f59c7b69 3059{
134fac3f 3060 register_syscore_ops(&iommu_syscore_ops);
f59c7b69
FY
3061}
3062
3063#else
99592ba4 3064static inline void init_iommu_pm_ops(void) {}
f59c7b69
FY
3065#endif /* CONFIG_PM */
3066
45967ffb 3067static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
ce4cc52b
BR
3068{
3069 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3070 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3071 rmrr->end_address <= rmrr->base_address ||
3072 arch_rmrr_sanity_check(rmrr))
3073 return -EINVAL;
3074
3075 return 0;
3076}
3077
c2a0b538 3078int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
318fe7df
SS
3079{
3080 struct acpi_dmar_reserved_memory *rmrr;
3081 struct dmar_rmrr_unit *rmrru;
f036c7fa
YC
3082
3083 rmrr = (struct acpi_dmar_reserved_memory *)header;
96788c7a
HG
3084 if (rmrr_sanity_check(rmrr)) {
3085 pr_warn(FW_BUG
f5a68bb0
BR
3086 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3087 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3088 rmrr->base_address, rmrr->end_address,
3089 dmi_get_system_info(DMI_BIOS_VENDOR),
3090 dmi_get_system_info(DMI_BIOS_VERSION),
3091 dmi_get_system_info(DMI_PRODUCT_VERSION));
96788c7a
HG
3092 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3093 }
318fe7df
SS
3094
3095 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3096 if (!rmrru)
0659b8dc 3097 goto out;
318fe7df
SS
3098
3099 rmrru->hdr = header;
f036c7fa 3100
318fe7df
SS
3101 rmrru->base_address = rmrr->base_address;
3102 rmrru->end_address = rmrr->end_address;
0659b8dc 3103
2e455289
JL
3104 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3105 ((void *)rmrr) + rmrr->header.length,
3106 &rmrru->devices_cnt);
0659b8dc 3107 if (rmrru->devices_cnt && rmrru->devices == NULL)
5f64ce54 3108 goto free_rmrru;
318fe7df 3109
2e455289 3110 list_add(&rmrru->list, &dmar_rmrr_units);
318fe7df 3111
2e455289 3112 return 0;
0659b8dc
EA
3113free_rmrru:
3114 kfree(rmrru);
3115out:
3116 return -ENOMEM;
318fe7df
SS
3117}
3118
6b197249
JL
3119static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3120{
3121 struct dmar_atsr_unit *atsru;
3122 struct acpi_dmar_atsr *tmp;
3123
c6f4ebde
QC
3124 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3125 dmar_rcu_check()) {
6b197249
JL
3126 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3127 if (atsr->segment != tmp->segment)
3128 continue;
3129 if (atsr->header.length != tmp->header.length)
3130 continue;
3131 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3132 return atsru;
3133 }
3134
3135 return NULL;
3136}
3137
3138int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
318fe7df
SS
3139{
3140 struct acpi_dmar_atsr *atsr;
3141 struct dmar_atsr_unit *atsru;
3142
b608fe35 3143 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
6b197249
JL
3144 return 0;
3145
318fe7df 3146 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
6b197249
JL
3147 atsru = dmar_find_atsr(atsr);
3148 if (atsru)
3149 return 0;
3150
3151 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
318fe7df
SS
3152 if (!atsru)
3153 return -ENOMEM;
3154
6b197249
JL
3155 /*
3156 * If memory is allocated from slab by ACPI _DSM method, we need to
3157 * copy the memory content because the memory buffer will be freed
3158 * on return.
3159 */
3160 atsru->hdr = (void *)(atsru + 1);
3161 memcpy(atsru->hdr, hdr, hdr->length);
318fe7df 3162 atsru->include_all = atsr->flags & 0x1;
2e455289
JL
3163 if (!atsru->include_all) {
3164 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3165 (void *)atsr + atsr->header.length,
3166 &atsru->devices_cnt);
3167 if (atsru->devices_cnt && atsru->devices == NULL) {
3168 kfree(atsru);
3169 return -ENOMEM;
3170 }
3171 }
318fe7df 3172
0e242612 3173 list_add_rcu(&atsru->list, &dmar_atsr_units);
318fe7df
SS
3174
3175 return 0;
3176}
3177
9bdc531e
JL
3178static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3179{
3180 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3181 kfree(atsru);
3182}
3183
6b197249
JL
3184int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3185{
3186 struct acpi_dmar_atsr *atsr;
3187 struct dmar_atsr_unit *atsru;
3188
3189 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3190 atsru = dmar_find_atsr(atsr);
3191 if (atsru) {
3192 list_del_rcu(&atsru->list);
3193 synchronize_rcu();
3194 intel_iommu_free_atsr(atsru);
3195 }
3196
3197 return 0;
3198}
3199
3200int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3201{
3202 int i;
3203 struct device *dev;
3204 struct acpi_dmar_atsr *atsr;
3205 struct dmar_atsr_unit *atsru;
3206
3207 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3208 atsru = dmar_find_atsr(atsr);
3209 if (!atsru)
3210 return 0;
3211
194dc870 3212 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
6b197249
JL
3213 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3214 i, dev)
3215 return -EBUSY;
194dc870 3216 }
6b197249
JL
3217
3218 return 0;
3219}
3220
31a75cbb
YC
3221static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3222{
3223 struct dmar_satc_unit *satcu;
3224 struct acpi_dmar_satc *tmp;
3225
3226 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3227 dmar_rcu_check()) {
3228 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3229 if (satc->segment != tmp->segment)
3230 continue;
3231 if (satc->header.length != tmp->header.length)
3232 continue;
3233 if (memcmp(satc, tmp, satc->header.length) == 0)
3234 return satcu;
3235 }
3236
3237 return NULL;
3238}
3239
3240int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3241{
3242 struct acpi_dmar_satc *satc;
3243 struct dmar_satc_unit *satcu;
3244
3245 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3246 return 0;
3247
3248 satc = container_of(hdr, struct acpi_dmar_satc, header);
3249 satcu = dmar_find_satc(satc);
3250 if (satcu)
3251 return 0;
3252
3253 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3254 if (!satcu)
3255 return -ENOMEM;
3256
3257 satcu->hdr = (void *)(satcu + 1);
3258 memcpy(satcu->hdr, hdr, hdr->length);
3259 satcu->atc_required = satc->flags & 0x1;
3260 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3261 (void *)satc + satc->header.length,
3262 &satcu->devices_cnt);
3263 if (satcu->devices_cnt && !satcu->devices) {
3264 kfree(satcu);
3265 return -ENOMEM;
3266 }
3267 list_add_rcu(&satcu->list, &dmar_satc_units);
3268
3269 return 0;
3270}
3271
ffebeb46
JL
3272static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3273{
e083ea5b 3274 int sp, ret;
ffebeb46
JL
3275 struct intel_iommu *iommu = dmaru->iommu;
3276
ad3d1902
KMP
3277 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3278 if (ret)
3279 goto out;
3280
ffebeb46 3281 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
9f10e5bf 3282 pr_warn("%s: Doesn't support hardware pass through.\n",
ffebeb46
JL
3283 iommu->name);
3284 return -ENXIO;
3285 }
e8055226 3286
64229e8f 3287 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
ffebeb46 3288 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
9f10e5bf 3289 pr_warn("%s: Doesn't support large page.\n",
ffebeb46
JL
3290 iommu->name);
3291 return -ENXIO;
3292 }
3293
3294 /*
3295 * Disable translation if already enabled prior to OS handover.
3296 */
3297 if (iommu->gcmd & DMA_GCMD_TE)
3298 iommu_disable_translation(iommu);
3299
ffebeb46
JL
3300 ret = iommu_init_domains(iommu);
3301 if (ret == 0)
3302 ret = iommu_alloc_root_entry(iommu);
3303 if (ret)
3304 goto out;
3305
ff3dc652 3306 intel_svm_check(iommu);
8a94ade4 3307
ffebeb46
JL
3308 if (dmaru->ignored) {
3309 /*
3310 * we always have to disable PMRs or DMA may fail on this device
3311 */
3312 if (force_on)
3313 iommu_disable_protect_mem_regions(iommu);
3314 return 0;
3315 }
3316
3317 intel_iommu_init_qi(iommu);
3318 iommu_flush_write_buffer(iommu);
a222a7f0
DW
3319
3320#ifdef CONFIG_INTEL_IOMMU_SVM
765b6a98 3321 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
a222a7f0
DW
3322 ret = intel_svm_enable_prq(iommu);
3323 if (ret)
3324 goto disable_iommu;
3325 }
3326#endif
ffebeb46
JL
3327 ret = dmar_set_interrupt(iommu);
3328 if (ret)
3329 goto disable_iommu;
3330
3331 iommu_set_root_entry(iommu);
ffebeb46
JL
3332 iommu_enable_translation(iommu);
3333
ffebeb46
JL
3334 iommu_disable_protect_mem_regions(iommu);
3335 return 0;
3336
3337disable_iommu:
3338 disable_dmar_iommu(iommu);
3339out:
3340 free_dmar_iommu(iommu);
3341 return ret;
3342}
3343
6b197249
JL
3344int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3345{
ffebeb46
JL
3346 int ret = 0;
3347 struct intel_iommu *iommu = dmaru->iommu;
3348
3349 if (!intel_iommu_enabled)
3350 return 0;
3351 if (iommu == NULL)
3352 return -EINVAL;
3353
3354 if (insert) {
3355 ret = intel_iommu_add(dmaru);
3356 } else {
3357 disable_dmar_iommu(iommu);
3358 free_dmar_iommu(iommu);
3359 }
3360
3361 return ret;
6b197249
JL
3362}
3363
9bdc531e
JL
3364static void intel_iommu_free_dmars(void)
3365{
3366 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3367 struct dmar_atsr_unit *atsru, *atsr_n;
31a75cbb 3368 struct dmar_satc_unit *satcu, *satc_n;
9bdc531e
JL
3369
3370 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3371 list_del(&rmrru->list);
3372 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3373 kfree(rmrru);
318fe7df
SS
3374 }
3375
9bdc531e
JL
3376 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3377 list_del(&atsru->list);
3378 intel_iommu_free_atsr(atsru);
3379 }
31a75cbb
YC
3380 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3381 list_del(&satcu->list);
3382 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3383 kfree(satcu);
3384 }
318fe7df
SS
3385}
3386
97f2f2c5
YC
3387static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3388{
3389 struct dmar_satc_unit *satcu;
3390 struct acpi_dmar_satc *satc;
3391 struct device *tmp;
3392 int i;
3393
3394 dev = pci_physfn(dev);
3395 rcu_read_lock();
3396
3397 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3398 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3399 if (satc->segment != pci_domain_nr(dev->bus))
3400 continue;
3401 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3402 if (to_pci_dev(tmp) == dev)
3403 goto out;
3404 }
3405 satcu = NULL;
3406out:
3407 rcu_read_unlock();
3408 return satcu;
3409}
3410
3411static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
318fe7df 3412{
b683b230 3413 int i, ret = 1;
318fe7df 3414 struct pci_bus *bus;
832bd858
DW
3415 struct pci_dev *bridge = NULL;
3416 struct device *tmp;
318fe7df
SS
3417 struct acpi_dmar_atsr *atsr;
3418 struct dmar_atsr_unit *atsru;
97f2f2c5 3419 struct dmar_satc_unit *satcu;
318fe7df
SS
3420
3421 dev = pci_physfn(dev);
97f2f2c5
YC
3422 satcu = dmar_find_matched_satc_unit(dev);
3423 if (satcu)
3424 /*
3425 * This device supports ATS as it is in SATC table.
3426 * When IOMMU is in legacy mode, enabling ATS is done
3427 * automatically by HW for the device that requires
3428 * ATS, hence OS should not enable this device ATS
3429 * to avoid duplicated TLB invalidation.
3430 */
3431 return !(satcu->atc_required && !sm_supported(iommu));
3432
318fe7df 3433 for (bus = dev->bus; bus; bus = bus->parent) {
b5f82ddf 3434 bridge = bus->self;
d14053b3
DW
3435 /* If it's an integrated device, allow ATS */
3436 if (!bridge)
3437 return 1;
3438 /* Connected via non-PCIe: no ATS */
3439 if (!pci_is_pcie(bridge) ||
62f87c0e 3440 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
318fe7df 3441 return 0;
d14053b3 3442 /* If we found the root port, look it up in the ATSR */
b5f82ddf 3443 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
318fe7df 3444 break;
318fe7df
SS
3445 }
3446
0e242612 3447 rcu_read_lock();
b5f82ddf
JL
3448 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3449 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3450 if (atsr->segment != pci_domain_nr(dev->bus))
3451 continue;
3452
b683b230 3453 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
832bd858 3454 if (tmp == &bridge->dev)
b683b230 3455 goto out;
b5f82ddf
JL
3456
3457 if (atsru->include_all)
b683b230 3458 goto out;
b5f82ddf 3459 }
b683b230
JL
3460 ret = 0;
3461out:
0e242612 3462 rcu_read_unlock();
318fe7df 3463
b683b230 3464 return ret;
318fe7df
SS
3465}
3466
59ce0515
JL
3467int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3468{
e083ea5b 3469 int ret;
59ce0515
JL
3470 struct dmar_rmrr_unit *rmrru;
3471 struct dmar_atsr_unit *atsru;
31a75cbb 3472 struct dmar_satc_unit *satcu;
59ce0515
JL
3473 struct acpi_dmar_atsr *atsr;
3474 struct acpi_dmar_reserved_memory *rmrr;
31a75cbb 3475 struct acpi_dmar_satc *satc;
59ce0515 3476
b608fe35 3477 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
59ce0515
JL
3478 return 0;
3479
3480 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3481 rmrr = container_of(rmrru->hdr,
3482 struct acpi_dmar_reserved_memory, header);
3483 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3484 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3485 ((void *)rmrr) + rmrr->header.length,
3486 rmrr->segment, rmrru->devices,
3487 rmrru->devices_cnt);
e083ea5b 3488 if (ret < 0)
59ce0515 3489 return ret;
e6a8c9b3 3490 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
27e24950
JL
3491 dmar_remove_dev_scope(info, rmrr->segment,
3492 rmrru->devices, rmrru->devices_cnt);
59ce0515
JL
3493 }
3494 }
3495
3496 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3497 if (atsru->include_all)
3498 continue;
3499
3500 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3501 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3502 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3503 (void *)atsr + atsr->header.length,
3504 atsr->segment, atsru->devices,
3505 atsru->devices_cnt);
3506 if (ret > 0)
3507 break;
e083ea5b 3508 else if (ret < 0)
59ce0515 3509 return ret;
e6a8c9b3 3510 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
59ce0515
JL
3511 if (dmar_remove_dev_scope(info, atsr->segment,
3512 atsru->devices, atsru->devices_cnt))
3513 break;
3514 }
3515 }
31a75cbb
YC
3516 list_for_each_entry(satcu, &dmar_satc_units, list) {
3517 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3518 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3519 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3520 (void *)satc + satc->header.length,
3521 satc->segment, satcu->devices,
3522 satcu->devices_cnt);
3523 if (ret > 0)
3524 break;
3525 else if (ret < 0)
3526 return ret;
3527 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3528 if (dmar_remove_dev_scope(info, satc->segment,
3529 satcu->devices, satcu->devices_cnt))
3530 break;
3531 }
3532 }
59ce0515
JL
3533
3534 return 0;
3535}
3536
75f05569
JL
3537static int intel_iommu_memory_notifier(struct notifier_block *nb,
3538 unsigned long val, void *v)
3539{
3540 struct memory_notify *mhp = v;
fb5f50a4
YX
3541 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3542 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
e70b081c 3543 mhp->nr_pages - 1);
75f05569
JL
3544
3545 switch (val) {
3546 case MEM_GOING_ONLINE:
e70b081c
TM
3547 if (iommu_domain_identity_map(si_domain,
3548 start_vpfn, last_vpfn)) {
3549 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3550 start_vpfn, last_vpfn);
75f05569
JL
3551 return NOTIFY_BAD;
3552 }
3553 break;
3554
3555 case MEM_OFFLINE:
3556 case MEM_CANCEL_ONLINE:
e70b081c 3557 {
75f05569
JL
3558 struct dmar_drhd_unit *drhd;
3559 struct intel_iommu *iommu;
87f60cc6 3560 LIST_HEAD(freelist);
75f05569 3561
87f60cc6 3562 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
ea8ea460 3563
75f05569
JL
3564 rcu_read_lock();
3565 for_each_active_iommu(iommu, drhd)
a1ddcbe9 3566 iommu_flush_iotlb_psi(iommu, si_domain,
e70b081c 3567 start_vpfn, mhp->nr_pages,
87f60cc6 3568 list_empty(&freelist), 0);
75f05569 3569 rcu_read_unlock();
87f60cc6 3570 put_pages_list(&freelist);
75f05569
JL
3571 }
3572 break;
3573 }
3574
3575 return NOTIFY_OK;
3576}
3577
3578static struct notifier_block intel_iommu_memory_nb = {
3579 .notifier_call = intel_iommu_memory_notifier,
3580 .priority = 0
3581};
3582
161b28aa
JR
3583static void intel_disable_iommus(void)
3584{
3585 struct intel_iommu *iommu = NULL;
3586 struct dmar_drhd_unit *drhd;
3587
3588 for_each_iommu(iommu, drhd)
3589 iommu_disable_translation(iommu);
3590}
3591
6c3a44ed
DD
3592void intel_iommu_shutdown(void)
3593{
3594 struct dmar_drhd_unit *drhd;
3595 struct intel_iommu *iommu = NULL;
3596
3597 if (no_iommu || dmar_disabled)
3598 return;
3599
3600 down_write(&dmar_global_lock);
3601
3602 /* Disable PMRs explicitly here. */
3603 for_each_iommu(iommu, drhd)
3604 iommu_disable_protect_mem_regions(iommu);
3605
3606 /* Make sure the IOMMUs are switched off */
3607 intel_disable_iommus();
3608
3609 up_write(&dmar_global_lock);
3610}
3611
a7fdb6e6
JR
3612static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3613{
2926a2aa
JR
3614 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3615
3616 return container_of(iommu_dev, struct intel_iommu, iommu);
a7fdb6e6
JR
3617}
3618
3bc770b0
Y
3619static ssize_t version_show(struct device *dev,
3620 struct device_attribute *attr, char *buf)
a5459cfe 3621{
a7fdb6e6 3622 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
a5459cfe 3623 u32 ver = readl(iommu->reg + DMAR_VER_REG);
c33fcc13
LB
3624 return sysfs_emit(buf, "%d:%d\n",
3625 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
a5459cfe 3626}
3bc770b0 3627static DEVICE_ATTR_RO(version);
a5459cfe 3628
3bc770b0
Y
3629static ssize_t address_show(struct device *dev,
3630 struct device_attribute *attr, char *buf)
a5459cfe 3631{
a7fdb6e6 3632 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
c33fcc13 3633 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
a5459cfe 3634}
3bc770b0 3635static DEVICE_ATTR_RO(address);
a5459cfe 3636
3bc770b0
Y
3637static ssize_t cap_show(struct device *dev,
3638 struct device_attribute *attr, char *buf)
a5459cfe 3639{
a7fdb6e6 3640 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
c33fcc13 3641 return sysfs_emit(buf, "%llx\n", iommu->cap);
a5459cfe 3642}
3bc770b0 3643static DEVICE_ATTR_RO(cap);
a5459cfe 3644
3bc770b0
Y
3645static ssize_t ecap_show(struct device *dev,
3646 struct device_attribute *attr, char *buf)
a5459cfe 3647{
a7fdb6e6 3648 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
c33fcc13 3649 return sysfs_emit(buf, "%llx\n", iommu->ecap);
a5459cfe 3650}
3bc770b0 3651static DEVICE_ATTR_RO(ecap);
a5459cfe 3652
3bc770b0
Y
3653static ssize_t domains_supported_show(struct device *dev,
3654 struct device_attribute *attr, char *buf)
2238c082 3655{
a7fdb6e6 3656 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
c33fcc13 3657 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2238c082 3658}
3bc770b0 3659static DEVICE_ATTR_RO(domains_supported);
2238c082 3660
3bc770b0
Y
3661static ssize_t domains_used_show(struct device *dev,
3662 struct device_attribute *attr, char *buf)
2238c082 3663{
a7fdb6e6 3664 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
c33fcc13
LB
3665 return sysfs_emit(buf, "%d\n",
3666 bitmap_weight(iommu->domain_ids,
3667 cap_ndoms(iommu->cap)));
2238c082 3668}
3bc770b0 3669static DEVICE_ATTR_RO(domains_used);
2238c082 3670
a5459cfe
AW
3671static struct attribute *intel_iommu_attrs[] = {
3672 &dev_attr_version.attr,
3673 &dev_attr_address.attr,
3674 &dev_attr_cap.attr,
3675 &dev_attr_ecap.attr,
2238c082
AW
3676 &dev_attr_domains_supported.attr,
3677 &dev_attr_domains_used.attr,
a5459cfe
AW
3678 NULL,
3679};
3680
3681static struct attribute_group intel_iommu_group = {
3682 .name = "intel-iommu",
3683 .attrs = intel_iommu_attrs,
3684};
3685
3686const struct attribute_group *intel_iommu_groups[] = {
3687 &intel_iommu_group,
3688 NULL,
3689};
3690
99b50be9 3691static inline bool has_external_pci(void)
89a6079d
LB
3692{
3693 struct pci_dev *pdev = NULL;
89a6079d 3694
c5a5dc4c 3695 for_each_pci_dev(pdev)
afca9e19
XW
3696 if (pdev->external_facing) {
3697 pci_dev_put(pdev);
c5a5dc4c 3698 return true;
afca9e19 3699 }
89a6079d 3700
c5a5dc4c
LB
3701 return false;
3702}
89a6079d 3703
c5a5dc4c
LB
3704static int __init platform_optin_force_iommu(void)
3705{
99b50be9 3706 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
89a6079d
LB
3707 return 0;
3708
3709 if (no_iommu || dmar_disabled)
3710 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3711
3712 /*
3713 * If Intel-IOMMU is disabled by default, we will apply identity
3714 * map for all devices except those marked as being untrusted.
3715 */
3716 if (dmar_disabled)
b89b6605 3717 iommu_set_default_passthrough(false);
89a6079d
LB
3718
3719 dmar_disabled = 0;
89a6079d
LB
3720 no_iommu = 0;
3721
3722 return 1;
3723}
3724
fa212a97
LB
3725static int __init probe_acpi_namespace_devices(void)
3726{
3727 struct dmar_drhd_unit *drhd;
af88ec39
QC
3728 /* To avoid a -Wunused-but-set-variable warning. */
3729 struct intel_iommu *iommu __maybe_unused;
fa212a97
LB
3730 struct device *dev;
3731 int i, ret = 0;
3732
3733 for_each_active_iommu(iommu, drhd) {
3734 for_each_active_dev_scope(drhd->devices,
3735 drhd->devices_cnt, i, dev) {
3736 struct acpi_device_physical_node *pn;
fa212a97
LB
3737 struct acpi_device *adev;
3738
3739 if (dev->bus != &acpi_bus_type)
3740 continue;
3741
3742 adev = to_acpi_device(dev);
3743 mutex_lock(&adev->physical_node_lock);
3744 list_for_each_entry(pn,
3745 &adev->physical_node_list, node) {
fa212a97
LB
3746 ret = iommu_probe_device(pn->dev);
3747 if (ret)
3748 break;
3749 }
3750 mutex_unlock(&adev->physical_node_lock);
3751
3752 if (ret)
3753 return ret;
3754 }
3755 }
3756
3757 return 0;
3758}
3759
853788b9
LB
3760static __init int tboot_force_iommu(void)
3761{
3762 if (!tboot_enabled())
3763 return 0;
3764
3765 if (no_iommu || dmar_disabled)
3766 pr_warn("Forcing Intel-IOMMU to enabled\n");
3767
3768 dmar_disabled = 0;
3769 no_iommu = 0;
3770
3771 return 1;
3772}
3773
ba395927
KA
3774int __init intel_iommu_init(void)
3775{
9bdc531e 3776 int ret = -ENODEV;
3a93c841 3777 struct dmar_drhd_unit *drhd;
7c919779 3778 struct intel_iommu *iommu;
ba395927 3779
89a6079d
LB
3780 /*
3781 * Intel IOMMU is required for a TXT/tboot launch or platform
3782 * opt in, so enforce that.
3783 */
4d213e76
ZD
3784 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3785 platform_optin_force_iommu();
a59b50e9 3786
3a5670e8 3787 down_write(&dmar_global_lock);
a59b50e9
JC
3788 if (dmar_table_init()) {
3789 if (force_on)
3790 panic("tboot: Failed to initialize DMAR table\n");
9bdc531e 3791 goto out_free_dmar;
a59b50e9 3792 }
ba395927 3793
c2c7286a 3794 if (dmar_dev_scope_init() < 0) {
a59b50e9
JC
3795 if (force_on)
3796 panic("tboot: Failed to initialize DMAR device scope\n");
9bdc531e 3797 goto out_free_dmar;
a59b50e9 3798 }
1886e8a9 3799
ec154bf5
JR
3800 up_write(&dmar_global_lock);
3801
3802 /*
3803 * The bus notifier takes the dmar_global_lock, so lockdep will
3804 * complain later when we register it under the lock.
3805 */
3806 dmar_register_bus_notifier();
3807
3808 down_write(&dmar_global_lock);
3809
1da8347d
MD
3810 if (!no_iommu)
3811 intel_iommu_debugfs_init();
3812
161b28aa 3813 if (no_iommu || dmar_disabled) {
bfd20f1c
SL
3814 /*
3815 * We exit the function here to ensure IOMMU's remapping and
3816 * mempool aren't setup, which means that the IOMMU's PMRs
3817 * won't be disabled via the call to init_dmars(). So disable
3818 * it explicitly here. The PMRs were setup by tboot prior to
3819 * calling SENTER, but the kernel is expected to reset/tear
3820 * down the PMRs.
3821 */
3822 if (intel_iommu_tboot_noforce) {
3823 for_each_iommu(iommu, drhd)
3824 iommu_disable_protect_mem_regions(iommu);
3825 }
3826
161b28aa
JR
3827 /*
3828 * Make sure the IOMMUs are switched off, even when we
3829 * boot into a kexec kernel and the previous kernel left
3830 * them enabled
3831 */
3832 intel_disable_iommus();
9bdc531e 3833 goto out_free_dmar;
161b28aa 3834 }
2ae21010 3835
318fe7df 3836 if (list_empty(&dmar_rmrr_units))
9f10e5bf 3837 pr_info("No RMRR found\n");
318fe7df
SS
3838
3839 if (list_empty(&dmar_atsr_units))
9f10e5bf 3840 pr_info("No ATSR found\n");
318fe7df 3841
31a75cbb
YC
3842 if (list_empty(&dmar_satc_units))
3843 pr_info("No SATC found\n");
3844
ba395927
KA
3845 init_no_remapping_devices();
3846
b779260b 3847 ret = init_dmars();
ba395927 3848 if (ret) {
a59b50e9
JC
3849 if (force_on)
3850 panic("tboot: Failed to initialize DMARs\n");
9f10e5bf 3851 pr_err("Initialization failed\n");
c588072b 3852 goto out_free_dmar;
ba395927 3853 }
3a5670e8 3854 up_write(&dmar_global_lock);
ba395927 3855
134fac3f 3856 init_iommu_pm_ops();
a8bcbb0d 3857
2d48ea0e 3858 down_read(&dmar_global_lock);
39ab9555 3859 for_each_active_iommu(iommu, drhd) {
a250c23f
RM
3860 /*
3861 * The flush queue implementation does not perform
3862 * page-selective invalidations that are required for efficient
3863 * TLB flushes in virtual environments. The benefit of batching
3864 * is likely to be much lower than the overhead of synchronizing
3865 * the virtual and physical IOMMU page-tables.
3866 */
257ec290
TZ
3867 if (cap_caching_mode(iommu->cap) &&
3868 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
d0e108b8 3869 pr_info_once("IOMMU batching disallowed due to virtualization\n");
308723e3 3870 iommu_set_dma_strict();
a250c23f 3871 }
39ab9555
JR
3872 iommu_device_sysfs_add(&iommu->iommu, NULL,
3873 intel_iommu_groups,
3874 "%s", iommu->name);
2d471b20 3875 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
d8a7c0cf
KL
3876
3877 iommu_pmu_register(iommu);
39ab9555 3878 }
2d48ea0e 3879 up_read(&dmar_global_lock);
a5459cfe 3880
75f05569
JL
3881 if (si_domain && !hw_pass_through)
3882 register_memory_notifier(&intel_iommu_memory_nb);
d8190dc6 3883
d5692d4a 3884 down_read(&dmar_global_lock);
fa212a97
LB
3885 if (probe_acpi_namespace_devices())
3886 pr_warn("ACPI name space devices didn't probe correctly\n");
3887
d8190dc6
LB
3888 /* Finally, we enable the DMA remapping hardware. */
3889 for_each_iommu(iommu, drhd) {
6a8c6748 3890 if (!drhd->ignored && !translation_pre_enabled(iommu))
d8190dc6
LB
3891 iommu_enable_translation(iommu);
3892
3893 iommu_disable_protect_mem_regions(iommu);
3894 }
2d48ea0e
QC
3895 up_read(&dmar_global_lock);
3896
d8190dc6
LB
3897 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3898
8bc1f85c
ED
3899 intel_iommu_enabled = 1;
3900
ba395927 3901 return 0;
9bdc531e 3902
9bdc531e
JL
3903out_free_dmar:
3904 intel_iommu_free_dmars();
3a5670e8 3905 up_write(&dmar_global_lock);
9bdc531e 3906 return ret;
ba395927 3907}
e820482c 3908
0ce4a85f
LB
3909static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3910{
37764b95 3911 struct device_domain_info *info = opaque;
0ce4a85f 3912
37764b95 3913 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
0ce4a85f
LB
3914 return 0;
3915}
3916
3917/*
3918 * NB - intel-iommu lacks any sort of reference counting for the users of
3919 * dependent devices. If multiple endpoints have intersecting dependent
3920 * devices, unbinding the driver from any one of them will possibly leave
3921 * the others unable to operate.
3922 */
37764b95 3923static void domain_context_clear(struct device_domain_info *info)
0ce4a85f 3924{
37764b95 3925 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
0ce4a85f
LB
3926 return;
3927
37764b95
SK
3928 pci_for_each_dma_alias(to_pci_dev(info->dev),
3929 &domain_context_clear_one_cb, info);
0ce4a85f
LB
3930}
3931
db75c957 3932static void dmar_remove_one_dev_info(struct device *dev)
c7151a8d 3933{
db75c957 3934 struct device_domain_info *info = dev_iommu_priv_get(dev);
5eaafdf0 3935 struct dmar_domain *domain = info->domain;
db75c957 3936 struct intel_iommu *iommu = info->iommu;
a349ffcb 3937 unsigned long flags;
c7151a8d 3938
db75c957 3939 if (!dev_is_real_dma_subdevice(info->dev)) {
ef848b7e
LB
3940 if (dev_is_pci(info->dev) && sm_supported(iommu))
3941 intel_pasid_tear_down_entry(iommu, info->dev,
42987801 3942 IOMMU_NO_PASID, false);
ef848b7e 3943
ba502132 3944 iommu_disable_pci_caps(info);
474dd1c6 3945 domain_context_clear(info);
127c7615 3946 }
c7151a8d 3947
a349ffcb 3948 spin_lock_irqsave(&domain->lock, flags);
586081d3 3949 list_del(&info->link);
a349ffcb 3950 spin_unlock_irqrestore(&domain->lock, flags);
c7151a8d 3951
942067f1 3952 domain_detach_iommu(domain, iommu);
db75c957 3953 info->domain = NULL;
c7151a8d
WH
3954}
3955
c7be17c2
LB
3956/*
3957 * Clear the page table pointer in context or pasid table entries so that
3958 * all DMA requests without PASID from the device are blocked. If the page
3959 * table has been set, clean up the data structures.
3960 */
3961static void device_block_translation(struct device *dev)
3962{
3963 struct device_domain_info *info = dev_iommu_priv_get(dev);
3964 struct intel_iommu *iommu = info->iommu;
3965 unsigned long flags;
3966
ba502132 3967 iommu_disable_pci_caps(info);
c7be17c2
LB
3968 if (!dev_is_real_dma_subdevice(dev)) {
3969 if (sm_supported(iommu))
3970 intel_pasid_tear_down_entry(iommu, dev,
42987801 3971 IOMMU_NO_PASID, false);
c7be17c2
LB
3972 else
3973 domain_context_clear(info);
3974 }
3975
3976 if (!info->domain)
3977 return;
3978
3979 spin_lock_irqsave(&info->domain->lock, flags);
3980 list_del(&info->link);
3981 spin_unlock_irqrestore(&info->domain->lock, flags);
3982
3983 domain_detach_iommu(info->domain, iommu);
3984 info->domain = NULL;
3985}
3986
301e7ee1
JR
3987static int md_domain_init(struct dmar_domain *domain, int guest_width)
3988{
3989 int adjust_width;
3990
301e7ee1
JR
3991 /* calculate AGAW */
3992 domain->gaw = guest_width;
3993 adjust_width = guestwidth_to_adjustwidth(guest_width);
3994 domain->agaw = width_to_agaw(adjust_width);
3995
1f106ff0 3996 domain->iommu_coherency = false;
301e7ee1
JR
3997 domain->iommu_superpage = 0;
3998 domain->max_addr = 0;
3999
4000 /* always allocate the top pgd */
2552d3a2 4001 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
301e7ee1
JR
4002 if (!domain->pgd)
4003 return -ENOMEM;
4004 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4005 return 0;
4006}
4007
35a99c54
LB
4008static int blocking_domain_attach_dev(struct iommu_domain *domain,
4009 struct device *dev)
4010{
4011 device_block_translation(dev);
4012 return 0;
4013}
4014
4015static struct iommu_domain blocking_domain = {
4016 .ops = &(const struct iommu_domain_ops) {
4017 .attach_dev = blocking_domain_attach_dev,
4018 .free = intel_iommu_domain_free
4019 }
4020};
4021
00a77deb 4022static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
38717946 4023{
5d450806 4024 struct dmar_domain *dmar_domain;
00a77deb
JR
4025 struct iommu_domain *domain;
4026
4de354ec 4027 switch (type) {
35a99c54
LB
4028 case IOMMU_DOMAIN_BLOCKED:
4029 return &blocking_domain;
fa954e68 4030 case IOMMU_DOMAIN_DMA:
4de354ec 4031 case IOMMU_DOMAIN_UNMANAGED:
b34380a6 4032 dmar_domain = alloc_domain(type);
4de354ec
LB
4033 if (!dmar_domain) {
4034 pr_err("Can't allocate dmar_domain\n");
4035 return NULL;
4036 }
301e7ee1 4037 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4de354ec
LB
4038 pr_err("Domain initialization failed\n");
4039 domain_exit(dmar_domain);
4040 return NULL;
4041 }
fa954e68 4042
4de354ec
LB
4043 domain = &dmar_domain->domain;
4044 domain->geometry.aperture_start = 0;
4045 domain->geometry.aperture_end =
4046 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4047 domain->geometry.force_aperture = true;
4048
4049 return domain;
4050 case IOMMU_DOMAIN_IDENTITY:
4051 return &si_domain->domain;
eaca8889
LB
4052 case IOMMU_DOMAIN_SVA:
4053 return intel_svm_domain_alloc();
4de354ec 4054 default:
00a77deb 4055 return NULL;
38717946 4056 }
8a0e715b 4057
4de354ec 4058 return NULL;
38717946 4059}
38717946 4060
00a77deb 4061static void intel_iommu_domain_free(struct iommu_domain *domain)
38717946 4062{
35a99c54 4063 if (domain != &si_domain->domain && domain != &blocking_domain)
4de354ec 4064 domain_exit(to_dmar_domain(domain));
38717946 4065}
38717946 4066
8cc3759a
LB
4067static int prepare_domain_attach_device(struct iommu_domain *domain,
4068 struct device *dev)
38717946 4069{
00a77deb 4070 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
fe40f1e0
WH
4071 struct intel_iommu *iommu;
4072 int addr_width;
faa3d6f5 4073
dd6692f1 4074 iommu = device_to_iommu(dev, NULL, NULL);
fe40f1e0
WH
4075 if (!iommu)
4076 return -ENODEV;
4077
9d6ab26a 4078 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
f4a14773 4079 return -EINVAL;
9d6ab26a 4080
fe40f1e0
WH
4081 /* check if this iommu agaw is sufficient for max mapped address */
4082 addr_width = agaw_to_width(iommu->agaw);
a99c47a2
TL
4083 if (addr_width > cap_mgaw(iommu->cap))
4084 addr_width = cap_mgaw(iommu->cap);
4085
f4a14773
NC
4086 if (dmar_domain->max_addr > (1LL << addr_width))
4087 return -EINVAL;
a99c47a2
TL
4088 dmar_domain->gaw = addr_width;
4089
4090 /*
4091 * Knock out extra levels of page tables if necessary
4092 */
4093 while (iommu->agaw < dmar_domain->agaw) {
4094 struct dma_pte *pte;
4095
4096 pte = dmar_domain->pgd;
4097 if (dma_pte_present(pte)) {
7a0f06c1 4098 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
7a661013 4099 free_pgtable_page(pte);
a99c47a2
TL
4100 }
4101 dmar_domain->agaw--;
4102 }
fe40f1e0 4103
8cc3759a
LB
4104 return 0;
4105}
4106
4107static int intel_iommu_attach_device(struct iommu_domain *domain,
4108 struct device *dev)
4109{
b1cf1563 4110 struct device_domain_info *info = dev_iommu_priv_get(dev);
8cc3759a
LB
4111 int ret;
4112
b1cf1563
LB
4113 if (info->domain)
4114 device_block_translation(dev);
8cc3759a
LB
4115
4116 ret = prepare_domain_attach_device(domain, dev);
4117 if (ret)
4118 return ret;
4119
a8204479 4120 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
faa3d6f5 4121}
c7151a8d 4122
b146a1c9
JR
4123static int intel_iommu_map(struct iommu_domain *domain,
4124 unsigned long iova, phys_addr_t hpa,
781ca2de 4125 size_t size, int iommu_prot, gfp_t gfp)
faa3d6f5 4126{
00a77deb 4127 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
fe40f1e0 4128 u64 max_addr;
dde57a21 4129 int prot = 0;
fe40f1e0 4130
dde57a21
JR
4131 if (iommu_prot & IOMMU_READ)
4132 prot |= DMA_PTE_READ;
4133 if (iommu_prot & IOMMU_WRITE)
4134 prot |= DMA_PTE_WRITE;
fc0051cb 4135 if (dmar_domain->set_pte_snp)
9cf06697 4136 prot |= DMA_PTE_SNP;
dde57a21 4137
163cc52c 4138 max_addr = iova + size;
dde57a21 4139 if (dmar_domain->max_addr < max_addr) {
fe40f1e0
WH
4140 u64 end;
4141
4142 /* check if minimum agaw is sufficient for mapped address */
8954da1f 4143 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
fe40f1e0 4144 if (end < max_addr) {
9f10e5bf 4145 pr_err("%s: iommu width (%d) is not "
fe40f1e0 4146 "sufficient for the mapped address (%llx)\n",
8954da1f 4147 __func__, dmar_domain->gaw, max_addr);
fe40f1e0
WH
4148 return -EFAULT;
4149 }
dde57a21 4150 dmar_domain->max_addr = max_addr;
fe40f1e0 4151 }
ad051221
DW
4152 /* Round up size to next multiple of PAGE_SIZE, if it and
4153 the low bits of hpa would take us onto the next page */
88cb6a74 4154 size = aligned_nrpages(hpa, size);
933fcd01 4155 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
2d4d7676 4156 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
38717946 4157}
38717946 4158
3f34f125
LB
4159static int intel_iommu_map_pages(struct iommu_domain *domain,
4160 unsigned long iova, phys_addr_t paddr,
4161 size_t pgsize, size_t pgcount,
4162 int prot, gfp_t gfp, size_t *mapped)
4163{
4164 unsigned long pgshift = __ffs(pgsize);
4165 size_t size = pgcount << pgshift;
4166 int ret;
4167
4168 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4169 return -EINVAL;
4170
4171 if (!IS_ALIGNED(iova | paddr, pgsize))
4172 return -EINVAL;
4173
4174 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4175 if (!ret && mapped)
4176 *mapped = size;
4177
4178 return ret;
4179}
4180
5009065d 4181static size_t intel_iommu_unmap(struct iommu_domain *domain,
56f8af5e
WD
4182 unsigned long iova, size_t size,
4183 struct iommu_iotlb_gather *gather)
38717946 4184{
00a77deb 4185 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
ea8ea460 4186 unsigned long start_pfn, last_pfn;
2a2b8eaa 4187 int level = 0;
5cf0a76f
DW
4188
4189 /* Cope with horrid API which requires us to unmap more than the
4190 size argument if it happens to be a large-page mapping. */
cbf2f9e8
TZ
4191 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4192 &level, GFP_ATOMIC)))
4193 return 0;
5cf0a76f
DW
4194
4195 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4196 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4b99d352 4197
ea8ea460
DW
4198 start_pfn = iova >> VTD_PAGE_SHIFT;
4199 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4200
87f60cc6 4201 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
fe40f1e0 4202
163cc52c
DW
4203 if (dmar_domain->max_addr == iova + size)
4204 dmar_domain->max_addr = iova;
b146a1c9 4205
16a75bbe
JP
4206 /*
4207 * We do not use page-selective IOTLB invalidation in flush queue,
4208 * so there is no need to track page and sync iotlb.
4209 */
4210 if (!iommu_iotlb_gather_queued(gather))
4211 iommu_iotlb_gather_add_page(domain, gather, iova, size);
2a2b8eaa 4212
5cf0a76f 4213 return size;
38717946 4214}
38717946 4215
3f34f125
LB
4216static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4217 unsigned long iova,
4218 size_t pgsize, size_t pgcount,
4219 struct iommu_iotlb_gather *gather)
4220{
4221 unsigned long pgshift = __ffs(pgsize);
4222 size_t size = pgcount << pgshift;
4223
4224 return intel_iommu_unmap(domain, iova, size, gather);
4225}
4226
2a2b8eaa
TM
4227static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4228 struct iommu_iotlb_gather *gather)
4229{
4230 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4231 unsigned long iova_pfn = IOVA_PFN(gather->start);
4232 size_t size = gather->end - gather->start;
ba949f4c 4233 struct iommu_domain_info *info;
405a43cc 4234 unsigned long start_pfn;
2a2b8eaa 4235 unsigned long nrpages;
ba949f4c 4236 unsigned long i;
2a2b8eaa
TM
4237
4238 nrpages = aligned_nrpages(gather->start, size);
fb5f50a4 4239 start_pfn = mm_to_dma_pfn_start(iova_pfn);
2a2b8eaa 4240
ba949f4c
LB
4241 xa_for_each(&dmar_domain->iommu_array, i, info)
4242 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
87f60cc6
MWO
4243 start_pfn, nrpages,
4244 list_empty(&gather->freelist), 0);
2a2b8eaa 4245
87f60cc6 4246 put_pages_list(&gather->freelist);
2a2b8eaa
TM
4247}
4248
d14d6577 4249static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
bb5547ac 4250 dma_addr_t iova)
38717946 4251{
00a77deb 4252 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
38717946 4253 struct dma_pte *pte;
5cf0a76f 4254 int level = 0;
faa3d6f5 4255 u64 phys = 0;
38717946 4256
2d4d7676
JG
4257 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4258 GFP_ATOMIC);
77a1bce8
YH
4259 if (pte && dma_pte_present(pte))
4260 phys = dma_pte_addr(pte) +
4261 (iova & (BIT_MASK(level_to_offset_bits(level) +
4262 VTD_PAGE_SHIFT) - 1));
38717946 4263
faa3d6f5 4264 return phys;
38717946 4265}
a8bcbb0d 4266
fc0051cb
LB
4267static bool domain_support_force_snooping(struct dmar_domain *domain)
4268{
4269 struct device_domain_info *info;
4270 bool support = true;
4271
5eaafdf0 4272 assert_spin_locked(&domain->lock);
fc0051cb
LB
4273 list_for_each_entry(info, &domain->devices, link) {
4274 if (!ecap_sc_support(info->iommu->ecap)) {
4275 support = false;
4276 break;
4277 }
4278 }
4279
4280 return support;
4281}
4282
4283static void domain_set_force_snooping(struct dmar_domain *domain)
4284{
4285 struct device_domain_info *info;
4286
5eaafdf0 4287 assert_spin_locked(&domain->lock);
fc0051cb
LB
4288 /*
4289 * Second level page table supports per-PTE snoop control. The
4290 * iommu_map() interface will handle this by setting SNP bit.
4291 */
e5b0feb4 4292 if (!domain->use_first_level) {
fc0051cb
LB
4293 domain->set_pte_snp = true;
4294 return;
4295 }
4296
4297 list_for_each_entry(info, &domain->devices, link)
4298 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
42987801 4299 IOMMU_NO_PASID);
fc0051cb
LB
4300}
4301
6043257b
JG
4302static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4303{
4304 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
a349ffcb 4305 unsigned long flags;
6043257b 4306
fc0051cb
LB
4307 if (dmar_domain->force_snooping)
4308 return true;
4309
a349ffcb 4310 spin_lock_irqsave(&dmar_domain->lock, flags);
fc0051cb 4311 if (!domain_support_force_snooping(dmar_domain)) {
a349ffcb 4312 spin_unlock_irqrestore(&dmar_domain->lock, flags);
6043257b 4313 return false;
fc0051cb
LB
4314 }
4315
4316 domain_set_force_snooping(dmar_domain);
6043257b 4317 dmar_domain->force_snooping = true;
a349ffcb 4318 spin_unlock_irqrestore(&dmar_domain->lock, flags);
fc0051cb 4319
6043257b
JG
4320 return true;
4321}
4322
359ad157 4323static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
dbb9fd86 4324{
4989764d
JG
4325 struct device_domain_info *info = dev_iommu_priv_get(dev);
4326
4327 switch (cap) {
4328 case IOMMU_CAP_CACHE_COHERENCY:
4a20ce0f 4329 case IOMMU_CAP_DEFERRED_FLUSH:
f78dc1da 4330 return true;
4989764d 4331 case IOMMU_CAP_PRE_BOOT_PROTECTION:
d0be55fb 4332 return dmar_platform_optin();
4989764d
JG
4333 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4334 return ecap_sc_support(info->iommu->ecap);
4335 default:
4336 return false;
4337 }
dbb9fd86
SY
4338}
4339
e5d1841f 4340static struct iommu_device *intel_iommu_probe_device(struct device *dev)
abdfdde2 4341{
586081d3
LB
4342 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4343 struct device_domain_info *info;
a5459cfe 4344 struct intel_iommu *iommu;
586081d3 4345 u8 bus, devfn;
ec62b442 4346 int ret;
70ae6f0d 4347
586081d3 4348 iommu = device_to_iommu(dev, &bus, &devfn);
c919739c 4349 if (!iommu || !iommu->iommu.ops)
e5d1841f 4350 return ERR_PTR(-ENODEV);
a4ff1fc2 4351
586081d3
LB
4352 info = kzalloc(sizeof(*info), GFP_KERNEL);
4353 if (!info)
4354 return ERR_PTR(-ENOMEM);
4355
4356 if (dev_is_real_dma_subdevice(dev)) {
4357 info->bus = pdev->bus->number;
4358 info->devfn = pdev->devfn;
4359 info->segment = pci_domain_nr(pdev->bus);
4360 } else {
4361 info->bus = bus;
4362 info->devfn = devfn;
4363 info->segment = iommu->segment;
4364 }
4365
4366 info->dev = dev;
4367 info->iommu = iommu;
4368 if (dev_is_pci(dev)) {
4369 if (ecap_dev_iotlb_support(iommu->ecap) &&
4370 pci_ats_supported(pdev) &&
e65a6897 4371 dmar_ats_supported(pdev, iommu)) {
586081d3 4372 info->ats_supported = 1;
e65a6897 4373 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
5ae40080
LB
4374
4375 /*
4376 * For IOMMU that supports device IOTLB throttling
4377 * (DIT), we assign PFSID to the invalidation desc
4378 * of a VF such that IOMMU HW can gauge queue depth
4379 * at PF level. If DIT is not set, PFSID will be
4380 * treated as reserved, which should be set to 0.
4381 */
4382 if (ecap_dit(iommu->ecap))
4383 info->pfsid = pci_dev_id(pci_physfn(pdev));
4384 info->ats_qdep = pci_ats_queue_depth(pdev);
e65a6897 4385 }
586081d3
LB
4386 if (sm_supported(iommu)) {
4387 if (pasid_supported(iommu)) {
4388 int features = pci_pasid_features(pdev);
4389
4390 if (features >= 0)
4391 info->pasid_supported = features | 1;
4392 }
4393
4394 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4395 pci_pri_supported(pdev))
4396 info->pri_supported = 1;
4397 }
4398 }
4399
586081d3 4400 dev_iommu_priv_set(dev, info);
8af46c78 4401
ec62b442
LB
4402 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4403 ret = intel_pasid_alloc_table(dev);
4404 if (ret) {
4405 dev_err(dev, "PASID table allocation failed\n");
4406 dev_iommu_priv_set(dev, NULL);
4407 kfree(info);
4408 return ERR_PTR(ret);
4409 }
4410 }
4411
e5d1841f 4412 return &iommu->iommu;
abdfdde2 4413}
70ae6f0d 4414
e5d1841f 4415static void intel_iommu_release_device(struct device *dev)
abdfdde2 4416{
586081d3 4417 struct device_domain_info *info = dev_iommu_priv_get(dev);
a5459cfe 4418
458b7c8e 4419 dmar_remove_one_dev_info(dev);
ec62b442 4420 intel_pasid_free_table(dev);
586081d3 4421 dev_iommu_priv_set(dev, NULL);
586081d3 4422 kfree(info);
6fc7020c
LB
4423 set_dma_ops(dev, NULL);
4424}
a5459cfe 4425
6fc7020c
LB
4426static void intel_iommu_probe_finalize(struct device *dev)
4427{
78ca0784
RM
4428 set_dma_ops(dev, NULL);
4429 iommu_setup_dma_ops(dev, 0, U64_MAX);
70ae6f0d
AW
4430}
4431
0659b8dc
EA
4432static void intel_iommu_get_resv_regions(struct device *device,
4433 struct list_head *head)
4434{
5f64ce54 4435 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
0659b8dc
EA
4436 struct iommu_resv_region *reg;
4437 struct dmar_rmrr_unit *rmrr;
4438 struct device *i_dev;
4439 int i;
4440
bf638a65 4441 rcu_read_lock();
0659b8dc
EA
4442 for_each_rmrr_units(rmrr) {
4443 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4444 i, i_dev) {
5f64ce54 4445 struct iommu_resv_region *resv;
1c5c59fb 4446 enum iommu_resv_type type;
5f64ce54
EA
4447 size_t length;
4448
3855ba2d
EA
4449 if (i_dev != device &&
4450 !is_downstream_to_pci_bridge(device, i_dev))
0659b8dc
EA
4451 continue;
4452
5f64ce54 4453 length = rmrr->end_address - rmrr->base_address + 1;
1c5c59fb
EA
4454
4455 type = device_rmrr_is_relaxable(device) ?
4456 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4457
5f64ce54 4458 resv = iommu_alloc_resv_region(rmrr->base_address,
0251d010 4459 length, prot, type,
bf638a65 4460 GFP_ATOMIC);
5f64ce54
EA
4461 if (!resv)
4462 break;
4463
4464 list_add_tail(&resv->list, head);
0659b8dc
EA
4465 }
4466 }
bf638a65 4467 rcu_read_unlock();
0659b8dc 4468
d850c2ee
LB
4469#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4470 if (dev_is_pci(device)) {
4471 struct pci_dev *pdev = to_pci_dev(device);
4472
4473 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
cde9319e 4474 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
0251d010
LB
4475 IOMMU_RESV_DIRECT_RELAXABLE,
4476 GFP_KERNEL);
d850c2ee
LB
4477 if (reg)
4478 list_add_tail(&reg->list, head);
4479 }
4480 }
4481#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4482
0659b8dc
EA
4483 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4484 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
0251d010 4485 0, IOMMU_RESV_MSI, GFP_KERNEL);
0659b8dc
EA
4486 if (!reg)
4487 return;
4488 list_add_tail(&reg->list, head);
4489}
4490
4a350a0e
PS
4491static struct iommu_group *intel_iommu_device_group(struct device *dev)
4492{
4493 if (dev_is_pci(dev))
4494 return pci_device_group(dev);
4495 return generic_device_group(dev);
4496}
4497
4c82b886
LB
4498static int intel_iommu_enable_sva(struct device *dev)
4499{
586081d3 4500 struct device_domain_info *info = dev_iommu_priv_get(dev);
934ed458 4501 struct intel_iommu *iommu;
4c82b886 4502
934ed458
CIK
4503 if (!info || dmar_disabled)
4504 return -EINVAL;
4505
4506 iommu = info->iommu;
4507 if (!iommu)
4c82b886
LB
4508 return -EINVAL;
4509
4510 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4511 return -ENODEV;
4512
a86fb771 4513 if (!info->pasid_enabled || !info->ats_enabled)
4c82b886
LB
4514 return -EINVAL;
4515
a86fb771
LB
4516 /*
4517 * Devices having device-specific I/O fault handling should not
4518 * support PCI/PRI. The IOMMU side has no means to check the
4519 * capability of device-specific IOPF. Therefore, IOMMU can only
4520 * default that if the device driver enables SVA on a non-PRI
4521 * device, it will handle IOPF in its own way.
4522 */
4523 if (!info->pri_supported)
4524 return 0;
60b1daa3 4525
a86fb771
LB
4526 /* Devices supporting PRI should have it enabled. */
4527 if (!info->pri_enabled)
4c82b886 4528 return -EINVAL;
d5b9e4bf 4529
3d4c7cc3 4530 return 0;
4c82b886
LB
4531}
4532
3d4c7cc3 4533static int intel_iommu_enable_iopf(struct device *dev)
4c82b886 4534{
fbcde5bb 4535 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
586081d3 4536 struct device_domain_info *info = dev_iommu_priv_get(dev);
3d4c7cc3 4537 struct intel_iommu *iommu;
d5b9e4bf
LB
4538 int ret;
4539
fbcde5bb 4540 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3d4c7cc3 4541 return -ENODEV;
fbcde5bb
LB
4542
4543 if (info->pri_enabled)
4544 return -EBUSY;
4545
3d4c7cc3
LB
4546 iommu = info->iommu;
4547 if (!iommu)
4548 return -EINVAL;
4549
fbcde5bb
LB
4550 /* PASID is required in PRG Response Message. */
4551 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4552 return -EINVAL;
4553
4554 ret = pci_reset_pri(pdev);
4555 if (ret)
4556 return ret;
4557
d5b9e4bf 4558 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
60b1daa3
LB
4559 if (ret)
4560 return ret;
4561
60b1daa3
LB
4562 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4563 if (ret)
fbcde5bb
LB
4564 goto iopf_remove_device;
4565
4566 ret = pci_enable_pri(pdev, PRQ_DEPTH);
60b1daa3 4567 if (ret)
fbcde5bb
LB
4568 goto iopf_unregister_handler;
4569 info->pri_enabled = 1;
4570
4571 return 0;
4572
4573iopf_unregister_handler:
4574 iommu_unregister_device_fault_handler(dev);
4575iopf_remove_device:
4576 iopf_queue_remove_device(iommu->iopf_queue, dev);
4c82b886 4577
d5b9e4bf 4578 return ret;
4c82b886
LB
4579}
4580
3d4c7cc3 4581static int intel_iommu_disable_iopf(struct device *dev)
95587a75 4582{
586081d3 4583 struct device_domain_info *info = dev_iommu_priv_get(dev);
4c82b886 4584 struct intel_iommu *iommu = info->iommu;
9003351c 4585
fbcde5bb
LB
4586 if (!info->pri_enabled)
4587 return -EINVAL;
4588
4589 /*
4590 * PCIe spec states that by clearing PRI enable bit, the Page
4591 * Request Interface will not issue new page requests, but has
4592 * outstanding page requests that have been transmitted or are
4593 * queued for transmission. This is supposed to be called after
4594 * the device driver has stopped DMA, all PASIDs have been
4595 * unbound and the outstanding PRQs have been drained.
4596 */
4597 pci_disable_pri(to_pci_dev(dev));
4598 info->pri_enabled = 0;
76fdd6c5 4599
7b8aa998
LB
4600 /*
4601 * With PRI disabled and outstanding PRQs drained, unregistering
4602 * fault handler and removing device from iopf queue should never
4603 * fail.
4604 */
4605 WARN_ON(iommu_unregister_device_fault_handler(dev));
4606 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4c82b886 4607
7b8aa998 4608 return 0;
95587a75
LB
4609}
4610
4611static int
4612intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4613{
4c82b886 4614 switch (feat) {
4c82b886 4615 case IOMMU_DEV_FEAT_IOPF:
94f797ad 4616 return intel_iommu_enable_iopf(dev);
9003351c 4617
4c82b886
LB
4618 case IOMMU_DEV_FEAT_SVA:
4619 return intel_iommu_enable_sva(dev);
76fdd6c5 4620
4c82b886
LB
4621 default:
4622 return -ENODEV;
76fdd6c5 4623 }
95587a75
LB
4624}
4625
4626static int
4627intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4628{
4c82b886 4629 switch (feat) {
4c82b886 4630 case IOMMU_DEV_FEAT_IOPF:
3d4c7cc3 4631 return intel_iommu_disable_iopf(dev);
4c82b886
LB
4632
4633 case IOMMU_DEV_FEAT_SVA:
3d4c7cc3 4634 return 0;
4c82b886
LB
4635
4636 default:
4637 return -ENODEV;
4638 }
95587a75
LB
4639}
4640
41bb23e7 4641static bool intel_iommu_is_attach_deferred(struct device *dev)
0e8000f8 4642{
586081d3 4643 struct device_domain_info *info = dev_iommu_priv_get(dev);
0e8000f8 4644
586081d3 4645 return translation_pre_enabled(info->iommu) && !info->domain;
2cd1311a
LB
4646}
4647
67e8a5b1
RJ
4648/*
4649 * Check that the device does not live on an external facing PCI port that is
4650 * marked as untrusted. Such devices should not be able to apply quirks and
4651 * thus not be able to bypass the IOMMU restrictions.
4652 */
4653static bool risky_device(struct pci_dev *pdev)
4654{
4655 if (pdev->untrusted) {
4656 pci_info(pdev,
4657 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4658 pdev->vendor, pdev->device);
4659 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4660 return true;
4661 }
4662 return false;
4663}
4664
933fcd01
LB
4665static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4666 unsigned long iova, size_t size)
4667{
4668 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4669 unsigned long pages = aligned_nrpages(iova, size);
4670 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
ba949f4c
LB
4671 struct iommu_domain_info *info;
4672 unsigned long i;
933fcd01 4673
ba949f4c
LB
4674 xa_for_each(&dmar_domain->iommu_array, i, info)
4675 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
933fcd01
LB
4676}
4677
eaca8889
LB
4678static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4679{
4680 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
7d0c9da6
LB
4681 struct dev_pasid_info *curr, *dev_pasid = NULL;
4682 struct dmar_domain *dmar_domain;
eaca8889 4683 struct iommu_domain *domain;
7d0c9da6 4684 unsigned long flags;
eaca8889 4685
eaca8889 4686 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
15478623
LB
4687 if (WARN_ON_ONCE(!domain))
4688 goto out_tear_down;
4689
4690 /*
4691 * The SVA implementation needs to handle its own stuffs like the mm
4692 * notification. Before consolidating that code into iommu core, let
4693 * the intel sva code handle it.
4694 */
4695 if (domain->type == IOMMU_DOMAIN_SVA) {
4696 intel_svm_remove_dev_pasid(dev, pasid);
4697 goto out_tear_down;
eaca8889
LB
4698 }
4699
7d0c9da6
LB
4700 dmar_domain = to_dmar_domain(domain);
4701 spin_lock_irqsave(&dmar_domain->lock, flags);
4702 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4703 if (curr->dev == dev && curr->pasid == pasid) {
4704 list_del(&curr->link_domain);
4705 dev_pasid = curr;
eaca8889
LB
4706 break;
4707 }
4708 }
7d0c9da6
LB
4709 WARN_ON_ONCE(!dev_pasid);
4710 spin_unlock_irqrestore(&dmar_domain->lock, flags);
eaca8889 4711
7d0c9da6
LB
4712 domain_detach_iommu(dmar_domain, iommu);
4713 kfree(dev_pasid);
15478623 4714out_tear_down:
eaca8889 4715 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
15478623 4716 intel_drain_pasid_prq(dev, pasid);
eaca8889
LB
4717}
4718
7d0c9da6
LB
4719static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4720 struct device *dev, ioasid_t pasid)
4721{
4722 struct device_domain_info *info = dev_iommu_priv_get(dev);
4723 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4724 struct intel_iommu *iommu = info->iommu;
4725 struct dev_pasid_info *dev_pasid;
4726 unsigned long flags;
4727 int ret;
4728
4729 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4730 return -EOPNOTSUPP;
4731
4732 if (context_copied(iommu, info->bus, info->devfn))
4733 return -EBUSY;
4734
4735 ret = prepare_domain_attach_device(domain, dev);
4736 if (ret)
4737 return ret;
4738
4739 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4740 if (!dev_pasid)
4741 return -ENOMEM;
4742
4743 ret = domain_attach_iommu(dmar_domain, iommu);
4744 if (ret)
4745 goto out_free;
4746
4747 if (domain_type_is_si(dmar_domain))
4748 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4749 dev, pasid);
4750 else if (dmar_domain->use_first_level)
4751 ret = domain_setup_first_level(iommu, dmar_domain,
4752 dev, pasid);
4753 else
4754 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4755 dev, pasid);
4756 if (ret)
4757 goto out_detach_iommu;
4758
4759 dev_pasid->dev = dev;
4760 dev_pasid->pasid = pasid;
4761 spin_lock_irqsave(&dmar_domain->lock, flags);
4762 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4763 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4764
4765 return 0;
4766out_detach_iommu:
4767 domain_detach_iommu(dmar_domain, iommu);
4768out_free:
4769 kfree(dev_pasid);
4770 return ret;
eaca8889
LB
4771}
4772
55243393
YL
4773static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4774{
4775 struct device_domain_info *info = dev_iommu_priv_get(dev);
4776 struct intel_iommu *iommu = info->iommu;
4777 struct iommu_hw_info_vtd *vtd;
4778
4779 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4780 if (!vtd)
4781 return ERR_PTR(-ENOMEM);
4782
4783 vtd->cap_reg = iommu->cap;
4784 vtd->ecap_reg = iommu->ecap;
4785 *length = sizeof(*vtd);
4786 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4787 return vtd;
4788}
4789
b0119e87 4790const struct iommu_ops intel_iommu_ops = {
0659b8dc 4791 .capable = intel_iommu_capable,
55243393 4792 .hw_info = intel_iommu_hw_info,
0659b8dc 4793 .domain_alloc = intel_iommu_domain_alloc,
e5d1841f 4794 .probe_device = intel_iommu_probe_device,
6fc7020c 4795 .probe_finalize = intel_iommu_probe_finalize,
e5d1841f 4796 .release_device = intel_iommu_release_device,
0659b8dc 4797 .get_resv_regions = intel_iommu_get_resv_regions,
4a350a0e 4798 .device_group = intel_iommu_device_group,
95587a75
LB
4799 .dev_enable_feat = intel_iommu_dev_enable_feat,
4800 .dev_disable_feat = intel_iommu_dev_disable_feat,
8af46c78 4801 .is_attach_deferred = intel_iommu_is_attach_deferred,
7039d11b 4802 .def_domain_type = device_def_domain_type,
eaca8889 4803 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
a886d5a7 4804 .pgsize_bitmap = SZ_4K,
56722a43 4805#ifdef CONFIG_INTEL_IOMMU_SVM
8b737121 4806 .page_response = intel_svm_page_response,
56722a43 4807#endif
9a630a4b
LB
4808 .default_domain_ops = &(const struct iommu_domain_ops) {
4809 .attach_dev = intel_iommu_attach_device,
7d0c9da6 4810 .set_dev_pasid = intel_iommu_set_dev_pasid,
9a630a4b
LB
4811 .map_pages = intel_iommu_map_pages,
4812 .unmap_pages = intel_iommu_unmap_pages,
4813 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4814 .flush_iotlb_all = intel_flush_iotlb_all,
4815 .iotlb_sync = intel_iommu_tlb_sync,
4816 .iova_to_phys = intel_iommu_iova_to_phys,
4817 .free = intel_iommu_domain_free,
6043257b 4818 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
9a630a4b 4819 }
a8bcbb0d 4820};
9af88143 4821
1f76249c 4822static void quirk_iommu_igfx(struct pci_dev *dev)
9452618e 4823{
67e8a5b1
RJ
4824 if (risky_device(dev))
4825 return;
4826
932a6523 4827 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
9452618e
DV
4828 dmar_map_gfx = 0;
4829}
4830
1f76249c
CW
4831/* G4x/GM45 integrated gfx dmar support is totally busted. */
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4836DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4837DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4838DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4839
4840/* Broadwell igfx malfunctions with dmar */
4841DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4842DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4843DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4844DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4845DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4846DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4847DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4848DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4849DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4857DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4858DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4859DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4860DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4861DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4862DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4863DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4864DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
9452618e 4865
d34d6517 4866static void quirk_iommu_rwbf(struct pci_dev *dev)
9af88143 4867{
67e8a5b1
RJ
4868 if (risky_device(dev))
4869 return;
4870
9af88143
DW
4871 /*
4872 * Mobile 4 Series Chipset neglects to set RWBF capability,
210561ff 4873 * but needs it. Same seems to hold for the desktop versions.
9af88143 4874 */
932a6523 4875 pci_info(dev, "Forcing write-buffer flush capability\n");
9af88143
DW
4876 rwbf_quirk = 1;
4877}
4878
4879DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
210561ff
DV
4880DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4881DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4882DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4883DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4884DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4885DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
e0fc7e0b 4886
eecfd57f
AJ
4887#define GGC 0x52
4888#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4889#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4890#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4891#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4892#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4893#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4894#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4895#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4896
d34d6517 4897static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
9eecabcb
DW
4898{
4899 unsigned short ggc;
4900
67e8a5b1
RJ
4901 if (risky_device(dev))
4902 return;
4903
eecfd57f 4904 if (pci_read_config_word(dev, GGC, &ggc))
9eecabcb
DW
4905 return;
4906
eecfd57f 4907 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
932a6523 4908 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
9eecabcb 4909 dmar_map_gfx = 0;
6fbcfb3e
DW
4910 } else if (dmar_map_gfx) {
4911 /* we have to ensure the gfx device is idle before we flush */
932a6523 4912 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
308723e3 4913 iommu_set_dma_strict();
d0e108b8 4914 }
9eecabcb
DW
4915}
4916DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4917DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4918DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4919DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4920
b1012ca8
LB
4921static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4922{
4923 unsigned short ver;
4924
4925 if (!IS_GFX_DEVICE(dev))
4926 return;
4927
4928 ver = (dev->device >> 8) & 0xff;
4929 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4930 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
0a967f5b 4931 ver != 0x9a && ver != 0xa7)
b1012ca8
LB
4932 return;
4933
4934 if (risky_device(dev))
4935 return;
4936
4937 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4938 iommu_skip_te_disable = 1;
4939}
4940DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4941
e0fc7e0b
DW
4942/* On Tylersburg chipsets, some BIOSes have been known to enable the
4943 ISOCH DMAR unit for the Azalia sound device, but not give it any
4944 TLB entries, which causes it to deadlock. Check for that. We do
4945 this in a function called from init_dmars(), instead of in a PCI
4946 quirk, because we don't want to print the obnoxious "BIOS broken"
4947 message if VT-d is actually disabled.
4948*/
4949static void __init check_tylersburg_isoch(void)
4950{
4951 struct pci_dev *pdev;
4952 uint32_t vtisochctrl;
4953
4954 /* If there's no Azalia in the system anyway, forget it. */
4955 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4956 if (!pdev)
4957 return;
67e8a5b1
RJ
4958
4959 if (risky_device(pdev)) {
4960 pci_dev_put(pdev);
4961 return;
4962 }
4963
e0fc7e0b
DW
4964 pci_dev_put(pdev);
4965
4966 /* System Management Registers. Might be hidden, in which case
4967 we can't do the sanity check. But that's OK, because the
4968 known-broken BIOSes _don't_ actually hide it, so far. */
4969 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4970 if (!pdev)
4971 return;
4972
67e8a5b1
RJ
4973 if (risky_device(pdev)) {
4974 pci_dev_put(pdev);
4975 return;
4976 }
4977
e0fc7e0b
DW
4978 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4979 pci_dev_put(pdev);
4980 return;
4981 }
4982
4983 pci_dev_put(pdev);
4984
4985 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4986 if (vtisochctrl & 1)
4987 return;
4988
4989 /* Drop all bits other than the number of TLB entries */
4990 vtisochctrl &= 0x1c;
4991
4992 /* If we have the recommended number of TLB entries (16), fine. */
4993 if (vtisochctrl == 0x10)
4994 return;
4995
4996 /* Zero TLB entries? You get to ride the short bus to school. */
4997 if (!vtisochctrl) {
4998 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4999 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5000 dmi_get_system_info(DMI_BIOS_VENDOR),
5001 dmi_get_system_info(DMI_BIOS_VERSION),
5002 dmi_get_system_info(DMI_PRODUCT_VERSION));
5003 iommu_identity_mapping |= IDENTMAP_AZALIA;
5004 return;
5005 }
9f10e5bf
JR
5006
5007 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
e0fc7e0b
DW
5008 vtisochctrl);
5009}
e65a6897
JP
5010
5011/*
5012 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5013 * invalidation completion before posted writes initiated with translated address
5014 * that utilized translations matching the invalidation address range, violating
5015 * the invalidation completion ordering.
5016 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5017 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5018 * under the control of the trusted/privileged host device driver must use this
5019 * quirk.
5020 * Device TLBs are invalidated under the following six conditions:
5021 * 1. Device driver does DMA API unmap IOVA
5022 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5023 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5024 * exit_mmap() due to crash
5025 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5026 * VM has to free pages that were unmapped
5027 * 5. Userspace driver unmaps a DMA buffer
5028 * 6. Cache invalidation in vSVA usage (upcoming)
5029 *
5030 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5031 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5032 * invalidate TLB the same way as normal user unmap which will use this quirk.
5033 * The dTLB invalidation after PASID cache flush does not need this quirk.
5034 *
5035 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5036 */
5037void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5038 unsigned long address, unsigned long mask,
5039 u32 pasid, u16 qdep)
5040{
5041 u16 sid;
5042
5043 if (likely(!info->dtlb_extra_inval))
5044 return;
5045
5046 sid = PCI_DEVID(info->bus, info->devfn);
42987801 5047 if (pasid == IOMMU_NO_PASID) {
e65a6897
JP
5048 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5049 qdep, address, mask);
5050 } else {
5051 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5052 pasid, qdep, address, mask);
5053 }
5054}
dc578758
KL
5055
5056#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5057
5058/*
5059 * Function to submit a command to the enhanced command interface. The
5060 * valid enhanced command descriptions are defined in Table 47 of the
5061 * VT-d spec. The VT-d hardware implementation may support some but not
5062 * all commands, which can be determined by checking the Enhanced
5063 * Command Capability Register.
5064 *
5065 * Return values:
5066 * - 0: Command successful without any error;
5067 * - Negative: software error value;
5068 * - Nonzero positive: failure status code defined in Table 48.
5069 */
5070int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5071{
5072 unsigned long flags;
5073 u64 res;
5074 int ret;
5075
5076 if (!cap_ecmds(iommu->cap))
5077 return -ENODEV;
5078
5079 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5080
5081 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5082 if (res & DMA_ECMD_ECRSP_IP) {
5083 ret = -EBUSY;
5084 goto err;
5085 }
5086
5087 /*
5088 * Unconditionally write the operand B, because
5089 * - There is no side effect if an ecmd doesn't require an
5090 * operand B, but we set the register to some value.
5091 * - It's not invoked in any critical path. The extra MMIO
5092 * write doesn't bring any performance concerns.
5093 */
5094 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5095 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5096
5097 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5098 !(res & DMA_ECMD_ECRSP_IP), res);
5099
5100 if (res & DMA_ECMD_ECRSP_IP) {
5101 ret = -ETIMEDOUT;
5102 goto err;
5103 }
5104
5105 ret = ecmd_get_status_code(res);
5106err:
5107 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5108
5109 return ret;
5110}