Commit | Line | Data |
---|---|---|
73fa0d10 AW |
1 | /* |
2 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU | |
3 | * | |
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | |
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as | |
9 | * published by the Free Software Foundation. | |
10 | * | |
11 | * Derived from original vfio: | |
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | |
13 | * Author: Tom Lyon, pugs@cisco.com | |
14 | * | |
15 | * We arbitrarily define a Type1 IOMMU as one matching the below code. | |
16 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel | |
17 | * VT-d, but that makes it harder to re-use as theoretically anyone | |
18 | * implementing a similar IOMMU could make use of this. We expect the | |
19 | * IOMMU to support the IOMMU API and have few to no restrictions around | |
20 | * the IOVA range that can be mapped. The Type1 IOMMU is currently | |
21 | * optimized for relatively static mappings of a userspace process with | |
22 | * userpsace pages pinned into memory. We also assume devices and IOMMU | |
23 | * domains are PCI based as the IOMMU API is still centered around a | |
24 | * device/bus interface rather than a group interface. | |
25 | */ | |
26 | ||
27 | #include <linux/compat.h> | |
28 | #include <linux/device.h> | |
29 | #include <linux/fs.h> | |
30 | #include <linux/iommu.h> | |
31 | #include <linux/module.h> | |
32 | #include <linux/mm.h> | |
33 | #include <linux/pci.h> /* pci_bus_type */ | |
cd9b2268 | 34 | #include <linux/rbtree.h> |
73fa0d10 AW |
35 | #include <linux/sched.h> |
36 | #include <linux/slab.h> | |
37 | #include <linux/uaccess.h> | |
38 | #include <linux/vfio.h> | |
39 | #include <linux/workqueue.h> | |
40 | ||
41 | #define DRIVER_VERSION "0.2" | |
42 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | |
43 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" | |
44 | ||
45 | static bool allow_unsafe_interrupts; | |
46 | module_param_named(allow_unsafe_interrupts, | |
47 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); | |
48 | MODULE_PARM_DESC(allow_unsafe_interrupts, | |
49 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | |
50 | ||
5c6c2b21 AW |
51 | static bool disable_hugepages; |
52 | module_param_named(disable_hugepages, | |
53 | disable_hugepages, bool, S_IRUGO | S_IWUSR); | |
54 | MODULE_PARM_DESC(disable_hugepages, | |
55 | "Disable VFIO IOMMU support for IOMMU hugepages."); | |
56 | ||
73fa0d10 AW |
57 | struct vfio_iommu { |
58 | struct iommu_domain *domain; | |
59 | struct mutex lock; | |
cd9b2268 | 60 | struct rb_root dma_list; |
73fa0d10 AW |
61 | struct list_head group_list; |
62 | bool cache; | |
63 | }; | |
64 | ||
65 | struct vfio_dma { | |
cd9b2268 | 66 | struct rb_node node; |
73fa0d10 AW |
67 | dma_addr_t iova; /* Device address */ |
68 | unsigned long vaddr; /* Process virtual addr */ | |
166fd7d9 | 69 | size_t size; /* Map size (bytes) */ |
73fa0d10 AW |
70 | int prot; /* IOMMU_READ/WRITE */ |
71 | }; | |
72 | ||
73 | struct vfio_group { | |
74 | struct iommu_group *iommu_group; | |
75 | struct list_head next; | |
76 | }; | |
77 | ||
78 | /* | |
79 | * This code handles mapping and unmapping of user data buffers | |
80 | * into DMA'ble space using the IOMMU | |
81 | */ | |
82 | ||
cd9b2268 AW |
83 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, |
84 | dma_addr_t start, size_t size) | |
85 | { | |
86 | struct rb_node *node = iommu->dma_list.rb_node; | |
87 | ||
88 | while (node) { | |
89 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); | |
90 | ||
91 | if (start + size <= dma->iova) | |
92 | node = node->rb_left; | |
166fd7d9 | 93 | else if (start >= dma->iova + dma->size) |
cd9b2268 AW |
94 | node = node->rb_right; |
95 | else | |
96 | return dma; | |
97 | } | |
98 | ||
99 | return NULL; | |
100 | } | |
101 | ||
102 | static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) | |
103 | { | |
104 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; | |
105 | struct vfio_dma *dma; | |
106 | ||
107 | while (*link) { | |
108 | parent = *link; | |
109 | dma = rb_entry(parent, struct vfio_dma, node); | |
110 | ||
166fd7d9 | 111 | if (new->iova + new->size <= dma->iova) |
cd9b2268 AW |
112 | link = &(*link)->rb_left; |
113 | else | |
114 | link = &(*link)->rb_right; | |
115 | } | |
116 | ||
117 | rb_link_node(&new->node, parent, link); | |
118 | rb_insert_color(&new->node, &iommu->dma_list); | |
119 | } | |
120 | ||
121 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) | |
122 | { | |
123 | rb_erase(&old->node, &iommu->dma_list); | |
124 | } | |
125 | ||
73fa0d10 AW |
126 | struct vwork { |
127 | struct mm_struct *mm; | |
128 | long npage; | |
129 | struct work_struct work; | |
130 | }; | |
131 | ||
132 | /* delayed decrement/increment for locked_vm */ | |
133 | static void vfio_lock_acct_bg(struct work_struct *work) | |
134 | { | |
135 | struct vwork *vwork = container_of(work, struct vwork, work); | |
136 | struct mm_struct *mm; | |
137 | ||
138 | mm = vwork->mm; | |
139 | down_write(&mm->mmap_sem); | |
140 | mm->locked_vm += vwork->npage; | |
141 | up_write(&mm->mmap_sem); | |
142 | mmput(mm); | |
143 | kfree(vwork); | |
144 | } | |
145 | ||
146 | static void vfio_lock_acct(long npage) | |
147 | { | |
148 | struct vwork *vwork; | |
149 | struct mm_struct *mm; | |
150 | ||
166fd7d9 AW |
151 | if (!current->mm || !npage) |
152 | return; /* process exited or nothing to do */ | |
73fa0d10 AW |
153 | |
154 | if (down_write_trylock(¤t->mm->mmap_sem)) { | |
155 | current->mm->locked_vm += npage; | |
156 | up_write(¤t->mm->mmap_sem); | |
157 | return; | |
158 | } | |
159 | ||
160 | /* | |
161 | * Couldn't get mmap_sem lock, so must setup to update | |
162 | * mm->locked_vm later. If locked_vm were atomic, we | |
163 | * wouldn't need this silliness | |
164 | */ | |
165 | vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); | |
166 | if (!vwork) | |
167 | return; | |
168 | mm = get_task_mm(current); | |
169 | if (!mm) { | |
170 | kfree(vwork); | |
171 | return; | |
172 | } | |
173 | INIT_WORK(&vwork->work, vfio_lock_acct_bg); | |
174 | vwork->mm = mm; | |
175 | vwork->npage = npage; | |
176 | schedule_work(&vwork->work); | |
177 | } | |
178 | ||
179 | /* | |
180 | * Some mappings aren't backed by a struct page, for example an mmap'd | |
181 | * MMIO range for our own or another device. These use a different | |
182 | * pfn conversion and shouldn't be tracked as locked pages. | |
183 | */ | |
184 | static bool is_invalid_reserved_pfn(unsigned long pfn) | |
185 | { | |
186 | if (pfn_valid(pfn)) { | |
187 | bool reserved; | |
188 | struct page *tail = pfn_to_page(pfn); | |
668f9abb | 189 | struct page *head = compound_head(tail); |
73fa0d10 AW |
190 | reserved = !!(PageReserved(head)); |
191 | if (head != tail) { | |
192 | /* | |
193 | * "head" is not a dangling pointer | |
668f9abb | 194 | * (compound_head takes care of that) |
73fa0d10 AW |
195 | * but the hugepage may have been split |
196 | * from under us (and we may not hold a | |
197 | * reference count on the head page so it can | |
198 | * be reused before we run PageReferenced), so | |
199 | * we've to check PageTail before returning | |
200 | * what we just read. | |
201 | */ | |
202 | smp_rmb(); | |
203 | if (PageTail(tail)) | |
204 | return reserved; | |
205 | } | |
206 | return PageReserved(tail); | |
207 | } | |
208 | ||
209 | return true; | |
210 | } | |
211 | ||
212 | static int put_pfn(unsigned long pfn, int prot) | |
213 | { | |
214 | if (!is_invalid_reserved_pfn(pfn)) { | |
215 | struct page *page = pfn_to_page(pfn); | |
216 | if (prot & IOMMU_WRITE) | |
217 | SetPageDirty(page); | |
218 | put_page(page); | |
219 | return 1; | |
220 | } | |
221 | return 0; | |
222 | } | |
223 | ||
73fa0d10 AW |
224 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) |
225 | { | |
226 | struct page *page[1]; | |
227 | struct vm_area_struct *vma; | |
228 | int ret = -EFAULT; | |
229 | ||
230 | if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { | |
231 | *pfn = page_to_pfn(page[0]); | |
232 | return 0; | |
233 | } | |
234 | ||
235 | down_read(¤t->mm->mmap_sem); | |
236 | ||
237 | vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); | |
238 | ||
239 | if (vma && vma->vm_flags & VM_PFNMAP) { | |
240 | *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | |
241 | if (is_invalid_reserved_pfn(*pfn)) | |
242 | ret = 0; | |
243 | } | |
244 | ||
245 | up_read(¤t->mm->mmap_sem); | |
246 | ||
247 | return ret; | |
248 | } | |
249 | ||
166fd7d9 AW |
250 | /* |
251 | * Attempt to pin pages. We really don't want to track all the pfns and | |
252 | * the iommu can only map chunks of consecutive pfns anyway, so get the | |
253 | * first page and all consecutive pages with the same locking. | |
254 | */ | |
255 | static long vfio_pin_pages(unsigned long vaddr, long npage, | |
256 | int prot, unsigned long *pfn_base) | |
73fa0d10 | 257 | { |
166fd7d9 AW |
258 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
259 | bool lock_cap = capable(CAP_IPC_LOCK); | |
260 | long ret, i; | |
73fa0d10 | 261 | |
166fd7d9 AW |
262 | if (!current->mm) |
263 | return -ENODEV; | |
73fa0d10 | 264 | |
166fd7d9 AW |
265 | ret = vaddr_get_pfn(vaddr, prot, pfn_base); |
266 | if (ret) | |
267 | return ret; | |
73fa0d10 | 268 | |
166fd7d9 AW |
269 | if (is_invalid_reserved_pfn(*pfn_base)) |
270 | return 1; | |
73fa0d10 | 271 | |
166fd7d9 AW |
272 | if (!lock_cap && current->mm->locked_vm + 1 > limit) { |
273 | put_pfn(*pfn_base, prot); | |
274 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, | |
275 | limit << PAGE_SHIFT); | |
276 | return -ENOMEM; | |
277 | } | |
278 | ||
5c6c2b21 AW |
279 | if (unlikely(disable_hugepages)) { |
280 | vfio_lock_acct(1); | |
281 | return 1; | |
282 | } | |
283 | ||
166fd7d9 AW |
284 | /* Lock all the consecutive pages from pfn_base */ |
285 | for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { | |
73fa0d10 AW |
286 | unsigned long pfn = 0; |
287 | ||
288 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | |
166fd7d9 AW |
289 | if (ret) |
290 | break; | |
291 | ||
292 | if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { | |
293 | put_pfn(pfn, prot); | |
294 | break; | |
73fa0d10 AW |
295 | } |
296 | ||
166fd7d9 AW |
297 | if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { |
298 | put_pfn(pfn, prot); | |
299 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | |
300 | __func__, limit << PAGE_SHIFT); | |
301 | break; | |
302 | } | |
303 | } | |
304 | ||
305 | vfio_lock_acct(i); | |
306 | ||
307 | return i; | |
308 | } | |
309 | ||
310 | static long vfio_unpin_pages(unsigned long pfn, long npage, | |
311 | int prot, bool do_accounting) | |
312 | { | |
313 | unsigned long unlocked = 0; | |
314 | long i; | |
315 | ||
316 | for (i = 0; i < npage; i++) | |
317 | unlocked += put_pfn(pfn++, prot); | |
318 | ||
319 | if (do_accounting) | |
320 | vfio_lock_acct(-unlocked); | |
321 | ||
322 | return unlocked; | |
323 | } | |
324 | ||
325 | static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, | |
326 | dma_addr_t iova, size_t *size) | |
327 | { | |
328 | dma_addr_t start = iova, end = iova + *size; | |
329 | long unlocked = 0; | |
330 | ||
331 | while (iova < end) { | |
332 | size_t unmapped; | |
333 | phys_addr_t phys; | |
334 | ||
73fa0d10 | 335 | /* |
166fd7d9 AW |
336 | * We use the IOMMU to track the physical address. This |
337 | * saves us from having a lot more entries in our mapping | |
338 | * tree. The downside is that we don't track the size | |
339 | * used to do the mapping. We request unmap of a single | |
340 | * page, but expect IOMMUs that support large pages to | |
341 | * unmap a larger chunk. | |
73fa0d10 | 342 | */ |
166fd7d9 AW |
343 | phys = iommu_iova_to_phys(iommu->domain, iova); |
344 | if (WARN_ON(!phys)) { | |
345 | iova += PAGE_SIZE; | |
346 | continue; | |
73fa0d10 | 347 | } |
166fd7d9 AW |
348 | |
349 | unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); | |
350 | if (!unmapped) | |
351 | break; | |
352 | ||
353 | unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, | |
354 | unmapped >> PAGE_SHIFT, | |
355 | dma->prot, false); | |
356 | iova += unmapped; | |
73fa0d10 | 357 | } |
166fd7d9 AW |
358 | |
359 | vfio_lock_acct(-unlocked); | |
360 | ||
361 | *size = iova - start; | |
362 | ||
73fa0d10 AW |
363 | return 0; |
364 | } | |
365 | ||
cd9b2268 | 366 | static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, |
166fd7d9 | 367 | size_t *size, struct vfio_dma *dma) |
73fa0d10 | 368 | { |
166fd7d9 | 369 | size_t offset, overlap, tmp; |
73fa0d10 | 370 | struct vfio_dma *split; |
166fd7d9 AW |
371 | int ret; |
372 | ||
f5bfdbf2 AW |
373 | if (!*size) |
374 | return 0; | |
375 | ||
166fd7d9 AW |
376 | /* |
377 | * Existing dma region is completely covered, unmap all. This is | |
378 | * the likely case since userspace tends to map and unmap buffers | |
379 | * in one shot rather than multiple mappings within a buffer. | |
380 | */ | |
381 | if (likely(start <= dma->iova && | |
382 | start + *size >= dma->iova + dma->size)) { | |
383 | *size = dma->size; | |
384 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); | |
385 | if (ret) | |
386 | return ret; | |
387 | ||
388 | /* | |
389 | * Did we remove more than we have? Should never happen | |
390 | * since a vfio_dma is contiguous in iova and vaddr. | |
391 | */ | |
392 | WARN_ON(*size != dma->size); | |
73fa0d10 | 393 | |
cd9b2268 | 394 | vfio_remove_dma(iommu, dma); |
73fa0d10 | 395 | kfree(dma); |
cd9b2268 | 396 | return 0; |
73fa0d10 AW |
397 | } |
398 | ||
399 | /* Overlap low address of existing range */ | |
400 | if (start <= dma->iova) { | |
166fd7d9 AW |
401 | overlap = start + *size - dma->iova; |
402 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); | |
403 | if (ret) | |
404 | return ret; | |
73fa0d10 | 405 | |
166fd7d9 | 406 | vfio_remove_dma(iommu, dma); |
73fa0d10 | 407 | |
166fd7d9 AW |
408 | /* |
409 | * Check, we may have removed to whole vfio_dma. If not | |
410 | * fixup and re-insert. | |
411 | */ | |
412 | if (overlap < dma->size) { | |
413 | dma->iova += overlap; | |
414 | dma->vaddr += overlap; | |
415 | dma->size -= overlap; | |
416 | vfio_insert_dma(iommu, dma); | |
f5bfdbf2 AW |
417 | } else |
418 | kfree(dma); | |
419 | ||
166fd7d9 | 420 | *size = overlap; |
cd9b2268 | 421 | return 0; |
73fa0d10 AW |
422 | } |
423 | ||
424 | /* Overlap high address of existing range */ | |
166fd7d9 AW |
425 | if (start + *size >= dma->iova + dma->size) { |
426 | offset = start - dma->iova; | |
427 | overlap = dma->size - offset; | |
73fa0d10 | 428 | |
166fd7d9 AW |
429 | ret = vfio_unmap_unpin(iommu, dma, start, &overlap); |
430 | if (ret) | |
431 | return ret; | |
432 | ||
f5bfdbf2 | 433 | dma->size -= overlap; |
166fd7d9 | 434 | *size = overlap; |
cd9b2268 | 435 | return 0; |
73fa0d10 AW |
436 | } |
437 | ||
438 | /* Split existing */ | |
8d38ef19 AW |
439 | |
440 | /* | |
441 | * Allocate our tracking structure early even though it may not | |
442 | * be used. An Allocation failure later loses track of pages and | |
443 | * is more difficult to unwind. | |
444 | */ | |
f5bfdbf2 AW |
445 | split = kzalloc(sizeof(*split), GFP_KERNEL); |
446 | if (!split) | |
447 | return -ENOMEM; | |
448 | ||
166fd7d9 | 449 | offset = start - dma->iova; |
73fa0d10 | 450 | |
166fd7d9 | 451 | ret = vfio_unmap_unpin(iommu, dma, start, size); |
8d38ef19 | 452 | if (ret || !*size) { |
f5bfdbf2 | 453 | kfree(split); |
8d38ef19 | 454 | return ret; |
f5bfdbf2 AW |
455 | } |
456 | ||
166fd7d9 | 457 | tmp = dma->size; |
73fa0d10 | 458 | |
f5bfdbf2 | 459 | /* Resize the lower vfio_dma in place, before the below insert */ |
166fd7d9 AW |
460 | dma->size = offset; |
461 | ||
f5bfdbf2 AW |
462 | /* Insert new for remainder, assuming it didn't all get unmapped */ |
463 | if (likely(offset + *size < tmp)) { | |
166fd7d9 AW |
464 | split->size = tmp - offset - *size; |
465 | split->iova = dma->iova + offset + *size; | |
466 | split->vaddr = dma->vaddr + offset + *size; | |
467 | split->prot = dma->prot; | |
468 | vfio_insert_dma(iommu, split); | |
f5bfdbf2 AW |
469 | } else |
470 | kfree(split); | |
73fa0d10 | 471 | |
cd9b2268 | 472 | return 0; |
73fa0d10 AW |
473 | } |
474 | ||
475 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | |
476 | struct vfio_iommu_type1_dma_unmap *unmap) | |
477 | { | |
73fa0d10 | 478 | uint64_t mask; |
cd9b2268 | 479 | struct vfio_dma *dma; |
166fd7d9 | 480 | size_t unmapped = 0, size; |
cd9b2268 | 481 | int ret = 0; |
73fa0d10 AW |
482 | |
483 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | |
484 | ||
485 | if (unmap->iova & mask) | |
486 | return -EINVAL; | |
f5bfdbf2 | 487 | if (!unmap->size || unmap->size & mask) |
73fa0d10 AW |
488 | return -EINVAL; |
489 | ||
73fa0d10 AW |
490 | WARN_ON(mask & PAGE_MASK); |
491 | ||
492 | mutex_lock(&iommu->lock); | |
493 | ||
166fd7d9 AW |
494 | while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { |
495 | size = unmap->size; | |
496 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); | |
f5bfdbf2 | 497 | if (ret || !size) |
166fd7d9 AW |
498 | break; |
499 | unmapped += size; | |
500 | } | |
cd9b2268 | 501 | |
73fa0d10 | 502 | mutex_unlock(&iommu->lock); |
166fd7d9 AW |
503 | |
504 | /* | |
505 | * We may unmap more than requested, update the unmap struct so | |
506 | * userspace can know. | |
507 | */ | |
508 | unmap->size = unmapped; | |
509 | ||
510 | return ret; | |
511 | } | |
512 | ||
513 | /* | |
514 | * Turns out AMD IOMMU has a page table bug where it won't map large pages | |
515 | * to a region that previously mapped smaller pages. This should be fixed | |
516 | * soon, so this is just a temporary workaround to break mappings down into | |
517 | * PAGE_SIZE. Better to map smaller pages than nothing. | |
518 | */ | |
519 | static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, | |
520 | unsigned long pfn, long npage, int prot) | |
521 | { | |
522 | long i; | |
523 | int ret; | |
524 | ||
525 | for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { | |
526 | ret = iommu_map(iommu->domain, iova, | |
527 | (phys_addr_t)pfn << PAGE_SHIFT, | |
528 | PAGE_SIZE, prot); | |
529 | if (ret) | |
530 | break; | |
531 | } | |
532 | ||
533 | for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) | |
534 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | |
535 | ||
cd9b2268 | 536 | return ret; |
73fa0d10 AW |
537 | } |
538 | ||
539 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | |
540 | struct vfio_iommu_type1_dma_map *map) | |
541 | { | |
166fd7d9 AW |
542 | dma_addr_t end, iova; |
543 | unsigned long vaddr = map->vaddr; | |
73fa0d10 | 544 | size_t size = map->size; |
166fd7d9 | 545 | long npage; |
73fa0d10 AW |
546 | int ret = 0, prot = 0; |
547 | uint64_t mask; | |
d93b3ac0 AM |
548 | struct vfio_dma *dma = NULL; |
549 | unsigned long pfn; | |
166fd7d9 AW |
550 | |
551 | end = map->iova + map->size; | |
73fa0d10 AW |
552 | |
553 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | |
554 | ||
555 | /* READ/WRITE from device perspective */ | |
556 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | |
557 | prot |= IOMMU_WRITE; | |
558 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) | |
559 | prot |= IOMMU_READ; | |
560 | ||
561 | if (!prot) | |
562 | return -EINVAL; /* No READ/WRITE? */ | |
563 | ||
166fd7d9 AW |
564 | if (iommu->cache) |
565 | prot |= IOMMU_CACHE; | |
566 | ||
73fa0d10 AW |
567 | if (vaddr & mask) |
568 | return -EINVAL; | |
166fd7d9 | 569 | if (map->iova & mask) |
73fa0d10 | 570 | return -EINVAL; |
166fd7d9 | 571 | if (!map->size || map->size & mask) |
73fa0d10 AW |
572 | return -EINVAL; |
573 | ||
73fa0d10 AW |
574 | WARN_ON(mask & PAGE_MASK); |
575 | ||
576 | /* Don't allow IOVA wrap */ | |
166fd7d9 | 577 | if (end && end < map->iova) |
73fa0d10 AW |
578 | return -EINVAL; |
579 | ||
580 | /* Don't allow virtual address wrap */ | |
166fd7d9 | 581 | if (vaddr + map->size && vaddr + map->size < vaddr) |
73fa0d10 AW |
582 | return -EINVAL; |
583 | ||
584 | mutex_lock(&iommu->lock); | |
585 | ||
166fd7d9 AW |
586 | if (vfio_find_dma(iommu, map->iova, map->size)) { |
587 | mutex_unlock(&iommu->lock); | |
588 | return -EEXIST; | |
73fa0d10 AW |
589 | } |
590 | ||
166fd7d9 | 591 | for (iova = map->iova; iova < end; iova += size, vaddr += size) { |
166fd7d9 AW |
592 | long i; |
593 | ||
594 | /* Pin a contiguous chunk of memory */ | |
595 | npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, | |
596 | prot, &pfn); | |
597 | if (npage <= 0) { | |
598 | WARN_ON(!npage); | |
599 | ret = (int)npage; | |
d93b3ac0 | 600 | goto out; |
166fd7d9 | 601 | } |
73fa0d10 | 602 | |
166fd7d9 AW |
603 | /* Verify pages are not already mapped */ |
604 | for (i = 0; i < npage; i++) { | |
605 | if (iommu_iova_to_phys(iommu->domain, | |
606 | iova + (i << PAGE_SHIFT))) { | |
166fd7d9 | 607 | ret = -EBUSY; |
d93b3ac0 | 608 | goto out_unpin; |
166fd7d9 AW |
609 | } |
610 | } | |
611 | ||
612 | ret = iommu_map(iommu->domain, iova, | |
613 | (phys_addr_t)pfn << PAGE_SHIFT, | |
614 | npage << PAGE_SHIFT, prot); | |
615 | if (ret) { | |
616 | if (ret != -EBUSY || | |
617 | map_try_harder(iommu, iova, pfn, npage, prot)) { | |
d93b3ac0 | 618 | goto out_unpin; |
166fd7d9 AW |
619 | } |
620 | } | |
621 | ||
622 | size = npage << PAGE_SHIFT; | |
623 | ||
624 | /* | |
625 | * Check if we abut a region below - nothing below 0. | |
626 | * This is the most likely case when mapping chunks of | |
627 | * physically contiguous regions within a virtual address | |
628 | * range. Update the abutting entry in place since iova | |
629 | * doesn't change. | |
630 | */ | |
631 | if (likely(iova)) { | |
632 | struct vfio_dma *tmp; | |
633 | tmp = vfio_find_dma(iommu, iova - 1, 1); | |
634 | if (tmp && tmp->prot == prot && | |
635 | tmp->vaddr + tmp->size == vaddr) { | |
636 | tmp->size += size; | |
166fd7d9 AW |
637 | iova = tmp->iova; |
638 | size = tmp->size; | |
639 | vaddr = tmp->vaddr; | |
640 | dma = tmp; | |
641 | } | |
642 | } | |
643 | ||
f5bfdbf2 AW |
644 | /* |
645 | * Check if we abut a region above - nothing above ~0 + 1. | |
646 | * If we abut above and below, remove and free. If only | |
647 | * abut above, remove, modify, reinsert. | |
648 | */ | |
166fd7d9 AW |
649 | if (likely(iova + size)) { |
650 | struct vfio_dma *tmp; | |
166fd7d9 AW |
651 | tmp = vfio_find_dma(iommu, iova + size, 1); |
652 | if (tmp && tmp->prot == prot && | |
653 | tmp->vaddr == vaddr + size) { | |
654 | vfio_remove_dma(iommu, tmp); | |
f5bfdbf2 | 655 | if (dma) { |
166fd7d9 | 656 | dma->size += tmp->size; |
f5bfdbf2 AW |
657 | kfree(tmp); |
658 | } else { | |
166fd7d9 | 659 | size += tmp->size; |
f5bfdbf2 AW |
660 | tmp->size = size; |
661 | tmp->iova = iova; | |
662 | tmp->vaddr = vaddr; | |
663 | vfio_insert_dma(iommu, tmp); | |
664 | dma = tmp; | |
665 | } | |
166fd7d9 | 666 | } |
73fa0d10 | 667 | } |
73fa0d10 | 668 | |
166fd7d9 AW |
669 | if (!dma) { |
670 | dma = kzalloc(sizeof(*dma), GFP_KERNEL); | |
671 | if (!dma) { | |
672 | iommu_unmap(iommu->domain, iova, size); | |
166fd7d9 | 673 | ret = -ENOMEM; |
d93b3ac0 | 674 | goto out_unpin; |
166fd7d9 AW |
675 | } |
676 | ||
677 | dma->size = size; | |
678 | dma->iova = iova; | |
679 | dma->vaddr = vaddr; | |
680 | dma->prot = prot; | |
681 | vfio_insert_dma(iommu, dma); | |
73fa0d10 AW |
682 | } |
683 | } | |
684 | ||
d93b3ac0 AM |
685 | WARN_ON(ret); |
686 | mutex_unlock(&iommu->lock); | |
687 | return ret; | |
688 | ||
689 | out_unpin: | |
690 | vfio_unpin_pages(pfn, npage, prot, true); | |
691 | ||
692 | out: | |
693 | iova = map->iova; | |
694 | size = map->size; | |
695 | while ((dma = vfio_find_dma(iommu, iova, size))) { | |
696 | int r = vfio_remove_dma_overlap(iommu, iova, | |
697 | &size, dma); | |
698 | if (WARN_ON(r || !size)) | |
699 | break; | |
166fd7d9 | 700 | } |
73fa0d10 | 701 | |
73fa0d10 AW |
702 | mutex_unlock(&iommu->lock); |
703 | return ret; | |
704 | } | |
705 | ||
706 | static int vfio_iommu_type1_attach_group(void *iommu_data, | |
707 | struct iommu_group *iommu_group) | |
708 | { | |
709 | struct vfio_iommu *iommu = iommu_data; | |
710 | struct vfio_group *group, *tmp; | |
711 | int ret; | |
712 | ||
713 | group = kzalloc(sizeof(*group), GFP_KERNEL); | |
714 | if (!group) | |
715 | return -ENOMEM; | |
716 | ||
717 | mutex_lock(&iommu->lock); | |
718 | ||
719 | list_for_each_entry(tmp, &iommu->group_list, next) { | |
720 | if (tmp->iommu_group == iommu_group) { | |
721 | mutex_unlock(&iommu->lock); | |
722 | kfree(group); | |
723 | return -EINVAL; | |
724 | } | |
725 | } | |
726 | ||
727 | /* | |
728 | * TODO: Domain have capabilities that might change as we add | |
729 | * groups (see iommu->cache, currently never set). Check for | |
730 | * them and potentially disallow groups to be attached when it | |
731 | * would change capabilities (ugh). | |
732 | */ | |
733 | ret = iommu_attach_group(iommu->domain, iommu_group); | |
734 | if (ret) { | |
735 | mutex_unlock(&iommu->lock); | |
736 | kfree(group); | |
737 | return ret; | |
738 | } | |
739 | ||
740 | group->iommu_group = iommu_group; | |
741 | list_add(&group->next, &iommu->group_list); | |
742 | ||
743 | mutex_unlock(&iommu->lock); | |
744 | ||
745 | return 0; | |
746 | } | |
747 | ||
748 | static void vfio_iommu_type1_detach_group(void *iommu_data, | |
749 | struct iommu_group *iommu_group) | |
750 | { | |
751 | struct vfio_iommu *iommu = iommu_data; | |
752 | struct vfio_group *group; | |
753 | ||
754 | mutex_lock(&iommu->lock); | |
755 | ||
756 | list_for_each_entry(group, &iommu->group_list, next) { | |
757 | if (group->iommu_group == iommu_group) { | |
758 | iommu_detach_group(iommu->domain, iommu_group); | |
759 | list_del(&group->next); | |
760 | kfree(group); | |
761 | break; | |
762 | } | |
763 | } | |
764 | ||
765 | mutex_unlock(&iommu->lock); | |
766 | } | |
767 | ||
768 | static void *vfio_iommu_type1_open(unsigned long arg) | |
769 | { | |
770 | struct vfio_iommu *iommu; | |
771 | ||
772 | if (arg != VFIO_TYPE1_IOMMU) | |
773 | return ERR_PTR(-EINVAL); | |
774 | ||
775 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | |
776 | if (!iommu) | |
777 | return ERR_PTR(-ENOMEM); | |
778 | ||
779 | INIT_LIST_HEAD(&iommu->group_list); | |
cd9b2268 | 780 | iommu->dma_list = RB_ROOT; |
73fa0d10 AW |
781 | mutex_init(&iommu->lock); |
782 | ||
783 | /* | |
784 | * Wish we didn't have to know about bus_type here. | |
785 | */ | |
786 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | |
787 | if (!iommu->domain) { | |
788 | kfree(iommu); | |
789 | return ERR_PTR(-EIO); | |
790 | } | |
791 | ||
792 | /* | |
793 | * Wish we could specify required capabilities rather than create | |
794 | * a domain, see what comes out and hope it doesn't change along | |
795 | * the way. Fortunately we know interrupt remapping is global for | |
796 | * our iommus. | |
797 | */ | |
798 | if (!allow_unsafe_interrupts && | |
799 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | |
800 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | |
801 | __func__); | |
802 | iommu_domain_free(iommu->domain); | |
803 | kfree(iommu); | |
804 | return ERR_PTR(-EPERM); | |
805 | } | |
806 | ||
807 | return iommu; | |
808 | } | |
809 | ||
810 | static void vfio_iommu_type1_release(void *iommu_data) | |
811 | { | |
812 | struct vfio_iommu *iommu = iommu_data; | |
813 | struct vfio_group *group, *group_tmp; | |
cd9b2268 | 814 | struct rb_node *node; |
73fa0d10 AW |
815 | |
816 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | |
817 | iommu_detach_group(iommu->domain, group->iommu_group); | |
818 | list_del(&group->next); | |
819 | kfree(group); | |
820 | } | |
821 | ||
cd9b2268 AW |
822 | while ((node = rb_first(&iommu->dma_list))) { |
823 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); | |
166fd7d9 AW |
824 | size_t size = dma->size; |
825 | vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); | |
f5bfdbf2 AW |
826 | if (WARN_ON(!size)) |
827 | break; | |
73fa0d10 AW |
828 | } |
829 | ||
830 | iommu_domain_free(iommu->domain); | |
831 | iommu->domain = NULL; | |
832 | kfree(iommu); | |
833 | } | |
834 | ||
835 | static long vfio_iommu_type1_ioctl(void *iommu_data, | |
836 | unsigned int cmd, unsigned long arg) | |
837 | { | |
838 | struct vfio_iommu *iommu = iommu_data; | |
839 | unsigned long minsz; | |
840 | ||
841 | if (cmd == VFIO_CHECK_EXTENSION) { | |
842 | switch (arg) { | |
843 | case VFIO_TYPE1_IOMMU: | |
844 | return 1; | |
845 | default: | |
846 | return 0; | |
847 | } | |
848 | } else if (cmd == VFIO_IOMMU_GET_INFO) { | |
849 | struct vfio_iommu_type1_info info; | |
850 | ||
851 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); | |
852 | ||
853 | if (copy_from_user(&info, (void __user *)arg, minsz)) | |
854 | return -EFAULT; | |
855 | ||
856 | if (info.argsz < minsz) | |
857 | return -EINVAL; | |
858 | ||
859 | info.flags = 0; | |
860 | ||
861 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | |
862 | ||
863 | return copy_to_user((void __user *)arg, &info, minsz); | |
864 | ||
865 | } else if (cmd == VFIO_IOMMU_MAP_DMA) { | |
866 | struct vfio_iommu_type1_dma_map map; | |
867 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | | |
868 | VFIO_DMA_MAP_FLAG_WRITE; | |
869 | ||
870 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); | |
871 | ||
872 | if (copy_from_user(&map, (void __user *)arg, minsz)) | |
873 | return -EFAULT; | |
874 | ||
875 | if (map.argsz < minsz || map.flags & ~mask) | |
876 | return -EINVAL; | |
877 | ||
878 | return vfio_dma_do_map(iommu, &map); | |
879 | ||
880 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | |
881 | struct vfio_iommu_type1_dma_unmap unmap; | |
166fd7d9 | 882 | long ret; |
73fa0d10 AW |
883 | |
884 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | |
885 | ||
886 | if (copy_from_user(&unmap, (void __user *)arg, minsz)) | |
887 | return -EFAULT; | |
888 | ||
889 | if (unmap.argsz < minsz || unmap.flags) | |
890 | return -EINVAL; | |
891 | ||
166fd7d9 AW |
892 | ret = vfio_dma_do_unmap(iommu, &unmap); |
893 | if (ret) | |
894 | return ret; | |
895 | ||
896 | return copy_to_user((void __user *)arg, &unmap, minsz); | |
73fa0d10 AW |
897 | } |
898 | ||
899 | return -ENOTTY; | |
900 | } | |
901 | ||
902 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | |
903 | .name = "vfio-iommu-type1", | |
904 | .owner = THIS_MODULE, | |
905 | .open = vfio_iommu_type1_open, | |
906 | .release = vfio_iommu_type1_release, | |
907 | .ioctl = vfio_iommu_type1_ioctl, | |
908 | .attach_group = vfio_iommu_type1_attach_group, | |
909 | .detach_group = vfio_iommu_type1_detach_group, | |
910 | }; | |
911 | ||
912 | static int __init vfio_iommu_type1_init(void) | |
913 | { | |
914 | if (!iommu_present(&pci_bus_type)) | |
915 | return -ENODEV; | |
916 | ||
917 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | |
918 | } | |
919 | ||
920 | static void __exit vfio_iommu_type1_cleanup(void) | |
921 | { | |
922 | vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); | |
923 | } | |
924 | ||
925 | module_init(vfio_iommu_type1_init); | |
926 | module_exit(vfio_iommu_type1_cleanup); | |
927 | ||
928 | MODULE_VERSION(DRIVER_VERSION); | |
929 | MODULE_LICENSE("GPL v2"); | |
930 | MODULE_AUTHOR(DRIVER_AUTHOR); | |
931 | MODULE_DESCRIPTION(DRIVER_DESC); |