iommufd: Data structure to provide IOVA to PFN mapping
[linux-2.6-block.git] / drivers / iommu / iommufd / io_pagetable.c
CommitLineData
51fe6141
JG
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11#include <linux/iommufd.h>
12#include <linux/lockdep.h>
13#include <linux/iommu.h>
14#include <linux/sched/mm.h>
15#include <linux/err.h>
16#include <linux/slab.h>
17#include <linux/errno.h>
18
19#include "io_pagetable.h"
20#include "double_span.h"
21
22struct iopt_pages_list {
23 struct iopt_pages *pages;
24 struct iopt_area *area;
25 struct list_head next;
26 unsigned long start_byte;
27 unsigned long length;
28};
29
30struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 struct io_pagetable *iopt,
32 unsigned long iova,
33 unsigned long last_iova)
34{
35 lockdep_assert_held(&iopt->iova_rwsem);
36
37 iter->cur_iova = iova;
38 iter->last_iova = last_iova;
39 iter->area = iopt_area_iter_first(iopt, iova, iova);
40 if (!iter->area)
41 return NULL;
42 if (!iter->area->pages) {
43 iter->area = NULL;
44 return NULL;
45 }
46 return iter->area;
47}
48
49struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50{
51 unsigned long last_iova;
52
53 if (!iter->area)
54 return NULL;
55 last_iova = iopt_area_last_iova(iter->area);
56 if (iter->last_iova <= last_iova)
57 return NULL;
58
59 iter->cur_iova = last_iova + 1;
60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 iter->last_iova);
62 if (!iter->area)
63 return NULL;
64 if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 !iter->area->pages) {
66 iter->area = NULL;
67 return NULL;
68 }
69 return iter->area;
70}
71
72static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 unsigned long length,
74 unsigned long iova_alignment,
75 unsigned long page_offset)
76{
77 if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 return false;
79
80 span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 page_offset;
82 if (span->start_hole > span->last_hole ||
83 span->last_hole - span->start_hole < length - 1)
84 return false;
85 return true;
86}
87
88static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 unsigned long length,
90 unsigned long iova_alignment,
91 unsigned long page_offset)
92{
93 if (span->is_hole || span->last_used - span->start_used < length - 1)
94 return false;
95
96 span->start_used = ALIGN(span->start_used, iova_alignment) |
97 page_offset;
98 if (span->start_used > span->last_used ||
99 span->last_used - span->start_used < length - 1)
100 return false;
101 return true;
102}
103
104/*
105 * Automatically find a block of IOVA that is not being used and not reserved.
106 * Does not return a 0 IOVA even if it is valid.
107 */
108static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 unsigned long uptr, unsigned long length)
110{
111 unsigned long page_offset = uptr % PAGE_SIZE;
112 struct interval_tree_double_span_iter used_span;
113 struct interval_tree_span_iter allowed_span;
114 unsigned long iova_alignment;
115
116 lockdep_assert_held(&iopt->iova_rwsem);
117
118 /* Protect roundup_pow-of_two() from overflow */
119 if (length == 0 || length >= ULONG_MAX / 2)
120 return -EOVERFLOW;
121
122 /*
123 * Keep alignment present in the uptr when building the IOVA, this
124 * increases the chance we can map a THP.
125 */
126 if (!uptr)
127 iova_alignment = roundup_pow_of_two(length);
128 else
129 iova_alignment = min_t(unsigned long,
130 roundup_pow_of_two(length),
131 1UL << __ffs64(uptr));
132
133 if (iova_alignment < iopt->iova_alignment)
134 return -EINVAL;
135
136 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 allowed_span.start_used = PAGE_SIZE;
140 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 allowed_span.is_hole = false;
142 }
143
144 if (!__alloc_iova_check_used(&allowed_span, length,
145 iova_alignment, page_offset))
146 continue;
147
148 interval_tree_for_each_double_span(
149 &used_span, &iopt->reserved_itree, &iopt->area_itree,
150 allowed_span.start_used, allowed_span.last_used) {
151 if (!__alloc_iova_check_hole(&used_span, length,
152 iova_alignment,
153 page_offset))
154 continue;
155
156 *iova = used_span.start_hole;
157 return 0;
158 }
159 }
160 return -ENOSPC;
161}
162
163static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 unsigned long length)
165{
166 unsigned long last;
167
168 lockdep_assert_held(&iopt->iova_rwsem);
169
170 if ((iova & (iopt->iova_alignment - 1)))
171 return -EINVAL;
172
173 if (check_add_overflow(iova, length - 1, &last))
174 return -EOVERFLOW;
175
176 /* No reserved IOVA intersects the range */
177 if (iopt_reserved_iter_first(iopt, iova, last))
178 return -EINVAL;
179
180 /* Check that there is not already a mapping in the range */
181 if (iopt_area_iter_first(iopt, iova, last))
182 return -EEXIST;
183 return 0;
184}
185
186/*
187 * The area takes a slice of the pages from start_bytes to start_byte + length
188 */
189static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 struct iopt_pages *pages, unsigned long iova,
191 unsigned long start_byte, unsigned long length,
192 int iommu_prot)
193{
194 lockdep_assert_held_write(&iopt->iova_rwsem);
195
196 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 return -EPERM;
198
199 area->iommu_prot = iommu_prot;
200 area->page_offset = start_byte % PAGE_SIZE;
201 if (area->page_offset & (iopt->iova_alignment - 1))
202 return -EINVAL;
203
204 area->node.start = iova;
205 if (check_add_overflow(iova, length - 1, &area->node.last))
206 return -EOVERFLOW;
207
208 area->pages_node.start = start_byte / PAGE_SIZE;
209 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 return -EOVERFLOW;
211 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 if (WARN_ON(area->pages_node.last >= pages->npages))
213 return -EOVERFLOW;
214
215 /*
216 * The area is inserted with a NULL pages indicating it is not fully
217 * initialized yet.
218 */
219 area->iopt = iopt;
220 interval_tree_insert(&area->node, &iopt->area_itree);
221 return 0;
222}
223
224static int iopt_alloc_area_pages(struct io_pagetable *iopt,
225 struct list_head *pages_list,
226 unsigned long length, unsigned long *dst_iova,
227 int iommu_prot, unsigned int flags)
228{
229 struct iopt_pages_list *elm;
230 unsigned long iova;
231 int rc = 0;
232
233 list_for_each_entry(elm, pages_list, next) {
234 elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
235 if (!elm->area)
236 return -ENOMEM;
237 }
238
239 down_write(&iopt->iova_rwsem);
240 if ((length & (iopt->iova_alignment - 1)) || !length) {
241 rc = -EINVAL;
242 goto out_unlock;
243 }
244
245 if (flags & IOPT_ALLOC_IOVA) {
246 /* Use the first entry to guess the ideal IOVA alignment */
247 elm = list_first_entry(pages_list, struct iopt_pages_list,
248 next);
249 rc = iopt_alloc_iova(
250 iopt, dst_iova,
251 (uintptr_t)elm->pages->uptr + elm->start_byte, length);
252 if (rc)
253 goto out_unlock;
254 } else {
255 rc = iopt_check_iova(iopt, *dst_iova, length);
256 if (rc)
257 goto out_unlock;
258 }
259
260 /*
261 * Areas are created with a NULL pages so that the IOVA space is
262 * reserved and we can unlock the iova_rwsem.
263 */
264 iova = *dst_iova;
265 list_for_each_entry(elm, pages_list, next) {
266 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
267 elm->start_byte, elm->length, iommu_prot);
268 if (rc)
269 goto out_unlock;
270 iova += elm->length;
271 }
272
273out_unlock:
274 up_write(&iopt->iova_rwsem);
275 return rc;
276}
277
278static void iopt_abort_area(struct iopt_area *area)
279{
280 if (area->iopt) {
281 down_write(&area->iopt->iova_rwsem);
282 interval_tree_remove(&area->node, &area->iopt->area_itree);
283 up_write(&area->iopt->iova_rwsem);
284 }
285 kfree(area);
286}
287
288void iopt_free_pages_list(struct list_head *pages_list)
289{
290 struct iopt_pages_list *elm;
291
292 while ((elm = list_first_entry_or_null(pages_list,
293 struct iopt_pages_list, next))) {
294 if (elm->area)
295 iopt_abort_area(elm->area);
296 if (elm->pages)
297 iopt_put_pages(elm->pages);
298 list_del(&elm->next);
299 kfree(elm);
300 }
301}
302
303static int iopt_fill_domains_pages(struct list_head *pages_list)
304{
305 struct iopt_pages_list *undo_elm;
306 struct iopt_pages_list *elm;
307 int rc;
308
309 list_for_each_entry(elm, pages_list, next) {
310 rc = iopt_area_fill_domains(elm->area, elm->pages);
311 if (rc)
312 goto err_undo;
313 }
314 return 0;
315
316err_undo:
317 list_for_each_entry(undo_elm, pages_list, next) {
318 if (undo_elm == elm)
319 break;
320 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
321 }
322 return rc;
323}
324
325int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
326 unsigned long length, unsigned long *dst_iova,
327 int iommu_prot, unsigned int flags)
328{
329 struct iopt_pages_list *elm;
330 int rc;
331
332 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
333 iommu_prot, flags);
334 if (rc)
335 return rc;
336
337 down_read(&iopt->domains_rwsem);
338 rc = iopt_fill_domains_pages(pages_list);
339 if (rc)
340 goto out_unlock_domains;
341
342 down_write(&iopt->iova_rwsem);
343 list_for_each_entry(elm, pages_list, next) {
344 /*
345 * area->pages must be set inside the domains_rwsem to ensure
346 * any newly added domains will get filled. Moves the reference
347 * in from the list.
348 */
349 elm->area->pages = elm->pages;
350 elm->pages = NULL;
351 elm->area = NULL;
352 }
353 up_write(&iopt->iova_rwsem);
354out_unlock_domains:
355 up_read(&iopt->domains_rwsem);
356 return rc;
357}
358
359/**
360 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
361 * @ictx: iommufd_ctx the iopt is part of
362 * @iopt: io_pagetable to act on
363 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
364 * the chosen iova on output. Otherwise is the iova to map to on input
365 * @uptr: User VA to map
366 * @length: Number of bytes to map
367 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
368 * @flags: IOPT_ALLOC_IOVA or zero
369 *
370 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
371 * page tables this will pin the pages and load them into the domain at iova.
372 * For non-domain page tables this will only setup a lazy reference and the
373 * caller must use iopt_access_pages() to touch them.
374 *
375 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
376 * destroyed.
377 */
378int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
379 unsigned long *iova, void __user *uptr,
380 unsigned long length, int iommu_prot,
381 unsigned int flags)
382{
383 struct iopt_pages_list elm = {};
384 LIST_HEAD(pages_list);
385 int rc;
386
387 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
388 if (IS_ERR(elm.pages))
389 return PTR_ERR(elm.pages);
390 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
391 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
392 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
393 elm.start_byte = uptr - elm.pages->uptr;
394 elm.length = length;
395 list_add(&elm.next, &pages_list);
396
397 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
398 if (rc) {
399 if (elm.area)
400 iopt_abort_area(elm.area);
401 if (elm.pages)
402 iopt_put_pages(elm.pages);
403 return rc;
404 }
405 return 0;
406}
407
408int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
409 unsigned long length, struct list_head *pages_list)
410{
411 struct iopt_area_contig_iter iter;
412 unsigned long last_iova;
413 struct iopt_area *area;
414 int rc;
415
416 if (!length)
417 return -EINVAL;
418 if (check_add_overflow(iova, length - 1, &last_iova))
419 return -EOVERFLOW;
420
421 down_read(&iopt->iova_rwsem);
422 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
423 struct iopt_pages_list *elm;
424 unsigned long last = min(last_iova, iopt_area_last_iova(area));
425
426 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
427 if (!elm) {
428 rc = -ENOMEM;
429 goto err_free;
430 }
431 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
432 elm->pages = area->pages;
433 elm->length = (last - iter.cur_iova) + 1;
434 kref_get(&elm->pages->kref);
435 list_add_tail(&elm->next, pages_list);
436 }
437 if (!iopt_area_contig_done(&iter)) {
438 rc = -ENOENT;
439 goto err_free;
440 }
441 up_read(&iopt->iova_rwsem);
442 return 0;
443err_free:
444 up_read(&iopt->iova_rwsem);
445 iopt_free_pages_list(pages_list);
446 return rc;
447}
448
449static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
450 unsigned long last, unsigned long *unmapped)
451{
452 struct iopt_area *area;
453 unsigned long unmapped_bytes = 0;
454 int rc = -ENOENT;
455
456 /*
457 * The domains_rwsem must be held in read mode any time any area->pages
458 * is NULL. This prevents domain attach/detatch from running
459 * concurrently with cleaning up the area.
460 */
461 down_read(&iopt->domains_rwsem);
462 down_write(&iopt->iova_rwsem);
463 while ((area = iopt_area_iter_first(iopt, start, last))) {
464 unsigned long area_last = iopt_area_last_iova(area);
465 unsigned long area_first = iopt_area_iova(area);
466 struct iopt_pages *pages;
467
468 /* Userspace should not race map/unmap's of the same area */
469 if (!area->pages) {
470 rc = -EBUSY;
471 goto out_unlock_iova;
472 }
473
474 if (area_first < start || area_last > last) {
475 rc = -ENOENT;
476 goto out_unlock_iova;
477 }
478
479 /*
480 * num_accesses writers must hold the iova_rwsem too, so we can
481 * safely read it under the write side of the iovam_rwsem
482 * without the pages->mutex.
483 */
484 if (area->num_accesses) {
485 start = area_first;
486 area->prevent_access = true;
487 up_write(&iopt->iova_rwsem);
488 up_read(&iopt->domains_rwsem);
489 /* Later patch calls back to drivers to unmap */
490 return -EBUSY;
491 }
492
493 pages = area->pages;
494 area->pages = NULL;
495 up_write(&iopt->iova_rwsem);
496
497 iopt_area_unfill_domains(area, pages);
498 iopt_abort_area(area);
499 iopt_put_pages(pages);
500
501 unmapped_bytes += area_last - area_first + 1;
502
503 down_write(&iopt->iova_rwsem);
504 }
505 if (unmapped_bytes)
506 rc = 0;
507
508out_unlock_iova:
509 up_write(&iopt->iova_rwsem);
510 up_read(&iopt->domains_rwsem);
511 if (unmapped)
512 *unmapped = unmapped_bytes;
513 return rc;
514}
515
516/**
517 * iopt_unmap_iova() - Remove a range of iova
518 * @iopt: io_pagetable to act on
519 * @iova: Starting iova to unmap
520 * @length: Number of bytes to unmap
521 * @unmapped: Return number of bytes unmapped
522 *
523 * The requested range must be a superset of existing ranges.
524 * Splitting/truncating IOVA mappings is not allowed.
525 */
526int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
527 unsigned long length, unsigned long *unmapped)
528{
529 unsigned long iova_last;
530
531 if (!length)
532 return -EINVAL;
533
534 if (check_add_overflow(iova, length - 1, &iova_last))
535 return -EOVERFLOW;
536
537 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
538}
539
540int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
541{
542 int rc;
543
544 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
545 /* If the IOVAs are empty then unmap all succeeds */
546 if (rc == -ENOENT)
547 return 0;
548 return rc;
549}
550
551/* The caller must always free all the nodes in the allowed_iova rb_root. */
552int iopt_set_allow_iova(struct io_pagetable *iopt,
553 struct rb_root_cached *allowed_iova)
554{
555 struct iopt_allowed *allowed;
556
557 down_write(&iopt->iova_rwsem);
558 swap(*allowed_iova, iopt->allowed_itree);
559
560 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
561 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
562 if (iopt_reserved_iter_first(iopt, allowed->node.start,
563 allowed->node.last)) {
564 swap(*allowed_iova, iopt->allowed_itree);
565 up_write(&iopt->iova_rwsem);
566 return -EADDRINUSE;
567 }
568 }
569 up_write(&iopt->iova_rwsem);
570 return 0;
571}
572
573int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
574 unsigned long last, void *owner)
575{
576 struct iopt_reserved *reserved;
577
578 lockdep_assert_held_write(&iopt->iova_rwsem);
579
580 if (iopt_area_iter_first(iopt, start, last) ||
581 iopt_allowed_iter_first(iopt, start, last))
582 return -EADDRINUSE;
583
584 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
585 if (!reserved)
586 return -ENOMEM;
587 reserved->node.start = start;
588 reserved->node.last = last;
589 reserved->owner = owner;
590 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
591 return 0;
592}
593
594static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
595{
596 struct iopt_reserved *reserved, *next;
597
598 lockdep_assert_held_write(&iopt->iova_rwsem);
599
600 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
601 reserved = next) {
602 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
603
604 if (reserved->owner == owner) {
605 interval_tree_remove(&reserved->node,
606 &iopt->reserved_itree);
607 kfree(reserved);
608 }
609 }
610}
611
612void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
613{
614 down_write(&iopt->iova_rwsem);
615 __iopt_remove_reserved_iova(iopt, owner);
616 up_write(&iopt->iova_rwsem);
617}
618
619void iopt_init_table(struct io_pagetable *iopt)
620{
621 init_rwsem(&iopt->iova_rwsem);
622 init_rwsem(&iopt->domains_rwsem);
623 iopt->area_itree = RB_ROOT_CACHED;
624 iopt->allowed_itree = RB_ROOT_CACHED;
625 iopt->reserved_itree = RB_ROOT_CACHED;
626 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
627 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
628
629 /*
630 * iopt's start as SW tables that can use the entire size_t IOVA space
631 * due to the use of size_t in the APIs. They have no alignment
632 * restriction.
633 */
634 iopt->iova_alignment = 1;
635}
636
637void iopt_destroy_table(struct io_pagetable *iopt)
638{
639 struct interval_tree_node *node;
640
641 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
642 ULONG_MAX))) {
643 interval_tree_remove(node, &iopt->allowed_itree);
644 kfree(container_of(node, struct iopt_allowed, node));
645 }
646
647 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
648 WARN_ON(!xa_empty(&iopt->domains));
649 WARN_ON(!xa_empty(&iopt->access_list));
650 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
651}
652
653/**
654 * iopt_unfill_domain() - Unfill a domain with PFNs
655 * @iopt: io_pagetable to act on
656 * @domain: domain to unfill
657 *
658 * This is used when removing a domain from the iopt. Every area in the iopt
659 * will be unmapped from the domain. The domain must already be removed from the
660 * domains xarray.
661 */
662static void iopt_unfill_domain(struct io_pagetable *iopt,
663 struct iommu_domain *domain)
664{
665 struct iopt_area *area;
666
667 lockdep_assert_held(&iopt->iova_rwsem);
668 lockdep_assert_held_write(&iopt->domains_rwsem);
669
670 /*
671 * Some other domain is holding all the pfns still, rapidly unmap this
672 * domain.
673 */
674 if (iopt->next_domain_id != 0) {
675 /* Pick an arbitrary remaining domain to act as storage */
676 struct iommu_domain *storage_domain =
677 xa_load(&iopt->domains, 0);
678
679 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
680 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
681 struct iopt_pages *pages = area->pages;
682
683 if (!pages)
684 continue;
685
686 mutex_lock(&pages->mutex);
687 if (area->storage_domain == domain)
688 area->storage_domain = storage_domain;
689 mutex_unlock(&pages->mutex);
690
691 iopt_area_unmap_domain(area, domain);
692 }
693 return;
694 }
695
696 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
697 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
698 struct iopt_pages *pages = area->pages;
699
700 if (!pages)
701 continue;
702
703 mutex_lock(&pages->mutex);
704 interval_tree_remove(&area->pages_node, &pages->domains_itree);
705 WARN_ON(area->storage_domain != domain);
706 area->storage_domain = NULL;
707 iopt_area_unfill_domain(area, pages, domain);
708 mutex_unlock(&pages->mutex);
709 }
710}
711
712/**
713 * iopt_fill_domain() - Fill a domain with PFNs
714 * @iopt: io_pagetable to act on
715 * @domain: domain to fill
716 *
717 * Fill the domain with PFNs from every area in the iopt. On failure the domain
718 * is left unchanged.
719 */
720static int iopt_fill_domain(struct io_pagetable *iopt,
721 struct iommu_domain *domain)
722{
723 struct iopt_area *end_area;
724 struct iopt_area *area;
725 int rc;
726
727 lockdep_assert_held(&iopt->iova_rwsem);
728 lockdep_assert_held_write(&iopt->domains_rwsem);
729
730 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
731 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
732 struct iopt_pages *pages = area->pages;
733
734 if (!pages)
735 continue;
736
737 mutex_lock(&pages->mutex);
738 rc = iopt_area_fill_domain(area, domain);
739 if (rc) {
740 mutex_unlock(&pages->mutex);
741 goto out_unfill;
742 }
743 if (!area->storage_domain) {
744 WARN_ON(iopt->next_domain_id != 0);
745 area->storage_domain = domain;
746 interval_tree_insert(&area->pages_node,
747 &pages->domains_itree);
748 }
749 mutex_unlock(&pages->mutex);
750 }
751 return 0;
752
753out_unfill:
754 end_area = area;
755 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
756 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
757 struct iopt_pages *pages = area->pages;
758
759 if (area == end_area)
760 break;
761 if (!pages)
762 continue;
763 mutex_lock(&pages->mutex);
764 if (iopt->next_domain_id == 0) {
765 interval_tree_remove(&area->pages_node,
766 &pages->domains_itree);
767 area->storage_domain = NULL;
768 }
769 iopt_area_unfill_domain(area, pages, domain);
770 mutex_unlock(&pages->mutex);
771 }
772 return rc;
773}
774
775/* All existing area's conform to an increased page size */
776static int iopt_check_iova_alignment(struct io_pagetable *iopt,
777 unsigned long new_iova_alignment)
778{
779 unsigned long align_mask = new_iova_alignment - 1;
780 struct iopt_area *area;
781
782 lockdep_assert_held(&iopt->iova_rwsem);
783 lockdep_assert_held(&iopt->domains_rwsem);
784
785 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
786 area = iopt_area_iter_next(area, 0, ULONG_MAX))
787 if ((iopt_area_iova(area) & align_mask) ||
788 (iopt_area_length(area) & align_mask) ||
789 (area->page_offset & align_mask))
790 return -EADDRINUSE;
791 return 0;
792}
793
794int iopt_table_add_domain(struct io_pagetable *iopt,
795 struct iommu_domain *domain)
796{
797 const struct iommu_domain_geometry *geometry = &domain->geometry;
798 struct iommu_domain *iter_domain;
799 unsigned int new_iova_alignment;
800 unsigned long index;
801 int rc;
802
803 down_write(&iopt->domains_rwsem);
804 down_write(&iopt->iova_rwsem);
805
806 xa_for_each(&iopt->domains, index, iter_domain) {
807 if (WARN_ON(iter_domain == domain)) {
808 rc = -EEXIST;
809 goto out_unlock;
810 }
811 }
812
813 /*
814 * The io page size drives the iova_alignment. Internally the iopt_pages
815 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
816 * objects into the iommu_domain.
817 *
818 * A iommu_domain must always be able to accept PAGE_SIZE to be
819 * compatible as we can't guarantee higher contiguity.
820 */
821 new_iova_alignment = max_t(unsigned long,
822 1UL << __ffs(domain->pgsize_bitmap),
823 iopt->iova_alignment);
824 if (new_iova_alignment > PAGE_SIZE) {
825 rc = -EINVAL;
826 goto out_unlock;
827 }
828 if (new_iova_alignment != iopt->iova_alignment) {
829 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
830 if (rc)
831 goto out_unlock;
832 }
833
834 /* No area exists that is outside the allowed domain aperture */
835 if (geometry->aperture_start != 0) {
836 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
837 domain);
838 if (rc)
839 goto out_reserved;
840 }
841 if (geometry->aperture_end != ULONG_MAX) {
842 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
843 ULONG_MAX, domain);
844 if (rc)
845 goto out_reserved;
846 }
847
848 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
849 if (rc)
850 goto out_reserved;
851
852 rc = iopt_fill_domain(iopt, domain);
853 if (rc)
854 goto out_release;
855
856 iopt->iova_alignment = new_iova_alignment;
857 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
858 iopt->next_domain_id++;
859 up_write(&iopt->iova_rwsem);
860 up_write(&iopt->domains_rwsem);
861 return 0;
862out_release:
863 xa_release(&iopt->domains, iopt->next_domain_id);
864out_reserved:
865 __iopt_remove_reserved_iova(iopt, domain);
866out_unlock:
867 up_write(&iopt->iova_rwsem);
868 up_write(&iopt->domains_rwsem);
869 return rc;
870}
871
872static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
873{
874 unsigned long new_iova_alignment;
875 struct iommufd_access *access;
876 struct iommu_domain *domain;
877 unsigned long index;
878
879 lockdep_assert_held_write(&iopt->iova_rwsem);
880 lockdep_assert_held(&iopt->domains_rwsem);
881
882 /* See batch_iommu_map_small() */
883 if (iopt->disable_large_pages)
884 new_iova_alignment = PAGE_SIZE;
885 else
886 new_iova_alignment = 1;
887
888 xa_for_each(&iopt->domains, index, domain)
889 new_iova_alignment = max_t(unsigned long,
890 1UL << __ffs(domain->pgsize_bitmap),
891 new_iova_alignment);
892 xa_for_each(&iopt->access_list, index, access)
893 new_iova_alignment = max_t(unsigned long,
894 access->iova_alignment,
895 new_iova_alignment);
896
897 if (new_iova_alignment > iopt->iova_alignment) {
898 int rc;
899
900 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
901 if (rc)
902 return rc;
903 }
904 iopt->iova_alignment = new_iova_alignment;
905 return 0;
906}
907
908void iopt_table_remove_domain(struct io_pagetable *iopt,
909 struct iommu_domain *domain)
910{
911 struct iommu_domain *iter_domain = NULL;
912 unsigned long index;
913
914 down_write(&iopt->domains_rwsem);
915 down_write(&iopt->iova_rwsem);
916
917 xa_for_each(&iopt->domains, index, iter_domain)
918 if (iter_domain == domain)
919 break;
920 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
921 goto out_unlock;
922
923 /*
924 * Compress the xarray to keep it linear by swapping the entry to erase
925 * with the tail entry and shrinking the tail.
926 */
927 iopt->next_domain_id--;
928 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
929 if (index != iopt->next_domain_id)
930 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
931
932 iopt_unfill_domain(iopt, domain);
933 __iopt_remove_reserved_iova(iopt, domain);
934
935 WARN_ON(iopt_calculate_iova_alignment(iopt));
936out_unlock:
937 up_write(&iopt->iova_rwsem);
938 up_write(&iopt->domains_rwsem);
939}
940
941/**
942 * iopt_area_split - Split an area into two parts at iova
943 * @area: The area to split
944 * @iova: Becomes the last of a new area
945 *
946 * This splits an area into two. It is part of the VFIO compatibility to allow
947 * poking a hole in the mapping. The two areas continue to point at the same
948 * iopt_pages, just with different starting bytes.
949 */
950static int iopt_area_split(struct iopt_area *area, unsigned long iova)
951{
952 unsigned long alignment = area->iopt->iova_alignment;
953 unsigned long last_iova = iopt_area_last_iova(area);
954 unsigned long start_iova = iopt_area_iova(area);
955 unsigned long new_start = iova + 1;
956 struct io_pagetable *iopt = area->iopt;
957 struct iopt_pages *pages = area->pages;
958 struct iopt_area *lhs;
959 struct iopt_area *rhs;
960 int rc;
961
962 lockdep_assert_held_write(&iopt->iova_rwsem);
963
964 if (iova == start_iova || iova == last_iova)
965 return 0;
966
967 if (!pages || area->prevent_access)
968 return -EBUSY;
969
970 if (new_start & (alignment - 1) ||
971 iopt_area_start_byte(area, new_start) & (alignment - 1))
972 return -EINVAL;
973
974 lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
975 if (!lhs)
976 return -ENOMEM;
977
978 rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
979 if (!rhs) {
980 rc = -ENOMEM;
981 goto err_free_lhs;
982 }
983
984 mutex_lock(&pages->mutex);
985 /*
986 * Splitting is not permitted if an access exists, we don't track enough
987 * information to split existing accesses.
988 */
989 if (area->num_accesses) {
990 rc = -EINVAL;
991 goto err_unlock;
992 }
993
994 /*
995 * Splitting is not permitted if a domain could have been mapped with
996 * huge pages.
997 */
998 if (area->storage_domain && !iopt->disable_large_pages) {
999 rc = -EINVAL;
1000 goto err_unlock;
1001 }
1002
1003 interval_tree_remove(&area->node, &iopt->area_itree);
1004 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1005 iopt_area_start_byte(area, start_iova),
1006 (new_start - 1) - start_iova + 1,
1007 area->iommu_prot);
1008 if (WARN_ON(rc))
1009 goto err_insert;
1010
1011 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1012 iopt_area_start_byte(area, new_start),
1013 last_iova - new_start + 1, area->iommu_prot);
1014 if (WARN_ON(rc))
1015 goto err_remove_lhs;
1016
1017 lhs->storage_domain = area->storage_domain;
1018 lhs->pages = area->pages;
1019 rhs->storage_domain = area->storage_domain;
1020 rhs->pages = area->pages;
1021 kref_get(&rhs->pages->kref);
1022 kfree(area);
1023 mutex_unlock(&pages->mutex);
1024
1025 /*
1026 * No change to domains or accesses because the pages hasn't been
1027 * changed
1028 */
1029 return 0;
1030
1031err_remove_lhs:
1032 interval_tree_remove(&lhs->node, &iopt->area_itree);
1033err_insert:
1034 interval_tree_insert(&area->node, &iopt->area_itree);
1035err_unlock:
1036 mutex_unlock(&pages->mutex);
1037 kfree(rhs);
1038err_free_lhs:
1039 kfree(lhs);
1040 return rc;
1041}
1042
1043int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1044 size_t num_iovas)
1045{
1046 int rc = 0;
1047 int i;
1048
1049 down_write(&iopt->iova_rwsem);
1050 for (i = 0; i < num_iovas; i++) {
1051 struct iopt_area *area;
1052
1053 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1054 if (!area)
1055 continue;
1056 rc = iopt_area_split(area, iovas[i]);
1057 if (rc)
1058 break;
1059 }
1060 up_write(&iopt->iova_rwsem);
1061 return rc;
1062}
1063
1064void iopt_enable_large_pages(struct io_pagetable *iopt)
1065{
1066 int rc;
1067
1068 down_write(&iopt->domains_rwsem);
1069 down_write(&iopt->iova_rwsem);
1070 WRITE_ONCE(iopt->disable_large_pages, false);
1071 rc = iopt_calculate_iova_alignment(iopt);
1072 WARN_ON(rc);
1073 up_write(&iopt->iova_rwsem);
1074 up_write(&iopt->domains_rwsem);
1075}
1076
1077int iopt_disable_large_pages(struct io_pagetable *iopt)
1078{
1079 int rc = 0;
1080
1081 down_write(&iopt->domains_rwsem);
1082 down_write(&iopt->iova_rwsem);
1083 if (iopt->disable_large_pages)
1084 goto out_unlock;
1085
1086 /* Won't do it if domains already have pages mapped in them */
1087 if (!xa_empty(&iopt->domains) &&
1088 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1089 rc = -EINVAL;
1090 goto out_unlock;
1091 }
1092
1093 WRITE_ONCE(iopt->disable_large_pages, true);
1094 rc = iopt_calculate_iova_alignment(iopt);
1095 if (rc)
1096 WRITE_ONCE(iopt->disable_large_pages, false);
1097out_unlock:
1098 up_write(&iopt->iova_rwsem);
1099 up_write(&iopt->domains_rwsem);
1100 return rc;
1101}
1102
1103int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1104{
1105 int rc;
1106
1107 down_write(&iopt->domains_rwsem);
1108 down_write(&iopt->iova_rwsem);
1109 rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1110 xa_limit_16b, GFP_KERNEL_ACCOUNT);
1111 if (rc)
1112 goto out_unlock;
1113
1114 rc = iopt_calculate_iova_alignment(iopt);
1115 if (rc) {
1116 xa_erase(&iopt->access_list, access->iopt_access_list_id);
1117 goto out_unlock;
1118 }
1119
1120out_unlock:
1121 up_write(&iopt->iova_rwsem);
1122 up_write(&iopt->domains_rwsem);
1123 return rc;
1124}
1125
1126void iopt_remove_access(struct io_pagetable *iopt,
1127 struct iommufd_access *access)
1128{
1129 down_write(&iopt->domains_rwsem);
1130 down_write(&iopt->iova_rwsem);
1131 WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) !=
1132 access);
1133 WARN_ON(iopt_calculate_iova_alignment(iopt));
1134 up_write(&iopt->iova_rwsem);
1135 up_write(&iopt->domains_rwsem);
1136}
1137
1138/* Narrow the valid_iova_itree to include reserved ranges from a group. */
1139int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
1140 struct device *device,
1141 struct iommu_group *group,
1142 phys_addr_t *sw_msi_start)
1143{
1144 struct iommu_resv_region *resv;
1145 struct iommu_resv_region *tmp;
1146 LIST_HEAD(group_resv_regions);
1147 int rc;
1148
1149 down_write(&iopt->iova_rwsem);
1150 rc = iommu_get_group_resv_regions(group, &group_resv_regions);
1151 if (rc)
1152 goto out_unlock;
1153
1154 list_for_each_entry(resv, &group_resv_regions, list) {
1155 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1156 continue;
1157
1158 /*
1159 * The presence of any 'real' MSI regions should take precedence
1160 * over the software-managed one if the IOMMU driver happens to
1161 * advertise both types.
1162 */
1163 if (sw_msi_start && resv->type == IOMMU_RESV_MSI) {
1164 *sw_msi_start = 0;
1165 sw_msi_start = NULL;
1166 }
1167 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI)
1168 *sw_msi_start = resv->start;
1169
1170 rc = iopt_reserve_iova(iopt, resv->start,
1171 resv->length - 1 + resv->start, device);
1172 if (rc)
1173 goto out_reserved;
1174 }
1175 rc = 0;
1176 goto out_free_resv;
1177
1178out_reserved:
1179 __iopt_remove_reserved_iova(iopt, device);
1180out_free_resv:
1181 list_for_each_entry_safe(resv, tmp, &group_resv_regions, list)
1182 kfree(resv);
1183out_unlock:
1184 up_write(&iopt->iova_rwsem);
1185 return rc;
1186}