Merge tag 'vfs-6.9.super' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[linux-2.6-block.git] / drivers / iommu / iommufd / io_pagetable.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18 #include <uapi/linux/iommufd.h>
19
20 #include "io_pagetable.h"
21 #include "double_span.h"
22
23 struct iopt_pages_list {
24         struct iopt_pages *pages;
25         struct iopt_area *area;
26         struct list_head next;
27         unsigned long start_byte;
28         unsigned long length;
29 };
30
31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32                                         struct io_pagetable *iopt,
33                                         unsigned long iova,
34                                         unsigned long last_iova)
35 {
36         lockdep_assert_held(&iopt->iova_rwsem);
37
38         iter->cur_iova = iova;
39         iter->last_iova = last_iova;
40         iter->area = iopt_area_iter_first(iopt, iova, iova);
41         if (!iter->area)
42                 return NULL;
43         if (!iter->area->pages) {
44                 iter->area = NULL;
45                 return NULL;
46         }
47         return iter->area;
48 }
49
50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52         unsigned long last_iova;
53
54         if (!iter->area)
55                 return NULL;
56         last_iova = iopt_area_last_iova(iter->area);
57         if (iter->last_iova <= last_iova)
58                 return NULL;
59
60         iter->cur_iova = last_iova + 1;
61         iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62                                          iter->last_iova);
63         if (!iter->area)
64                 return NULL;
65         if (iter->cur_iova != iopt_area_iova(iter->area) ||
66             !iter->area->pages) {
67                 iter->area = NULL;
68                 return NULL;
69         }
70         return iter->area;
71 }
72
73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
74                                     unsigned long length,
75                                     unsigned long iova_alignment,
76                                     unsigned long page_offset)
77 {
78         if (span->is_used || span->last_hole - span->start_hole < length - 1)
79                 return false;
80
81         span->start_hole = ALIGN(span->start_hole, iova_alignment) |
82                            page_offset;
83         if (span->start_hole > span->last_hole ||
84             span->last_hole - span->start_hole < length - 1)
85                 return false;
86         return true;
87 }
88
89 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
90                                     unsigned long length,
91                                     unsigned long iova_alignment,
92                                     unsigned long page_offset)
93 {
94         if (span->is_hole || span->last_used - span->start_used < length - 1)
95                 return false;
96
97         span->start_used = ALIGN(span->start_used, iova_alignment) |
98                            page_offset;
99         if (span->start_used > span->last_used ||
100             span->last_used - span->start_used < length - 1)
101                 return false;
102         return true;
103 }
104
105 /*
106  * Automatically find a block of IOVA that is not being used and not reserved.
107  * Does not return a 0 IOVA even if it is valid.
108  */
109 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
110                            unsigned long uptr, unsigned long length)
111 {
112         unsigned long page_offset = uptr % PAGE_SIZE;
113         struct interval_tree_double_span_iter used_span;
114         struct interval_tree_span_iter allowed_span;
115         unsigned long iova_alignment;
116
117         lockdep_assert_held(&iopt->iova_rwsem);
118
119         /* Protect roundup_pow-of_two() from overflow */
120         if (length == 0 || length >= ULONG_MAX / 2)
121                 return -EOVERFLOW;
122
123         /*
124          * Keep alignment present in the uptr when building the IOVA, this
125          * increases the chance we can map a THP.
126          */
127         if (!uptr)
128                 iova_alignment = roundup_pow_of_two(length);
129         else
130                 iova_alignment = min_t(unsigned long,
131                                        roundup_pow_of_two(length),
132                                        1UL << __ffs64(uptr));
133
134         if (iova_alignment < iopt->iova_alignment)
135                 return -EINVAL;
136
137         interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
138                                     PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
139                 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
140                         allowed_span.start_used = PAGE_SIZE;
141                         allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
142                         allowed_span.is_hole = false;
143                 }
144
145                 if (!__alloc_iova_check_used(&allowed_span, length,
146                                              iova_alignment, page_offset))
147                         continue;
148
149                 interval_tree_for_each_double_span(
150                         &used_span, &iopt->reserved_itree, &iopt->area_itree,
151                         allowed_span.start_used, allowed_span.last_used) {
152                         if (!__alloc_iova_check_hole(&used_span, length,
153                                                      iova_alignment,
154                                                      page_offset))
155                                 continue;
156
157                         *iova = used_span.start_hole;
158                         return 0;
159                 }
160         }
161         return -ENOSPC;
162 }
163
164 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
165                            unsigned long length)
166 {
167         unsigned long last;
168
169         lockdep_assert_held(&iopt->iova_rwsem);
170
171         if ((iova & (iopt->iova_alignment - 1)))
172                 return -EINVAL;
173
174         if (check_add_overflow(iova, length - 1, &last))
175                 return -EOVERFLOW;
176
177         /* No reserved IOVA intersects the range */
178         if (iopt_reserved_iter_first(iopt, iova, last))
179                 return -EINVAL;
180
181         /* Check that there is not already a mapping in the range */
182         if (iopt_area_iter_first(iopt, iova, last))
183                 return -EEXIST;
184         return 0;
185 }
186
187 /*
188  * The area takes a slice of the pages from start_bytes to start_byte + length
189  */
190 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
191                             struct iopt_pages *pages, unsigned long iova,
192                             unsigned long start_byte, unsigned long length,
193                             int iommu_prot)
194 {
195         lockdep_assert_held_write(&iopt->iova_rwsem);
196
197         if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
198                 return -EPERM;
199
200         area->iommu_prot = iommu_prot;
201         area->page_offset = start_byte % PAGE_SIZE;
202         if (area->page_offset & (iopt->iova_alignment - 1))
203                 return -EINVAL;
204
205         area->node.start = iova;
206         if (check_add_overflow(iova, length - 1, &area->node.last))
207                 return -EOVERFLOW;
208
209         area->pages_node.start = start_byte / PAGE_SIZE;
210         if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
211                 return -EOVERFLOW;
212         area->pages_node.last = area->pages_node.last / PAGE_SIZE;
213         if (WARN_ON(area->pages_node.last >= pages->npages))
214                 return -EOVERFLOW;
215
216         /*
217          * The area is inserted with a NULL pages indicating it is not fully
218          * initialized yet.
219          */
220         area->iopt = iopt;
221         interval_tree_insert(&area->node, &iopt->area_itree);
222         return 0;
223 }
224
225 static struct iopt_area *iopt_area_alloc(void)
226 {
227         struct iopt_area *area;
228
229         area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
230         if (!area)
231                 return NULL;
232         RB_CLEAR_NODE(&area->node.rb);
233         RB_CLEAR_NODE(&area->pages_node.rb);
234         return area;
235 }
236
237 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
238                                  struct list_head *pages_list,
239                                  unsigned long length, unsigned long *dst_iova,
240                                  int iommu_prot, unsigned int flags)
241 {
242         struct iopt_pages_list *elm;
243         unsigned long iova;
244         int rc = 0;
245
246         list_for_each_entry(elm, pages_list, next) {
247                 elm->area = iopt_area_alloc();
248                 if (!elm->area)
249                         return -ENOMEM;
250         }
251
252         down_write(&iopt->iova_rwsem);
253         if ((length & (iopt->iova_alignment - 1)) || !length) {
254                 rc = -EINVAL;
255                 goto out_unlock;
256         }
257
258         if (flags & IOPT_ALLOC_IOVA) {
259                 /* Use the first entry to guess the ideal IOVA alignment */
260                 elm = list_first_entry(pages_list, struct iopt_pages_list,
261                                        next);
262                 rc = iopt_alloc_iova(
263                         iopt, dst_iova,
264                         (uintptr_t)elm->pages->uptr + elm->start_byte, length);
265                 if (rc)
266                         goto out_unlock;
267                 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
268                     WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
269                         rc = -EINVAL;
270                         goto out_unlock;
271                 }
272         } else {
273                 rc = iopt_check_iova(iopt, *dst_iova, length);
274                 if (rc)
275                         goto out_unlock;
276         }
277
278         /*
279          * Areas are created with a NULL pages so that the IOVA space is
280          * reserved and we can unlock the iova_rwsem.
281          */
282         iova = *dst_iova;
283         list_for_each_entry(elm, pages_list, next) {
284                 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
285                                       elm->start_byte, elm->length, iommu_prot);
286                 if (rc)
287                         goto out_unlock;
288                 iova += elm->length;
289         }
290
291 out_unlock:
292         up_write(&iopt->iova_rwsem);
293         return rc;
294 }
295
296 static void iopt_abort_area(struct iopt_area *area)
297 {
298         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
299                 WARN_ON(area->pages);
300         if (area->iopt) {
301                 down_write(&area->iopt->iova_rwsem);
302                 interval_tree_remove(&area->node, &area->iopt->area_itree);
303                 up_write(&area->iopt->iova_rwsem);
304         }
305         kfree(area);
306 }
307
308 void iopt_free_pages_list(struct list_head *pages_list)
309 {
310         struct iopt_pages_list *elm;
311
312         while ((elm = list_first_entry_or_null(pages_list,
313                                                struct iopt_pages_list, next))) {
314                 if (elm->area)
315                         iopt_abort_area(elm->area);
316                 if (elm->pages)
317                         iopt_put_pages(elm->pages);
318                 list_del(&elm->next);
319                 kfree(elm);
320         }
321 }
322
323 static int iopt_fill_domains_pages(struct list_head *pages_list)
324 {
325         struct iopt_pages_list *undo_elm;
326         struct iopt_pages_list *elm;
327         int rc;
328
329         list_for_each_entry(elm, pages_list, next) {
330                 rc = iopt_area_fill_domains(elm->area, elm->pages);
331                 if (rc)
332                         goto err_undo;
333         }
334         return 0;
335
336 err_undo:
337         list_for_each_entry(undo_elm, pages_list, next) {
338                 if (undo_elm == elm)
339                         break;
340                 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
341         }
342         return rc;
343 }
344
345 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
346                    unsigned long length, unsigned long *dst_iova,
347                    int iommu_prot, unsigned int flags)
348 {
349         struct iopt_pages_list *elm;
350         int rc;
351
352         rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
353                                    iommu_prot, flags);
354         if (rc)
355                 return rc;
356
357         down_read(&iopt->domains_rwsem);
358         rc = iopt_fill_domains_pages(pages_list);
359         if (rc)
360                 goto out_unlock_domains;
361
362         down_write(&iopt->iova_rwsem);
363         list_for_each_entry(elm, pages_list, next) {
364                 /*
365                  * area->pages must be set inside the domains_rwsem to ensure
366                  * any newly added domains will get filled. Moves the reference
367                  * in from the list.
368                  */
369                 elm->area->pages = elm->pages;
370                 elm->pages = NULL;
371                 elm->area = NULL;
372         }
373         up_write(&iopt->iova_rwsem);
374 out_unlock_domains:
375         up_read(&iopt->domains_rwsem);
376         return rc;
377 }
378
379 /**
380  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
381  * @ictx: iommufd_ctx the iopt is part of
382  * @iopt: io_pagetable to act on
383  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
384  *        the chosen iova on output. Otherwise is the iova to map to on input
385  * @uptr: User VA to map
386  * @length: Number of bytes to map
387  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
388  * @flags: IOPT_ALLOC_IOVA or zero
389  *
390  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
391  * page tables this will pin the pages and load them into the domain at iova.
392  * For non-domain page tables this will only setup a lazy reference and the
393  * caller must use iopt_access_pages() to touch them.
394  *
395  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
396  * destroyed.
397  */
398 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
399                         unsigned long *iova, void __user *uptr,
400                         unsigned long length, int iommu_prot,
401                         unsigned int flags)
402 {
403         struct iopt_pages_list elm = {};
404         LIST_HEAD(pages_list);
405         int rc;
406
407         elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
408         if (IS_ERR(elm.pages))
409                 return PTR_ERR(elm.pages);
410         if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
411             elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
412                 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
413         elm.start_byte = uptr - elm.pages->uptr;
414         elm.length = length;
415         list_add(&elm.next, &pages_list);
416
417         rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
418         if (rc) {
419                 if (elm.area)
420                         iopt_abort_area(elm.area);
421                 if (elm.pages)
422                         iopt_put_pages(elm.pages);
423                 return rc;
424         }
425         return 0;
426 }
427
428 struct iova_bitmap_fn_arg {
429         unsigned long flags;
430         struct io_pagetable *iopt;
431         struct iommu_domain *domain;
432         struct iommu_dirty_bitmap *dirty;
433 };
434
435 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
436                                         unsigned long iova, size_t length,
437                                         void *opaque)
438 {
439         struct iopt_area *area;
440         struct iopt_area_contig_iter iter;
441         struct iova_bitmap_fn_arg *arg = opaque;
442         struct iommu_domain *domain = arg->domain;
443         struct iommu_dirty_bitmap *dirty = arg->dirty;
444         const struct iommu_dirty_ops *ops = domain->dirty_ops;
445         unsigned long last_iova = iova + length - 1;
446         unsigned long flags = arg->flags;
447         int ret;
448
449         iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
450                 unsigned long last = min(last_iova, iopt_area_last_iova(area));
451
452                 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
453                                                 last - iter.cur_iova + 1, flags,
454                                                 dirty);
455                 if (ret)
456                         return ret;
457         }
458
459         if (!iopt_area_contig_done(&iter))
460                 return -EINVAL;
461         return 0;
462 }
463
464 static int
465 iommu_read_and_clear_dirty(struct iommu_domain *domain,
466                            struct io_pagetable *iopt, unsigned long flags,
467                            struct iommu_hwpt_get_dirty_bitmap *bitmap)
468 {
469         const struct iommu_dirty_ops *ops = domain->dirty_ops;
470         struct iommu_iotlb_gather gather;
471         struct iommu_dirty_bitmap dirty;
472         struct iova_bitmap_fn_arg arg;
473         struct iova_bitmap *iter;
474         int ret = 0;
475
476         if (!ops || !ops->read_and_clear_dirty)
477                 return -EOPNOTSUPP;
478
479         iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
480                                  bitmap->page_size,
481                                  u64_to_user_ptr(bitmap->data));
482         if (IS_ERR(iter))
483                 return -ENOMEM;
484
485         iommu_dirty_bitmap_init(&dirty, iter, &gather);
486
487         arg.flags = flags;
488         arg.iopt = iopt;
489         arg.domain = domain;
490         arg.dirty = &dirty;
491         iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
492
493         if (!(flags & IOMMU_DIRTY_NO_CLEAR))
494                 iommu_iotlb_sync(domain, &gather);
495
496         iova_bitmap_free(iter);
497
498         return ret;
499 }
500
501 int iommufd_check_iova_range(struct io_pagetable *iopt,
502                              struct iommu_hwpt_get_dirty_bitmap *bitmap)
503 {
504         size_t iommu_pgsize = iopt->iova_alignment;
505         u64 last_iova;
506
507         if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
508                 return -EOVERFLOW;
509
510         if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
511                 return -EOVERFLOW;
512
513         if ((bitmap->iova & (iommu_pgsize - 1)) ||
514             ((last_iova + 1) & (iommu_pgsize - 1)))
515                 return -EINVAL;
516
517         if (!bitmap->page_size)
518                 return -EINVAL;
519
520         if ((bitmap->iova & (bitmap->page_size - 1)) ||
521             ((last_iova + 1) & (bitmap->page_size - 1)))
522                 return -EINVAL;
523
524         return 0;
525 }
526
527 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
528                                    struct iommu_domain *domain,
529                                    unsigned long flags,
530                                    struct iommu_hwpt_get_dirty_bitmap *bitmap)
531 {
532         int ret;
533
534         ret = iommufd_check_iova_range(iopt, bitmap);
535         if (ret)
536                 return ret;
537
538         down_read(&iopt->iova_rwsem);
539         ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
540         up_read(&iopt->iova_rwsem);
541
542         return ret;
543 }
544
545 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
546                                  struct iommu_domain *domain)
547 {
548         const struct iommu_dirty_ops *ops = domain->dirty_ops;
549         struct iommu_iotlb_gather gather;
550         struct iommu_dirty_bitmap dirty;
551         struct iopt_area *area;
552         int ret = 0;
553
554         lockdep_assert_held_read(&iopt->iova_rwsem);
555
556         iommu_dirty_bitmap_init(&dirty, NULL, &gather);
557
558         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
559              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
560                 if (!area->pages)
561                         continue;
562
563                 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
564                                                 iopt_area_length(area), 0,
565                                                 &dirty);
566                 if (ret)
567                         break;
568         }
569
570         iommu_iotlb_sync(domain, &gather);
571         return ret;
572 }
573
574 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
575                             struct iommu_domain *domain, bool enable)
576 {
577         const struct iommu_dirty_ops *ops = domain->dirty_ops;
578         int ret = 0;
579
580         if (!ops)
581                 return -EOPNOTSUPP;
582
583         down_read(&iopt->iova_rwsem);
584
585         /* Clear dirty bits from PTEs to ensure a clean snapshot */
586         if (enable) {
587                 ret = iopt_clear_dirty_data(iopt, domain);
588                 if (ret)
589                         goto out_unlock;
590         }
591
592         ret = ops->set_dirty_tracking(domain, enable);
593
594 out_unlock:
595         up_read(&iopt->iova_rwsem);
596         return ret;
597 }
598
599 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
600                    unsigned long length, struct list_head *pages_list)
601 {
602         struct iopt_area_contig_iter iter;
603         unsigned long last_iova;
604         struct iopt_area *area;
605         int rc;
606
607         if (!length)
608                 return -EINVAL;
609         if (check_add_overflow(iova, length - 1, &last_iova))
610                 return -EOVERFLOW;
611
612         down_read(&iopt->iova_rwsem);
613         iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
614                 struct iopt_pages_list *elm;
615                 unsigned long last = min(last_iova, iopt_area_last_iova(area));
616
617                 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
618                 if (!elm) {
619                         rc = -ENOMEM;
620                         goto err_free;
621                 }
622                 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
623                 elm->pages = area->pages;
624                 elm->length = (last - iter.cur_iova) + 1;
625                 kref_get(&elm->pages->kref);
626                 list_add_tail(&elm->next, pages_list);
627         }
628         if (!iopt_area_contig_done(&iter)) {
629                 rc = -ENOENT;
630                 goto err_free;
631         }
632         up_read(&iopt->iova_rwsem);
633         return 0;
634 err_free:
635         up_read(&iopt->iova_rwsem);
636         iopt_free_pages_list(pages_list);
637         return rc;
638 }
639
640 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
641                                  unsigned long last, unsigned long *unmapped)
642 {
643         struct iopt_area *area;
644         unsigned long unmapped_bytes = 0;
645         unsigned int tries = 0;
646         int rc = -ENOENT;
647
648         /*
649          * The domains_rwsem must be held in read mode any time any area->pages
650          * is NULL. This prevents domain attach/detatch from running
651          * concurrently with cleaning up the area.
652          */
653 again:
654         down_read(&iopt->domains_rwsem);
655         down_write(&iopt->iova_rwsem);
656         while ((area = iopt_area_iter_first(iopt, start, last))) {
657                 unsigned long area_last = iopt_area_last_iova(area);
658                 unsigned long area_first = iopt_area_iova(area);
659                 struct iopt_pages *pages;
660
661                 /* Userspace should not race map/unmap's of the same area */
662                 if (!area->pages) {
663                         rc = -EBUSY;
664                         goto out_unlock_iova;
665                 }
666
667                 if (area_first < start || area_last > last) {
668                         rc = -ENOENT;
669                         goto out_unlock_iova;
670                 }
671
672                 if (area_first != start)
673                         tries = 0;
674
675                 /*
676                  * num_accesses writers must hold the iova_rwsem too, so we can
677                  * safely read it under the write side of the iovam_rwsem
678                  * without the pages->mutex.
679                  */
680                 if (area->num_accesses) {
681                         size_t length = iopt_area_length(area);
682
683                         start = area_first;
684                         area->prevent_access = true;
685                         up_write(&iopt->iova_rwsem);
686                         up_read(&iopt->domains_rwsem);
687
688                         iommufd_access_notify_unmap(iopt, area_first, length);
689                         /* Something is not responding to unmap requests. */
690                         tries++;
691                         if (WARN_ON(tries > 100))
692                                 return -EDEADLOCK;
693                         goto again;
694                 }
695
696                 pages = area->pages;
697                 area->pages = NULL;
698                 up_write(&iopt->iova_rwsem);
699
700                 iopt_area_unfill_domains(area, pages);
701                 iopt_abort_area(area);
702                 iopt_put_pages(pages);
703
704                 unmapped_bytes += area_last - area_first + 1;
705
706                 down_write(&iopt->iova_rwsem);
707         }
708         if (unmapped_bytes)
709                 rc = 0;
710
711 out_unlock_iova:
712         up_write(&iopt->iova_rwsem);
713         up_read(&iopt->domains_rwsem);
714         if (unmapped)
715                 *unmapped = unmapped_bytes;
716         return rc;
717 }
718
719 /**
720  * iopt_unmap_iova() - Remove a range of iova
721  * @iopt: io_pagetable to act on
722  * @iova: Starting iova to unmap
723  * @length: Number of bytes to unmap
724  * @unmapped: Return number of bytes unmapped
725  *
726  * The requested range must be a superset of existing ranges.
727  * Splitting/truncating IOVA mappings is not allowed.
728  */
729 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
730                     unsigned long length, unsigned long *unmapped)
731 {
732         unsigned long iova_last;
733
734         if (!length)
735                 return -EINVAL;
736
737         if (check_add_overflow(iova, length - 1, &iova_last))
738                 return -EOVERFLOW;
739
740         return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
741 }
742
743 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
744 {
745         int rc;
746
747         rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
748         /* If the IOVAs are empty then unmap all succeeds */
749         if (rc == -ENOENT)
750                 return 0;
751         return rc;
752 }
753
754 /* The caller must always free all the nodes in the allowed_iova rb_root. */
755 int iopt_set_allow_iova(struct io_pagetable *iopt,
756                         struct rb_root_cached *allowed_iova)
757 {
758         struct iopt_allowed *allowed;
759
760         down_write(&iopt->iova_rwsem);
761         swap(*allowed_iova, iopt->allowed_itree);
762
763         for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
764              allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
765                 if (iopt_reserved_iter_first(iopt, allowed->node.start,
766                                              allowed->node.last)) {
767                         swap(*allowed_iova, iopt->allowed_itree);
768                         up_write(&iopt->iova_rwsem);
769                         return -EADDRINUSE;
770                 }
771         }
772         up_write(&iopt->iova_rwsem);
773         return 0;
774 }
775
776 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
777                       unsigned long last, void *owner)
778 {
779         struct iopt_reserved *reserved;
780
781         lockdep_assert_held_write(&iopt->iova_rwsem);
782
783         if (iopt_area_iter_first(iopt, start, last) ||
784             iopt_allowed_iter_first(iopt, start, last))
785                 return -EADDRINUSE;
786
787         reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
788         if (!reserved)
789                 return -ENOMEM;
790         reserved->node.start = start;
791         reserved->node.last = last;
792         reserved->owner = owner;
793         interval_tree_insert(&reserved->node, &iopt->reserved_itree);
794         return 0;
795 }
796
797 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
798 {
799         struct iopt_reserved *reserved, *next;
800
801         lockdep_assert_held_write(&iopt->iova_rwsem);
802
803         for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
804              reserved = next) {
805                 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
806
807                 if (reserved->owner == owner) {
808                         interval_tree_remove(&reserved->node,
809                                              &iopt->reserved_itree);
810                         kfree(reserved);
811                 }
812         }
813 }
814
815 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
816 {
817         down_write(&iopt->iova_rwsem);
818         __iopt_remove_reserved_iova(iopt, owner);
819         up_write(&iopt->iova_rwsem);
820 }
821
822 void iopt_init_table(struct io_pagetable *iopt)
823 {
824         init_rwsem(&iopt->iova_rwsem);
825         init_rwsem(&iopt->domains_rwsem);
826         iopt->area_itree = RB_ROOT_CACHED;
827         iopt->allowed_itree = RB_ROOT_CACHED;
828         iopt->reserved_itree = RB_ROOT_CACHED;
829         xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
830         xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
831
832         /*
833          * iopt's start as SW tables that can use the entire size_t IOVA space
834          * due to the use of size_t in the APIs. They have no alignment
835          * restriction.
836          */
837         iopt->iova_alignment = 1;
838 }
839
840 void iopt_destroy_table(struct io_pagetable *iopt)
841 {
842         struct interval_tree_node *node;
843
844         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
845                 iopt_remove_reserved_iova(iopt, NULL);
846
847         while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
848                                                 ULONG_MAX))) {
849                 interval_tree_remove(node, &iopt->allowed_itree);
850                 kfree(container_of(node, struct iopt_allowed, node));
851         }
852
853         WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
854         WARN_ON(!xa_empty(&iopt->domains));
855         WARN_ON(!xa_empty(&iopt->access_list));
856         WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
857 }
858
859 /**
860  * iopt_unfill_domain() - Unfill a domain with PFNs
861  * @iopt: io_pagetable to act on
862  * @domain: domain to unfill
863  *
864  * This is used when removing a domain from the iopt. Every area in the iopt
865  * will be unmapped from the domain. The domain must already be removed from the
866  * domains xarray.
867  */
868 static void iopt_unfill_domain(struct io_pagetable *iopt,
869                                struct iommu_domain *domain)
870 {
871         struct iopt_area *area;
872
873         lockdep_assert_held(&iopt->iova_rwsem);
874         lockdep_assert_held_write(&iopt->domains_rwsem);
875
876         /*
877          * Some other domain is holding all the pfns still, rapidly unmap this
878          * domain.
879          */
880         if (iopt->next_domain_id != 0) {
881                 /* Pick an arbitrary remaining domain to act as storage */
882                 struct iommu_domain *storage_domain =
883                         xa_load(&iopt->domains, 0);
884
885                 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
886                      area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
887                         struct iopt_pages *pages = area->pages;
888
889                         if (!pages)
890                                 continue;
891
892                         mutex_lock(&pages->mutex);
893                         if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
894                                 WARN_ON(!area->storage_domain);
895                         if (area->storage_domain == domain)
896                                 area->storage_domain = storage_domain;
897                         mutex_unlock(&pages->mutex);
898
899                         iopt_area_unmap_domain(area, domain);
900                 }
901                 return;
902         }
903
904         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
905              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
906                 struct iopt_pages *pages = area->pages;
907
908                 if (!pages)
909                         continue;
910
911                 mutex_lock(&pages->mutex);
912                 interval_tree_remove(&area->pages_node, &pages->domains_itree);
913                 WARN_ON(area->storage_domain != domain);
914                 area->storage_domain = NULL;
915                 iopt_area_unfill_domain(area, pages, domain);
916                 mutex_unlock(&pages->mutex);
917         }
918 }
919
920 /**
921  * iopt_fill_domain() - Fill a domain with PFNs
922  * @iopt: io_pagetable to act on
923  * @domain: domain to fill
924  *
925  * Fill the domain with PFNs from every area in the iopt. On failure the domain
926  * is left unchanged.
927  */
928 static int iopt_fill_domain(struct io_pagetable *iopt,
929                             struct iommu_domain *domain)
930 {
931         struct iopt_area *end_area;
932         struct iopt_area *area;
933         int rc;
934
935         lockdep_assert_held(&iopt->iova_rwsem);
936         lockdep_assert_held_write(&iopt->domains_rwsem);
937
938         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
939              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
940                 struct iopt_pages *pages = area->pages;
941
942                 if (!pages)
943                         continue;
944
945                 mutex_lock(&pages->mutex);
946                 rc = iopt_area_fill_domain(area, domain);
947                 if (rc) {
948                         mutex_unlock(&pages->mutex);
949                         goto out_unfill;
950                 }
951                 if (!area->storage_domain) {
952                         WARN_ON(iopt->next_domain_id != 0);
953                         area->storage_domain = domain;
954                         interval_tree_insert(&area->pages_node,
955                                              &pages->domains_itree);
956                 }
957                 mutex_unlock(&pages->mutex);
958         }
959         return 0;
960
961 out_unfill:
962         end_area = area;
963         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
964              area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
965                 struct iopt_pages *pages = area->pages;
966
967                 if (area == end_area)
968                         break;
969                 if (!pages)
970                         continue;
971                 mutex_lock(&pages->mutex);
972                 if (iopt->next_domain_id == 0) {
973                         interval_tree_remove(&area->pages_node,
974                                              &pages->domains_itree);
975                         area->storage_domain = NULL;
976                 }
977                 iopt_area_unfill_domain(area, pages, domain);
978                 mutex_unlock(&pages->mutex);
979         }
980         return rc;
981 }
982
983 /* All existing area's conform to an increased page size */
984 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
985                                      unsigned long new_iova_alignment)
986 {
987         unsigned long align_mask = new_iova_alignment - 1;
988         struct iopt_area *area;
989
990         lockdep_assert_held(&iopt->iova_rwsem);
991         lockdep_assert_held(&iopt->domains_rwsem);
992
993         for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
994              area = iopt_area_iter_next(area, 0, ULONG_MAX))
995                 if ((iopt_area_iova(area) & align_mask) ||
996                     (iopt_area_length(area) & align_mask) ||
997                     (area->page_offset & align_mask))
998                         return -EADDRINUSE;
999
1000         if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1001                 struct iommufd_access *access;
1002                 unsigned long index;
1003
1004                 xa_for_each(&iopt->access_list, index, access)
1005                         if (WARN_ON(access->iova_alignment >
1006                                     new_iova_alignment))
1007                                 return -EADDRINUSE;
1008         }
1009         return 0;
1010 }
1011
1012 int iopt_table_add_domain(struct io_pagetable *iopt,
1013                           struct iommu_domain *domain)
1014 {
1015         const struct iommu_domain_geometry *geometry = &domain->geometry;
1016         struct iommu_domain *iter_domain;
1017         unsigned int new_iova_alignment;
1018         unsigned long index;
1019         int rc;
1020
1021         down_write(&iopt->domains_rwsem);
1022         down_write(&iopt->iova_rwsem);
1023
1024         xa_for_each(&iopt->domains, index, iter_domain) {
1025                 if (WARN_ON(iter_domain == domain)) {
1026                         rc = -EEXIST;
1027                         goto out_unlock;
1028                 }
1029         }
1030
1031         /*
1032          * The io page size drives the iova_alignment. Internally the iopt_pages
1033          * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1034          * objects into the iommu_domain.
1035          *
1036          * A iommu_domain must always be able to accept PAGE_SIZE to be
1037          * compatible as we can't guarantee higher contiguity.
1038          */
1039         new_iova_alignment = max_t(unsigned long,
1040                                    1UL << __ffs(domain->pgsize_bitmap),
1041                                    iopt->iova_alignment);
1042         if (new_iova_alignment > PAGE_SIZE) {
1043                 rc = -EINVAL;
1044                 goto out_unlock;
1045         }
1046         if (new_iova_alignment != iopt->iova_alignment) {
1047                 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1048                 if (rc)
1049                         goto out_unlock;
1050         }
1051
1052         /* No area exists that is outside the allowed domain aperture */
1053         if (geometry->aperture_start != 0) {
1054                 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1055                                        domain);
1056                 if (rc)
1057                         goto out_reserved;
1058         }
1059         if (geometry->aperture_end != ULONG_MAX) {
1060                 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1061                                        ULONG_MAX, domain);
1062                 if (rc)
1063                         goto out_reserved;
1064         }
1065
1066         rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1067         if (rc)
1068                 goto out_reserved;
1069
1070         rc = iopt_fill_domain(iopt, domain);
1071         if (rc)
1072                 goto out_release;
1073
1074         iopt->iova_alignment = new_iova_alignment;
1075         xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1076         iopt->next_domain_id++;
1077         up_write(&iopt->iova_rwsem);
1078         up_write(&iopt->domains_rwsem);
1079         return 0;
1080 out_release:
1081         xa_release(&iopt->domains, iopt->next_domain_id);
1082 out_reserved:
1083         __iopt_remove_reserved_iova(iopt, domain);
1084 out_unlock:
1085         up_write(&iopt->iova_rwsem);
1086         up_write(&iopt->domains_rwsem);
1087         return rc;
1088 }
1089
1090 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1091 {
1092         unsigned long new_iova_alignment;
1093         struct iommufd_access *access;
1094         struct iommu_domain *domain;
1095         unsigned long index;
1096
1097         lockdep_assert_held_write(&iopt->iova_rwsem);
1098         lockdep_assert_held(&iopt->domains_rwsem);
1099
1100         /* See batch_iommu_map_small() */
1101         if (iopt->disable_large_pages)
1102                 new_iova_alignment = PAGE_SIZE;
1103         else
1104                 new_iova_alignment = 1;
1105
1106         xa_for_each(&iopt->domains, index, domain)
1107                 new_iova_alignment = max_t(unsigned long,
1108                                            1UL << __ffs(domain->pgsize_bitmap),
1109                                            new_iova_alignment);
1110         xa_for_each(&iopt->access_list, index, access)
1111                 new_iova_alignment = max_t(unsigned long,
1112                                            access->iova_alignment,
1113                                            new_iova_alignment);
1114
1115         if (new_iova_alignment > iopt->iova_alignment) {
1116                 int rc;
1117
1118                 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1119                 if (rc)
1120                         return rc;
1121         }
1122         iopt->iova_alignment = new_iova_alignment;
1123         return 0;
1124 }
1125
1126 void iopt_table_remove_domain(struct io_pagetable *iopt,
1127                               struct iommu_domain *domain)
1128 {
1129         struct iommu_domain *iter_domain = NULL;
1130         unsigned long index;
1131
1132         down_write(&iopt->domains_rwsem);
1133         down_write(&iopt->iova_rwsem);
1134
1135         xa_for_each(&iopt->domains, index, iter_domain)
1136                 if (iter_domain == domain)
1137                         break;
1138         if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1139                 goto out_unlock;
1140
1141         /*
1142          * Compress the xarray to keep it linear by swapping the entry to erase
1143          * with the tail entry and shrinking the tail.
1144          */
1145         iopt->next_domain_id--;
1146         iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1147         if (index != iopt->next_domain_id)
1148                 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1149
1150         iopt_unfill_domain(iopt, domain);
1151         __iopt_remove_reserved_iova(iopt, domain);
1152
1153         WARN_ON(iopt_calculate_iova_alignment(iopt));
1154 out_unlock:
1155         up_write(&iopt->iova_rwsem);
1156         up_write(&iopt->domains_rwsem);
1157 }
1158
1159 /**
1160  * iopt_area_split - Split an area into two parts at iova
1161  * @area: The area to split
1162  * @iova: Becomes the last of a new area
1163  *
1164  * This splits an area into two. It is part of the VFIO compatibility to allow
1165  * poking a hole in the mapping. The two areas continue to point at the same
1166  * iopt_pages, just with different starting bytes.
1167  */
1168 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1169 {
1170         unsigned long alignment = area->iopt->iova_alignment;
1171         unsigned long last_iova = iopt_area_last_iova(area);
1172         unsigned long start_iova = iopt_area_iova(area);
1173         unsigned long new_start = iova + 1;
1174         struct io_pagetable *iopt = area->iopt;
1175         struct iopt_pages *pages = area->pages;
1176         struct iopt_area *lhs;
1177         struct iopt_area *rhs;
1178         int rc;
1179
1180         lockdep_assert_held_write(&iopt->iova_rwsem);
1181
1182         if (iova == start_iova || iova == last_iova)
1183                 return 0;
1184
1185         if (!pages || area->prevent_access)
1186                 return -EBUSY;
1187
1188         if (new_start & (alignment - 1) ||
1189             iopt_area_start_byte(area, new_start) & (alignment - 1))
1190                 return -EINVAL;
1191
1192         lhs = iopt_area_alloc();
1193         if (!lhs)
1194                 return -ENOMEM;
1195
1196         rhs = iopt_area_alloc();
1197         if (!rhs) {
1198                 rc = -ENOMEM;
1199                 goto err_free_lhs;
1200         }
1201
1202         mutex_lock(&pages->mutex);
1203         /*
1204          * Splitting is not permitted if an access exists, we don't track enough
1205          * information to split existing accesses.
1206          */
1207         if (area->num_accesses) {
1208                 rc = -EINVAL;
1209                 goto err_unlock;
1210         }
1211
1212         /*
1213          * Splitting is not permitted if a domain could have been mapped with
1214          * huge pages.
1215          */
1216         if (area->storage_domain && !iopt->disable_large_pages) {
1217                 rc = -EINVAL;
1218                 goto err_unlock;
1219         }
1220
1221         interval_tree_remove(&area->node, &iopt->area_itree);
1222         rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1223                               iopt_area_start_byte(area, start_iova),
1224                               (new_start - 1) - start_iova + 1,
1225                               area->iommu_prot);
1226         if (WARN_ON(rc))
1227                 goto err_insert;
1228
1229         rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1230                               iopt_area_start_byte(area, new_start),
1231                               last_iova - new_start + 1, area->iommu_prot);
1232         if (WARN_ON(rc))
1233                 goto err_remove_lhs;
1234
1235         /*
1236          * If the original area has filled a domain, domains_itree has to be
1237          * updated.
1238          */
1239         if (area->storage_domain) {
1240                 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1241                 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1242                 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1243         }
1244
1245         lhs->storage_domain = area->storage_domain;
1246         lhs->pages = area->pages;
1247         rhs->storage_domain = area->storage_domain;
1248         rhs->pages = area->pages;
1249         kref_get(&rhs->pages->kref);
1250         kfree(area);
1251         mutex_unlock(&pages->mutex);
1252
1253         /*
1254          * No change to domains or accesses because the pages hasn't been
1255          * changed
1256          */
1257         return 0;
1258
1259 err_remove_lhs:
1260         interval_tree_remove(&lhs->node, &iopt->area_itree);
1261 err_insert:
1262         interval_tree_insert(&area->node, &iopt->area_itree);
1263 err_unlock:
1264         mutex_unlock(&pages->mutex);
1265         kfree(rhs);
1266 err_free_lhs:
1267         kfree(lhs);
1268         return rc;
1269 }
1270
1271 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1272                   size_t num_iovas)
1273 {
1274         int rc = 0;
1275         int i;
1276
1277         down_write(&iopt->iova_rwsem);
1278         for (i = 0; i < num_iovas; i++) {
1279                 struct iopt_area *area;
1280
1281                 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1282                 if (!area)
1283                         continue;
1284                 rc = iopt_area_split(area, iovas[i]);
1285                 if (rc)
1286                         break;
1287         }
1288         up_write(&iopt->iova_rwsem);
1289         return rc;
1290 }
1291
1292 void iopt_enable_large_pages(struct io_pagetable *iopt)
1293 {
1294         int rc;
1295
1296         down_write(&iopt->domains_rwsem);
1297         down_write(&iopt->iova_rwsem);
1298         WRITE_ONCE(iopt->disable_large_pages, false);
1299         rc = iopt_calculate_iova_alignment(iopt);
1300         WARN_ON(rc);
1301         up_write(&iopt->iova_rwsem);
1302         up_write(&iopt->domains_rwsem);
1303 }
1304
1305 int iopt_disable_large_pages(struct io_pagetable *iopt)
1306 {
1307         int rc = 0;
1308
1309         down_write(&iopt->domains_rwsem);
1310         down_write(&iopt->iova_rwsem);
1311         if (iopt->disable_large_pages)
1312                 goto out_unlock;
1313
1314         /* Won't do it if domains already have pages mapped in them */
1315         if (!xa_empty(&iopt->domains) &&
1316             !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1317                 rc = -EINVAL;
1318                 goto out_unlock;
1319         }
1320
1321         WRITE_ONCE(iopt->disable_large_pages, true);
1322         rc = iopt_calculate_iova_alignment(iopt);
1323         if (rc)
1324                 WRITE_ONCE(iopt->disable_large_pages, false);
1325 out_unlock:
1326         up_write(&iopt->iova_rwsem);
1327         up_write(&iopt->domains_rwsem);
1328         return rc;
1329 }
1330
1331 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1332 {
1333         u32 new_id;
1334         int rc;
1335
1336         down_write(&iopt->domains_rwsem);
1337         down_write(&iopt->iova_rwsem);
1338         rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1339                       GFP_KERNEL_ACCOUNT);
1340
1341         if (rc)
1342                 goto out_unlock;
1343
1344         rc = iopt_calculate_iova_alignment(iopt);
1345         if (rc) {
1346                 xa_erase(&iopt->access_list, new_id);
1347                 goto out_unlock;
1348         }
1349         access->iopt_access_list_id = new_id;
1350
1351 out_unlock:
1352         up_write(&iopt->iova_rwsem);
1353         up_write(&iopt->domains_rwsem);
1354         return rc;
1355 }
1356
1357 void iopt_remove_access(struct io_pagetable *iopt,
1358                         struct iommufd_access *access,
1359                         u32 iopt_access_list_id)
1360 {
1361         down_write(&iopt->domains_rwsem);
1362         down_write(&iopt->iova_rwsem);
1363         WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1364         WARN_ON(iopt_calculate_iova_alignment(iopt));
1365         up_write(&iopt->iova_rwsem);
1366         up_write(&iopt->domains_rwsem);
1367 }
1368
1369 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1370 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1371                                         struct device *dev,
1372                                         phys_addr_t *sw_msi_start)
1373 {
1374         struct iommu_resv_region *resv;
1375         LIST_HEAD(resv_regions);
1376         unsigned int num_hw_msi = 0;
1377         unsigned int num_sw_msi = 0;
1378         int rc;
1379
1380         if (iommufd_should_fail())
1381                 return -EINVAL;
1382
1383         down_write(&iopt->iova_rwsem);
1384         /* FIXME: drivers allocate memory but there is no failure propogated */
1385         iommu_get_resv_regions(dev, &resv_regions);
1386
1387         list_for_each_entry(resv, &resv_regions, list) {
1388                 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1389                         continue;
1390
1391                 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1392                         num_hw_msi++;
1393                 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1394                         *sw_msi_start = resv->start;
1395                         num_sw_msi++;
1396                 }
1397
1398                 rc = iopt_reserve_iova(iopt, resv->start,
1399                                        resv->length - 1 + resv->start, dev);
1400                 if (rc)
1401                         goto out_reserved;
1402         }
1403
1404         /* Drivers must offer sane combinations of regions */
1405         if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1406                 rc = -EINVAL;
1407                 goto out_reserved;
1408         }
1409
1410         rc = 0;
1411         goto out_free_resv;
1412
1413 out_reserved:
1414         __iopt_remove_reserved_iova(iopt, dev);
1415 out_free_resv:
1416         iommu_put_resv_regions(dev, &resv_regions);
1417         up_write(&iopt->iova_rwsem);
1418         return rc;
1419 }