5046cae052224e25cef601753b4eab9da9571702
[linux-block.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37 #include <linux/sched/signal.h>
38 #include <linux/pm_runtime.h>
39 #include <linux/interval_tree.h>
40 #include <linux/iova_bitmap.h>
41 #include <linux/iommufd.h>
42 #include "vfio.h"
43
44 #define DRIVER_VERSION  "0.3"
45 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC     "VFIO - User Level meta-driver"
47
48 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
49
50 static struct vfio {
51         struct class                    *device_class;
52         struct ida                      device_ida;
53         struct vfsmount                 *vfs_mount;
54         int                             fs_count;
55 } vfio;
56
57 #ifdef CONFIG_VFIO_NOIOMMU
58 bool vfio_noiommu __read_mostly;
59 module_param_named(enable_unsafe_noiommu_mode,
60                    vfio_noiommu, bool, S_IRUGO | S_IWUSR);
61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
62 #endif
63
64 static DEFINE_XARRAY(vfio_device_set_xa);
65
66 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
67 {
68         unsigned long idx = (unsigned long)set_id;
69         struct vfio_device_set *new_dev_set;
70         struct vfio_device_set *dev_set;
71
72         if (WARN_ON(!set_id))
73                 return -EINVAL;
74
75         /*
76          * Atomically acquire a singleton object in the xarray for this set_id
77          */
78         xa_lock(&vfio_device_set_xa);
79         dev_set = xa_load(&vfio_device_set_xa, idx);
80         if (dev_set)
81                 goto found_get_ref;
82         xa_unlock(&vfio_device_set_xa);
83
84         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
85         if (!new_dev_set)
86                 return -ENOMEM;
87         mutex_init(&new_dev_set->lock);
88         INIT_LIST_HEAD(&new_dev_set->device_list);
89         new_dev_set->set_id = set_id;
90
91         xa_lock(&vfio_device_set_xa);
92         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
93                                GFP_KERNEL);
94         if (!dev_set) {
95                 dev_set = new_dev_set;
96                 goto found_get_ref;
97         }
98
99         kfree(new_dev_set);
100         if (xa_is_err(dev_set)) {
101                 xa_unlock(&vfio_device_set_xa);
102                 return xa_err(dev_set);
103         }
104
105 found_get_ref:
106         dev_set->device_count++;
107         xa_unlock(&vfio_device_set_xa);
108         mutex_lock(&dev_set->lock);
109         device->dev_set = dev_set;
110         list_add_tail(&device->dev_set_list, &dev_set->device_list);
111         mutex_unlock(&dev_set->lock);
112         return 0;
113 }
114 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
115
116 static void vfio_release_device_set(struct vfio_device *device)
117 {
118         struct vfio_device_set *dev_set = device->dev_set;
119
120         if (!dev_set)
121                 return;
122
123         mutex_lock(&dev_set->lock);
124         list_del(&device->dev_set_list);
125         mutex_unlock(&dev_set->lock);
126
127         xa_lock(&vfio_device_set_xa);
128         if (!--dev_set->device_count) {
129                 __xa_erase(&vfio_device_set_xa,
130                            (unsigned long)dev_set->set_id);
131                 mutex_destroy(&dev_set->lock);
132                 kfree(dev_set);
133         }
134         xa_unlock(&vfio_device_set_xa);
135 }
136
137 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
138 {
139         struct vfio_device *cur;
140         unsigned int open_count = 0;
141
142         lockdep_assert_held(&dev_set->lock);
143
144         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
145                 open_count += cur->open_count;
146         return open_count;
147 }
148 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
149
150 struct vfio_device *
151 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
152                            struct device *dev)
153 {
154         struct vfio_device *cur;
155
156         lockdep_assert_held(&dev_set->lock);
157
158         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
159                 if (cur->dev == dev)
160                         return cur;
161         return NULL;
162 }
163 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
164
165 /*
166  * Device objects - create, release, get, put, search
167  */
168 /* Device reference always implies a group reference */
169 void vfio_device_put_registration(struct vfio_device *device)
170 {
171         if (refcount_dec_and_test(&device->refcount))
172                 complete(&device->comp);
173 }
174
175 bool vfio_device_try_get_registration(struct vfio_device *device)
176 {
177         return refcount_inc_not_zero(&device->refcount);
178 }
179
180 /*
181  * VFIO driver API
182  */
183 /* Release helper called by vfio_put_device() */
184 static void vfio_device_release(struct device *dev)
185 {
186         struct vfio_device *device =
187                         container_of(dev, struct vfio_device, device);
188
189         vfio_release_device_set(device);
190         ida_free(&vfio.device_ida, device->index);
191
192         if (device->ops->release)
193                 device->ops->release(device);
194
195         iput(device->inode);
196         simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
197         kvfree(device);
198 }
199
200 static int vfio_init_device(struct vfio_device *device, struct device *dev,
201                             const struct vfio_device_ops *ops);
202
203 /*
204  * Allocate and initialize vfio_device so it can be registered to vfio
205  * core.
206  *
207  * Drivers should use the wrapper vfio_alloc_device() for allocation.
208  * @size is the size of the structure to be allocated, including any
209  * private data used by the driver.
210  *
211  * Driver may provide an @init callback to cover device private data.
212  *
213  * Use vfio_put_device() to release the structure after success return.
214  */
215 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
216                                        const struct vfio_device_ops *ops)
217 {
218         struct vfio_device *device;
219         int ret;
220
221         if (WARN_ON(size < sizeof(struct vfio_device)))
222                 return ERR_PTR(-EINVAL);
223
224         device = kvzalloc(size, GFP_KERNEL);
225         if (!device)
226                 return ERR_PTR(-ENOMEM);
227
228         ret = vfio_init_device(device, dev, ops);
229         if (ret)
230                 goto out_free;
231         return device;
232
233 out_free:
234         kvfree(device);
235         return ERR_PTR(ret);
236 }
237 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
238
239 static int vfio_fs_init_fs_context(struct fs_context *fc)
240 {
241         return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
242 }
243
244 static struct file_system_type vfio_fs_type = {
245         .name = "vfio",
246         .owner = THIS_MODULE,
247         .init_fs_context = vfio_fs_init_fs_context,
248         .kill_sb = kill_anon_super,
249 };
250
251 static struct inode *vfio_fs_inode_new(void)
252 {
253         struct inode *inode;
254         int ret;
255
256         ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
257         if (ret)
258                 return ERR_PTR(ret);
259
260         inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
261         if (IS_ERR(inode))
262                 simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
263
264         return inode;
265 }
266
267 /*
268  * Initialize a vfio_device so it can be registered to vfio core.
269  */
270 static int vfio_init_device(struct vfio_device *device, struct device *dev,
271                             const struct vfio_device_ops *ops)
272 {
273         int ret;
274
275         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
276         if (ret < 0) {
277                 dev_dbg(dev, "Error to alloc index\n");
278                 return ret;
279         }
280
281         device->index = ret;
282         init_completion(&device->comp);
283         device->dev = dev;
284         device->ops = ops;
285         device->inode = vfio_fs_inode_new();
286         if (IS_ERR(device->inode)) {
287                 ret = PTR_ERR(device->inode);
288                 goto out_inode;
289         }
290
291         if (ops->init) {
292                 ret = ops->init(device);
293                 if (ret)
294                         goto out_uninit;
295         }
296
297         device_initialize(&device->device);
298         device->device.release = vfio_device_release;
299         device->device.class = vfio.device_class;
300         device->device.parent = device->dev;
301         return 0;
302
303 out_uninit:
304         iput(device->inode);
305         simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
306 out_inode:
307         vfio_release_device_set(device);
308         ida_free(&vfio.device_ida, device->index);
309         return ret;
310 }
311
312 static int __vfio_register_dev(struct vfio_device *device,
313                                enum vfio_group_type type)
314 {
315         int ret;
316
317         if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
318                     (!device->ops->bind_iommufd ||
319                      !device->ops->unbind_iommufd ||
320                      !device->ops->attach_ioas ||
321                      !device->ops->detach_ioas)))
322                 return -EINVAL;
323
324         /*
325          * If the driver doesn't specify a set then the device is added to a
326          * singleton set just for itself.
327          */
328         if (!device->dev_set)
329                 vfio_assign_device_set(device, device);
330
331         ret = dev_set_name(&device->device, "vfio%d", device->index);
332         if (ret)
333                 return ret;
334
335         ret = vfio_device_set_group(device, type);
336         if (ret)
337                 return ret;
338
339         /*
340          * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
341          * restore cache coherency. It has to be checked here because it is only
342          * valid for cases where we are using iommu groups.
343          */
344         if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
345             !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
346                 ret = -EINVAL;
347                 goto err_out;
348         }
349
350         ret = vfio_device_add(device);
351         if (ret)
352                 goto err_out;
353
354         /* Refcounting can't start until the driver calls register */
355         refcount_set(&device->refcount, 1);
356
357         vfio_device_group_register(device);
358         vfio_device_debugfs_init(device);
359
360         return 0;
361 err_out:
362         vfio_device_remove_group(device);
363         return ret;
364 }
365
366 int vfio_register_group_dev(struct vfio_device *device)
367 {
368         return __vfio_register_dev(device, VFIO_IOMMU);
369 }
370 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
371
372 /*
373  * Register a virtual device without IOMMU backing.  The user of this
374  * device must not be able to directly trigger unmediated DMA.
375  */
376 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
377 {
378         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
379 }
380 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
381
382 /*
383  * Decrement the device reference count and wait for the device to be
384  * removed.  Open file descriptors for the device... */
385 void vfio_unregister_group_dev(struct vfio_device *device)
386 {
387         unsigned int i = 0;
388         bool interrupted = false;
389         long rc;
390
391         /*
392          * Prevent new device opened by userspace via the
393          * VFIO_GROUP_GET_DEVICE_FD in the group path.
394          */
395         vfio_device_group_unregister(device);
396
397         /*
398          * Balances vfio_device_add() in register path, also prevents
399          * new device opened by userspace in the cdev path.
400          */
401         vfio_device_del(device);
402
403         vfio_device_put_registration(device);
404         rc = try_wait_for_completion(&device->comp);
405         while (rc <= 0) {
406                 if (device->ops->request)
407                         device->ops->request(device, i++);
408
409                 if (interrupted) {
410                         rc = wait_for_completion_timeout(&device->comp,
411                                                          HZ * 10);
412                 } else {
413                         rc = wait_for_completion_interruptible_timeout(
414                                 &device->comp, HZ * 10);
415                         if (rc < 0) {
416                                 interrupted = true;
417                                 dev_warn(device->dev,
418                                          "Device is currently in use, task"
419                                          " \"%s\" (%d) "
420                                          "blocked until device is released",
421                                          current->comm, task_pid_nr(current));
422                         }
423                 }
424         }
425
426         vfio_device_debugfs_exit(device);
427         /* Balances vfio_device_set_group in register path */
428         vfio_device_remove_group(device);
429 }
430 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
431
432 #if IS_ENABLED(CONFIG_KVM)
433 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
434 {
435         void (*pfn)(struct kvm *kvm);
436         bool (*fn)(struct kvm *kvm);
437         bool ret;
438
439         lockdep_assert_held(&device->dev_set->lock);
440
441         if (!kvm)
442                 return;
443
444         pfn = symbol_get(kvm_put_kvm);
445         if (WARN_ON(!pfn))
446                 return;
447
448         fn = symbol_get(kvm_get_kvm_safe);
449         if (WARN_ON(!fn)) {
450                 symbol_put(kvm_put_kvm);
451                 return;
452         }
453
454         ret = fn(kvm);
455         symbol_put(kvm_get_kvm_safe);
456         if (!ret) {
457                 symbol_put(kvm_put_kvm);
458                 return;
459         }
460
461         device->put_kvm = pfn;
462         device->kvm = kvm;
463 }
464
465 void vfio_device_put_kvm(struct vfio_device *device)
466 {
467         lockdep_assert_held(&device->dev_set->lock);
468
469         if (!device->kvm)
470                 return;
471
472         if (WARN_ON(!device->put_kvm))
473                 goto clear;
474
475         device->put_kvm(device->kvm);
476         device->put_kvm = NULL;
477         symbol_put(kvm_put_kvm);
478
479 clear:
480         device->kvm = NULL;
481 }
482 #endif
483
484 /* true if the vfio_device has open_device() called but not close_device() */
485 static bool vfio_assert_device_open(struct vfio_device *device)
486 {
487         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
488 }
489
490 struct vfio_device_file *
491 vfio_allocate_device_file(struct vfio_device *device)
492 {
493         struct vfio_device_file *df;
494
495         df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
496         if (!df)
497                 return ERR_PTR(-ENOMEM);
498
499         df->device = device;
500         spin_lock_init(&df->kvm_ref_lock);
501
502         return df;
503 }
504
505 static int vfio_df_device_first_open(struct vfio_device_file *df)
506 {
507         struct vfio_device *device = df->device;
508         struct iommufd_ctx *iommufd = df->iommufd;
509         int ret;
510
511         lockdep_assert_held(&device->dev_set->lock);
512
513         if (!try_module_get(device->dev->driver->owner))
514                 return -ENODEV;
515
516         if (iommufd)
517                 ret = vfio_df_iommufd_bind(df);
518         else
519                 ret = vfio_device_group_use_iommu(device);
520         if (ret)
521                 goto err_module_put;
522
523         if (device->ops->open_device) {
524                 ret = device->ops->open_device(device);
525                 if (ret)
526                         goto err_unuse_iommu;
527         }
528         return 0;
529
530 err_unuse_iommu:
531         if (iommufd)
532                 vfio_df_iommufd_unbind(df);
533         else
534                 vfio_device_group_unuse_iommu(device);
535 err_module_put:
536         module_put(device->dev->driver->owner);
537         return ret;
538 }
539
540 static void vfio_df_device_last_close(struct vfio_device_file *df)
541 {
542         struct vfio_device *device = df->device;
543         struct iommufd_ctx *iommufd = df->iommufd;
544
545         lockdep_assert_held(&device->dev_set->lock);
546
547         if (device->ops->close_device)
548                 device->ops->close_device(device);
549         if (iommufd)
550                 vfio_df_iommufd_unbind(df);
551         else
552                 vfio_device_group_unuse_iommu(device);
553         module_put(device->dev->driver->owner);
554 }
555
556 int vfio_df_open(struct vfio_device_file *df)
557 {
558         struct vfio_device *device = df->device;
559         int ret = 0;
560
561         lockdep_assert_held(&device->dev_set->lock);
562
563         /*
564          * Only the group path allows the device to be opened multiple
565          * times.  The device cdev path doesn't have a secure way for it.
566          */
567         if (device->open_count != 0 && !df->group)
568                 return -EINVAL;
569
570         device->open_count++;
571         if (device->open_count == 1) {
572                 ret = vfio_df_device_first_open(df);
573                 if (ret)
574                         device->open_count--;
575         }
576
577         return ret;
578 }
579
580 void vfio_df_close(struct vfio_device_file *df)
581 {
582         struct vfio_device *device = df->device;
583
584         lockdep_assert_held(&device->dev_set->lock);
585
586         if (!vfio_assert_device_open(device))
587                 return;
588         if (device->open_count == 1)
589                 vfio_df_device_last_close(df);
590         device->open_count--;
591 }
592
593 /*
594  * Wrapper around pm_runtime_resume_and_get().
595  * Return error code on failure or 0 on success.
596  */
597 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
598 {
599         struct device *dev = device->dev;
600
601         if (dev->driver && dev->driver->pm) {
602                 int ret;
603
604                 ret = pm_runtime_resume_and_get(dev);
605                 if (ret) {
606                         dev_info_ratelimited(dev,
607                                 "vfio: runtime resume failed %d\n", ret);
608                         return -EIO;
609                 }
610         }
611
612         return 0;
613 }
614
615 /*
616  * Wrapper around pm_runtime_put().
617  */
618 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
619 {
620         struct device *dev = device->dev;
621
622         if (dev->driver && dev->driver->pm)
623                 pm_runtime_put(dev);
624 }
625
626 /*
627  * VFIO Device fd
628  */
629 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
630 {
631         struct vfio_device_file *df = filep->private_data;
632         struct vfio_device *device = df->device;
633
634         if (df->group)
635                 vfio_df_group_close(df);
636         else
637                 vfio_df_unbind_iommufd(df);
638
639         vfio_device_put_registration(device);
640
641         kfree(df);
642
643         return 0;
644 }
645
646 /*
647  * vfio_mig_get_next_state - Compute the next step in the FSM
648  * @cur_fsm - The current state the device is in
649  * @new_fsm - The target state to reach
650  * @next_fsm - Pointer to the next step to get to new_fsm
651  *
652  * Return 0 upon success, otherwise -errno
653  * Upon success the next step in the state progression between cur_fsm and
654  * new_fsm will be set in next_fsm.
655  *
656  * This breaks down requests for combination transitions into smaller steps and
657  * returns the next step to get to new_fsm. The function may need to be called
658  * multiple times before reaching new_fsm.
659  *
660  */
661 int vfio_mig_get_next_state(struct vfio_device *device,
662                             enum vfio_device_mig_state cur_fsm,
663                             enum vfio_device_mig_state new_fsm,
664                             enum vfio_device_mig_state *next_fsm)
665 {
666         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
667         /*
668          * The coding in this table requires the driver to implement the
669          * following FSM arcs:
670          *         RESUMING -> STOP
671          *         STOP -> RESUMING
672          *         STOP -> STOP_COPY
673          *         STOP_COPY -> STOP
674          *
675          * If P2P is supported then the driver must also implement these FSM
676          * arcs:
677          *         RUNNING -> RUNNING_P2P
678          *         RUNNING_P2P -> RUNNING
679          *         RUNNING_P2P -> STOP
680          *         STOP -> RUNNING_P2P
681          *
682          * If precopy is supported then the driver must support these additional
683          * FSM arcs:
684          *         RUNNING -> PRE_COPY
685          *         PRE_COPY -> RUNNING
686          *         PRE_COPY -> STOP_COPY
687          * However, if precopy and P2P are supported together then the driver
688          * must support these additional arcs beyond the P2P arcs above:
689          *         PRE_COPY -> RUNNING
690          *         PRE_COPY -> PRE_COPY_P2P
691          *         PRE_COPY_P2P -> PRE_COPY
692          *         PRE_COPY_P2P -> RUNNING_P2P
693          *         PRE_COPY_P2P -> STOP_COPY
694          *         RUNNING -> PRE_COPY
695          *         RUNNING_P2P -> PRE_COPY_P2P
696          *
697          * Without P2P and precopy the driver must implement:
698          *         RUNNING -> STOP
699          *         STOP -> RUNNING
700          *
701          * The coding will step through multiple states for some combination
702          * transitions; if all optional features are supported, this means the
703          * following ones:
704          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
705          *         PRE_COPY -> RUNNING -> RUNNING_P2P
706          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
707          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
708          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
709          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
710          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
711          *         RESUMING -> STOP -> RUNNING_P2P
712          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
713          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
714          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
715          *         RESUMING -> STOP -> STOP_COPY
716          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
717          *         RUNNING -> RUNNING_P2P -> STOP
718          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
719          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
720          *         RUNNING_P2P -> RUNNING -> PRE_COPY
721          *         RUNNING_P2P -> STOP -> RESUMING
722          *         RUNNING_P2P -> STOP -> STOP_COPY
723          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
724          *         STOP -> RUNNING_P2P -> RUNNING
725          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
726          *         STOP_COPY -> STOP -> RESUMING
727          *         STOP_COPY -> STOP -> RUNNING_P2P
728          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
729          *
730          *  The following transitions are blocked:
731          *         STOP_COPY -> PRE_COPY
732          *         STOP_COPY -> PRE_COPY_P2P
733          */
734         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
735                 [VFIO_DEVICE_STATE_STOP] = {
736                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
737                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
738                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
739                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
740                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
741                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
742                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
743                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
744                 },
745                 [VFIO_DEVICE_STATE_RUNNING] = {
746                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
747                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
748                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
749                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
750                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
751                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
752                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
753                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
754                 },
755                 [VFIO_DEVICE_STATE_PRE_COPY] = {
756                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
757                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
758                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
759                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
760                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
761                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
762                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
763                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
764                 },
765                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
766                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
767                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
768                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
769                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
770                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
771                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
772                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
773                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
774                 },
775                 [VFIO_DEVICE_STATE_STOP_COPY] = {
776                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
777                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
778                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
779                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
780                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
781                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
782                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
783                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
784                 },
785                 [VFIO_DEVICE_STATE_RESUMING] = {
786                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
787                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
788                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
789                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
790                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
791                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
792                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
793                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
794                 },
795                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
796                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
797                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
798                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
799                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
800                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
801                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
802                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
803                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
804                 },
805                 [VFIO_DEVICE_STATE_ERROR] = {
806                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
807                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
808                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
809                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
810                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
811                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
812                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
813                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
814                 },
815         };
816
817         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
818                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
819                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
820                 [VFIO_DEVICE_STATE_PRE_COPY] =
821                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
822                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
823                                                    VFIO_MIGRATION_P2P |
824                                                    VFIO_MIGRATION_PRE_COPY,
825                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
826                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
827                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
828                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
829                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
830         };
831
832         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
833                     (state_flags_table[cur_fsm] & device->migration_flags) !=
834                         state_flags_table[cur_fsm]))
835                 return -EINVAL;
836
837         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
838            (state_flags_table[new_fsm] & device->migration_flags) !=
839                         state_flags_table[new_fsm])
840                 return -EINVAL;
841
842         /*
843          * Arcs touching optional and unsupported states are skipped over. The
844          * driver will instead see an arc from the original state to the next
845          * logical state, as per the above comment.
846          */
847         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
848         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
849                         state_flags_table[*next_fsm])
850                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
851
852         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
853 }
854 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
855
856 /*
857  * Convert the drivers's struct file into a FD number and return it to userspace
858  */
859 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
860                                    struct vfio_device_feature_mig_state *mig)
861 {
862         int ret;
863         int fd;
864
865         fd = get_unused_fd_flags(O_CLOEXEC);
866         if (fd < 0) {
867                 ret = fd;
868                 goto out_fput;
869         }
870
871         mig->data_fd = fd;
872         if (copy_to_user(arg, mig, sizeof(*mig))) {
873                 ret = -EFAULT;
874                 goto out_put_unused;
875         }
876         fd_install(fd, filp);
877         return 0;
878
879 out_put_unused:
880         put_unused_fd(fd);
881 out_fput:
882         fput(filp);
883         return ret;
884 }
885
886 static int
887 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
888                                            u32 flags, void __user *arg,
889                                            size_t argsz)
890 {
891         size_t minsz =
892                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
893         struct vfio_device_feature_mig_state mig;
894         struct file *filp = NULL;
895         int ret;
896
897         if (!device->mig_ops)
898                 return -ENOTTY;
899
900         ret = vfio_check_feature(flags, argsz,
901                                  VFIO_DEVICE_FEATURE_SET |
902                                  VFIO_DEVICE_FEATURE_GET,
903                                  sizeof(mig));
904         if (ret != 1)
905                 return ret;
906
907         if (copy_from_user(&mig, arg, minsz))
908                 return -EFAULT;
909
910         if (flags & VFIO_DEVICE_FEATURE_GET) {
911                 enum vfio_device_mig_state curr_state;
912
913                 ret = device->mig_ops->migration_get_state(device,
914                                                            &curr_state);
915                 if (ret)
916                         return ret;
917                 mig.device_state = curr_state;
918                 goto out_copy;
919         }
920
921         /* Handle the VFIO_DEVICE_FEATURE_SET */
922         filp = device->mig_ops->migration_set_state(device, mig.device_state);
923         if (IS_ERR(filp) || !filp)
924                 goto out_copy;
925
926         return vfio_ioct_mig_return_fd(filp, arg, &mig);
927 out_copy:
928         mig.data_fd = -1;
929         if (copy_to_user(arg, &mig, sizeof(mig)))
930                 return -EFAULT;
931         if (IS_ERR(filp))
932                 return PTR_ERR(filp);
933         return 0;
934 }
935
936 static int
937 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
938                                               u32 flags, void __user *arg,
939                                               size_t argsz)
940 {
941         struct vfio_device_feature_mig_data_size data_size = {};
942         unsigned long stop_copy_length;
943         int ret;
944
945         if (!device->mig_ops)
946                 return -ENOTTY;
947
948         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
949                                  sizeof(data_size));
950         if (ret != 1)
951                 return ret;
952
953         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
954         if (ret)
955                 return ret;
956
957         data_size.stop_copy_length = stop_copy_length;
958         if (copy_to_user(arg, &data_size, sizeof(data_size)))
959                 return -EFAULT;
960
961         return 0;
962 }
963
964 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
965                                                u32 flags, void __user *arg,
966                                                size_t argsz)
967 {
968         struct vfio_device_feature_migration mig = {
969                 .flags = device->migration_flags,
970         };
971         int ret;
972
973         if (!device->mig_ops)
974                 return -ENOTTY;
975
976         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
977                                  sizeof(mig));
978         if (ret != 1)
979                 return ret;
980         if (copy_to_user(arg, &mig, sizeof(mig)))
981                 return -EFAULT;
982         return 0;
983 }
984
985 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
986                               u32 req_nodes)
987 {
988         struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
989         unsigned long min_gap, curr_gap;
990
991         /* Special shortcut when a single range is required */
992         if (req_nodes == 1) {
993                 unsigned long last;
994
995                 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
996
997                 /* Empty list */
998                 if (WARN_ON_ONCE(!comb_start))
999                         return;
1000
1001                 curr = comb_start;
1002                 while (curr) {
1003                         last = curr->last;
1004                         prev = curr;
1005                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1006                         if (prev != comb_start)
1007                                 interval_tree_remove(prev, root);
1008                 }
1009                 comb_start->last = last;
1010                 return;
1011         }
1012
1013         /* Combine ranges which have the smallest gap */
1014         while (cur_nodes > req_nodes) {
1015                 prev = NULL;
1016                 min_gap = ULONG_MAX;
1017                 curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1018                 while (curr) {
1019                         if (prev) {
1020                                 curr_gap = curr->start - prev->last;
1021                                 if (curr_gap < min_gap) {
1022                                         min_gap = curr_gap;
1023                                         comb_start = prev;
1024                                         comb_end = curr;
1025                                 }
1026                         }
1027                         prev = curr;
1028                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1029                 }
1030
1031                 /* Empty list or no nodes to combine */
1032                 if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1033                         break;
1034
1035                 comb_start->last = comb_end->last;
1036                 interval_tree_remove(comb_end, root);
1037                 cur_nodes--;
1038         }
1039 }
1040 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1041
1042 /* Ranges should fit into a single kernel page */
1043 #define LOG_MAX_RANGES \
1044         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1045
1046 static int
1047 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1048                                         u32 flags, void __user *arg,
1049                                         size_t argsz)
1050 {
1051         size_t minsz =
1052                 offsetofend(struct vfio_device_feature_dma_logging_control,
1053                             ranges);
1054         struct vfio_device_feature_dma_logging_range __user *ranges;
1055         struct vfio_device_feature_dma_logging_control control;
1056         struct vfio_device_feature_dma_logging_range range;
1057         struct rb_root_cached root = RB_ROOT_CACHED;
1058         struct interval_tree_node *nodes;
1059         u64 iova_end;
1060         u32 nnodes;
1061         int i, ret;
1062
1063         if (!device->log_ops)
1064                 return -ENOTTY;
1065
1066         ret = vfio_check_feature(flags, argsz,
1067                                  VFIO_DEVICE_FEATURE_SET,
1068                                  sizeof(control));
1069         if (ret != 1)
1070                 return ret;
1071
1072         if (copy_from_user(&control, arg, minsz))
1073                 return -EFAULT;
1074
1075         nnodes = control.num_ranges;
1076         if (!nnodes)
1077                 return -EINVAL;
1078
1079         if (nnodes > LOG_MAX_RANGES)
1080                 return -E2BIG;
1081
1082         ranges = u64_to_user_ptr(control.ranges);
1083         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1084                               GFP_KERNEL);
1085         if (!nodes)
1086                 return -ENOMEM;
1087
1088         for (i = 0; i < nnodes; i++) {
1089                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1090                         ret = -EFAULT;
1091                         goto end;
1092                 }
1093                 if (!IS_ALIGNED(range.iova, control.page_size) ||
1094                     !IS_ALIGNED(range.length, control.page_size)) {
1095                         ret = -EINVAL;
1096                         goto end;
1097                 }
1098
1099                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
1100                     iova_end > ULONG_MAX) {
1101                         ret = -EOVERFLOW;
1102                         goto end;
1103                 }
1104
1105                 nodes[i].start = range.iova;
1106                 nodes[i].last = range.iova + range.length - 1;
1107                 if (interval_tree_iter_first(&root, nodes[i].start,
1108                                              nodes[i].last)) {
1109                         /* Range overlapping */
1110                         ret = -EINVAL;
1111                         goto end;
1112                 }
1113                 interval_tree_insert(nodes + i, &root);
1114         }
1115
1116         ret = device->log_ops->log_start(device, &root, nnodes,
1117                                          &control.page_size);
1118         if (ret)
1119                 goto end;
1120
1121         if (copy_to_user(arg, &control, sizeof(control))) {
1122                 ret = -EFAULT;
1123                 device->log_ops->log_stop(device);
1124         }
1125
1126 end:
1127         kfree(nodes);
1128         return ret;
1129 }
1130
1131 static int
1132 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1133                                        u32 flags, void __user *arg,
1134                                        size_t argsz)
1135 {
1136         int ret;
1137
1138         if (!device->log_ops)
1139                 return -ENOTTY;
1140
1141         ret = vfio_check_feature(flags, argsz,
1142                                  VFIO_DEVICE_FEATURE_SET, 0);
1143         if (ret != 1)
1144                 return ret;
1145
1146         return device->log_ops->log_stop(device);
1147 }
1148
1149 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1150                                           unsigned long iova, size_t length,
1151                                           void *opaque)
1152 {
1153         struct vfio_device *device = opaque;
1154
1155         return device->log_ops->log_read_and_clear(device, iova, length, iter);
1156 }
1157
1158 static int
1159 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1160                                          u32 flags, void __user *arg,
1161                                          size_t argsz)
1162 {
1163         size_t minsz =
1164                 offsetofend(struct vfio_device_feature_dma_logging_report,
1165                             bitmap);
1166         struct vfio_device_feature_dma_logging_report report;
1167         struct iova_bitmap *iter;
1168         u64 iova_end;
1169         int ret;
1170
1171         if (!device->log_ops)
1172                 return -ENOTTY;
1173
1174         ret = vfio_check_feature(flags, argsz,
1175                                  VFIO_DEVICE_FEATURE_GET,
1176                                  sizeof(report));
1177         if (ret != 1)
1178                 return ret;
1179
1180         if (copy_from_user(&report, arg, minsz))
1181                 return -EFAULT;
1182
1183         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1184                 return -EINVAL;
1185
1186         if (check_add_overflow(report.iova, report.length, &iova_end) ||
1187             iova_end > ULONG_MAX)
1188                 return -EOVERFLOW;
1189
1190         iter = iova_bitmap_alloc(report.iova, report.length,
1191                                  report.page_size,
1192                                  u64_to_user_ptr(report.bitmap));
1193         if (IS_ERR(iter))
1194                 return PTR_ERR(iter);
1195
1196         ret = iova_bitmap_for_each(iter, device,
1197                                    vfio_device_log_read_and_clear);
1198
1199         iova_bitmap_free(iter);
1200         return ret;
1201 }
1202
1203 static int vfio_ioctl_device_feature(struct vfio_device *device,
1204                                      struct vfio_device_feature __user *arg)
1205 {
1206         size_t minsz = offsetofend(struct vfio_device_feature, flags);
1207         struct vfio_device_feature feature;
1208
1209         if (copy_from_user(&feature, arg, minsz))
1210                 return -EFAULT;
1211
1212         if (feature.argsz < minsz)
1213                 return -EINVAL;
1214
1215         /* Check unknown flags */
1216         if (feature.flags &
1217             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1218               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1219                 return -EINVAL;
1220
1221         /* GET & SET are mutually exclusive except with PROBE */
1222         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1223             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1224             (feature.flags & VFIO_DEVICE_FEATURE_GET))
1225                 return -EINVAL;
1226
1227         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1228         case VFIO_DEVICE_FEATURE_MIGRATION:
1229                 return vfio_ioctl_device_feature_migration(
1230                         device, feature.flags, arg->data,
1231                         feature.argsz - minsz);
1232         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1233                 return vfio_ioctl_device_feature_mig_device_state(
1234                         device, feature.flags, arg->data,
1235                         feature.argsz - minsz);
1236         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1237                 return vfio_ioctl_device_feature_logging_start(
1238                         device, feature.flags, arg->data,
1239                         feature.argsz - minsz);
1240         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1241                 return vfio_ioctl_device_feature_logging_stop(
1242                         device, feature.flags, arg->data,
1243                         feature.argsz - minsz);
1244         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1245                 return vfio_ioctl_device_feature_logging_report(
1246                         device, feature.flags, arg->data,
1247                         feature.argsz - minsz);
1248         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1249                 return vfio_ioctl_device_feature_migration_data_size(
1250                         device, feature.flags, arg->data,
1251                         feature.argsz - minsz);
1252         default:
1253                 if (unlikely(!device->ops->device_feature))
1254                         return -EINVAL;
1255                 return device->ops->device_feature(device, feature.flags,
1256                                                    arg->data,
1257                                                    feature.argsz - minsz);
1258         }
1259 }
1260
1261 static long vfio_device_fops_unl_ioctl(struct file *filep,
1262                                        unsigned int cmd, unsigned long arg)
1263 {
1264         struct vfio_device_file *df = filep->private_data;
1265         struct vfio_device *device = df->device;
1266         void __user *uptr = (void __user *)arg;
1267         int ret;
1268
1269         if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1270                 return vfio_df_ioctl_bind_iommufd(df, uptr);
1271
1272         /* Paired with smp_store_release() following vfio_df_open() */
1273         if (!smp_load_acquire(&df->access_granted))
1274                 return -EINVAL;
1275
1276         ret = vfio_device_pm_runtime_get(device);
1277         if (ret)
1278                 return ret;
1279
1280         /* cdev only ioctls */
1281         if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1282                 switch (cmd) {
1283                 case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1284                         ret = vfio_df_ioctl_attach_pt(df, uptr);
1285                         goto out;
1286
1287                 case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1288                         ret = vfio_df_ioctl_detach_pt(df, uptr);
1289                         goto out;
1290                 }
1291         }
1292
1293         switch (cmd) {
1294         case VFIO_DEVICE_FEATURE:
1295                 ret = vfio_ioctl_device_feature(device, uptr);
1296                 break;
1297
1298         default:
1299                 if (unlikely(!device->ops->ioctl))
1300                         ret = -EINVAL;
1301                 else
1302                         ret = device->ops->ioctl(device, cmd, arg);
1303                 break;
1304         }
1305 out:
1306         vfio_device_pm_runtime_put(device);
1307         return ret;
1308 }
1309
1310 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1311                                      size_t count, loff_t *ppos)
1312 {
1313         struct vfio_device_file *df = filep->private_data;
1314         struct vfio_device *device = df->device;
1315
1316         /* Paired with smp_store_release() following vfio_df_open() */
1317         if (!smp_load_acquire(&df->access_granted))
1318                 return -EINVAL;
1319
1320         if (unlikely(!device->ops->read))
1321                 return -EINVAL;
1322
1323         return device->ops->read(device, buf, count, ppos);
1324 }
1325
1326 static ssize_t vfio_device_fops_write(struct file *filep,
1327                                       const char __user *buf,
1328                                       size_t count, loff_t *ppos)
1329 {
1330         struct vfio_device_file *df = filep->private_data;
1331         struct vfio_device *device = df->device;
1332
1333         /* Paired with smp_store_release() following vfio_df_open() */
1334         if (!smp_load_acquire(&df->access_granted))
1335                 return -EINVAL;
1336
1337         if (unlikely(!device->ops->write))
1338                 return -EINVAL;
1339
1340         return device->ops->write(device, buf, count, ppos);
1341 }
1342
1343 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1344 {
1345         struct vfio_device_file *df = filep->private_data;
1346         struct vfio_device *device = df->device;
1347
1348         /* Paired with smp_store_release() following vfio_df_open() */
1349         if (!smp_load_acquire(&df->access_granted))
1350                 return -EINVAL;
1351
1352         if (unlikely(!device->ops->mmap))
1353                 return -EINVAL;
1354
1355         return device->ops->mmap(device, vma);
1356 }
1357
1358 const struct file_operations vfio_device_fops = {
1359         .owner          = THIS_MODULE,
1360         .open           = vfio_device_fops_cdev_open,
1361         .release        = vfio_device_fops_release,
1362         .read           = vfio_device_fops_read,
1363         .write          = vfio_device_fops_write,
1364         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1365         .compat_ioctl   = compat_ptr_ioctl,
1366         .mmap           = vfio_device_fops_mmap,
1367 };
1368
1369 static struct vfio_device *vfio_device_from_file(struct file *file)
1370 {
1371         struct vfio_device_file *df = file->private_data;
1372
1373         if (file->f_op != &vfio_device_fops)
1374                 return NULL;
1375         return df->device;
1376 }
1377
1378 /**
1379  * vfio_file_is_valid - True if the file is valid vfio file
1380  * @file: VFIO group file or VFIO device file
1381  */
1382 bool vfio_file_is_valid(struct file *file)
1383 {
1384         return vfio_group_from_file(file) ||
1385                vfio_device_from_file(file);
1386 }
1387 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1388
1389 /**
1390  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1391  *        is always CPU cache coherent
1392  * @file: VFIO group file or VFIO device file
1393  *
1394  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1395  * bit in DMA transactions. A return of false indicates that the user has
1396  * rights to access additional instructions such as wbinvd on x86.
1397  */
1398 bool vfio_file_enforced_coherent(struct file *file)
1399 {
1400         struct vfio_device *device;
1401         struct vfio_group *group;
1402
1403         group = vfio_group_from_file(file);
1404         if (group)
1405                 return vfio_group_enforced_coherent(group);
1406
1407         device = vfio_device_from_file(file);
1408         if (device)
1409                 return device_iommu_capable(device->dev,
1410                                             IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1411
1412         return true;
1413 }
1414 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1415
1416 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1417 {
1418         struct vfio_device_file *df = file->private_data;
1419
1420         /*
1421          * The kvm is first recorded in the vfio_device_file, and will
1422          * be propagated to vfio_device::kvm when the file is bound to
1423          * iommufd successfully in the vfio device cdev path.
1424          */
1425         spin_lock(&df->kvm_ref_lock);
1426         df->kvm = kvm;
1427         spin_unlock(&df->kvm_ref_lock);
1428 }
1429
1430 /**
1431  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1432  * @file: VFIO group file or VFIO device file
1433  * @kvm: KVM to link
1434  *
1435  * When a VFIO device is first opened the KVM will be available in
1436  * device->kvm if one was associated with the file.
1437  */
1438 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1439 {
1440         struct vfio_group *group;
1441
1442         group = vfio_group_from_file(file);
1443         if (group)
1444                 vfio_group_set_kvm(group, kvm);
1445
1446         if (vfio_device_from_file(file))
1447                 vfio_device_file_set_kvm(file, kvm);
1448 }
1449 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1450
1451 /*
1452  * Sub-module support
1453  */
1454 /*
1455  * Helper for managing a buffer of info chain capabilities, allocate or
1456  * reallocate a buffer with additional @size, filling in @id and @version
1457  * of the capability.  A pointer to the new capability is returned.
1458  *
1459  * NB. The chain is based at the head of the buffer, so new entries are
1460  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1461  * next offsets prior to copying to the user buffer.
1462  */
1463 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1464                                                size_t size, u16 id, u16 version)
1465 {
1466         void *buf;
1467         struct vfio_info_cap_header *header, *tmp;
1468
1469         /* Ensure that the next capability struct will be aligned */
1470         size = ALIGN(size, sizeof(u64));
1471
1472         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1473         if (!buf) {
1474                 kfree(caps->buf);
1475                 caps->buf = NULL;
1476                 caps->size = 0;
1477                 return ERR_PTR(-ENOMEM);
1478         }
1479
1480         caps->buf = buf;
1481         header = buf + caps->size;
1482
1483         /* Eventually copied to user buffer, zero */
1484         memset(header, 0, size);
1485
1486         header->id = id;
1487         header->version = version;
1488
1489         /* Add to the end of the capability chain */
1490         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1491                 ; /* nothing */
1492
1493         tmp->next = caps->size;
1494         caps->size += size;
1495
1496         return header;
1497 }
1498 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1499
1500 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1501 {
1502         struct vfio_info_cap_header *tmp;
1503         void *buf = (void *)caps->buf;
1504
1505         /* Capability structs should start with proper alignment */
1506         WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1507
1508         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1509                 tmp->next += offset;
1510 }
1511 EXPORT_SYMBOL(vfio_info_cap_shift);
1512
1513 int vfio_info_add_capability(struct vfio_info_cap *caps,
1514                              struct vfio_info_cap_header *cap, size_t size)
1515 {
1516         struct vfio_info_cap_header *header;
1517
1518         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1519         if (IS_ERR(header))
1520                 return PTR_ERR(header);
1521
1522         memcpy(header + 1, cap + 1, size - sizeof(*header));
1523
1524         return 0;
1525 }
1526 EXPORT_SYMBOL(vfio_info_add_capability);
1527
1528 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1529                                        int max_irq_type, size_t *data_size)
1530 {
1531         unsigned long minsz;
1532         size_t size;
1533
1534         minsz = offsetofend(struct vfio_irq_set, count);
1535
1536         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1537             (hdr->count >= (U32_MAX - hdr->start)) ||
1538             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1539                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1540                 return -EINVAL;
1541
1542         if (data_size)
1543                 *data_size = 0;
1544
1545         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1546                 return -EINVAL;
1547
1548         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1549         case VFIO_IRQ_SET_DATA_NONE:
1550                 size = 0;
1551                 break;
1552         case VFIO_IRQ_SET_DATA_BOOL:
1553                 size = sizeof(uint8_t);
1554                 break;
1555         case VFIO_IRQ_SET_DATA_EVENTFD:
1556                 size = sizeof(int32_t);
1557                 break;
1558         default:
1559                 return -EINVAL;
1560         }
1561
1562         if (size) {
1563                 if (hdr->argsz - minsz < hdr->count * size)
1564                         return -EINVAL;
1565
1566                 if (!data_size)
1567                         return -EINVAL;
1568
1569                 *data_size = hdr->count * size;
1570         }
1571
1572         return 0;
1573 }
1574 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1575
1576 /*
1577  * Pin contiguous user pages and return their associated host pages for local
1578  * domain only.
1579  * @device [in]  : device
1580  * @iova [in]    : starting IOVA of user pages to be pinned.
1581  * @npage [in]   : count of pages to be pinned.  This count should not
1582  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1583  * @prot [in]    : protection flags
1584  * @pages[out]   : array of host pages
1585  * Return error or number of pages pinned.
1586  *
1587  * A driver may only call this function if the vfio_device was created
1588  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1589  */
1590 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1591                    int npage, int prot, struct page **pages)
1592 {
1593         /* group->container cannot change while a vfio device is open */
1594         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1595                 return -EINVAL;
1596         if (!device->ops->dma_unmap)
1597                 return -EINVAL;
1598         if (vfio_device_has_container(device))
1599                 return vfio_device_container_pin_pages(device, iova,
1600                                                        npage, prot, pages);
1601         if (device->iommufd_access) {
1602                 int ret;
1603
1604                 if (iova > ULONG_MAX)
1605                         return -EINVAL;
1606                 /*
1607                  * VFIO ignores the sub page offset, npages is from the start of
1608                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1609                  * the sub page offset by doing:
1610                  *     pages[0] + (iova % PAGE_SIZE)
1611                  */
1612                 ret = iommufd_access_pin_pages(
1613                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1614                         npage * PAGE_SIZE, pages,
1615                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1616                 if (ret)
1617                         return ret;
1618                 return npage;
1619         }
1620         return -EINVAL;
1621 }
1622 EXPORT_SYMBOL(vfio_pin_pages);
1623
1624 /*
1625  * Unpin contiguous host pages for local domain only.
1626  * @device [in]  : device
1627  * @iova [in]    : starting address of user pages to be unpinned.
1628  * @npage [in]   : count of pages to be unpinned.  This count should not
1629  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1630  */
1631 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1632 {
1633         if (WARN_ON(!vfio_assert_device_open(device)))
1634                 return;
1635         if (WARN_ON(!device->ops->dma_unmap))
1636                 return;
1637
1638         if (vfio_device_has_container(device)) {
1639                 vfio_device_container_unpin_pages(device, iova, npage);
1640                 return;
1641         }
1642         if (device->iommufd_access) {
1643                 if (WARN_ON(iova > ULONG_MAX))
1644                         return;
1645                 iommufd_access_unpin_pages(device->iommufd_access,
1646                                            ALIGN_DOWN(iova, PAGE_SIZE),
1647                                            npage * PAGE_SIZE);
1648                 return;
1649         }
1650 }
1651 EXPORT_SYMBOL(vfio_unpin_pages);
1652
1653 /*
1654  * This interface allows the CPUs to perform some sort of virtual DMA on
1655  * behalf of the device.
1656  *
1657  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1658  * into/from a kernel buffer.
1659  *
1660  * As the read/write of user space memory is conducted via the CPUs and is
1661  * not a real device DMA, it is not necessary to pin the user space memory.
1662  *
1663  * @device [in]         : VFIO device
1664  * @iova [in]           : base IOVA of a user space buffer
1665  * @data [in]           : pointer to kernel buffer
1666  * @len [in]            : kernel buffer length
1667  * @write               : indicate read or write
1668  * Return error code on failure or 0 on success.
1669  */
1670 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1671                 size_t len, bool write)
1672 {
1673         if (!data || len <= 0 || !vfio_assert_device_open(device))
1674                 return -EINVAL;
1675
1676         if (vfio_device_has_container(device))
1677                 return vfio_device_container_dma_rw(device, iova,
1678                                                     data, len, write);
1679
1680         if (device->iommufd_access) {
1681                 unsigned int flags = 0;
1682
1683                 if (iova > ULONG_MAX)
1684                         return -EINVAL;
1685
1686                 /* VFIO historically tries to auto-detect a kthread */
1687                 if (!current->mm)
1688                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1689                 if (write)
1690                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1691                 return iommufd_access_rw(device->iommufd_access, iova, data,
1692                                          len, flags);
1693         }
1694         return -EINVAL;
1695 }
1696 EXPORT_SYMBOL(vfio_dma_rw);
1697
1698 /*
1699  * Module/class support
1700  */
1701 static int __init vfio_init(void)
1702 {
1703         int ret;
1704
1705         ida_init(&vfio.device_ida);
1706
1707         ret = vfio_group_init();
1708         if (ret)
1709                 return ret;
1710
1711         ret = vfio_virqfd_init();
1712         if (ret)
1713                 goto err_virqfd;
1714
1715         /* /sys/class/vfio-dev/vfioX */
1716         vfio.device_class = class_create("vfio-dev");
1717         if (IS_ERR(vfio.device_class)) {
1718                 ret = PTR_ERR(vfio.device_class);
1719                 goto err_dev_class;
1720         }
1721
1722         ret = vfio_cdev_init(vfio.device_class);
1723         if (ret)
1724                 goto err_alloc_dev_chrdev;
1725
1726         vfio_debugfs_create_root();
1727         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1728         return 0;
1729
1730 err_alloc_dev_chrdev:
1731         class_destroy(vfio.device_class);
1732         vfio.device_class = NULL;
1733 err_dev_class:
1734         vfio_virqfd_exit();
1735 err_virqfd:
1736         vfio_group_cleanup();
1737         return ret;
1738 }
1739
1740 static void __exit vfio_cleanup(void)
1741 {
1742         vfio_debugfs_remove_root();
1743         ida_destroy(&vfio.device_ida);
1744         vfio_cdev_cleanup();
1745         class_destroy(vfio.device_class);
1746         vfio.device_class = NULL;
1747         vfio_virqfd_exit();
1748         vfio_group_cleanup();
1749         xa_destroy(&vfio_device_set_xa);
1750 }
1751
1752 module_init(vfio_init);
1753 module_exit(vfio_cleanup);
1754
1755 MODULE_IMPORT_NS("IOMMUFD");
1756 MODULE_VERSION(DRIVER_VERSION);
1757 MODULE_LICENSE("GPL v2");
1758 MODULE_AUTHOR(DRIVER_AUTHOR);
1759 MODULE_DESCRIPTION(DRIVER_DESC);
1760 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");