vfio: Rename vfio_device_put() and vfio_device_try_get()
[linux-block.git] / drivers / vfio / vfio_main.c
CommitLineData
d2912cb1 1// SPDX-License-Identifier: GPL-2.0-only
cba3345c
AW
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
cba3345c
AW
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/file.h>
17#include <linux/anon_inodes.h>
18#include <linux/fs.h>
19#include <linux/idr.h>
20#include <linux/iommu.h>
21#include <linux/list.h>
d1099901 22#include <linux/miscdevice.h>
cba3345c
AW
23#include <linux/module.h>
24#include <linux/mutex.h>
5f096b14 25#include <linux/pci.h>
9587f44a 26#include <linux/rwsem.h>
cba3345c
AW
27#include <linux/sched.h>
28#include <linux/slab.h>
664e9386 29#include <linux/stat.h>
cba3345c
AW
30#include <linux/string.h>
31#include <linux/uaccess.h>
32#include <linux/vfio.h>
33#include <linux/wait.h>
41be3e26 34#include <linux/sched/signal.h>
8e5c6995 35#include <linux/pm_runtime.h>
80c4b92a
YH
36#include <linux/interval_tree.h>
37#include <linux/iova_bitmap.h>
8cc02d22 38#include "vfio.h"
cba3345c
AW
39
40#define DRIVER_VERSION "0.3"
41#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
42#define DRIVER_DESC "VFIO - User Level meta-driver"
43
44static struct vfio {
45 struct class *class;
46 struct list_head iommu_drivers_list;
47 struct mutex iommu_drivers_lock;
48 struct list_head group_list;
9cef7391
JG
49 struct mutex group_lock; /* locks group_list */
50 struct ida group_ida;
d1099901 51 dev_t group_devt;
cba3345c
AW
52} vfio;
53
54struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
57};
58
59struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
9587f44a 62 struct rw_semaphore group_lock;
cba3345c
AW
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
03a76b60 65 bool noiommu;
cba3345c
AW
66};
67
68struct vfio_group {
9cef7391
JG
69 struct device dev;
70 struct cdev cdev;
2b678aa2 71 refcount_t users;
3ca54708 72 unsigned int container_users;
cba3345c
AW
73 struct iommu_group *iommu_group;
74 struct vfio_container *container;
75 struct list_head device_list;
76 struct mutex device_lock;
cba3345c
AW
77 struct list_head vfio_next;
78 struct list_head container_next;
c68ea0d0 79 enum vfio_group_type type;
be8d3ada 80 struct rw_semaphore group_rwsem;
ccd46dba 81 struct kvm *kvm;
b76c0eed 82 struct file *opened_file;
ccd46dba 83 struct blocking_notifier_head notifier;
cba3345c
AW
84};
85
03a76b60
AW
86#ifdef CONFIG_VFIO_NOIOMMU
87static bool noiommu __read_mostly;
88module_param_named(enable_unsafe_noiommu_mode,
89 noiommu, bool, S_IRUGO | S_IWUSR);
90MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
91#endif
92
2fd585f4 93static DEFINE_XARRAY(vfio_device_set_xa);
9cef7391 94static const struct file_operations vfio_group_fops;
2fd585f4
JG
95
96int vfio_assign_device_set(struct vfio_device *device, void *set_id)
97{
98 unsigned long idx = (unsigned long)set_id;
99 struct vfio_device_set *new_dev_set;
100 struct vfio_device_set *dev_set;
101
102 if (WARN_ON(!set_id))
103 return -EINVAL;
104
105 /*
106 * Atomically acquire a singleton object in the xarray for this set_id
107 */
108 xa_lock(&vfio_device_set_xa);
109 dev_set = xa_load(&vfio_device_set_xa, idx);
110 if (dev_set)
111 goto found_get_ref;
112 xa_unlock(&vfio_device_set_xa);
113
114 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
115 if (!new_dev_set)
116 return -ENOMEM;
117 mutex_init(&new_dev_set->lock);
118 INIT_LIST_HEAD(&new_dev_set->device_list);
119 new_dev_set->set_id = set_id;
120
121 xa_lock(&vfio_device_set_xa);
122 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
123 GFP_KERNEL);
124 if (!dev_set) {
125 dev_set = new_dev_set;
126 goto found_get_ref;
127 }
128
129 kfree(new_dev_set);
130 if (xa_is_err(dev_set)) {
131 xa_unlock(&vfio_device_set_xa);
132 return xa_err(dev_set);
133 }
134
135found_get_ref:
136 dev_set->device_count++;
137 xa_unlock(&vfio_device_set_xa);
138 mutex_lock(&dev_set->lock);
139 device->dev_set = dev_set;
140 list_add_tail(&device->dev_set_list, &dev_set->device_list);
141 mutex_unlock(&dev_set->lock);
142 return 0;
143}
144EXPORT_SYMBOL_GPL(vfio_assign_device_set);
145
146static void vfio_release_device_set(struct vfio_device *device)
147{
148 struct vfio_device_set *dev_set = device->dev_set;
149
150 if (!dev_set)
151 return;
152
153 mutex_lock(&dev_set->lock);
154 list_del(&device->dev_set_list);
155 mutex_unlock(&dev_set->lock);
156
157 xa_lock(&vfio_device_set_xa);
158 if (!--dev_set->device_count) {
159 __xa_erase(&vfio_device_set_xa,
160 (unsigned long)dev_set->set_id);
161 mutex_destroy(&dev_set->lock);
162 kfree(dev_set);
163 }
164 xa_unlock(&vfio_device_set_xa);
165}
166
03a76b60
AW
167#ifdef CONFIG_VFIO_NOIOMMU
168static void *vfio_noiommu_open(unsigned long arg)
169{
170 if (arg != VFIO_NOIOMMU_IOMMU)
171 return ERR_PTR(-EINVAL);
172 if (!capable(CAP_SYS_RAWIO))
173 return ERR_PTR(-EPERM);
174
175 return NULL;
176}
177
178static void vfio_noiommu_release(void *iommu_data)
179{
180}
181
182static long vfio_noiommu_ioctl(void *iommu_data,
183 unsigned int cmd, unsigned long arg)
184{
185 if (cmd == VFIO_CHECK_EXTENSION)
186 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
187
188 return -ENOTTY;
189}
190
03a76b60 191static int vfio_noiommu_attach_group(void *iommu_data,
c3c0fa9d 192 struct iommu_group *iommu_group, enum vfio_group_type type)
03a76b60 193{
c5b4ba97 194 return 0;
03a76b60
AW
195}
196
197static void vfio_noiommu_detach_group(void *iommu_data,
198 struct iommu_group *iommu_group)
199{
200}
201
202static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
203 .name = "vfio-noiommu",
204 .owner = THIS_MODULE,
205 .open = vfio_noiommu_open,
206 .release = vfio_noiommu_release,
207 .ioctl = vfio_noiommu_ioctl,
208 .attach_group = vfio_noiommu_attach_group,
209 .detach_group = vfio_noiommu_detach_group,
210};
03a76b60 211
b0062160
CH
212/*
213 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
214 * use vfio-noiommu.
215 */
216static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
217 const struct vfio_iommu_driver *driver)
218{
219 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
220}
221#else
222static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
223 const struct vfio_iommu_driver *driver)
224{
225 return true;
226}
227#endif /* CONFIG_VFIO_NOIOMMU */
03a76b60 228
3b9a2d57 229/*
cba3345c
AW
230 * IOMMU driver registration
231 */
232int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
233{
234 struct vfio_iommu_driver *driver, *tmp;
235
8cfc5b60 236 if (WARN_ON(!ops->register_device != !ops->unregister_device))
ce4b4657
JG
237 return -EINVAL;
238
cba3345c
AW
239 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
240 if (!driver)
241 return -ENOMEM;
242
243 driver->ops = ops;
244
245 mutex_lock(&vfio.iommu_drivers_lock);
246
247 /* Check for duplicates */
248 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
249 if (tmp->ops == ops) {
250 mutex_unlock(&vfio.iommu_drivers_lock);
251 kfree(driver);
252 return -EINVAL;
253 }
254 }
255
256 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
257
258 mutex_unlock(&vfio.iommu_drivers_lock);
259
260 return 0;
261}
262EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
263
264void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
265{
266 struct vfio_iommu_driver *driver;
267
268 mutex_lock(&vfio.iommu_drivers_lock);
269 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
270 if (driver->ops == ops) {
271 list_del(&driver->vfio_next);
272 mutex_unlock(&vfio.iommu_drivers_lock);
273 kfree(driver);
274 return;
275 }
276 }
277 mutex_unlock(&vfio.iommu_drivers_lock);
278}
279EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
280
cba3345c
AW
281static void vfio_group_get(struct vfio_group *group);
282
3b9a2d57 283/*
cba3345c
AW
284 * Container objects - containers are created when /dev/vfio/vfio is
285 * opened, but their lifecycle extends until the last user is done, so
286 * it's freed via kref. Must support container/group/device being
287 * closed in any order.
288 */
289static void vfio_container_get(struct vfio_container *container)
290{
291 kref_get(&container->kref);
292}
293
294static void vfio_container_release(struct kref *kref)
295{
296 struct vfio_container *container;
297 container = container_of(kref, struct vfio_container, kref);
298
299 kfree(container);
300}
301
302static void vfio_container_put(struct vfio_container *container)
303{
304 kref_put(&container->kref, vfio_container_release);
305}
306
3b9a2d57 307/*
cba3345c
AW
308 * Group objects - create, release, get, put, search
309 */
1ceabade
JG
310static struct vfio_group *
311__vfio_group_get_from_iommu(struct iommu_group *iommu_group)
312{
313 struct vfio_group *group;
314
315 list_for_each_entry(group, &vfio.group_list, vfio_next) {
316 if (group->iommu_group == iommu_group) {
317 vfio_group_get(group);
318 return group;
319 }
320 }
321 return NULL;
322}
323
324static struct vfio_group *
325vfio_group_get_from_iommu(struct iommu_group *iommu_group)
326{
327 struct vfio_group *group;
328
329 mutex_lock(&vfio.group_lock);
330 group = __vfio_group_get_from_iommu(iommu_group);
331 mutex_unlock(&vfio.group_lock);
332 return group;
333}
334
9cef7391 335static void vfio_group_release(struct device *dev)
cba3345c 336{
9cef7391 337 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
9cef7391
JG
338
339 mutex_destroy(&group->device_lock);
9cef7391
JG
340 iommu_group_put(group->iommu_group);
341 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
342 kfree(group);
343}
344
345static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
346 enum vfio_group_type type)
347{
348 struct vfio_group *group;
349 int minor;
cba3345c
AW
350
351 group = kzalloc(sizeof(*group), GFP_KERNEL);
352 if (!group)
353 return ERR_PTR(-ENOMEM);
354
9cef7391
JG
355 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
356 if (minor < 0) {
357 kfree(group);
358 return ERR_PTR(minor);
359 }
360
361 device_initialize(&group->dev);
362 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
363 group->dev.class = vfio.class;
364 group->dev.release = vfio_group_release;
365 cdev_init(&group->cdev, &vfio_group_fops);
366 group->cdev.owner = THIS_MODULE;
367
2b678aa2 368 refcount_set(&group->users, 1);
be8d3ada 369 init_rwsem(&group->group_rwsem);
cba3345c
AW
370 INIT_LIST_HEAD(&group->device_list);
371 mutex_init(&group->device_lock);
cba3345c 372 group->iommu_group = iommu_group;
9cef7391 373 /* put in vfio_group_release() */
325a31c9 374 iommu_group_ref_get(iommu_group);
c68ea0d0 375 group->type = type;
ccd46dba 376 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
cba3345c 377
9cef7391
JG
378 return group;
379}
380
381static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
382 enum vfio_group_type type)
383{
384 struct vfio_group *group;
385 struct vfio_group *ret;
386 int err;
387
388 group = vfio_group_alloc(iommu_group, type);
389 if (IS_ERR(group))
390 return group;
391
392 err = dev_set_name(&group->dev, "%s%d",
393 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
394 iommu_group_id(iommu_group));
395 if (err) {
396 ret = ERR_PTR(err);
397 goto err_put;
398 }
399
cba3345c
AW
400 mutex_lock(&vfio.group_lock);
401
cba3345c 402 /* Did we race creating this group? */
9cef7391
JG
403 ret = __vfio_group_get_from_iommu(iommu_group);
404 if (ret)
405 goto err_unlock;
2f51bf4b 406
9cef7391
JG
407 err = cdev_device_add(&group->cdev, &group->dev);
408 if (err) {
409 ret = ERR_PTR(err);
410 goto err_unlock;
cba3345c
AW
411 }
412
cba3345c
AW
413 list_add(&group->vfio_next, &vfio.group_list);
414
415 mutex_unlock(&vfio.group_lock);
cba3345c 416 return group;
9cef7391
JG
417
418err_unlock:
419 mutex_unlock(&vfio.group_lock);
9cef7391
JG
420err_put:
421 put_device(&group->dev);
422 return ret;
cba3345c
AW
423}
424
2b678aa2 425static void vfio_group_put(struct vfio_group *group)
cba3345c 426{
2b678aa2
JG
427 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
428 return;
cba3345c 429
63b150fd
JG
430 /*
431 * These data structures all have paired operations that can only be
432 * undone when the caller holds a live reference on the group. Since all
433 * pairs must be undone these WARN_ON's indicate some caller did not
434 * properly hold the group reference.
435 */
cba3345c 436 WARN_ON(!list_empty(&group->device_list));
3ca54708 437 WARN_ON(group->container || group->container_users);
65b1adeb 438 WARN_ON(group->notifier.head);
cba3345c 439
cba3345c 440 list_del(&group->vfio_next);
9cef7391
JG
441 cdev_device_del(&group->cdev, &group->dev);
442 mutex_unlock(&vfio.group_lock);
443
9cef7391 444 put_device(&group->dev);
cba3345c
AW
445}
446
cba3345c
AW
447static void vfio_group_get(struct vfio_group *group)
448{
2b678aa2 449 refcount_inc(&group->users);
cba3345c
AW
450}
451
3b9a2d57 452/*
cba3345c
AW
453 * Device objects - create, release, get, put, search
454 */
cba3345c 455/* Device reference always implies a group reference */
4a725b8d 456static void vfio_device_put_registration(struct vfio_device *device)
cba3345c 457{
5e42c999
JG
458 if (refcount_dec_and_test(&device->refcount))
459 complete(&device->comp);
cba3345c
AW
460}
461
4a725b8d 462static bool vfio_device_try_get_registration(struct vfio_device *device)
cba3345c 463{
5e42c999 464 return refcount_inc_not_zero(&device->refcount);
cba3345c
AW
465}
466
467static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
468 struct device *dev)
469{
470 struct vfio_device *device;
471
472 mutex_lock(&group->device_lock);
473 list_for_each_entry(device, &group->device_list, group_next) {
4a725b8d
KT
474 if (device->dev == dev &&
475 vfio_device_try_get_registration(device)) {
cba3345c
AW
476 mutex_unlock(&group->device_lock);
477 return device;
478 }
479 }
480 mutex_unlock(&group->device_lock);
481 return NULL;
482}
483
3b9a2d57 484/*
cba3345c
AW
485 * VFIO driver API
486 */
cb9ff3f3
KT
487/* Release helper called by vfio_put_device() */
488void vfio_device_release(struct kref *kref)
489{
490 struct vfio_device *device =
491 container_of(kref, struct vfio_device, kref);
492
ebb72b76 493 vfio_release_device_set(device);
cb9ff3f3
KT
494
495 /*
496 * kvfree() cannot be done here due to a life cycle mess in
497 * vfio-ccw. Before the ccw part is fixed all drivers are
498 * required to support @release and call vfio_free_device()
499 * from there.
500 */
501 device->ops->release(device);
502}
503EXPORT_SYMBOL_GPL(vfio_device_release);
504
505/*
506 * Allocate and initialize vfio_device so it can be registered to vfio
507 * core.
508 *
509 * Drivers should use the wrapper vfio_alloc_device() for allocation.
510 * @size is the size of the structure to be allocated, including any
511 * private data used by the driver.
512 *
513 * Driver may provide an @init callback to cover device private data.
514 *
515 * Use vfio_put_device() to release the structure after success return.
516 */
517struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
518 const struct vfio_device_ops *ops)
519{
520 struct vfio_device *device;
521 int ret;
522
523 if (WARN_ON(size < sizeof(struct vfio_device)))
524 return ERR_PTR(-EINVAL);
525
526 device = kvzalloc(size, GFP_KERNEL);
527 if (!device)
528 return ERR_PTR(-ENOMEM);
529
530 ret = vfio_init_device(device, dev, ops);
531 if (ret)
532 goto out_free;
533 return device;
534
535out_free:
536 kvfree(device);
537 return ERR_PTR(ret);
538}
539EXPORT_SYMBOL_GPL(_vfio_alloc_device);
540
541/*
542 * Initialize a vfio_device so it can be registered to vfio core.
543 *
544 * Only vfio-ccw driver should call this interface.
545 */
546int vfio_init_device(struct vfio_device *device, struct device *dev,
547 const struct vfio_device_ops *ops)
548{
549 int ret;
550
ebb72b76
KT
551 init_completion(&device->comp);
552 device->dev = dev;
553 device->ops = ops;
cb9ff3f3
KT
554
555 if (ops->init) {
556 ret = ops->init(device);
557 if (ret)
558 goto out_uninit;
559 }
560
561 kref_init(&device->kref);
562 return 0;
563
564out_uninit:
ebb72b76 565 vfio_release_device_set(device);
cb9ff3f3
KT
566 return ret;
567}
568EXPORT_SYMBOL_GPL(vfio_init_device);
569
570/*
571 * The helper called by driver @release callback to free the device
572 * structure. Drivers which don't have private data to clean can
573 * simply use this helper as its @release.
574 */
575void vfio_free_device(struct vfio_device *device)
576{
577 kvfree(device);
578}
579EXPORT_SYMBOL_GPL(vfio_free_device);
580
c68ea0d0
CH
581static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
582 enum vfio_group_type type)
1362591f
CH
583{
584 struct iommu_group *iommu_group;
585 struct vfio_group *group;
3af91771
CH
586 int ret;
587
588 iommu_group = iommu_group_alloc();
589 if (IS_ERR(iommu_group))
590 return ERR_CAST(iommu_group);
591
1c61d51e
LN
592 ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
593 if (ret)
594 goto out_put_group;
3af91771
CH
595 ret = iommu_group_add_device(iommu_group, dev);
596 if (ret)
597 goto out_put_group;
1362591f 598
c68ea0d0 599 group = vfio_create_group(iommu_group, type);
3af91771
CH
600 if (IS_ERR(group)) {
601 ret = PTR_ERR(group);
602 goto out_remove_device;
603 }
325a31c9 604 iommu_group_put(iommu_group);
3af91771
CH
605 return group;
606
607out_remove_device:
608 iommu_group_remove_device(dev);
609out_put_group:
610 iommu_group_put(iommu_group);
611 return ERR_PTR(ret);
612}
3af91771
CH
613
614static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
615{
616 struct iommu_group *iommu_group;
617 struct vfio_group *group;
618
619 iommu_group = iommu_group_get(dev);
620#ifdef CONFIG_VFIO_NOIOMMU
a77109ff 621 if (!iommu_group && noiommu) {
3af91771
CH
622 /*
623 * With noiommu enabled, create an IOMMU group for devices that
a77109ff
RM
624 * don't already have one, implying no IOMMU hardware/driver
625 * exists. Taint the kernel because we're about to give a DMA
3af91771
CH
626 * capable device to a user without IOMMU protection.
627 */
c68ea0d0 628 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
3af91771
CH
629 if (!IS_ERR(group)) {
630 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
631 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
632 }
633 return group;
634 }
635#endif
1362591f
CH
636 if (!iommu_group)
637 return ERR_PTR(-EINVAL);
638
afe4e376
JG
639 /*
640 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
641 * restore cache coherency. It has to be checked here because it is only
642 * valid for cases where we are using iommu groups.
643 */
a9cf69d0 644 if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) {
afe4e376
JG
645 iommu_group_put(iommu_group);
646 return ERR_PTR(-EINVAL);
647 }
648
1362591f 649 group = vfio_group_get_from_iommu(iommu_group);
325a31c9
JG
650 if (!group)
651 group = vfio_create_group(iommu_group, VFIO_IOMMU);
1362591f 652
325a31c9 653 /* The vfio_group holds a reference to the iommu_group */
1362591f
CH
654 iommu_group_put(iommu_group);
655 return group;
656}
657
c68ea0d0
CH
658static int __vfio_register_dev(struct vfio_device *device,
659 struct vfio_group *group)
0bfc6a4e
JG
660{
661 struct vfio_device *existing_device;
c68ea0d0
CH
662
663 if (IS_ERR(group))
664 return PTR_ERR(group);
cba3345c 665
2fd585f4
JG
666 /*
667 * If the driver doesn't specify a set then the device is added to a
668 * singleton set just for itself.
669 */
670 if (!device->dev_set)
671 vfio_assign_device_set(device, device);
672
0bfc6a4e
JG
673 existing_device = vfio_group_get_device(group, device->dev);
674 if (existing_device) {
675 dev_WARN(device->dev, "Device already exists on group %d\n",
1362591f 676 iommu_group_id(group->iommu_group));
4a725b8d 677 vfio_device_put_registration(existing_device);
c68ea0d0
CH
678 if (group->type == VFIO_NO_IOMMU ||
679 group->type == VFIO_EMULATED_IOMMU)
38a68934 680 iommu_group_remove_device(device->dev);
cba3345c 681 vfio_group_put(group);
cba3345c
AW
682 return -EBUSY;
683 }
684
0bfc6a4e
JG
685 /* Our reference on group is moved to the device */
686 device->group = group;
687
688 /* Refcounting can't start until the driver calls register */
689 refcount_set(&device->refcount, 1);
690
691 mutex_lock(&group->device_lock);
692 list_add(&device->group_next, &group->device_list);
0bfc6a4e
JG
693 mutex_unlock(&group->device_lock);
694
695 return 0;
696}
c68ea0d0
CH
697
698int vfio_register_group_dev(struct vfio_device *device)
699{
700 return __vfio_register_dev(device,
701 vfio_group_find_or_alloc(device->dev));
702}
0bfc6a4e
JG
703EXPORT_SYMBOL_GPL(vfio_register_group_dev);
704
c68ea0d0
CH
705/*
706 * Register a virtual device without IOMMU backing. The user of this
707 * device must not be able to directly trigger unmediated DMA.
708 */
709int vfio_register_emulated_iommu_dev(struct vfio_device *device)
710{
711 return __vfio_register_dev(device,
712 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
713}
714EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
715
4bc94d5d
AW
716static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
717 char *buf)
718{
5f3874c2 719 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
4bc94d5d
AW
720
721 mutex_lock(&group->device_lock);
e324fc82 722 list_for_each_entry(it, &group->device_list, group_next) {
5f3874c2
AW
723 int ret;
724
725 if (it->ops->match) {
6df62c5b 726 ret = it->ops->match(it, buf);
5f3874c2
AW
727 if (ret < 0) {
728 device = ERR_PTR(ret);
729 break;
730 }
731 } else {
732 ret = !strcmp(dev_name(it->dev), buf);
733 }
734
4a725b8d 735 if (ret && vfio_device_try_get_registration(it)) {
e324fc82 736 device = it;
4bc94d5d
AW
737 break;
738 }
739 }
740 mutex_unlock(&group->device_lock);
741
742 return device;
743}
744
cba3345c
AW
745/*
746 * Decrement the device reference count and wait for the device to be
747 * removed. Open file descriptors for the device... */
0bfc6a4e 748void vfio_unregister_group_dev(struct vfio_device *device)
cba3345c 749{
cba3345c 750 struct vfio_group *group = device->group;
13060b64 751 unsigned int i = 0;
db7d4d7f 752 bool interrupted = false;
5e42c999 753 long rc;
cba3345c 754
4a725b8d 755 vfio_device_put_registration(device);
5e42c999
JG
756 rc = try_wait_for_completion(&device->comp);
757 while (rc <= 0) {
13060b64 758 if (device->ops->request)
6df62c5b 759 device->ops->request(device, i++);
13060b64 760
db7d4d7f 761 if (interrupted) {
5e42c999
JG
762 rc = wait_for_completion_timeout(&device->comp,
763 HZ * 10);
db7d4d7f 764 } else {
5e42c999
JG
765 rc = wait_for_completion_interruptible_timeout(
766 &device->comp, HZ * 10);
767 if (rc < 0) {
db7d4d7f 768 interrupted = true;
0bfc6a4e 769 dev_warn(device->dev,
db7d4d7f
AW
770 "Device is currently in use, task"
771 " \"%s\" (%d) "
772 "blocked until device is released",
773 current->comm, task_pid_nr(current));
774 }
775 }
5e42c999 776 }
e014e944 777
5e42c999
JG
778 mutex_lock(&group->device_lock);
779 list_del(&device->group_next);
5e42c999 780 mutex_unlock(&group->device_lock);
41be3e26 781
c68ea0d0 782 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
38a68934 783 iommu_group_remove_device(device->dev);
c04ac340 784
0bfc6a4e 785 /* Matches the get in vfio_register_group_dev() */
e014e944 786 vfio_group_put(group);
0bfc6a4e
JG
787}
788EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
789
3b9a2d57 790/*
cba3345c
AW
791 * VFIO base fd, /dev/vfio/vfio
792 */
793static long vfio_ioctl_check_extension(struct vfio_container *container,
794 unsigned long arg)
795{
0b43c082 796 struct vfio_iommu_driver *driver;
cba3345c
AW
797 long ret = 0;
798
0b43c082
AW
799 down_read(&container->group_lock);
800
801 driver = container->iommu_driver;
802
cba3345c
AW
803 switch (arg) {
804 /* No base extensions yet */
805 default:
806 /*
807 * If no driver is set, poll all registered drivers for
808 * extensions and return the first positive result. If
809 * a driver is already set, further queries will be passed
810 * only to that driver.
811 */
812 if (!driver) {
813 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6
AW
814 list_for_each_entry(driver, &vfio.iommu_drivers_list,
815 vfio_next) {
03a76b60 816
03a76b60 817 if (!list_empty(&container->group_list) &&
b0062160
CH
818 !vfio_iommu_driver_allowed(container,
819 driver))
03a76b60 820 continue;
cba3345c
AW
821 if (!try_module_get(driver->ops->owner))
822 continue;
823
824 ret = driver->ops->ioctl(NULL,
825 VFIO_CHECK_EXTENSION,
826 arg);
827 module_put(driver->ops->owner);
828 if (ret > 0)
829 break;
830 }
831 mutex_unlock(&vfio.iommu_drivers_lock);
832 } else
833 ret = driver->ops->ioctl(container->iommu_data,
834 VFIO_CHECK_EXTENSION, arg);
835 }
836
0b43c082
AW
837 up_read(&container->group_lock);
838
cba3345c
AW
839 return ret;
840}
841
9587f44a 842/* hold write lock on container->group_lock */
cba3345c
AW
843static int __vfio_container_attach_groups(struct vfio_container *container,
844 struct vfio_iommu_driver *driver,
845 void *data)
846{
847 struct vfio_group *group;
848 int ret = -ENODEV;
849
850 list_for_each_entry(group, &container->group_list, container_next) {
c3c0fa9d
CH
851 ret = driver->ops->attach_group(data, group->iommu_group,
852 group->type);
cba3345c
AW
853 if (ret)
854 goto unwind;
855 }
856
857 return ret;
858
859unwind:
860 list_for_each_entry_continue_reverse(group, &container->group_list,
861 container_next) {
862 driver->ops->detach_group(data, group->iommu_group);
863 }
864
865 return ret;
866}
867
868static long vfio_ioctl_set_iommu(struct vfio_container *container,
869 unsigned long arg)
870{
871 struct vfio_iommu_driver *driver;
872 long ret = -ENODEV;
873
9587f44a 874 down_write(&container->group_lock);
cba3345c
AW
875
876 /*
877 * The container is designed to be an unprivileged interface while
878 * the group can be assigned to specific users. Therefore, only by
879 * adding a group to a container does the user get the privilege of
880 * enabling the iommu, which may allocate finite resources. There
881 * is no unset_iommu, but by removing all the groups from a container,
882 * the container is deprivileged and returns to an unset state.
883 */
884 if (list_empty(&container->group_list) || container->iommu_driver) {
9587f44a 885 up_write(&container->group_lock);
cba3345c
AW
886 return -EINVAL;
887 }
888
889 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6 890 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
cba3345c
AW
891 void *data;
892
b0062160 893 if (!vfio_iommu_driver_allowed(container, driver))
03a76b60 894 continue;
cba3345c
AW
895 if (!try_module_get(driver->ops->owner))
896 continue;
897
898 /*
899 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
900 * so test which iommu driver reported support for this
901 * extension and call open on them. We also pass them the
902 * magic, allowing a single driver to support multiple
903 * interfaces if they'd like.
904 */
905 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
906 module_put(driver->ops->owner);
907 continue;
908 }
909
cba3345c
AW
910 data = driver->ops->open(arg);
911 if (IS_ERR(data)) {
912 ret = PTR_ERR(data);
913 module_put(driver->ops->owner);
7c435b46 914 continue;
cba3345c
AW
915 }
916
917 ret = __vfio_container_attach_groups(container, driver, data);
7c435b46 918 if (ret) {
cba3345c
AW
919 driver->ops->release(data);
920 module_put(driver->ops->owner);
7c435b46 921 continue;
cba3345c
AW
922 }
923
7c435b46
AW
924 container->iommu_driver = driver;
925 container->iommu_data = data;
926 break;
cba3345c
AW
927 }
928
929 mutex_unlock(&vfio.iommu_drivers_lock);
9587f44a 930 up_write(&container->group_lock);
cba3345c
AW
931
932 return ret;
933}
934
935static long vfio_fops_unl_ioctl(struct file *filep,
936 unsigned int cmd, unsigned long arg)
937{
938 struct vfio_container *container = filep->private_data;
939 struct vfio_iommu_driver *driver;
940 void *data;
941 long ret = -EINVAL;
942
943 if (!container)
944 return ret;
945
cba3345c
AW
946 switch (cmd) {
947 case VFIO_GET_API_VERSION:
948 ret = VFIO_API_VERSION;
949 break;
950 case VFIO_CHECK_EXTENSION:
951 ret = vfio_ioctl_check_extension(container, arg);
952 break;
953 case VFIO_SET_IOMMU:
954 ret = vfio_ioctl_set_iommu(container, arg);
955 break;
956 default:
0b43c082
AW
957 driver = container->iommu_driver;
958 data = container->iommu_data;
959
cba3345c
AW
960 if (driver) /* passthrough all unrecognized ioctls */
961 ret = driver->ops->ioctl(data, cmd, arg);
962 }
963
964 return ret;
965}
966
cba3345c
AW
967static int vfio_fops_open(struct inode *inode, struct file *filep)
968{
969 struct vfio_container *container;
970
971 container = kzalloc(sizeof(*container), GFP_KERNEL);
972 if (!container)
973 return -ENOMEM;
974
975 INIT_LIST_HEAD(&container->group_list);
9587f44a 976 init_rwsem(&container->group_lock);
cba3345c
AW
977 kref_init(&container->kref);
978
979 filep->private_data = container;
980
981 return 0;
982}
983
984static int vfio_fops_release(struct inode *inode, struct file *filep)
985{
986 struct vfio_container *container = filep->private_data;
ec5e3294
SS
987 struct vfio_iommu_driver *driver = container->iommu_driver;
988
989 if (driver && driver->ops->notify)
990 driver->ops->notify(container->iommu_data,
991 VFIO_IOMMU_CONTAINER_CLOSE);
cba3345c
AW
992
993 filep->private_data = NULL;
994
995 vfio_container_put(container);
996
997 return 0;
998}
999
cba3345c
AW
1000static const struct file_operations vfio_fops = {
1001 .owner = THIS_MODULE,
1002 .open = vfio_fops_open,
1003 .release = vfio_fops_release,
cba3345c 1004 .unlocked_ioctl = vfio_fops_unl_ioctl,
407e9ef7 1005 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1006};
1007
3b9a2d57 1008/*
cba3345c
AW
1009 * VFIO Group fd, /dev/vfio/$GROUP
1010 */
1011static void __vfio_group_unset_container(struct vfio_group *group)
1012{
1013 struct vfio_container *container = group->container;
1014 struct vfio_iommu_driver *driver;
1015
e0e29bdb
JG
1016 lockdep_assert_held_write(&group->group_rwsem);
1017
9587f44a 1018 down_write(&container->group_lock);
cba3345c
AW
1019
1020 driver = container->iommu_driver;
1021 if (driver)
1022 driver->ops->detach_group(container->iommu_data,
1023 group->iommu_group);
1024
a3da1ab6
JG
1025 if (group->type == VFIO_IOMMU)
1026 iommu_group_release_dma_owner(group->iommu_group);
70693f47 1027
cba3345c 1028 group->container = NULL;
3ca54708 1029 group->container_users = 0;
cba3345c
AW
1030 list_del(&group->container_next);
1031
1032 /* Detaching the last group deprivileges a container, remove iommu */
1033 if (driver && list_empty(&container->group_list)) {
1034 driver->ops->release(container->iommu_data);
1035 module_put(driver->ops->owner);
1036 container->iommu_driver = NULL;
1037 container->iommu_data = NULL;
1038 }
1039
9587f44a 1040 up_write(&container->group_lock);
cba3345c
AW
1041
1042 vfio_container_put(container);
1043}
1044
1045/*
1046 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1047 * if there was no container to unset. Since the ioctl is called on
1048 * the group, we know that still exists, therefore the only valid
1049 * transition here is 1->0.
1050 */
b3b43590 1051static int vfio_group_ioctl_unset_container(struct vfio_group *group)
cba3345c 1052{
b3b43590 1053 int ret = 0;
cba3345c 1054
b3b43590
JG
1055 down_write(&group->group_rwsem);
1056 if (!group->container) {
1057 ret = -EINVAL;
1058 goto out_unlock;
1059 }
1060 if (group->container_users != 1) {
1061 ret = -EBUSY;
1062 goto out_unlock;
1063 }
cba3345c 1064 __vfio_group_unset_container(group);
b3b43590
JG
1065
1066out_unlock:
1067 up_write(&group->group_rwsem);
1068 return ret;
cba3345c
AW
1069}
1070
67671f15
JG
1071static int vfio_group_ioctl_set_container(struct vfio_group *group,
1072 int __user *arg)
cba3345c 1073{
2903ff01 1074 struct fd f;
cba3345c
AW
1075 struct vfio_container *container;
1076 struct vfio_iommu_driver *driver;
67671f15 1077 int container_fd;
2903ff01 1078 int ret = 0;
cba3345c 1079
c68ea0d0 1080 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
03a76b60
AW
1081 return -EPERM;
1082
67671f15
JG
1083 if (get_user(container_fd, arg))
1084 return -EFAULT;
1085 if (container_fd < 0)
1086 return -EINVAL;
2903ff01
AV
1087 f = fdget(container_fd);
1088 if (!f.file)
cba3345c
AW
1089 return -EBADF;
1090
1091 /* Sanity check, is this really our fd? */
2903ff01 1092 if (f.file->f_op != &vfio_fops) {
67671f15
JG
1093 ret = -EINVAL;
1094 goto out_fdput;
cba3345c 1095 }
2903ff01 1096 container = f.file->private_data;
cba3345c
AW
1097 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1098
67671f15
JG
1099 down_write(&group->group_rwsem);
1100
1101 if (group->container || WARN_ON(group->container_users)) {
1102 ret = -EINVAL;
1103 goto out_unlock_group;
1104 }
1105
9587f44a 1106 down_write(&container->group_lock);
cba3345c 1107
03a76b60
AW
1108 /* Real groups and fake groups cannot mix */
1109 if (!list_empty(&container->group_list) &&
c68ea0d0 1110 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
03a76b60 1111 ret = -EPERM;
67671f15 1112 goto out_unlock_container;
03a76b60
AW
1113 }
1114
a3da1ab6
JG
1115 if (group->type == VFIO_IOMMU) {
1116 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1117 if (ret)
67671f15 1118 goto out_unlock_container;
a3da1ab6 1119 }
70693f47 1120
cba3345c
AW
1121 driver = container->iommu_driver;
1122 if (driver) {
1123 ret = driver->ops->attach_group(container->iommu_data,
c3c0fa9d
CH
1124 group->iommu_group,
1125 group->type);
70693f47 1126 if (ret) {
a3da1ab6
JG
1127 if (group->type == VFIO_IOMMU)
1128 iommu_group_release_dma_owner(
1129 group->iommu_group);
67671f15 1130 goto out_unlock_container;
70693f47 1131 }
cba3345c
AW
1132 }
1133
1134 group->container = container;
3ca54708 1135 group->container_users = 1;
c68ea0d0 1136 container->noiommu = (group->type == VFIO_NO_IOMMU);
cba3345c
AW
1137 list_add(&group->container_next, &container->group_list);
1138
1139 /* Get a reference on the container and mark a user within the group */
1140 vfio_container_get(container);
cba3345c 1141
67671f15 1142out_unlock_container:
9587f44a 1143 up_write(&container->group_lock);
67671f15
JG
1144out_unlock_group:
1145 up_write(&group->group_rwsem);
1146out_fdput:
2903ff01 1147 fdput(f);
cba3345c
AW
1148 return ret;
1149}
1150
cba3345c
AW
1151static const struct file_operations vfio_device_fops;
1152
eadd86f8
JG
1153/* true if the vfio_device has open_device() called but not close_device() */
1154static bool vfio_assert_device_open(struct vfio_device *device)
32f55d83 1155{
eadd86f8
JG
1156 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1157}
1158
805bb6c1 1159static int vfio_device_assign_container(struct vfio_device *device)
cba3345c 1160{
805bb6c1 1161 struct vfio_group *group = device->group;
cba3345c 1162
e0e29bdb
JG
1163 lockdep_assert_held_write(&group->group_rwsem);
1164
3ca54708
JG
1165 if (!group->container || !group->container->iommu_driver ||
1166 WARN_ON(!group->container_users))
32f55d83
KW
1167 return -EINVAL;
1168
c68ea0d0 1169 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
32f55d83 1170 return -EPERM;
32f55d83 1171
b76c0eed 1172 get_file(group->opened_file);
3ca54708 1173 group->container_users++;
32f55d83
KW
1174 return 0;
1175}
1176
b76c0eed
JG
1177static void vfio_device_unassign_container(struct vfio_device *device)
1178{
1179 down_write(&device->group->group_rwsem);
3ca54708
JG
1180 WARN_ON(device->group->container_users <= 1);
1181 device->group->container_users--;
b76c0eed
JG
1182 fput(device->group->opened_file);
1183 up_write(&device->group->group_rwsem);
1184}
cba3345c 1185
805bb6c1 1186static struct file *vfio_device_open(struct vfio_device *device)
cba3345c 1187{
ce4b4657 1188 struct vfio_iommu_driver *iommu_driver;
cba3345c 1189 struct file *filep;
805bb6c1 1190 int ret;
03a76b60 1191
e0e29bdb 1192 down_write(&device->group->group_rwsem);
805bb6c1 1193 ret = vfio_device_assign_container(device);
e0e29bdb 1194 up_write(&device->group->group_rwsem);
805bb6c1
JG
1195 if (ret)
1196 return ERR_PTR(ret);
cba3345c 1197
9dcf01d9 1198 if (!try_module_get(device->dev->driver->owner)) {
2fd585f4 1199 ret = -ENODEV;
805bb6c1 1200 goto err_unassign_container;
9dcf01d9
MG
1201 }
1202
2fd585f4
JG
1203 mutex_lock(&device->dev_set->lock);
1204 device->open_count++;
421cfe65
MR
1205 if (device->open_count == 1) {
1206 /*
1207 * Here we pass the KVM pointer with the group under the read
1208 * lock. If the device driver will use it, it must obtain a
1209 * reference and release it during close_device.
1210 */
1211 down_read(&device->group->group_rwsem);
1212 device->kvm = device->group->kvm;
1213
1214 if (device->ops->open_device) {
1215 ret = device->ops->open_device(device);
1216 if (ret)
1217 goto err_undo_count;
1218 }
ce4b4657
JG
1219
1220 iommu_driver = device->group->container->iommu_driver;
8cfc5b60
JG
1221 if (iommu_driver && iommu_driver->ops->register_device)
1222 iommu_driver->ops->register_device(
1223 device->group->container->iommu_data, device);
ce4b4657 1224
421cfe65 1225 up_read(&device->group->group_rwsem);
2fd585f4
JG
1226 }
1227 mutex_unlock(&device->dev_set->lock);
1228
4bc94d5d
AW
1229 /*
1230 * We can't use anon_inode_getfd() because we need to modify
1231 * the f_mode flags directly to allow more than just ioctls
1232 */
4bc94d5d
AW
1233 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1234 device, O_RDWR);
1235 if (IS_ERR(filep)) {
4bc94d5d 1236 ret = PTR_ERR(filep);
805bb6c1 1237 goto err_close_device;
4bc94d5d
AW
1238 }
1239
1240 /*
1241 * TODO: add an anon_inode interface to do this.
1242 * Appears to be missing by lack of need rather than
1243 * explicitly prevented. Now there's need.
1244 */
54ef7a47 1245 filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
cba3345c 1246
805bb6c1 1247 if (device->group->type == VFIO_NO_IOMMU)
03a76b60
AW
1248 dev_warn(device->dev, "vfio-noiommu device opened by user "
1249 "(%s:%d)\n", current->comm, task_pid_nr(current));
805bb6c1
JG
1250 /*
1251 * On success the ref of device is moved to the file and
1252 * put in vfio_device_fops_release()
1253 */
1254 return filep;
03a76b60 1255
2fd585f4
JG
1256err_close_device:
1257 mutex_lock(&device->dev_set->lock);
421cfe65 1258 down_read(&device->group->group_rwsem);
ce4b4657 1259 if (device->open_count == 1 && device->ops->close_device) {
2fd585f4 1260 device->ops->close_device(device);
ce4b4657
JG
1261
1262 iommu_driver = device->group->container->iommu_driver;
8cfc5b60
JG
1263 if (iommu_driver && iommu_driver->ops->unregister_device)
1264 iommu_driver->ops->unregister_device(
1265 device->group->container->iommu_data, device);
ce4b4657 1266 }
2fd585f4 1267err_undo_count:
330c1799 1268 up_read(&device->group->group_rwsem);
2fd585f4 1269 device->open_count--;
421cfe65
MR
1270 if (device->open_count == 0 && device->kvm)
1271 device->kvm = NULL;
2fd585f4
JG
1272 mutex_unlock(&device->dev_set->lock);
1273 module_put(device->dev->driver->owner);
805bb6c1 1274err_unassign_container:
b76c0eed 1275 vfio_device_unassign_container(device);
805bb6c1
JG
1276 return ERR_PTR(ret);
1277}
1278
150ee2f9
JG
1279static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
1280 char __user *arg)
805bb6c1
JG
1281{
1282 struct vfio_device *device;
1283 struct file *filep;
150ee2f9 1284 char *buf;
805bb6c1
JG
1285 int fdno;
1286 int ret;
1287
150ee2f9
JG
1288 buf = strndup_user(arg, PAGE_SIZE);
1289 if (IS_ERR(buf))
1290 return PTR_ERR(buf);
1291
805bb6c1 1292 device = vfio_device_get_from_name(group, buf);
150ee2f9 1293 kfree(buf);
805bb6c1
JG
1294 if (IS_ERR(device))
1295 return PTR_ERR(device);
1296
1297 fdno = get_unused_fd_flags(O_CLOEXEC);
1298 if (fdno < 0) {
1299 ret = fdno;
1300 goto err_put_device;
1301 }
1302
1303 filep = vfio_device_open(device);
1304 if (IS_ERR(filep)) {
1305 ret = PTR_ERR(filep);
1306 goto err_put_fdno;
1307 }
1308
1309 fd_install(fdno, filep);
1310 return fdno;
1311
1312err_put_fdno:
1313 put_unused_fd(fdno);
1314err_put_device:
4a725b8d 1315 vfio_device_put_registration(device);
cba3345c
AW
1316 return ret;
1317}
1318
99a27c08
JG
1319static int vfio_group_ioctl_get_status(struct vfio_group *group,
1320 struct vfio_group_status __user *arg)
1321{
1322 unsigned long minsz = offsetofend(struct vfio_group_status, flags);
1323 struct vfio_group_status status;
1324
1325 if (copy_from_user(&status, arg, minsz))
1326 return -EFAULT;
1327
1328 if (status.argsz < minsz)
1329 return -EINVAL;
1330
1331 status.flags = 0;
1332
1333 down_read(&group->group_rwsem);
1334 if (group->container)
1335 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1336 VFIO_GROUP_FLAGS_VIABLE;
1337 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1338 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1339 up_read(&group->group_rwsem);
1340
1341 if (copy_to_user(arg, &status, minsz))
1342 return -EFAULT;
1343 return 0;
1344}
1345
cba3345c
AW
1346static long vfio_group_fops_unl_ioctl(struct file *filep,
1347 unsigned int cmd, unsigned long arg)
1348{
1349 struct vfio_group *group = filep->private_data;
150ee2f9 1350 void __user *uarg = (void __user *)arg;
cba3345c
AW
1351
1352 switch (cmd) {
150ee2f9
JG
1353 case VFIO_GROUP_GET_DEVICE_FD:
1354 return vfio_group_ioctl_get_device_fd(group, uarg);
cba3345c 1355 case VFIO_GROUP_GET_STATUS:
99a27c08 1356 return vfio_group_ioctl_get_status(group, uarg);
cba3345c 1357 case VFIO_GROUP_SET_CONTAINER:
67671f15 1358 return vfio_group_ioctl_set_container(group, uarg);
cba3345c 1359 case VFIO_GROUP_UNSET_CONTAINER:
b3b43590 1360 return vfio_group_ioctl_unset_container(group);
99a27c08
JG
1361 default:
1362 return -ENOTTY;
cba3345c 1363 }
cba3345c
AW
1364}
1365
cba3345c
AW
1366static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1367{
9cef7391
JG
1368 struct vfio_group *group =
1369 container_of(inode->i_cdev, struct vfio_group, cdev);
c6f4860e 1370 int ret;
cba3345c 1371
c6f4860e 1372 down_write(&group->group_rwsem);
cba3345c 1373
c6f4860e
JG
1374 /* users can be zero if this races with vfio_group_put() */
1375 if (!refcount_inc_not_zero(&group->users)) {
1376 ret = -ENODEV;
1377 goto err_unlock;
03a76b60
AW
1378 }
1379
c6f4860e
JG
1380 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1381 ret = -EPERM;
1382 goto err_put;
6d6768c6
AW
1383 }
1384
c6f4860e
JG
1385 /*
1386 * Do we need multiple instances of the group open? Seems not.
c6f4860e 1387 */
b76c0eed 1388 if (group->opened_file) {
c6f4860e
JG
1389 ret = -EBUSY;
1390 goto err_put;
cba3345c 1391 }
b76c0eed 1392 group->opened_file = filep;
cba3345c
AW
1393 filep->private_data = group;
1394
c6f4860e 1395 up_write(&group->group_rwsem);
cba3345c 1396 return 0;
c6f4860e
JG
1397err_put:
1398 vfio_group_put(group);
1399err_unlock:
1400 up_write(&group->group_rwsem);
1401 return ret;
cba3345c
AW
1402}
1403
1404static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1405{
1406 struct vfio_group *group = filep->private_data;
1407
1408 filep->private_data = NULL;
1409
c6f4860e 1410 down_write(&group->group_rwsem);
b76c0eed
JG
1411 /*
1412 * Device FDs hold a group file reference, therefore the group release
1413 * is only called when there are no open devices.
1414 */
1415 WARN_ON(group->notifier.head);
1416 if (group->container) {
3ca54708 1417 WARN_ON(group->container_users != 1);
b76c0eed
JG
1418 __vfio_group_unset_container(group);
1419 }
1420 group->opened_file = NULL;
c6f4860e 1421 up_write(&group->group_rwsem);
6d6768c6 1422
cba3345c
AW
1423 vfio_group_put(group);
1424
1425 return 0;
1426}
1427
1428static const struct file_operations vfio_group_fops = {
1429 .owner = THIS_MODULE,
1430 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
407e9ef7 1431 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1432 .open = vfio_group_fops_open,
1433 .release = vfio_group_fops_release,
1434};
1435
8e5c6995
AS
1436/*
1437 * Wrapper around pm_runtime_resume_and_get().
1438 * Return error code on failure or 0 on success.
1439 */
1440static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
1441{
1442 struct device *dev = device->dev;
1443
1444 if (dev->driver && dev->driver->pm) {
1445 int ret;
1446
1447 ret = pm_runtime_resume_and_get(dev);
1448 if (ret) {
1449 dev_info_ratelimited(dev,
1450 "vfio: runtime resume failed %d\n", ret);
1451 return -EIO;
1452 }
1453 }
1454
1455 return 0;
1456}
1457
1458/*
1459 * Wrapper around pm_runtime_put().
1460 */
1461static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
1462{
1463 struct device *dev = device->dev;
1464
1465 if (dev->driver && dev->driver->pm)
1466 pm_runtime_put(dev);
1467}
1468
3b9a2d57 1469/*
cba3345c
AW
1470 * VFIO Device fd
1471 */
1472static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1473{
1474 struct vfio_device *device = filep->private_data;
ce4b4657 1475 struct vfio_iommu_driver *iommu_driver;
cba3345c 1476
2fd585f4 1477 mutex_lock(&device->dev_set->lock);
eadd86f8 1478 vfio_assert_device_open(device);
421cfe65 1479 down_read(&device->group->group_rwsem);
eadd86f8 1480 if (device->open_count == 1 && device->ops->close_device)
2fd585f4 1481 device->ops->close_device(device);
ce4b4657
JG
1482
1483 iommu_driver = device->group->container->iommu_driver;
8cfc5b60
JG
1484 if (iommu_driver && iommu_driver->ops->unregister_device)
1485 iommu_driver->ops->unregister_device(
1486 device->group->container->iommu_data, device);
421cfe65 1487 up_read(&device->group->group_rwsem);
eadd86f8 1488 device->open_count--;
421cfe65
MR
1489 if (device->open_count == 0)
1490 device->kvm = NULL;
2fd585f4 1491 mutex_unlock(&device->dev_set->lock);
cba3345c 1492
9dcf01d9
MG
1493 module_put(device->dev->driver->owner);
1494
b76c0eed 1495 vfio_device_unassign_container(device);
cba3345c 1496
4a725b8d 1497 vfio_device_put_registration(device);
cba3345c
AW
1498
1499 return 0;
1500}
1501
115dcec6
JG
1502/*
1503 * vfio_mig_get_next_state - Compute the next step in the FSM
1504 * @cur_fsm - The current state the device is in
1505 * @new_fsm - The target state to reach
1506 * @next_fsm - Pointer to the next step to get to new_fsm
1507 *
1508 * Return 0 upon success, otherwise -errno
1509 * Upon success the next step in the state progression between cur_fsm and
1510 * new_fsm will be set in next_fsm.
1511 *
1512 * This breaks down requests for combination transitions into smaller steps and
1513 * returns the next step to get to new_fsm. The function may need to be called
1514 * multiple times before reaching new_fsm.
1515 *
1516 */
1517int vfio_mig_get_next_state(struct vfio_device *device,
1518 enum vfio_device_mig_state cur_fsm,
1519 enum vfio_device_mig_state new_fsm,
1520 enum vfio_device_mig_state *next_fsm)
1521{
8cb3d83b 1522 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
115dcec6 1523 /*
8cb3d83b
JG
1524 * The coding in this table requires the driver to implement the
1525 * following FSM arcs:
115dcec6 1526 * RESUMING -> STOP
115dcec6 1527 * STOP -> RESUMING
115dcec6
JG
1528 * STOP -> STOP_COPY
1529 * STOP_COPY -> STOP
1530 *
8cb3d83b
JG
1531 * If P2P is supported then the driver must also implement these FSM
1532 * arcs:
1533 * RUNNING -> RUNNING_P2P
1534 * RUNNING_P2P -> RUNNING
1535 * RUNNING_P2P -> STOP
1536 * STOP -> RUNNING_P2P
1537 * Without P2P the driver must implement:
1538 * RUNNING -> STOP
1539 * STOP -> RUNNING
1540 *
1541 * The coding will step through multiple states for some combination
1542 * transitions; if all optional features are supported, this means the
1543 * following ones:
1544 * RESUMING -> STOP -> RUNNING_P2P
1545 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
115dcec6 1546 * RESUMING -> STOP -> STOP_COPY
8cb3d83b
JG
1547 * RUNNING -> RUNNING_P2P -> STOP
1548 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1549 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1550 * RUNNING_P2P -> STOP -> RESUMING
1551 * RUNNING_P2P -> STOP -> STOP_COPY
1552 * STOP -> RUNNING_P2P -> RUNNING
115dcec6 1553 * STOP_COPY -> STOP -> RESUMING
8cb3d83b
JG
1554 * STOP_COPY -> STOP -> RUNNING_P2P
1555 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
115dcec6
JG
1556 */
1557 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1558 [VFIO_DEVICE_STATE_STOP] = {
1559 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 1560 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1561 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1562 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b 1563 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1564 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1565 },
1566 [VFIO_DEVICE_STATE_RUNNING] = {
8cb3d83b 1567 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6 1568 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
8cb3d83b
JG
1569 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1570 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1571 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1572 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1573 },
1574 [VFIO_DEVICE_STATE_STOP_COPY] = {
1575 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1576 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1577 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1578 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 1579 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
1580 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1581 },
1582 [VFIO_DEVICE_STATE_RESUMING] = {
1583 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1584 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1585 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1586 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b
JG
1587 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1588 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1589 },
1590 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1591 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1592 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1593 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1594 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1595 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1596 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1597 },
1598 [VFIO_DEVICE_STATE_ERROR] = {
1599 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1600 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1601 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1602 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
8cb3d83b 1603 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
1604 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1605 },
1606 };
1607
8cb3d83b
JG
1608 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1609 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1610 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1611 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1612 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1613 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1614 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1615 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1616 };
1617
1618 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1619 (state_flags_table[cur_fsm] & device->migration_flags) !=
1620 state_flags_table[cur_fsm]))
115dcec6
JG
1621 return -EINVAL;
1622
8cb3d83b
JG
1623 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1624 (state_flags_table[new_fsm] & device->migration_flags) !=
1625 state_flags_table[new_fsm])
115dcec6
JG
1626 return -EINVAL;
1627
8cb3d83b
JG
1628 /*
1629 * Arcs touching optional and unsupported states are skipped over. The
1630 * driver will instead see an arc from the original state to the next
1631 * logical state, as per the above comment.
1632 */
115dcec6 1633 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
8cb3d83b
JG
1634 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1635 state_flags_table[*next_fsm])
1636 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1637
115dcec6
JG
1638 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1639}
1640EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1641
1642/*
1643 * Convert the drivers's struct file into a FD number and return it to userspace
1644 */
1645static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1646 struct vfio_device_feature_mig_state *mig)
1647{
1648 int ret;
1649 int fd;
1650
1651 fd = get_unused_fd_flags(O_CLOEXEC);
1652 if (fd < 0) {
1653 ret = fd;
1654 goto out_fput;
1655 }
1656
1657 mig->data_fd = fd;
1658 if (copy_to_user(arg, mig, sizeof(*mig))) {
1659 ret = -EFAULT;
1660 goto out_put_unused;
1661 }
1662 fd_install(fd, filp);
1663 return 0;
1664
1665out_put_unused:
1666 put_unused_fd(fd);
1667out_fput:
1668 fput(filp);
1669 return ret;
1670}
1671
1672static int
1673vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1674 u32 flags, void __user *arg,
1675 size_t argsz)
1676{
1677 size_t minsz =
1678 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1679 struct vfio_device_feature_mig_state mig;
1680 struct file *filp = NULL;
1681 int ret;
1682
6e97eba8 1683 if (!device->mig_ops)
115dcec6
JG
1684 return -ENOTTY;
1685
1686 ret = vfio_check_feature(flags, argsz,
1687 VFIO_DEVICE_FEATURE_SET |
1688 VFIO_DEVICE_FEATURE_GET,
1689 sizeof(mig));
1690 if (ret != 1)
1691 return ret;
1692
1693 if (copy_from_user(&mig, arg, minsz))
1694 return -EFAULT;
1695
1696 if (flags & VFIO_DEVICE_FEATURE_GET) {
1697 enum vfio_device_mig_state curr_state;
1698
6e97eba8
YH
1699 ret = device->mig_ops->migration_get_state(device,
1700 &curr_state);
115dcec6
JG
1701 if (ret)
1702 return ret;
1703 mig.device_state = curr_state;
1704 goto out_copy;
1705 }
1706
1707 /* Handle the VFIO_DEVICE_FEATURE_SET */
6e97eba8 1708 filp = device->mig_ops->migration_set_state(device, mig.device_state);
115dcec6
JG
1709 if (IS_ERR(filp) || !filp)
1710 goto out_copy;
1711
1712 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1713out_copy:
1714 mig.data_fd = -1;
1715 if (copy_to_user(arg, &mig, sizeof(mig)))
1716 return -EFAULT;
1717 if (IS_ERR(filp))
1718 return PTR_ERR(filp);
1719 return 0;
1720}
1721
1722static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1723 u32 flags, void __user *arg,
1724 size_t argsz)
1725{
1726 struct vfio_device_feature_migration mig = {
8cb3d83b 1727 .flags = device->migration_flags,
115dcec6
JG
1728 };
1729 int ret;
1730
6e97eba8 1731 if (!device->mig_ops)
115dcec6
JG
1732 return -ENOTTY;
1733
1734 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1735 sizeof(mig));
1736 if (ret != 1)
1737 return ret;
1738 if (copy_to_user(arg, &mig, sizeof(mig)))
1739 return -EFAULT;
1740 return 0;
1741}
1742
80c4b92a
YH
1743/* Ranges should fit into a single kernel page */
1744#define LOG_MAX_RANGES \
1745 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1746
1747static int
1748vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1749 u32 flags, void __user *arg,
1750 size_t argsz)
1751{
1752 size_t minsz =
1753 offsetofend(struct vfio_device_feature_dma_logging_control,
1754 ranges);
1755 struct vfio_device_feature_dma_logging_range __user *ranges;
1756 struct vfio_device_feature_dma_logging_control control;
1757 struct vfio_device_feature_dma_logging_range range;
1758 struct rb_root_cached root = RB_ROOT_CACHED;
1759 struct interval_tree_node *nodes;
1760 u64 iova_end;
1761 u32 nnodes;
1762 int i, ret;
1763
1764 if (!device->log_ops)
1765 return -ENOTTY;
1766
1767 ret = vfio_check_feature(flags, argsz,
1768 VFIO_DEVICE_FEATURE_SET,
1769 sizeof(control));
1770 if (ret != 1)
1771 return ret;
1772
1773 if (copy_from_user(&control, arg, minsz))
1774 return -EFAULT;
1775
1776 nnodes = control.num_ranges;
1777 if (!nnodes)
1778 return -EINVAL;
1779
1780 if (nnodes > LOG_MAX_RANGES)
1781 return -E2BIG;
1782
1783 ranges = u64_to_user_ptr(control.ranges);
1784 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1785 GFP_KERNEL);
1786 if (!nodes)
1787 return -ENOMEM;
1788
1789 for (i = 0; i < nnodes; i++) {
1790 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1791 ret = -EFAULT;
1792 goto end;
1793 }
1794 if (!IS_ALIGNED(range.iova, control.page_size) ||
1795 !IS_ALIGNED(range.length, control.page_size)) {
1796 ret = -EINVAL;
1797 goto end;
1798 }
1799
1800 if (check_add_overflow(range.iova, range.length, &iova_end) ||
1801 iova_end > ULONG_MAX) {
1802 ret = -EOVERFLOW;
1803 goto end;
1804 }
1805
1806 nodes[i].start = range.iova;
1807 nodes[i].last = range.iova + range.length - 1;
1808 if (interval_tree_iter_first(&root, nodes[i].start,
1809 nodes[i].last)) {
1810 /* Range overlapping */
1811 ret = -EINVAL;
1812 goto end;
1813 }
1814 interval_tree_insert(nodes + i, &root);
1815 }
1816
1817 ret = device->log_ops->log_start(device, &root, nnodes,
1818 &control.page_size);
1819 if (ret)
1820 goto end;
1821
1822 if (copy_to_user(arg, &control, sizeof(control))) {
1823 ret = -EFAULT;
1824 device->log_ops->log_stop(device);
1825 }
1826
1827end:
1828 kfree(nodes);
1829 return ret;
1830}
1831
1832static int
1833vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1834 u32 flags, void __user *arg,
1835 size_t argsz)
1836{
1837 int ret;
1838
1839 if (!device->log_ops)
1840 return -ENOTTY;
1841
1842 ret = vfio_check_feature(flags, argsz,
1843 VFIO_DEVICE_FEATURE_SET, 0);
1844 if (ret != 1)
1845 return ret;
1846
1847 return device->log_ops->log_stop(device);
1848}
1849
1850static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1851 unsigned long iova, size_t length,
1852 void *opaque)
1853{
1854 struct vfio_device *device = opaque;
1855
1856 return device->log_ops->log_read_and_clear(device, iova, length, iter);
1857}
1858
1859static int
1860vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1861 u32 flags, void __user *arg,
1862 size_t argsz)
1863{
1864 size_t minsz =
1865 offsetofend(struct vfio_device_feature_dma_logging_report,
1866 bitmap);
1867 struct vfio_device_feature_dma_logging_report report;
1868 struct iova_bitmap *iter;
1869 u64 iova_end;
1870 int ret;
1871
1872 if (!device->log_ops)
1873 return -ENOTTY;
1874
1875 ret = vfio_check_feature(flags, argsz,
1876 VFIO_DEVICE_FEATURE_GET,
1877 sizeof(report));
1878 if (ret != 1)
1879 return ret;
1880
1881 if (copy_from_user(&report, arg, minsz))
1882 return -EFAULT;
1883
1884 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1885 return -EINVAL;
1886
1887 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1888 iova_end > ULONG_MAX)
1889 return -EOVERFLOW;
1890
1891 iter = iova_bitmap_alloc(report.iova, report.length,
1892 report.page_size,
1893 u64_to_user_ptr(report.bitmap));
1894 if (IS_ERR(iter))
1895 return PTR_ERR(iter);
1896
1897 ret = iova_bitmap_for_each(iter, device,
1898 vfio_device_log_read_and_clear);
1899
1900 iova_bitmap_free(iter);
1901 return ret;
1902}
1903
445ad495
JG
1904static int vfio_ioctl_device_feature(struct vfio_device *device,
1905 struct vfio_device_feature __user *arg)
1906{
1907 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1908 struct vfio_device_feature feature;
1909
1910 if (copy_from_user(&feature, arg, minsz))
1911 return -EFAULT;
1912
1913 if (feature.argsz < minsz)
1914 return -EINVAL;
1915
1916 /* Check unknown flags */
1917 if (feature.flags &
1918 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1919 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1920 return -EINVAL;
1921
1922 /* GET & SET are mutually exclusive except with PROBE */
1923 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1924 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1925 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1926 return -EINVAL;
1927
1928 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
115dcec6
JG
1929 case VFIO_DEVICE_FEATURE_MIGRATION:
1930 return vfio_ioctl_device_feature_migration(
1931 device, feature.flags, arg->data,
1932 feature.argsz - minsz);
1933 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1934 return vfio_ioctl_device_feature_mig_device_state(
1935 device, feature.flags, arg->data,
1936 feature.argsz - minsz);
80c4b92a
YH
1937 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1938 return vfio_ioctl_device_feature_logging_start(
1939 device, feature.flags, arg->data,
1940 feature.argsz - minsz);
1941 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1942 return vfio_ioctl_device_feature_logging_stop(
1943 device, feature.flags, arg->data,
1944 feature.argsz - minsz);
1945 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1946 return vfio_ioctl_device_feature_logging_report(
1947 device, feature.flags, arg->data,
1948 feature.argsz - minsz);
445ad495
JG
1949 default:
1950 if (unlikely(!device->ops->device_feature))
1951 return -EINVAL;
1952 return device->ops->device_feature(device, feature.flags,
1953 arg->data,
1954 feature.argsz - minsz);
1955 }
1956}
1957
cba3345c
AW
1958static long vfio_device_fops_unl_ioctl(struct file *filep,
1959 unsigned int cmd, unsigned long arg)
1960{
1961 struct vfio_device *device = filep->private_data;
8e5c6995
AS
1962 int ret;
1963
1964 ret = vfio_device_pm_runtime_get(device);
1965 if (ret)
1966 return ret;
cba3345c 1967
445ad495
JG
1968 switch (cmd) {
1969 case VFIO_DEVICE_FEATURE:
8e5c6995
AS
1970 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1971 break;
1972
445ad495
JG
1973 default:
1974 if (unlikely(!device->ops->ioctl))
8e5c6995
AS
1975 ret = -EINVAL;
1976 else
1977 ret = device->ops->ioctl(device, cmd, arg);
1978 break;
445ad495 1979 }
8e5c6995
AS
1980
1981 vfio_device_pm_runtime_put(device);
1982 return ret;
cba3345c
AW
1983}
1984
1985static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1986 size_t count, loff_t *ppos)
1987{
1988 struct vfio_device *device = filep->private_data;
1989
1990 if (unlikely(!device->ops->read))
1991 return -EINVAL;
1992
6df62c5b 1993 return device->ops->read(device, buf, count, ppos);
cba3345c
AW
1994}
1995
1996static ssize_t vfio_device_fops_write(struct file *filep,
1997 const char __user *buf,
1998 size_t count, loff_t *ppos)
1999{
2000 struct vfio_device *device = filep->private_data;
2001
2002 if (unlikely(!device->ops->write))
2003 return -EINVAL;
2004
6df62c5b 2005 return device->ops->write(device, buf, count, ppos);
cba3345c
AW
2006}
2007
2008static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
2009{
2010 struct vfio_device *device = filep->private_data;
2011
2012 if (unlikely(!device->ops->mmap))
2013 return -EINVAL;
2014
6df62c5b 2015 return device->ops->mmap(device, vma);
cba3345c
AW
2016}
2017
cba3345c
AW
2018static const struct file_operations vfio_device_fops = {
2019 .owner = THIS_MODULE,
2020 .release = vfio_device_fops_release,
2021 .read = vfio_device_fops_read,
2022 .write = vfio_device_fops_write,
2023 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
407e9ef7 2024 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
2025 .mmap = vfio_device_fops_mmap,
2026};
2027
50d63b5b
JG
2028/**
2029 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
2030 * @file: VFIO group file
6cdd9782 2031 *
50d63b5b 2032 * The returned iommu_group is valid as long as a ref is held on the file.
6cdd9782 2033 */
50d63b5b 2034struct iommu_group *vfio_file_iommu_group(struct file *file)
6cdd9782 2035{
50d63b5b 2036 struct vfio_group *group = file->private_data;
6cdd9782 2037
50d63b5b
JG
2038 if (file->f_op != &vfio_group_fops)
2039 return NULL;
2040 return group->iommu_group;
6cdd9782 2041}
50d63b5b 2042EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
6cdd9782 2043
a905ad04
JG
2044/**
2045 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
2046 * is always CPU cache coherent
2047 * @file: VFIO group file
c0560f51 2048 *
a905ad04
JG
2049 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
2050 * bit in DMA transactions. A return of false indicates that the user has
2051 * rights to access additional instructions such as wbinvd on x86.
c0560f51 2052 */
a905ad04 2053bool vfio_file_enforced_coherent(struct file *file)
c0560f51 2054{
a905ad04
JG
2055 struct vfio_group *group = file->private_data;
2056 bool ret;
c0560f51 2057
a905ad04
JG
2058 if (file->f_op != &vfio_group_fops)
2059 return true;
c0560f51 2060
e0e29bdb
JG
2061 down_read(&group->group_rwsem);
2062 if (group->container) {
2063 ret = vfio_ioctl_check_extension(group->container,
2064 VFIO_DMA_CC_IOMMU);
2065 } else {
2066 /*
2067 * Since the coherency state is determined only once a container
2068 * is attached the user must do so before they can prove they
2069 * have permission.
2070 */
2071 ret = true;
c0560f51 2072 }
e0e29bdb 2073 up_read(&group->group_rwsem);
a905ad04 2074 return ret;
c0560f51 2075}
a905ad04 2076EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
c0560f51 2077
ba70a89f
JG
2078/**
2079 * vfio_file_set_kvm - Link a kvm with VFIO drivers
2080 * @file: VFIO group file
2081 * @kvm: KVM to link
2082 *
421cfe65
MR
2083 * When a VFIO device is first opened the KVM will be available in
2084 * device->kvm if one was associated with the group.
ba70a89f
JG
2085 */
2086void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
6cdd9782 2087{
ba70a89f 2088 struct vfio_group *group = file->private_data;
6cdd9782 2089
ba70a89f
JG
2090 if (file->f_op != &vfio_group_fops)
2091 return;
5d6dee80 2092
be8d3ada 2093 down_write(&group->group_rwsem);
ba70a89f 2094 group->kvm = kvm;
be8d3ada 2095 up_write(&group->group_rwsem);
5d6dee80 2096}
ba70a89f 2097EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
5d6dee80 2098
6a985ae8
JG
2099/**
2100 * vfio_file_has_dev - True if the VFIO file is a handle for device
2101 * @file: VFIO file to check
2102 * @device: Device that must be part of the file
2103 *
2104 * Returns true if given file has permission to manipulate the given device.
2105 */
2106bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
6cdd9782 2107{
6a985ae8 2108 struct vfio_group *group = file->private_data;
6cdd9782 2109
6a985ae8
JG
2110 if (file->f_op != &vfio_group_fops)
2111 return false;
2112
2113 return group == device->group;
88d7ab89 2114}
6a985ae8 2115EXPORT_SYMBOL_GPL(vfio_file_has_dev);
88d7ab89 2116
3b9a2d57 2117/*
d7a8d5ed
AW
2118 * Sub-module support
2119 */
2120/*
2121 * Helper for managing a buffer of info chain capabilities, allocate or
2122 * reallocate a buffer with additional @size, filling in @id and @version
2123 * of the capability. A pointer to the new capability is returned.
2124 *
2125 * NB. The chain is based at the head of the buffer, so new entries are
2126 * added to the tail, vfio_info_cap_shift() should be called to fixup the
2127 * next offsets prior to copying to the user buffer.
2128 */
2129struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
2130 size_t size, u16 id, u16 version)
2131{
2132 void *buf;
2133 struct vfio_info_cap_header *header, *tmp;
2134
2135 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
2136 if (!buf) {
2137 kfree(caps->buf);
6641085e 2138 caps->buf = NULL;
d7a8d5ed
AW
2139 caps->size = 0;
2140 return ERR_PTR(-ENOMEM);
2141 }
2142
2143 caps->buf = buf;
2144 header = buf + caps->size;
2145
2146 /* Eventually copied to user buffer, zero */
2147 memset(header, 0, size);
2148
2149 header->id = id;
2150 header->version = version;
2151
2152 /* Add to the end of the capability chain */
5ba6de98 2153 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
d7a8d5ed
AW
2154 ; /* nothing */
2155
2156 tmp->next = caps->size;
2157 caps->size += size;
2158
2159 return header;
2160}
2161EXPORT_SYMBOL_GPL(vfio_info_cap_add);
2162
2163void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
2164{
2165 struct vfio_info_cap_header *tmp;
5ba6de98 2166 void *buf = (void *)caps->buf;
d7a8d5ed 2167
5ba6de98 2168 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
d7a8d5ed
AW
2169 tmp->next += offset;
2170}
b3c0a866 2171EXPORT_SYMBOL(vfio_info_cap_shift);
d7a8d5ed 2172
dda01f78
AW
2173int vfio_info_add_capability(struct vfio_info_cap *caps,
2174 struct vfio_info_cap_header *cap, size_t size)
b3c0a866
KW
2175{
2176 struct vfio_info_cap_header *header;
b3c0a866 2177
dda01f78 2178 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
b3c0a866
KW
2179 if (IS_ERR(header))
2180 return PTR_ERR(header);
2181
dda01f78 2182 memcpy(header + 1, cap + 1, size - sizeof(*header));
b3c0a866 2183
b3c0a866
KW
2184 return 0;
2185}
b3c0a866 2186EXPORT_SYMBOL(vfio_info_add_capability);
2169037d 2187
c747f08a
KW
2188int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
2189 int max_irq_type, size_t *data_size)
2190{
2191 unsigned long minsz;
2192 size_t size;
2193
2194 minsz = offsetofend(struct vfio_irq_set, count);
2195
2196 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
2197 (hdr->count >= (U32_MAX - hdr->start)) ||
2198 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
2199 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
2200 return -EINVAL;
2201
2202 if (data_size)
2203 *data_size = 0;
2204
2205 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
2206 return -EINVAL;
2207
2208 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
2209 case VFIO_IRQ_SET_DATA_NONE:
2210 size = 0;
2211 break;
2212 case VFIO_IRQ_SET_DATA_BOOL:
2213 size = sizeof(uint8_t);
2214 break;
2215 case VFIO_IRQ_SET_DATA_EVENTFD:
2216 size = sizeof(int32_t);
2217 break;
2218 default:
2219 return -EINVAL;
2220 }
2221
2222 if (size) {
2223 if (hdr->argsz - minsz < hdr->count * size)
2224 return -EINVAL;
2225
2226 if (!data_size)
2227 return -EINVAL;
2228
2229 *data_size = hdr->count * size;
2230 }
2231
2232 return 0;
2233}
2234EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
2235
2169037d 2236/*
44abdd16 2237 * Pin contiguous user pages and return their associated host pages for local
2169037d 2238 * domain only.
8e432bb0 2239 * @device [in] : device
44abdd16
NC
2240 * @iova [in] : starting IOVA of user pages to be pinned.
2241 * @npage [in] : count of pages to be pinned. This count should not
2242 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2169037d 2243 * @prot [in] : protection flags
34a255e6 2244 * @pages[out] : array of host pages
2169037d 2245 * Return error or number of pages pinned.
21c13829
JG
2246 *
2247 * A driver may only call this function if the vfio_device was created
2248 * by vfio_register_emulated_iommu_dev().
2169037d 2249 */
44abdd16 2250int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
34a255e6 2251 int npage, int prot, struct page **pages)
2169037d
KW
2252{
2253 struct vfio_container *container;
8e432bb0 2254 struct vfio_group *group = device->group;
2169037d
KW
2255 struct vfio_iommu_driver *driver;
2256 int ret;
2257
34a255e6 2258 if (!pages || !npage || !vfio_assert_device_open(device))
2169037d
KW
2259 return -EINVAL;
2260
2261 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2262 return -E2BIG;
2263
e0e29bdb 2264 /* group->container cannot change while a vfio device is open */
2169037d 2265 container = group->container;
2169037d
KW
2266 driver = container->iommu_driver;
2267 if (likely(driver && driver->ops->pin_pages))
95fc87b4 2268 ret = driver->ops->pin_pages(container->iommu_data,
44abdd16 2269 group->iommu_group, iova,
34a255e6 2270 npage, prot, pages);
2169037d
KW
2271 else
2272 ret = -ENOTTY;
2273
2169037d
KW
2274 return ret;
2275}
2276EXPORT_SYMBOL(vfio_pin_pages);
2277
2278/*
44abdd16 2279 * Unpin contiguous host pages for local domain only.
8e432bb0 2280 * @device [in] : device
44abdd16
NC
2281 * @iova [in] : starting address of user pages to be unpinned.
2282 * @npage [in] : count of pages to be unpinned. This count should not
2169037d 2283 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2169037d 2284 */
44abdd16 2285void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
2169037d
KW
2286{
2287 struct vfio_container *container;
2169037d 2288 struct vfio_iommu_driver *driver;
2169037d 2289
e8f90717
NC
2290 if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
2291 return;
2169037d 2292
e8f90717
NC
2293 if (WARN_ON(!vfio_assert_device_open(device)))
2294 return;
2169037d 2295
e0e29bdb 2296 /* group->container cannot change while a vfio device is open */
8e432bb0 2297 container = device->group->container;
2169037d 2298 driver = container->iommu_driver;
2169037d 2299
44abdd16 2300 driver->ops->unpin_pages(container->iommu_data, iova, npage);
2169037d
KW
2301}
2302EXPORT_SYMBOL(vfio_unpin_pages);
2303
8d46c0cc
YZ
2304/*
2305 * This interface allows the CPUs to perform some sort of virtual DMA on
2306 * behalf of the device.
2307 *
2308 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2309 * into/from a kernel buffer.
2310 *
2311 * As the read/write of user space memory is conducted via the CPUs and is
2312 * not a real device DMA, it is not necessary to pin the user space memory.
2313 *
c6250ffb 2314 * @device [in] : VFIO device
8561aa4f 2315 * @iova [in] : base IOVA of a user space buffer
8d46c0cc
YZ
2316 * @data [in] : pointer to kernel buffer
2317 * @len [in] : kernel buffer length
2318 * @write : indicate read or write
2319 * Return error code on failure or 0 on success.
2320 */
8561aa4f 2321int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
c6250ffb 2322 size_t len, bool write)
8d46c0cc
YZ
2323{
2324 struct vfio_container *container;
2325 struct vfio_iommu_driver *driver;
2326 int ret = 0;
2327
eadd86f8 2328 if (!data || len <= 0 || !vfio_assert_device_open(device))
8d46c0cc
YZ
2329 return -EINVAL;
2330
e0e29bdb 2331 /* group->container cannot change while a vfio device is open */
c6250ffb 2332 container = device->group->container;
8d46c0cc
YZ
2333 driver = container->iommu_driver;
2334
2335 if (likely(driver && driver->ops->dma_rw))
2336 ret = driver->ops->dma_rw(container->iommu_data,
8561aa4f 2337 iova, data, len, write);
8d46c0cc
YZ
2338 else
2339 ret = -ENOTTY;
8d46c0cc
YZ
2340 return ret;
2341}
2342EXPORT_SYMBOL(vfio_dma_rw);
2343
3b9a2d57 2344/*
cba3345c
AW
2345 * Module/class support
2346 */
2347static char *vfio_devnode(struct device *dev, umode_t *mode)
2348{
2349 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2350}
2351
d1099901
AW
2352static struct miscdevice vfio_dev = {
2353 .minor = VFIO_MINOR,
2354 .name = "vfio",
2355 .fops = &vfio_fops,
2356 .nodename = "vfio/vfio",
2357 .mode = S_IRUGO | S_IWUGO,
2358};
2359
cba3345c
AW
2360static int __init vfio_init(void)
2361{
2362 int ret;
2363
9cef7391 2364 ida_init(&vfio.group_ida);
cba3345c
AW
2365 mutex_init(&vfio.group_lock);
2366 mutex_init(&vfio.iommu_drivers_lock);
2367 INIT_LIST_HEAD(&vfio.group_list);
2368 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
cba3345c 2369
d1099901
AW
2370 ret = misc_register(&vfio_dev);
2371 if (ret) {
2372 pr_err("vfio: misc device register failed\n");
2373 return ret;
2374 }
2375
2376 /* /dev/vfio/$GROUP */
cba3345c
AW
2377 vfio.class = class_create(THIS_MODULE, "vfio");
2378 if (IS_ERR(vfio.class)) {
2379 ret = PTR_ERR(vfio.class);
2380 goto err_class;
2381 }
2382
2383 vfio.class->devnode = vfio_devnode;
2384
8bcb64a5 2385 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
cba3345c 2386 if (ret)
d1099901 2387 goto err_alloc_chrdev;
cba3345c 2388
03a76b60 2389#ifdef CONFIG_VFIO_NOIOMMU
a13b1e47 2390 ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
03a76b60 2391#endif
a13b1e47
BL
2392 if (ret)
2393 goto err_driver_register;
2394
2395 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
cba3345c
AW
2396 return 0;
2397
a13b1e47
BL
2398err_driver_register:
2399 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
d1099901 2400err_alloc_chrdev:
cba3345c
AW
2401 class_destroy(vfio.class);
2402 vfio.class = NULL;
2403err_class:
d1099901 2404 misc_deregister(&vfio_dev);
cba3345c
AW
2405 return ret;
2406}
2407
2408static void __exit vfio_cleanup(void)
2409{
2410 WARN_ON(!list_empty(&vfio.group_list));
2411
03a76b60
AW
2412#ifdef CONFIG_VFIO_NOIOMMU
2413 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2414#endif
9cef7391 2415 ida_destroy(&vfio.group_ida);
8bcb64a5 2416 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
cba3345c
AW
2417 class_destroy(vfio.class);
2418 vfio.class = NULL;
d1099901 2419 misc_deregister(&vfio_dev);
2fd585f4 2420 xa_destroy(&vfio_device_set_xa);
cba3345c
AW
2421}
2422
2423module_init(vfio_init);
2424module_exit(vfio_cleanup);
2425
2426MODULE_VERSION(DRIVER_VERSION);
2427MODULE_LICENSE("GPL v2");
2428MODULE_AUTHOR(DRIVER_AUTHOR);
2429MODULE_DESCRIPTION(DRIVER_DESC);
d1099901
AW
2430MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2431MODULE_ALIAS("devname:vfio/vfio");
0ca582fd 2432MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");