Merge tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
[linux-2.6-block.git] / drivers / vfio / vfio.c
CommitLineData
d2912cb1 1// SPDX-License-Identifier: GPL-2.0-only
cba3345c
AW
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
cba3345c
AW
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/file.h>
17#include <linux/anon_inodes.h>
18#include <linux/fs.h>
19#include <linux/idr.h>
20#include <linux/iommu.h>
21#include <linux/list.h>
d1099901 22#include <linux/miscdevice.h>
cba3345c
AW
23#include <linux/module.h>
24#include <linux/mutex.h>
5f096b14 25#include <linux/pci.h>
9587f44a 26#include <linux/rwsem.h>
cba3345c
AW
27#include <linux/sched.h>
28#include <linux/slab.h>
664e9386 29#include <linux/stat.h>
cba3345c
AW
30#include <linux/string.h>
31#include <linux/uaccess.h>
32#include <linux/vfio.h>
33#include <linux/wait.h>
41be3e26 34#include <linux/sched/signal.h>
8cc02d22 35#include "vfio.h"
cba3345c
AW
36
37#define DRIVER_VERSION "0.3"
38#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
39#define DRIVER_DESC "VFIO - User Level meta-driver"
40
41static struct vfio {
42 struct class *class;
43 struct list_head iommu_drivers_list;
44 struct mutex iommu_drivers_lock;
45 struct list_head group_list;
9cef7391
JG
46 struct mutex group_lock; /* locks group_list */
47 struct ida group_ida;
d1099901 48 dev_t group_devt;
cba3345c
AW
49} vfio;
50
51struct vfio_iommu_driver {
52 const struct vfio_iommu_driver_ops *ops;
53 struct list_head vfio_next;
54};
55
56struct vfio_container {
57 struct kref kref;
58 struct list_head group_list;
9587f44a 59 struct rw_semaphore group_lock;
cba3345c
AW
60 struct vfio_iommu_driver *iommu_driver;
61 void *iommu_data;
03a76b60 62 bool noiommu;
cba3345c
AW
63};
64
65struct vfio_group {
9cef7391
JG
66 struct device dev;
67 struct cdev cdev;
2b678aa2 68 refcount_t users;
3ca54708 69 unsigned int container_users;
cba3345c
AW
70 struct iommu_group *iommu_group;
71 struct vfio_container *container;
72 struct list_head device_list;
73 struct mutex device_lock;
cba3345c
AW
74 struct list_head vfio_next;
75 struct list_head container_next;
c68ea0d0 76 enum vfio_group_type type;
95fc87b4 77 unsigned int dev_counter;
be8d3ada 78 struct rw_semaphore group_rwsem;
ccd46dba 79 struct kvm *kvm;
b76c0eed 80 struct file *opened_file;
ccd46dba 81 struct blocking_notifier_head notifier;
cba3345c
AW
82};
83
03a76b60
AW
84#ifdef CONFIG_VFIO_NOIOMMU
85static bool noiommu __read_mostly;
86module_param_named(enable_unsafe_noiommu_mode,
87 noiommu, bool, S_IRUGO | S_IWUSR);
88MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
89#endif
90
2fd585f4 91static DEFINE_XARRAY(vfio_device_set_xa);
9cef7391 92static const struct file_operations vfio_group_fops;
2fd585f4
JG
93
94int vfio_assign_device_set(struct vfio_device *device, void *set_id)
95{
96 unsigned long idx = (unsigned long)set_id;
97 struct vfio_device_set *new_dev_set;
98 struct vfio_device_set *dev_set;
99
100 if (WARN_ON(!set_id))
101 return -EINVAL;
102
103 /*
104 * Atomically acquire a singleton object in the xarray for this set_id
105 */
106 xa_lock(&vfio_device_set_xa);
107 dev_set = xa_load(&vfio_device_set_xa, idx);
108 if (dev_set)
109 goto found_get_ref;
110 xa_unlock(&vfio_device_set_xa);
111
112 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
113 if (!new_dev_set)
114 return -ENOMEM;
115 mutex_init(&new_dev_set->lock);
116 INIT_LIST_HEAD(&new_dev_set->device_list);
117 new_dev_set->set_id = set_id;
118
119 xa_lock(&vfio_device_set_xa);
120 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
121 GFP_KERNEL);
122 if (!dev_set) {
123 dev_set = new_dev_set;
124 goto found_get_ref;
125 }
126
127 kfree(new_dev_set);
128 if (xa_is_err(dev_set)) {
129 xa_unlock(&vfio_device_set_xa);
130 return xa_err(dev_set);
131 }
132
133found_get_ref:
134 dev_set->device_count++;
135 xa_unlock(&vfio_device_set_xa);
136 mutex_lock(&dev_set->lock);
137 device->dev_set = dev_set;
138 list_add_tail(&device->dev_set_list, &dev_set->device_list);
139 mutex_unlock(&dev_set->lock);
140 return 0;
141}
142EXPORT_SYMBOL_GPL(vfio_assign_device_set);
143
144static void vfio_release_device_set(struct vfio_device *device)
145{
146 struct vfio_device_set *dev_set = device->dev_set;
147
148 if (!dev_set)
149 return;
150
151 mutex_lock(&dev_set->lock);
152 list_del(&device->dev_set_list);
153 mutex_unlock(&dev_set->lock);
154
155 xa_lock(&vfio_device_set_xa);
156 if (!--dev_set->device_count) {
157 __xa_erase(&vfio_device_set_xa,
158 (unsigned long)dev_set->set_id);
159 mutex_destroy(&dev_set->lock);
160 kfree(dev_set);
161 }
162 xa_unlock(&vfio_device_set_xa);
163}
164
03a76b60
AW
165#ifdef CONFIG_VFIO_NOIOMMU
166static void *vfio_noiommu_open(unsigned long arg)
167{
168 if (arg != VFIO_NOIOMMU_IOMMU)
169 return ERR_PTR(-EINVAL);
170 if (!capable(CAP_SYS_RAWIO))
171 return ERR_PTR(-EPERM);
172
173 return NULL;
174}
175
176static void vfio_noiommu_release(void *iommu_data)
177{
178}
179
180static long vfio_noiommu_ioctl(void *iommu_data,
181 unsigned int cmd, unsigned long arg)
182{
183 if (cmd == VFIO_CHECK_EXTENSION)
184 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
185
186 return -ENOTTY;
187}
188
03a76b60 189static int vfio_noiommu_attach_group(void *iommu_data,
c3c0fa9d 190 struct iommu_group *iommu_group, enum vfio_group_type type)
03a76b60 191{
c5b4ba97 192 return 0;
03a76b60
AW
193}
194
195static void vfio_noiommu_detach_group(void *iommu_data,
196 struct iommu_group *iommu_group)
197{
198}
199
200static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
201 .name = "vfio-noiommu",
202 .owner = THIS_MODULE,
203 .open = vfio_noiommu_open,
204 .release = vfio_noiommu_release,
205 .ioctl = vfio_noiommu_ioctl,
206 .attach_group = vfio_noiommu_attach_group,
207 .detach_group = vfio_noiommu_detach_group,
208};
03a76b60 209
b0062160
CH
210/*
211 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
212 * use vfio-noiommu.
213 */
214static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
215 const struct vfio_iommu_driver *driver)
216{
217 return container->noiommu == (driver->ops == &vfio_noiommu_ops);
218}
219#else
220static inline bool vfio_iommu_driver_allowed(struct vfio_container *container,
221 const struct vfio_iommu_driver *driver)
222{
223 return true;
224}
225#endif /* CONFIG_VFIO_NOIOMMU */
03a76b60 226
3b9a2d57 227/*
cba3345c
AW
228 * IOMMU driver registration
229 */
230int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
231{
232 struct vfio_iommu_driver *driver, *tmp;
233
234 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
235 if (!driver)
236 return -ENOMEM;
237
238 driver->ops = ops;
239
240 mutex_lock(&vfio.iommu_drivers_lock);
241
242 /* Check for duplicates */
243 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
244 if (tmp->ops == ops) {
245 mutex_unlock(&vfio.iommu_drivers_lock);
246 kfree(driver);
247 return -EINVAL;
248 }
249 }
250
251 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
252
253 mutex_unlock(&vfio.iommu_drivers_lock);
254
255 return 0;
256}
257EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
258
259void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
260{
261 struct vfio_iommu_driver *driver;
262
263 mutex_lock(&vfio.iommu_drivers_lock);
264 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
265 if (driver->ops == ops) {
266 list_del(&driver->vfio_next);
267 mutex_unlock(&vfio.iommu_drivers_lock);
268 kfree(driver);
269 return;
270 }
271 }
272 mutex_unlock(&vfio.iommu_drivers_lock);
273}
274EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
275
cba3345c
AW
276static void vfio_group_get(struct vfio_group *group);
277
3b9a2d57 278/*
cba3345c
AW
279 * Container objects - containers are created when /dev/vfio/vfio is
280 * opened, but their lifecycle extends until the last user is done, so
281 * it's freed via kref. Must support container/group/device being
282 * closed in any order.
283 */
284static void vfio_container_get(struct vfio_container *container)
285{
286 kref_get(&container->kref);
287}
288
289static void vfio_container_release(struct kref *kref)
290{
291 struct vfio_container *container;
292 container = container_of(kref, struct vfio_container, kref);
293
294 kfree(container);
295}
296
297static void vfio_container_put(struct vfio_container *container)
298{
299 kref_put(&container->kref, vfio_container_release);
300}
301
3b9a2d57 302/*
cba3345c
AW
303 * Group objects - create, release, get, put, search
304 */
1ceabade
JG
305static struct vfio_group *
306__vfio_group_get_from_iommu(struct iommu_group *iommu_group)
307{
308 struct vfio_group *group;
309
310 list_for_each_entry(group, &vfio.group_list, vfio_next) {
311 if (group->iommu_group == iommu_group) {
312 vfio_group_get(group);
313 return group;
314 }
315 }
316 return NULL;
317}
318
319static struct vfio_group *
320vfio_group_get_from_iommu(struct iommu_group *iommu_group)
321{
322 struct vfio_group *group;
323
324 mutex_lock(&vfio.group_lock);
325 group = __vfio_group_get_from_iommu(iommu_group);
326 mutex_unlock(&vfio.group_lock);
327 return group;
328}
329
9cef7391 330static void vfio_group_release(struct device *dev)
cba3345c 331{
9cef7391 332 struct vfio_group *group = container_of(dev, struct vfio_group, dev);
9cef7391
JG
333
334 mutex_destroy(&group->device_lock);
9cef7391
JG
335 iommu_group_put(group->iommu_group);
336 ida_free(&vfio.group_ida, MINOR(group->dev.devt));
337 kfree(group);
338}
339
340static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
341 enum vfio_group_type type)
342{
343 struct vfio_group *group;
344 int minor;
cba3345c
AW
345
346 group = kzalloc(sizeof(*group), GFP_KERNEL);
347 if (!group)
348 return ERR_PTR(-ENOMEM);
349
9cef7391
JG
350 minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
351 if (minor < 0) {
352 kfree(group);
353 return ERR_PTR(minor);
354 }
355
356 device_initialize(&group->dev);
357 group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
358 group->dev.class = vfio.class;
359 group->dev.release = vfio_group_release;
360 cdev_init(&group->cdev, &vfio_group_fops);
361 group->cdev.owner = THIS_MODULE;
362
2b678aa2 363 refcount_set(&group->users, 1);
be8d3ada 364 init_rwsem(&group->group_rwsem);
cba3345c
AW
365 INIT_LIST_HEAD(&group->device_list);
366 mutex_init(&group->device_lock);
cba3345c 367 group->iommu_group = iommu_group;
9cef7391 368 /* put in vfio_group_release() */
325a31c9 369 iommu_group_ref_get(iommu_group);
c68ea0d0 370 group->type = type;
ccd46dba 371 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
cba3345c 372
9cef7391
JG
373 return group;
374}
375
376static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
377 enum vfio_group_type type)
378{
379 struct vfio_group *group;
380 struct vfio_group *ret;
381 int err;
382
383 group = vfio_group_alloc(iommu_group, type);
384 if (IS_ERR(group))
385 return group;
386
387 err = dev_set_name(&group->dev, "%s%d",
388 group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
389 iommu_group_id(iommu_group));
390 if (err) {
391 ret = ERR_PTR(err);
392 goto err_put;
393 }
394
cba3345c
AW
395 mutex_lock(&vfio.group_lock);
396
cba3345c 397 /* Did we race creating this group? */
9cef7391
JG
398 ret = __vfio_group_get_from_iommu(iommu_group);
399 if (ret)
400 goto err_unlock;
2f51bf4b 401
9cef7391
JG
402 err = cdev_device_add(&group->cdev, &group->dev);
403 if (err) {
404 ret = ERR_PTR(err);
405 goto err_unlock;
cba3345c
AW
406 }
407
cba3345c
AW
408 list_add(&group->vfio_next, &vfio.group_list);
409
410 mutex_unlock(&vfio.group_lock);
cba3345c 411 return group;
9cef7391
JG
412
413err_unlock:
414 mutex_unlock(&vfio.group_lock);
9cef7391
JG
415err_put:
416 put_device(&group->dev);
417 return ret;
cba3345c
AW
418}
419
2b678aa2 420static void vfio_group_put(struct vfio_group *group)
cba3345c 421{
2b678aa2
JG
422 if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
423 return;
cba3345c 424
63b150fd
JG
425 /*
426 * These data structures all have paired operations that can only be
427 * undone when the caller holds a live reference on the group. Since all
428 * pairs must be undone these WARN_ON's indicate some caller did not
429 * properly hold the group reference.
430 */
cba3345c 431 WARN_ON(!list_empty(&group->device_list));
3ca54708 432 WARN_ON(group->container || group->container_users);
65b1adeb 433 WARN_ON(group->notifier.head);
cba3345c 434
cba3345c 435 list_del(&group->vfio_next);
9cef7391
JG
436 cdev_device_del(&group->cdev, &group->dev);
437 mutex_unlock(&vfio.group_lock);
438
9cef7391 439 put_device(&group->dev);
cba3345c
AW
440}
441
cba3345c
AW
442static void vfio_group_get(struct vfio_group *group)
443{
2b678aa2 444 refcount_inc(&group->users);
cba3345c
AW
445}
446
3b9a2d57 447/*
cba3345c
AW
448 * Device objects - create, release, get, put, search
449 */
cba3345c 450/* Device reference always implies a group reference */
ff806cbd 451static void vfio_device_put(struct vfio_device *device)
cba3345c 452{
5e42c999
JG
453 if (refcount_dec_and_test(&device->refcount))
454 complete(&device->comp);
cba3345c
AW
455}
456
5e42c999 457static bool vfio_device_try_get(struct vfio_device *device)
cba3345c 458{
5e42c999 459 return refcount_inc_not_zero(&device->refcount);
cba3345c
AW
460}
461
462static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
463 struct device *dev)
464{
465 struct vfio_device *device;
466
467 mutex_lock(&group->device_lock);
468 list_for_each_entry(device, &group->device_list, group_next) {
5e42c999 469 if (device->dev == dev && vfio_device_try_get(device)) {
cba3345c
AW
470 mutex_unlock(&group->device_lock);
471 return device;
472 }
473 }
474 mutex_unlock(&group->device_lock);
475 return NULL;
476}
477
3b9a2d57 478/*
cba3345c
AW
479 * VFIO driver API
480 */
0bfc6a4e 481void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
1e04ec14 482 const struct vfio_device_ops *ops)
cba3345c 483{
0bfc6a4e
JG
484 init_completion(&device->comp);
485 device->dev = dev;
486 device->ops = ops;
0bfc6a4e
JG
487}
488EXPORT_SYMBOL_GPL(vfio_init_group_dev);
489
ae03c377
MG
490void vfio_uninit_group_dev(struct vfio_device *device)
491{
2fd585f4 492 vfio_release_device_set(device);
ae03c377
MG
493}
494EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
495
c68ea0d0
CH
496static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
497 enum vfio_group_type type)
1362591f
CH
498{
499 struct iommu_group *iommu_group;
500 struct vfio_group *group;
3af91771
CH
501 int ret;
502
503 iommu_group = iommu_group_alloc();
504 if (IS_ERR(iommu_group))
505 return ERR_CAST(iommu_group);
506
507 iommu_group_set_name(iommu_group, "vfio-noiommu");
3af91771
CH
508 ret = iommu_group_add_device(iommu_group, dev);
509 if (ret)
510 goto out_put_group;
1362591f 511
c68ea0d0 512 group = vfio_create_group(iommu_group, type);
3af91771
CH
513 if (IS_ERR(group)) {
514 ret = PTR_ERR(group);
515 goto out_remove_device;
516 }
325a31c9 517 iommu_group_put(iommu_group);
3af91771
CH
518 return group;
519
520out_remove_device:
521 iommu_group_remove_device(dev);
522out_put_group:
523 iommu_group_put(iommu_group);
524 return ERR_PTR(ret);
525}
3af91771
CH
526
527static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
528{
529 struct iommu_group *iommu_group;
530 struct vfio_group *group;
531
532 iommu_group = iommu_group_get(dev);
533#ifdef CONFIG_VFIO_NOIOMMU
a77109ff 534 if (!iommu_group && noiommu) {
3af91771
CH
535 /*
536 * With noiommu enabled, create an IOMMU group for devices that
a77109ff
RM
537 * don't already have one, implying no IOMMU hardware/driver
538 * exists. Taint the kernel because we're about to give a DMA
3af91771
CH
539 * capable device to a user without IOMMU protection.
540 */
c68ea0d0 541 group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
3af91771
CH
542 if (!IS_ERR(group)) {
543 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
544 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
545 }
546 return group;
547 }
548#endif
1362591f
CH
549 if (!iommu_group)
550 return ERR_PTR(-EINVAL);
551
1362591f 552 group = vfio_group_get_from_iommu(iommu_group);
325a31c9
JG
553 if (!group)
554 group = vfio_create_group(iommu_group, VFIO_IOMMU);
1362591f 555
325a31c9 556 /* The vfio_group holds a reference to the iommu_group */
1362591f
CH
557 iommu_group_put(iommu_group);
558 return group;
559}
560
c68ea0d0
CH
561static int __vfio_register_dev(struct vfio_device *device,
562 struct vfio_group *group)
0bfc6a4e
JG
563{
564 struct vfio_device *existing_device;
c68ea0d0
CH
565
566 if (IS_ERR(group))
567 return PTR_ERR(group);
cba3345c 568
2fd585f4
JG
569 /*
570 * If the driver doesn't specify a set then the device is added to a
571 * singleton set just for itself.
572 */
573 if (!device->dev_set)
574 vfio_assign_device_set(device, device);
575
0bfc6a4e
JG
576 existing_device = vfio_group_get_device(group, device->dev);
577 if (existing_device) {
578 dev_WARN(device->dev, "Device already exists on group %d\n",
1362591f 579 iommu_group_id(group->iommu_group));
0bfc6a4e 580 vfio_device_put(existing_device);
c68ea0d0
CH
581 if (group->type == VFIO_NO_IOMMU ||
582 group->type == VFIO_EMULATED_IOMMU)
38a68934 583 iommu_group_remove_device(device->dev);
cba3345c 584 vfio_group_put(group);
cba3345c
AW
585 return -EBUSY;
586 }
587
0bfc6a4e
JG
588 /* Our reference on group is moved to the device */
589 device->group = group;
590
591 /* Refcounting can't start until the driver calls register */
592 refcount_set(&device->refcount, 1);
593
594 mutex_lock(&group->device_lock);
595 list_add(&device->group_next, &group->device_list);
596 group->dev_counter++;
597 mutex_unlock(&group->device_lock);
598
599 return 0;
600}
c68ea0d0
CH
601
602int vfio_register_group_dev(struct vfio_device *device)
603{
e8ae0e14
JG
604 /*
605 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
606 * restore cache coherency.
607 */
608 if (!iommu_capable(device->dev->bus, IOMMU_CAP_CACHE_COHERENCY))
609 return -EINVAL;
610
c68ea0d0
CH
611 return __vfio_register_dev(device,
612 vfio_group_find_or_alloc(device->dev));
613}
0bfc6a4e
JG
614EXPORT_SYMBOL_GPL(vfio_register_group_dev);
615
c68ea0d0
CH
616/*
617 * Register a virtual device without IOMMU backing. The user of this
618 * device must not be able to directly trigger unmediated DMA.
619 */
620int vfio_register_emulated_iommu_dev(struct vfio_device *device)
621{
622 return __vfio_register_dev(device,
623 vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU));
624}
625EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
626
4bc94d5d
AW
627static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
628 char *buf)
629{
5f3874c2 630 struct vfio_device *it, *device = ERR_PTR(-ENODEV);
4bc94d5d
AW
631
632 mutex_lock(&group->device_lock);
e324fc82 633 list_for_each_entry(it, &group->device_list, group_next) {
5f3874c2
AW
634 int ret;
635
636 if (it->ops->match) {
6df62c5b 637 ret = it->ops->match(it, buf);
5f3874c2
AW
638 if (ret < 0) {
639 device = ERR_PTR(ret);
640 break;
641 }
642 } else {
643 ret = !strcmp(dev_name(it->dev), buf);
644 }
645
5e42c999 646 if (ret && vfio_device_try_get(it)) {
e324fc82 647 device = it;
4bc94d5d
AW
648 break;
649 }
650 }
651 mutex_unlock(&group->device_lock);
652
653 return device;
654}
655
cba3345c
AW
656/*
657 * Decrement the device reference count and wait for the device to be
658 * removed. Open file descriptors for the device... */
0bfc6a4e 659void vfio_unregister_group_dev(struct vfio_device *device)
cba3345c 660{
cba3345c 661 struct vfio_group *group = device->group;
13060b64 662 unsigned int i = 0;
db7d4d7f 663 bool interrupted = false;
5e42c999 664 long rc;
cba3345c
AW
665
666 vfio_device_put(device);
5e42c999
JG
667 rc = try_wait_for_completion(&device->comp);
668 while (rc <= 0) {
13060b64 669 if (device->ops->request)
6df62c5b 670 device->ops->request(device, i++);
13060b64 671
db7d4d7f 672 if (interrupted) {
5e42c999
JG
673 rc = wait_for_completion_timeout(&device->comp,
674 HZ * 10);
db7d4d7f 675 } else {
5e42c999
JG
676 rc = wait_for_completion_interruptible_timeout(
677 &device->comp, HZ * 10);
678 if (rc < 0) {
db7d4d7f 679 interrupted = true;
0bfc6a4e 680 dev_warn(device->dev,
db7d4d7f
AW
681 "Device is currently in use, task"
682 " \"%s\" (%d) "
683 "blocked until device is released",
684 current->comm, task_pid_nr(current));
685 }
686 }
5e42c999 687 }
e014e944 688
5e42c999
JG
689 mutex_lock(&group->device_lock);
690 list_del(&device->group_next);
691 group->dev_counter--;
692 mutex_unlock(&group->device_lock);
41be3e26 693
c68ea0d0 694 if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
38a68934 695 iommu_group_remove_device(device->dev);
c04ac340 696
0bfc6a4e 697 /* Matches the get in vfio_register_group_dev() */
e014e944 698 vfio_group_put(group);
0bfc6a4e
JG
699}
700EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
701
3b9a2d57 702/*
cba3345c
AW
703 * VFIO base fd, /dev/vfio/vfio
704 */
705static long vfio_ioctl_check_extension(struct vfio_container *container,
706 unsigned long arg)
707{
0b43c082 708 struct vfio_iommu_driver *driver;
cba3345c
AW
709 long ret = 0;
710
0b43c082
AW
711 down_read(&container->group_lock);
712
713 driver = container->iommu_driver;
714
cba3345c
AW
715 switch (arg) {
716 /* No base extensions yet */
717 default:
718 /*
719 * If no driver is set, poll all registered drivers for
720 * extensions and return the first positive result. If
721 * a driver is already set, further queries will be passed
722 * only to that driver.
723 */
724 if (!driver) {
725 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6
AW
726 list_for_each_entry(driver, &vfio.iommu_drivers_list,
727 vfio_next) {
03a76b60 728
03a76b60 729 if (!list_empty(&container->group_list) &&
b0062160
CH
730 !vfio_iommu_driver_allowed(container,
731 driver))
03a76b60 732 continue;
cba3345c
AW
733 if (!try_module_get(driver->ops->owner))
734 continue;
735
736 ret = driver->ops->ioctl(NULL,
737 VFIO_CHECK_EXTENSION,
738 arg);
739 module_put(driver->ops->owner);
740 if (ret > 0)
741 break;
742 }
743 mutex_unlock(&vfio.iommu_drivers_lock);
744 } else
745 ret = driver->ops->ioctl(container->iommu_data,
746 VFIO_CHECK_EXTENSION, arg);
747 }
748
0b43c082
AW
749 up_read(&container->group_lock);
750
cba3345c
AW
751 return ret;
752}
753
9587f44a 754/* hold write lock on container->group_lock */
cba3345c
AW
755static int __vfio_container_attach_groups(struct vfio_container *container,
756 struct vfio_iommu_driver *driver,
757 void *data)
758{
759 struct vfio_group *group;
760 int ret = -ENODEV;
761
762 list_for_each_entry(group, &container->group_list, container_next) {
c3c0fa9d
CH
763 ret = driver->ops->attach_group(data, group->iommu_group,
764 group->type);
cba3345c
AW
765 if (ret)
766 goto unwind;
767 }
768
769 return ret;
770
771unwind:
772 list_for_each_entry_continue_reverse(group, &container->group_list,
773 container_next) {
774 driver->ops->detach_group(data, group->iommu_group);
775 }
776
777 return ret;
778}
779
780static long vfio_ioctl_set_iommu(struct vfio_container *container,
781 unsigned long arg)
782{
783 struct vfio_iommu_driver *driver;
784 long ret = -ENODEV;
785
9587f44a 786 down_write(&container->group_lock);
cba3345c
AW
787
788 /*
789 * The container is designed to be an unprivileged interface while
790 * the group can be assigned to specific users. Therefore, only by
791 * adding a group to a container does the user get the privilege of
792 * enabling the iommu, which may allocate finite resources. There
793 * is no unset_iommu, but by removing all the groups from a container,
794 * the container is deprivileged and returns to an unset state.
795 */
796 if (list_empty(&container->group_list) || container->iommu_driver) {
9587f44a 797 up_write(&container->group_lock);
cba3345c
AW
798 return -EINVAL;
799 }
800
801 mutex_lock(&vfio.iommu_drivers_lock);
ae5515d6 802 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
cba3345c
AW
803 void *data;
804
b0062160 805 if (!vfio_iommu_driver_allowed(container, driver))
03a76b60 806 continue;
cba3345c
AW
807 if (!try_module_get(driver->ops->owner))
808 continue;
809
810 /*
811 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
812 * so test which iommu driver reported support for this
813 * extension and call open on them. We also pass them the
814 * magic, allowing a single driver to support multiple
815 * interfaces if they'd like.
816 */
817 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
818 module_put(driver->ops->owner);
819 continue;
820 }
821
cba3345c
AW
822 data = driver->ops->open(arg);
823 if (IS_ERR(data)) {
824 ret = PTR_ERR(data);
825 module_put(driver->ops->owner);
7c435b46 826 continue;
cba3345c
AW
827 }
828
829 ret = __vfio_container_attach_groups(container, driver, data);
7c435b46 830 if (ret) {
cba3345c
AW
831 driver->ops->release(data);
832 module_put(driver->ops->owner);
7c435b46 833 continue;
cba3345c
AW
834 }
835
7c435b46
AW
836 container->iommu_driver = driver;
837 container->iommu_data = data;
838 break;
cba3345c
AW
839 }
840
841 mutex_unlock(&vfio.iommu_drivers_lock);
9587f44a 842 up_write(&container->group_lock);
cba3345c
AW
843
844 return ret;
845}
846
847static long vfio_fops_unl_ioctl(struct file *filep,
848 unsigned int cmd, unsigned long arg)
849{
850 struct vfio_container *container = filep->private_data;
851 struct vfio_iommu_driver *driver;
852 void *data;
853 long ret = -EINVAL;
854
855 if (!container)
856 return ret;
857
cba3345c
AW
858 switch (cmd) {
859 case VFIO_GET_API_VERSION:
860 ret = VFIO_API_VERSION;
861 break;
862 case VFIO_CHECK_EXTENSION:
863 ret = vfio_ioctl_check_extension(container, arg);
864 break;
865 case VFIO_SET_IOMMU:
866 ret = vfio_ioctl_set_iommu(container, arg);
867 break;
868 default:
0b43c082
AW
869 driver = container->iommu_driver;
870 data = container->iommu_data;
871
cba3345c
AW
872 if (driver) /* passthrough all unrecognized ioctls */
873 ret = driver->ops->ioctl(data, cmd, arg);
874 }
875
876 return ret;
877}
878
cba3345c
AW
879static int vfio_fops_open(struct inode *inode, struct file *filep)
880{
881 struct vfio_container *container;
882
883 container = kzalloc(sizeof(*container), GFP_KERNEL);
884 if (!container)
885 return -ENOMEM;
886
887 INIT_LIST_HEAD(&container->group_list);
9587f44a 888 init_rwsem(&container->group_lock);
cba3345c
AW
889 kref_init(&container->kref);
890
891 filep->private_data = container;
892
893 return 0;
894}
895
896static int vfio_fops_release(struct inode *inode, struct file *filep)
897{
898 struct vfio_container *container = filep->private_data;
ec5e3294
SS
899 struct vfio_iommu_driver *driver = container->iommu_driver;
900
901 if (driver && driver->ops->notify)
902 driver->ops->notify(container->iommu_data,
903 VFIO_IOMMU_CONTAINER_CLOSE);
cba3345c
AW
904
905 filep->private_data = NULL;
906
907 vfio_container_put(container);
908
909 return 0;
910}
911
cba3345c
AW
912static const struct file_operations vfio_fops = {
913 .owner = THIS_MODULE,
914 .open = vfio_fops_open,
915 .release = vfio_fops_release,
cba3345c 916 .unlocked_ioctl = vfio_fops_unl_ioctl,
407e9ef7 917 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
918};
919
3b9a2d57 920/*
cba3345c
AW
921 * VFIO Group fd, /dev/vfio/$GROUP
922 */
923static void __vfio_group_unset_container(struct vfio_group *group)
924{
925 struct vfio_container *container = group->container;
926 struct vfio_iommu_driver *driver;
927
e0e29bdb
JG
928 lockdep_assert_held_write(&group->group_rwsem);
929
9587f44a 930 down_write(&container->group_lock);
cba3345c
AW
931
932 driver = container->iommu_driver;
933 if (driver)
934 driver->ops->detach_group(container->iommu_data,
935 group->iommu_group);
936
a3da1ab6
JG
937 if (group->type == VFIO_IOMMU)
938 iommu_group_release_dma_owner(group->iommu_group);
70693f47 939
cba3345c 940 group->container = NULL;
3ca54708 941 group->container_users = 0;
cba3345c
AW
942 list_del(&group->container_next);
943
944 /* Detaching the last group deprivileges a container, remove iommu */
945 if (driver && list_empty(&container->group_list)) {
946 driver->ops->release(container->iommu_data);
947 module_put(driver->ops->owner);
948 container->iommu_driver = NULL;
949 container->iommu_data = NULL;
950 }
951
9587f44a 952 up_write(&container->group_lock);
cba3345c
AW
953
954 vfio_container_put(container);
955}
956
957/*
958 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
959 * if there was no container to unset. Since the ioctl is called on
960 * the group, we know that still exists, therefore the only valid
961 * transition here is 1->0.
962 */
963static int vfio_group_unset_container(struct vfio_group *group)
964{
e0e29bdb 965 lockdep_assert_held_write(&group->group_rwsem);
cba3345c 966
3ca54708 967 if (!group->container)
cba3345c 968 return -EINVAL;
3ca54708 969 if (group->container_users != 1)
cba3345c 970 return -EBUSY;
cba3345c 971 __vfio_group_unset_container(group);
cba3345c
AW
972 return 0;
973}
974
cba3345c
AW
975static int vfio_group_set_container(struct vfio_group *group, int container_fd)
976{
2903ff01 977 struct fd f;
cba3345c
AW
978 struct vfio_container *container;
979 struct vfio_iommu_driver *driver;
2903ff01 980 int ret = 0;
cba3345c 981
e0e29bdb
JG
982 lockdep_assert_held_write(&group->group_rwsem);
983
3ca54708 984 if (group->container || WARN_ON(group->container_users))
cba3345c
AW
985 return -EINVAL;
986
c68ea0d0 987 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
03a76b60
AW
988 return -EPERM;
989
2903ff01
AV
990 f = fdget(container_fd);
991 if (!f.file)
cba3345c
AW
992 return -EBADF;
993
994 /* Sanity check, is this really our fd? */
2903ff01
AV
995 if (f.file->f_op != &vfio_fops) {
996 fdput(f);
cba3345c
AW
997 return -EINVAL;
998 }
999
2903ff01 1000 container = f.file->private_data;
cba3345c
AW
1001 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1002
9587f44a 1003 down_write(&container->group_lock);
cba3345c 1004
03a76b60
AW
1005 /* Real groups and fake groups cannot mix */
1006 if (!list_empty(&container->group_list) &&
c68ea0d0 1007 container->noiommu != (group->type == VFIO_NO_IOMMU)) {
03a76b60
AW
1008 ret = -EPERM;
1009 goto unlock_out;
1010 }
1011
a3da1ab6
JG
1012 if (group->type == VFIO_IOMMU) {
1013 ret = iommu_group_claim_dma_owner(group->iommu_group, f.file);
1014 if (ret)
1015 goto unlock_out;
1016 }
70693f47 1017
cba3345c
AW
1018 driver = container->iommu_driver;
1019 if (driver) {
1020 ret = driver->ops->attach_group(container->iommu_data,
c3c0fa9d
CH
1021 group->iommu_group,
1022 group->type);
70693f47 1023 if (ret) {
a3da1ab6
JG
1024 if (group->type == VFIO_IOMMU)
1025 iommu_group_release_dma_owner(
1026 group->iommu_group);
cba3345c 1027 goto unlock_out;
70693f47 1028 }
cba3345c
AW
1029 }
1030
1031 group->container = container;
3ca54708 1032 group->container_users = 1;
c68ea0d0 1033 container->noiommu = (group->type == VFIO_NO_IOMMU);
cba3345c
AW
1034 list_add(&group->container_next, &container->group_list);
1035
1036 /* Get a reference on the container and mark a user within the group */
1037 vfio_container_get(container);
cba3345c
AW
1038
1039unlock_out:
9587f44a 1040 up_write(&container->group_lock);
2903ff01 1041 fdput(f);
cba3345c
AW
1042 return ret;
1043}
1044
cba3345c
AW
1045static const struct file_operations vfio_device_fops;
1046
eadd86f8
JG
1047/* true if the vfio_device has open_device() called but not close_device() */
1048static bool vfio_assert_device_open(struct vfio_device *device)
32f55d83 1049{
eadd86f8
JG
1050 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
1051}
1052
805bb6c1 1053static int vfio_device_assign_container(struct vfio_device *device)
cba3345c 1054{
805bb6c1 1055 struct vfio_group *group = device->group;
cba3345c 1056
e0e29bdb
JG
1057 lockdep_assert_held_write(&group->group_rwsem);
1058
3ca54708
JG
1059 if (!group->container || !group->container->iommu_driver ||
1060 WARN_ON(!group->container_users))
32f55d83
KW
1061 return -EINVAL;
1062
c68ea0d0 1063 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
32f55d83 1064 return -EPERM;
32f55d83 1065
b76c0eed 1066 get_file(group->opened_file);
3ca54708 1067 group->container_users++;
32f55d83
KW
1068 return 0;
1069}
1070
b76c0eed
JG
1071static void vfio_device_unassign_container(struct vfio_device *device)
1072{
1073 down_write(&device->group->group_rwsem);
3ca54708
JG
1074 WARN_ON(device->group->container_users <= 1);
1075 device->group->container_users--;
b76c0eed
JG
1076 fput(device->group->opened_file);
1077 up_write(&device->group->group_rwsem);
1078}
cba3345c 1079
805bb6c1 1080static struct file *vfio_device_open(struct vfio_device *device)
cba3345c 1081{
cba3345c 1082 struct file *filep;
805bb6c1 1083 int ret;
03a76b60 1084
e0e29bdb 1085 down_write(&device->group->group_rwsem);
805bb6c1 1086 ret = vfio_device_assign_container(device);
e0e29bdb 1087 up_write(&device->group->group_rwsem);
805bb6c1
JG
1088 if (ret)
1089 return ERR_PTR(ret);
cba3345c 1090
9dcf01d9 1091 if (!try_module_get(device->dev->driver->owner)) {
2fd585f4 1092 ret = -ENODEV;
805bb6c1 1093 goto err_unassign_container;
9dcf01d9
MG
1094 }
1095
2fd585f4
JG
1096 mutex_lock(&device->dev_set->lock);
1097 device->open_count++;
421cfe65
MR
1098 if (device->open_count == 1) {
1099 /*
1100 * Here we pass the KVM pointer with the group under the read
1101 * lock. If the device driver will use it, it must obtain a
1102 * reference and release it during close_device.
1103 */
1104 down_read(&device->group->group_rwsem);
1105 device->kvm = device->group->kvm;
1106
1107 if (device->ops->open_device) {
1108 ret = device->ops->open_device(device);
1109 if (ret)
1110 goto err_undo_count;
1111 }
1112 up_read(&device->group->group_rwsem);
2fd585f4
JG
1113 }
1114 mutex_unlock(&device->dev_set->lock);
1115
4bc94d5d
AW
1116 /*
1117 * We can't use anon_inode_getfd() because we need to modify
1118 * the f_mode flags directly to allow more than just ioctls
1119 */
4bc94d5d
AW
1120 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1121 device, O_RDWR);
1122 if (IS_ERR(filep)) {
4bc94d5d 1123 ret = PTR_ERR(filep);
805bb6c1 1124 goto err_close_device;
4bc94d5d
AW
1125 }
1126
1127 /*
1128 * TODO: add an anon_inode interface to do this.
1129 * Appears to be missing by lack of need rather than
1130 * explicitly prevented. Now there's need.
1131 */
1132 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
cba3345c 1133
805bb6c1 1134 if (device->group->type == VFIO_NO_IOMMU)
03a76b60
AW
1135 dev_warn(device->dev, "vfio-noiommu device opened by user "
1136 "(%s:%d)\n", current->comm, task_pid_nr(current));
805bb6c1
JG
1137 /*
1138 * On success the ref of device is moved to the file and
1139 * put in vfio_device_fops_release()
1140 */
1141 return filep;
03a76b60 1142
2fd585f4
JG
1143err_close_device:
1144 mutex_lock(&device->dev_set->lock);
421cfe65 1145 down_read(&device->group->group_rwsem);
2fd585f4
JG
1146 if (device->open_count == 1 && device->ops->close_device)
1147 device->ops->close_device(device);
1148err_undo_count:
1149 device->open_count--;
421cfe65
MR
1150 if (device->open_count == 0 && device->kvm)
1151 device->kvm = NULL;
1152 up_read(&device->group->group_rwsem);
2fd585f4
JG
1153 mutex_unlock(&device->dev_set->lock);
1154 module_put(device->dev->driver->owner);
805bb6c1 1155err_unassign_container:
b76c0eed 1156 vfio_device_unassign_container(device);
805bb6c1
JG
1157 return ERR_PTR(ret);
1158}
1159
1160static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1161{
1162 struct vfio_device *device;
1163 struct file *filep;
1164 int fdno;
1165 int ret;
1166
1167 device = vfio_device_get_from_name(group, buf);
1168 if (IS_ERR(device))
1169 return PTR_ERR(device);
1170
1171 fdno = get_unused_fd_flags(O_CLOEXEC);
1172 if (fdno < 0) {
1173 ret = fdno;
1174 goto err_put_device;
1175 }
1176
1177 filep = vfio_device_open(device);
1178 if (IS_ERR(filep)) {
1179 ret = PTR_ERR(filep);
1180 goto err_put_fdno;
1181 }
1182
1183 fd_install(fdno, filep);
1184 return fdno;
1185
1186err_put_fdno:
1187 put_unused_fd(fdno);
1188err_put_device:
2fd585f4 1189 vfio_device_put(device);
cba3345c
AW
1190 return ret;
1191}
1192
1193static long vfio_group_fops_unl_ioctl(struct file *filep,
1194 unsigned int cmd, unsigned long arg)
1195{
1196 struct vfio_group *group = filep->private_data;
1197 long ret = -ENOTTY;
1198
1199 switch (cmd) {
1200 case VFIO_GROUP_GET_STATUS:
1201 {
1202 struct vfio_group_status status;
1203 unsigned long minsz;
1204
1205 minsz = offsetofend(struct vfio_group_status, flags);
1206
1207 if (copy_from_user(&status, (void __user *)arg, minsz))
1208 return -EFAULT;
1209
1210 if (status.argsz < minsz)
1211 return -EINVAL;
1212
1213 status.flags = 0;
1214
e0e29bdb 1215 down_read(&group->group_rwsem);
cba3345c 1216 if (group->container)
31076af0
LB
1217 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
1218 VFIO_GROUP_FLAGS_VIABLE;
1219 else if (!iommu_group_dma_owner_claimed(group->iommu_group))
1220 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
e0e29bdb 1221 up_read(&group->group_rwsem);
cba3345c
AW
1222
1223 if (copy_to_user((void __user *)arg, &status, minsz))
1224 return -EFAULT;
1225
1226 ret = 0;
1227 break;
1228 }
1229 case VFIO_GROUP_SET_CONTAINER:
1230 {
1231 int fd;
1232
1233 if (get_user(fd, (int __user *)arg))
1234 return -EFAULT;
1235
1236 if (fd < 0)
1237 return -EINVAL;
1238
e0e29bdb 1239 down_write(&group->group_rwsem);
cba3345c 1240 ret = vfio_group_set_container(group, fd);
e0e29bdb 1241 up_write(&group->group_rwsem);
cba3345c
AW
1242 break;
1243 }
1244 case VFIO_GROUP_UNSET_CONTAINER:
e0e29bdb 1245 down_write(&group->group_rwsem);
cba3345c 1246 ret = vfio_group_unset_container(group);
e0e29bdb 1247 up_write(&group->group_rwsem);
cba3345c
AW
1248 break;
1249 case VFIO_GROUP_GET_DEVICE_FD:
1250 {
1251 char *buf;
1252
1253 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1254 if (IS_ERR(buf))
1255 return PTR_ERR(buf);
1256
1257 ret = vfio_group_get_device_fd(group, buf);
1258 kfree(buf);
1259 break;
1260 }
1261 }
1262
1263 return ret;
1264}
1265
cba3345c
AW
1266static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1267{
9cef7391
JG
1268 struct vfio_group *group =
1269 container_of(inode->i_cdev, struct vfio_group, cdev);
c6f4860e 1270 int ret;
cba3345c 1271
c6f4860e 1272 down_write(&group->group_rwsem);
cba3345c 1273
c6f4860e
JG
1274 /* users can be zero if this races with vfio_group_put() */
1275 if (!refcount_inc_not_zero(&group->users)) {
1276 ret = -ENODEV;
1277 goto err_unlock;
03a76b60
AW
1278 }
1279
c6f4860e
JG
1280 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
1281 ret = -EPERM;
1282 goto err_put;
6d6768c6
AW
1283 }
1284
c6f4860e
JG
1285 /*
1286 * Do we need multiple instances of the group open? Seems not.
c6f4860e 1287 */
b76c0eed 1288 if (group->opened_file) {
c6f4860e
JG
1289 ret = -EBUSY;
1290 goto err_put;
cba3345c 1291 }
b76c0eed 1292 group->opened_file = filep;
cba3345c
AW
1293 filep->private_data = group;
1294
c6f4860e 1295 up_write(&group->group_rwsem);
cba3345c 1296 return 0;
c6f4860e
JG
1297err_put:
1298 vfio_group_put(group);
1299err_unlock:
1300 up_write(&group->group_rwsem);
1301 return ret;
cba3345c
AW
1302}
1303
1304static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1305{
1306 struct vfio_group *group = filep->private_data;
1307
1308 filep->private_data = NULL;
1309
c6f4860e 1310 down_write(&group->group_rwsem);
b76c0eed
JG
1311 /*
1312 * Device FDs hold a group file reference, therefore the group release
1313 * is only called when there are no open devices.
1314 */
1315 WARN_ON(group->notifier.head);
1316 if (group->container) {
3ca54708 1317 WARN_ON(group->container_users != 1);
b76c0eed
JG
1318 __vfio_group_unset_container(group);
1319 }
1320 group->opened_file = NULL;
c6f4860e 1321 up_write(&group->group_rwsem);
6d6768c6 1322
cba3345c
AW
1323 vfio_group_put(group);
1324
1325 return 0;
1326}
1327
1328static const struct file_operations vfio_group_fops = {
1329 .owner = THIS_MODULE,
1330 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
407e9ef7 1331 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1332 .open = vfio_group_fops_open,
1333 .release = vfio_group_fops_release,
1334};
1335
3b9a2d57 1336/*
cba3345c
AW
1337 * VFIO Device fd
1338 */
1339static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1340{
1341 struct vfio_device *device = filep->private_data;
1342
2fd585f4 1343 mutex_lock(&device->dev_set->lock);
eadd86f8 1344 vfio_assert_device_open(device);
421cfe65 1345 down_read(&device->group->group_rwsem);
eadd86f8 1346 if (device->open_count == 1 && device->ops->close_device)
2fd585f4 1347 device->ops->close_device(device);
421cfe65 1348 up_read(&device->group->group_rwsem);
eadd86f8 1349 device->open_count--;
421cfe65
MR
1350 if (device->open_count == 0)
1351 device->kvm = NULL;
2fd585f4 1352 mutex_unlock(&device->dev_set->lock);
cba3345c 1353
9dcf01d9
MG
1354 module_put(device->dev->driver->owner);
1355
b76c0eed 1356 vfio_device_unassign_container(device);
cba3345c
AW
1357
1358 vfio_device_put(device);
1359
1360 return 0;
1361}
1362
115dcec6
JG
1363/*
1364 * vfio_mig_get_next_state - Compute the next step in the FSM
1365 * @cur_fsm - The current state the device is in
1366 * @new_fsm - The target state to reach
1367 * @next_fsm - Pointer to the next step to get to new_fsm
1368 *
1369 * Return 0 upon success, otherwise -errno
1370 * Upon success the next step in the state progression between cur_fsm and
1371 * new_fsm will be set in next_fsm.
1372 *
1373 * This breaks down requests for combination transitions into smaller steps and
1374 * returns the next step to get to new_fsm. The function may need to be called
1375 * multiple times before reaching new_fsm.
1376 *
1377 */
1378int vfio_mig_get_next_state(struct vfio_device *device,
1379 enum vfio_device_mig_state cur_fsm,
1380 enum vfio_device_mig_state new_fsm,
1381 enum vfio_device_mig_state *next_fsm)
1382{
8cb3d83b 1383 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
115dcec6 1384 /*
8cb3d83b
JG
1385 * The coding in this table requires the driver to implement the
1386 * following FSM arcs:
115dcec6 1387 * RESUMING -> STOP
115dcec6 1388 * STOP -> RESUMING
115dcec6
JG
1389 * STOP -> STOP_COPY
1390 * STOP_COPY -> STOP
1391 *
8cb3d83b
JG
1392 * If P2P is supported then the driver must also implement these FSM
1393 * arcs:
1394 * RUNNING -> RUNNING_P2P
1395 * RUNNING_P2P -> RUNNING
1396 * RUNNING_P2P -> STOP
1397 * STOP -> RUNNING_P2P
1398 * Without P2P the driver must implement:
1399 * RUNNING -> STOP
1400 * STOP -> RUNNING
1401 *
1402 * The coding will step through multiple states for some combination
1403 * transitions; if all optional features are supported, this means the
1404 * following ones:
1405 * RESUMING -> STOP -> RUNNING_P2P
1406 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
115dcec6 1407 * RESUMING -> STOP -> STOP_COPY
8cb3d83b
JG
1408 * RUNNING -> RUNNING_P2P -> STOP
1409 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
1410 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
1411 * RUNNING_P2P -> STOP -> RESUMING
1412 * RUNNING_P2P -> STOP -> STOP_COPY
1413 * STOP -> RUNNING_P2P -> RUNNING
115dcec6 1414 * STOP_COPY -> STOP -> RESUMING
8cb3d83b
JG
1415 * STOP_COPY -> STOP -> RUNNING_P2P
1416 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
115dcec6
JG
1417 */
1418 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
1419 [VFIO_DEVICE_STATE_STOP] = {
1420 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 1421 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1422 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1423 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b 1424 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1425 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1426 },
1427 [VFIO_DEVICE_STATE_RUNNING] = {
8cb3d83b 1428 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6 1429 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
8cb3d83b
JG
1430 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
1431 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
1432 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1433 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1434 },
1435 [VFIO_DEVICE_STATE_STOP_COPY] = {
1436 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1437 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1438 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
1439 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 1440 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
1441 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1442 },
1443 [VFIO_DEVICE_STATE_RESUMING] = {
1444 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1445 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
1446 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1447 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b
JG
1448 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
1449 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1450 },
1451 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
1452 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
1453 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
1454 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
1455 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
1456 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
1457 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1458 },
1459 [VFIO_DEVICE_STATE_ERROR] = {
1460 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
1461 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
1462 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
1463 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
8cb3d83b 1464 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
1465 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
1466 },
1467 };
1468
8cb3d83b
JG
1469 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
1470 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
1471 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
1472 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
1473 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
1474 [VFIO_DEVICE_STATE_RUNNING_P2P] =
1475 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
1476 [VFIO_DEVICE_STATE_ERROR] = ~0U,
1477 };
1478
1479 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1480 (state_flags_table[cur_fsm] & device->migration_flags) !=
1481 state_flags_table[cur_fsm]))
115dcec6
JG
1482 return -EINVAL;
1483
8cb3d83b
JG
1484 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
1485 (state_flags_table[new_fsm] & device->migration_flags) !=
1486 state_flags_table[new_fsm])
115dcec6
JG
1487 return -EINVAL;
1488
8cb3d83b
JG
1489 /*
1490 * Arcs touching optional and unsupported states are skipped over. The
1491 * driver will instead see an arc from the original state to the next
1492 * logical state, as per the above comment.
1493 */
115dcec6 1494 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
8cb3d83b
JG
1495 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
1496 state_flags_table[*next_fsm])
1497 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
1498
115dcec6
JG
1499 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
1500}
1501EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
1502
1503/*
1504 * Convert the drivers's struct file into a FD number and return it to userspace
1505 */
1506static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
1507 struct vfio_device_feature_mig_state *mig)
1508{
1509 int ret;
1510 int fd;
1511
1512 fd = get_unused_fd_flags(O_CLOEXEC);
1513 if (fd < 0) {
1514 ret = fd;
1515 goto out_fput;
1516 }
1517
1518 mig->data_fd = fd;
1519 if (copy_to_user(arg, mig, sizeof(*mig))) {
1520 ret = -EFAULT;
1521 goto out_put_unused;
1522 }
1523 fd_install(fd, filp);
1524 return 0;
1525
1526out_put_unused:
1527 put_unused_fd(fd);
1528out_fput:
1529 fput(filp);
1530 return ret;
1531}
1532
1533static int
1534vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
1535 u32 flags, void __user *arg,
1536 size_t argsz)
1537{
1538 size_t minsz =
1539 offsetofend(struct vfio_device_feature_mig_state, data_fd);
1540 struct vfio_device_feature_mig_state mig;
1541 struct file *filp = NULL;
1542 int ret;
1543
1544 if (!device->ops->migration_set_state ||
1545 !device->ops->migration_get_state)
1546 return -ENOTTY;
1547
1548 ret = vfio_check_feature(flags, argsz,
1549 VFIO_DEVICE_FEATURE_SET |
1550 VFIO_DEVICE_FEATURE_GET,
1551 sizeof(mig));
1552 if (ret != 1)
1553 return ret;
1554
1555 if (copy_from_user(&mig, arg, minsz))
1556 return -EFAULT;
1557
1558 if (flags & VFIO_DEVICE_FEATURE_GET) {
1559 enum vfio_device_mig_state curr_state;
1560
1561 ret = device->ops->migration_get_state(device, &curr_state);
1562 if (ret)
1563 return ret;
1564 mig.device_state = curr_state;
1565 goto out_copy;
1566 }
1567
1568 /* Handle the VFIO_DEVICE_FEATURE_SET */
1569 filp = device->ops->migration_set_state(device, mig.device_state);
1570 if (IS_ERR(filp) || !filp)
1571 goto out_copy;
1572
1573 return vfio_ioct_mig_return_fd(filp, arg, &mig);
1574out_copy:
1575 mig.data_fd = -1;
1576 if (copy_to_user(arg, &mig, sizeof(mig)))
1577 return -EFAULT;
1578 if (IS_ERR(filp))
1579 return PTR_ERR(filp);
1580 return 0;
1581}
1582
1583static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
1584 u32 flags, void __user *arg,
1585 size_t argsz)
1586{
1587 struct vfio_device_feature_migration mig = {
8cb3d83b 1588 .flags = device->migration_flags,
115dcec6
JG
1589 };
1590 int ret;
1591
1592 if (!device->ops->migration_set_state ||
1593 !device->ops->migration_get_state)
1594 return -ENOTTY;
1595
1596 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
1597 sizeof(mig));
1598 if (ret != 1)
1599 return ret;
1600 if (copy_to_user(arg, &mig, sizeof(mig)))
1601 return -EFAULT;
1602 return 0;
1603}
1604
445ad495
JG
1605static int vfio_ioctl_device_feature(struct vfio_device *device,
1606 struct vfio_device_feature __user *arg)
1607{
1608 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1609 struct vfio_device_feature feature;
1610
1611 if (copy_from_user(&feature, arg, minsz))
1612 return -EFAULT;
1613
1614 if (feature.argsz < minsz)
1615 return -EINVAL;
1616
1617 /* Check unknown flags */
1618 if (feature.flags &
1619 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1620 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1621 return -EINVAL;
1622
1623 /* GET & SET are mutually exclusive except with PROBE */
1624 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1625 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1626 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1627 return -EINVAL;
1628
1629 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
115dcec6
JG
1630 case VFIO_DEVICE_FEATURE_MIGRATION:
1631 return vfio_ioctl_device_feature_migration(
1632 device, feature.flags, arg->data,
1633 feature.argsz - minsz);
1634 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1635 return vfio_ioctl_device_feature_mig_device_state(
1636 device, feature.flags, arg->data,
1637 feature.argsz - minsz);
445ad495
JG
1638 default:
1639 if (unlikely(!device->ops->device_feature))
1640 return -EINVAL;
1641 return device->ops->device_feature(device, feature.flags,
1642 arg->data,
1643 feature.argsz - minsz);
1644 }
1645}
1646
cba3345c
AW
1647static long vfio_device_fops_unl_ioctl(struct file *filep,
1648 unsigned int cmd, unsigned long arg)
1649{
1650 struct vfio_device *device = filep->private_data;
1651
445ad495
JG
1652 switch (cmd) {
1653 case VFIO_DEVICE_FEATURE:
1654 return vfio_ioctl_device_feature(device, (void __user *)arg);
1655 default:
1656 if (unlikely(!device->ops->ioctl))
1657 return -EINVAL;
1658 return device->ops->ioctl(device, cmd, arg);
1659 }
cba3345c
AW
1660}
1661
1662static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1663 size_t count, loff_t *ppos)
1664{
1665 struct vfio_device *device = filep->private_data;
1666
1667 if (unlikely(!device->ops->read))
1668 return -EINVAL;
1669
6df62c5b 1670 return device->ops->read(device, buf, count, ppos);
cba3345c
AW
1671}
1672
1673static ssize_t vfio_device_fops_write(struct file *filep,
1674 const char __user *buf,
1675 size_t count, loff_t *ppos)
1676{
1677 struct vfio_device *device = filep->private_data;
1678
1679 if (unlikely(!device->ops->write))
1680 return -EINVAL;
1681
6df62c5b 1682 return device->ops->write(device, buf, count, ppos);
cba3345c
AW
1683}
1684
1685static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1686{
1687 struct vfio_device *device = filep->private_data;
1688
1689 if (unlikely(!device->ops->mmap))
1690 return -EINVAL;
1691
6df62c5b 1692 return device->ops->mmap(device, vma);
cba3345c
AW
1693}
1694
cba3345c
AW
1695static const struct file_operations vfio_device_fops = {
1696 .owner = THIS_MODULE,
1697 .release = vfio_device_fops_release,
1698 .read = vfio_device_fops_read,
1699 .write = vfio_device_fops_write,
1700 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
407e9ef7 1701 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1702 .mmap = vfio_device_fops_mmap,
1703};
1704
50d63b5b
JG
1705/**
1706 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
1707 * @file: VFIO group file
6cdd9782 1708 *
50d63b5b 1709 * The returned iommu_group is valid as long as a ref is held on the file.
6cdd9782 1710 */
50d63b5b 1711struct iommu_group *vfio_file_iommu_group(struct file *file)
6cdd9782 1712{
50d63b5b 1713 struct vfio_group *group = file->private_data;
6cdd9782 1714
50d63b5b
JG
1715 if (file->f_op != &vfio_group_fops)
1716 return NULL;
1717 return group->iommu_group;
6cdd9782 1718}
50d63b5b 1719EXPORT_SYMBOL_GPL(vfio_file_iommu_group);
6cdd9782 1720
a905ad04
JG
1721/**
1722 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1723 * is always CPU cache coherent
1724 * @file: VFIO group file
c0560f51 1725 *
a905ad04
JG
1726 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1727 * bit in DMA transactions. A return of false indicates that the user has
1728 * rights to access additional instructions such as wbinvd on x86.
c0560f51 1729 */
a905ad04 1730bool vfio_file_enforced_coherent(struct file *file)
c0560f51 1731{
a905ad04
JG
1732 struct vfio_group *group = file->private_data;
1733 bool ret;
c0560f51 1734
a905ad04
JG
1735 if (file->f_op != &vfio_group_fops)
1736 return true;
c0560f51 1737
e0e29bdb
JG
1738 down_read(&group->group_rwsem);
1739 if (group->container) {
1740 ret = vfio_ioctl_check_extension(group->container,
1741 VFIO_DMA_CC_IOMMU);
1742 } else {
1743 /*
1744 * Since the coherency state is determined only once a container
1745 * is attached the user must do so before they can prove they
1746 * have permission.
1747 */
1748 ret = true;
c0560f51 1749 }
e0e29bdb 1750 up_read(&group->group_rwsem);
a905ad04 1751 return ret;
c0560f51 1752}
a905ad04 1753EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
c0560f51 1754
ba70a89f
JG
1755/**
1756 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1757 * @file: VFIO group file
1758 * @kvm: KVM to link
1759 *
421cfe65
MR
1760 * When a VFIO device is first opened the KVM will be available in
1761 * device->kvm if one was associated with the group.
ba70a89f
JG
1762 */
1763void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
6cdd9782 1764{
ba70a89f 1765 struct vfio_group *group = file->private_data;
6cdd9782 1766
ba70a89f
JG
1767 if (file->f_op != &vfio_group_fops)
1768 return;
5d6dee80 1769
be8d3ada 1770 down_write(&group->group_rwsem);
ba70a89f 1771 group->kvm = kvm;
be8d3ada 1772 up_write(&group->group_rwsem);
5d6dee80 1773}
ba70a89f 1774EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
5d6dee80 1775
6a985ae8
JG
1776/**
1777 * vfio_file_has_dev - True if the VFIO file is a handle for device
1778 * @file: VFIO file to check
1779 * @device: Device that must be part of the file
1780 *
1781 * Returns true if given file has permission to manipulate the given device.
1782 */
1783bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
6cdd9782 1784{
6a985ae8 1785 struct vfio_group *group = file->private_data;
6cdd9782 1786
6a985ae8
JG
1787 if (file->f_op != &vfio_group_fops)
1788 return false;
1789
1790 return group == device->group;
88d7ab89 1791}
6a985ae8 1792EXPORT_SYMBOL_GPL(vfio_file_has_dev);
88d7ab89 1793
3b9a2d57 1794/*
d7a8d5ed
AW
1795 * Sub-module support
1796 */
1797/*
1798 * Helper for managing a buffer of info chain capabilities, allocate or
1799 * reallocate a buffer with additional @size, filling in @id and @version
1800 * of the capability. A pointer to the new capability is returned.
1801 *
1802 * NB. The chain is based at the head of the buffer, so new entries are
1803 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1804 * next offsets prior to copying to the user buffer.
1805 */
1806struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1807 size_t size, u16 id, u16 version)
1808{
1809 void *buf;
1810 struct vfio_info_cap_header *header, *tmp;
1811
1812 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1813 if (!buf) {
1814 kfree(caps->buf);
1815 caps->size = 0;
1816 return ERR_PTR(-ENOMEM);
1817 }
1818
1819 caps->buf = buf;
1820 header = buf + caps->size;
1821
1822 /* Eventually copied to user buffer, zero */
1823 memset(header, 0, size);
1824
1825 header->id = id;
1826 header->version = version;
1827
1828 /* Add to the end of the capability chain */
5ba6de98 1829 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
d7a8d5ed
AW
1830 ; /* nothing */
1831
1832 tmp->next = caps->size;
1833 caps->size += size;
1834
1835 return header;
1836}
1837EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1838
1839void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1840{
1841 struct vfio_info_cap_header *tmp;
5ba6de98 1842 void *buf = (void *)caps->buf;
d7a8d5ed 1843
5ba6de98 1844 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
d7a8d5ed
AW
1845 tmp->next += offset;
1846}
b3c0a866 1847EXPORT_SYMBOL(vfio_info_cap_shift);
d7a8d5ed 1848
dda01f78
AW
1849int vfio_info_add_capability(struct vfio_info_cap *caps,
1850 struct vfio_info_cap_header *cap, size_t size)
b3c0a866
KW
1851{
1852 struct vfio_info_cap_header *header;
b3c0a866 1853
dda01f78 1854 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
b3c0a866
KW
1855 if (IS_ERR(header))
1856 return PTR_ERR(header);
1857
dda01f78 1858 memcpy(header + 1, cap + 1, size - sizeof(*header));
b3c0a866 1859
b3c0a866
KW
1860 return 0;
1861}
b3c0a866 1862EXPORT_SYMBOL(vfio_info_add_capability);
2169037d 1863
c747f08a
KW
1864int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1865 int max_irq_type, size_t *data_size)
1866{
1867 unsigned long minsz;
1868 size_t size;
1869
1870 minsz = offsetofend(struct vfio_irq_set, count);
1871
1872 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1873 (hdr->count >= (U32_MAX - hdr->start)) ||
1874 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1875 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1876 return -EINVAL;
1877
1878 if (data_size)
1879 *data_size = 0;
1880
1881 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1882 return -EINVAL;
1883
1884 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1885 case VFIO_IRQ_SET_DATA_NONE:
1886 size = 0;
1887 break;
1888 case VFIO_IRQ_SET_DATA_BOOL:
1889 size = sizeof(uint8_t);
1890 break;
1891 case VFIO_IRQ_SET_DATA_EVENTFD:
1892 size = sizeof(int32_t);
1893 break;
1894 default:
1895 return -EINVAL;
1896 }
1897
1898 if (size) {
1899 if (hdr->argsz - minsz < hdr->count * size)
1900 return -EINVAL;
1901
1902 if (!data_size)
1903 return -EINVAL;
1904
1905 *data_size = hdr->count * size;
1906 }
1907
1908 return 0;
1909}
1910EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1911
2169037d
KW
1912/*
1913 * Pin a set of guest PFNs and return their associated host PFNs for local
1914 * domain only.
8e432bb0 1915 * @device [in] : device
d9d84780 1916 * @user_pfn [in]: array of user/guest PFNs to be pinned.
2169037d
KW
1917 * @npage [in] : count of elements in user_pfn array. This count should not
1918 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1919 * @prot [in] : protection flags
1920 * @phys_pfn[out]: array of host PFNs
1921 * Return error or number of pages pinned.
1922 */
8e432bb0
JG
1923int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
1924 int npage, int prot, unsigned long *phys_pfn)
2169037d
KW
1925{
1926 struct vfio_container *container;
8e432bb0 1927 struct vfio_group *group = device->group;
2169037d
KW
1928 struct vfio_iommu_driver *driver;
1929 int ret;
1930
eadd86f8
JG
1931 if (!user_pfn || !phys_pfn || !npage ||
1932 !vfio_assert_device_open(device))
2169037d
KW
1933 return -EINVAL;
1934
1935 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1936 return -E2BIG;
1937
8e432bb0
JG
1938 if (group->dev_counter > 1)
1939 return -EINVAL;
2169037d 1940
e0e29bdb 1941 /* group->container cannot change while a vfio device is open */
2169037d 1942 container = group->container;
2169037d
KW
1943 driver = container->iommu_driver;
1944 if (likely(driver && driver->ops->pin_pages))
95fc87b4
KW
1945 ret = driver->ops->pin_pages(container->iommu_data,
1946 group->iommu_group, user_pfn,
2169037d
KW
1947 npage, prot, phys_pfn);
1948 else
1949 ret = -ENOTTY;
1950
2169037d
KW
1951 return ret;
1952}
1953EXPORT_SYMBOL(vfio_pin_pages);
1954
1955/*
1956 * Unpin set of host PFNs for local domain only.
8e432bb0 1957 * @device [in] : device
2169037d
KW
1958 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1959 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1960 * @npage [in] : count of elements in user_pfn array. This count should not
1961 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1962 * Return error or number of pages unpinned.
1963 */
8e432bb0
JG
1964int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
1965 int npage)
2169037d
KW
1966{
1967 struct vfio_container *container;
2169037d
KW
1968 struct vfio_iommu_driver *driver;
1969 int ret;
1970
eadd86f8 1971 if (!user_pfn || !npage || !vfio_assert_device_open(device))
2169037d
KW
1972 return -EINVAL;
1973
1974 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1975 return -E2BIG;
1976
e0e29bdb 1977 /* group->container cannot change while a vfio device is open */
8e432bb0 1978 container = device->group->container;
2169037d
KW
1979 driver = container->iommu_driver;
1980 if (likely(driver && driver->ops->unpin_pages))
1981 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1982 npage);
1983 else
1984 ret = -ENOTTY;
1985
2169037d
KW
1986 return ret;
1987}
1988EXPORT_SYMBOL(vfio_unpin_pages);
1989
8d46c0cc
YZ
1990/*
1991 * This interface allows the CPUs to perform some sort of virtual DMA on
1992 * behalf of the device.
1993 *
1994 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1995 * into/from a kernel buffer.
1996 *
1997 * As the read/write of user space memory is conducted via the CPUs and is
1998 * not a real device DMA, it is not necessary to pin the user space memory.
1999 *
c6250ffb 2000 * @device [in] : VFIO device
8d46c0cc
YZ
2001 * @user_iova [in] : base IOVA of a user space buffer
2002 * @data [in] : pointer to kernel buffer
2003 * @len [in] : kernel buffer length
2004 * @write : indicate read or write
2005 * Return error code on failure or 0 on success.
2006 */
c6250ffb
JG
2007int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
2008 size_t len, bool write)
8d46c0cc
YZ
2009{
2010 struct vfio_container *container;
2011 struct vfio_iommu_driver *driver;
2012 int ret = 0;
2013
eadd86f8 2014 if (!data || len <= 0 || !vfio_assert_device_open(device))
8d46c0cc
YZ
2015 return -EINVAL;
2016
e0e29bdb 2017 /* group->container cannot change while a vfio device is open */
c6250ffb 2018 container = device->group->container;
8d46c0cc
YZ
2019 driver = container->iommu_driver;
2020
2021 if (likely(driver && driver->ops->dma_rw))
2022 ret = driver->ops->dma_rw(container->iommu_data,
2023 user_iova, data, len, write);
2024 else
2025 ret = -ENOTTY;
8d46c0cc
YZ
2026 return ret;
2027}
2028EXPORT_SYMBOL(vfio_dma_rw);
2029
22195cbd
JS
2030static int vfio_register_iommu_notifier(struct vfio_group *group,
2031 unsigned long *events,
2032 struct notifier_block *nb)
c086de81
KW
2033{
2034 struct vfio_container *container;
c086de81
KW
2035 struct vfio_iommu_driver *driver;
2036 int ret;
2037
421cfe65 2038 lockdep_assert_held_read(&group->group_rwsem);
c086de81
KW
2039
2040 container = group->container;
c086de81
KW
2041 driver = container->iommu_driver;
2042 if (likely(driver && driver->ops->register_notifier))
22195cbd
JS
2043 ret = driver->ops->register_notifier(container->iommu_data,
2044 events, nb);
c086de81
KW
2045 else
2046 ret = -ENOTTY;
2047
c086de81
KW
2048 return ret;
2049}
c086de81 2050
22195cbd
JS
2051static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2052 struct notifier_block *nb)
c086de81
KW
2053{
2054 struct vfio_container *container;
c086de81
KW
2055 struct vfio_iommu_driver *driver;
2056 int ret;
2057
421cfe65 2058 lockdep_assert_held_read(&group->group_rwsem);
c086de81
KW
2059
2060 container = group->container;
c086de81
KW
2061 driver = container->iommu_driver;
2062 if (likely(driver && driver->ops->unregister_notifier))
2063 ret = driver->ops->unregister_notifier(container->iommu_data,
2064 nb);
2065 else
2066 ret = -ENOTTY;
2067
ccd46dba
JS
2068 return ret;
2069}
2070
09ea48ef
JG
2071int vfio_register_notifier(struct vfio_device *device,
2072 enum vfio_notify_type type, unsigned long *events,
2073 struct notifier_block *nb)
ccd46dba 2074{
09ea48ef 2075 struct vfio_group *group = device->group;
ccd46dba
JS
2076 int ret;
2077
eadd86f8
JG
2078 if (!nb || !events || (*events == 0) ||
2079 !vfio_assert_device_open(device))
22195cbd
JS
2080 return -EINVAL;
2081
22195cbd
JS
2082 switch (type) {
2083 case VFIO_IOMMU_NOTIFY:
2084 ret = vfio_register_iommu_notifier(group, events, nb);
2085 break;
2086 default:
2087 ret = -EINVAL;
2088 }
22195cbd
JS
2089 return ret;
2090}
2091EXPORT_SYMBOL(vfio_register_notifier);
2092
09ea48ef
JG
2093int vfio_unregister_notifier(struct vfio_device *device,
2094 enum vfio_notify_type type,
22195cbd
JS
2095 struct notifier_block *nb)
2096{
09ea48ef 2097 struct vfio_group *group = device->group;
22195cbd
JS
2098 int ret;
2099
eadd86f8 2100 if (!nb || !vfio_assert_device_open(device))
22195cbd
JS
2101 return -EINVAL;
2102
22195cbd
JS
2103 switch (type) {
2104 case VFIO_IOMMU_NOTIFY:
2105 ret = vfio_unregister_iommu_notifier(group, nb);
2106 break;
2107 default:
2108 ret = -EINVAL;
2109 }
c086de81
KW
2110 return ret;
2111}
2112EXPORT_SYMBOL(vfio_unregister_notifier);
2113
3b9a2d57 2114/*
cba3345c
AW
2115 * Module/class support
2116 */
2117static char *vfio_devnode(struct device *dev, umode_t *mode)
2118{
2119 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2120}
2121
d1099901
AW
2122static struct miscdevice vfio_dev = {
2123 .minor = VFIO_MINOR,
2124 .name = "vfio",
2125 .fops = &vfio_fops,
2126 .nodename = "vfio/vfio",
2127 .mode = S_IRUGO | S_IWUGO,
2128};
2129
cba3345c
AW
2130static int __init vfio_init(void)
2131{
2132 int ret;
2133
9cef7391 2134 ida_init(&vfio.group_ida);
cba3345c
AW
2135 mutex_init(&vfio.group_lock);
2136 mutex_init(&vfio.iommu_drivers_lock);
2137 INIT_LIST_HEAD(&vfio.group_list);
2138 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
cba3345c 2139
d1099901
AW
2140 ret = misc_register(&vfio_dev);
2141 if (ret) {
2142 pr_err("vfio: misc device register failed\n");
2143 return ret;
2144 }
2145
2146 /* /dev/vfio/$GROUP */
cba3345c
AW
2147 vfio.class = class_create(THIS_MODULE, "vfio");
2148 if (IS_ERR(vfio.class)) {
2149 ret = PTR_ERR(vfio.class);
2150 goto err_class;
2151 }
2152
2153 vfio.class->devnode = vfio_devnode;
2154
8bcb64a5 2155 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
cba3345c 2156 if (ret)
d1099901 2157 goto err_alloc_chrdev;
cba3345c 2158
cba3345c
AW
2159 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2160
03a76b60
AW
2161#ifdef CONFIG_VFIO_NOIOMMU
2162 vfio_register_iommu_driver(&vfio_noiommu_ops);
2163#endif
cba3345c
AW
2164 return 0;
2165
d1099901 2166err_alloc_chrdev:
cba3345c
AW
2167 class_destroy(vfio.class);
2168 vfio.class = NULL;
2169err_class:
d1099901 2170 misc_deregister(&vfio_dev);
cba3345c
AW
2171 return ret;
2172}
2173
2174static void __exit vfio_cleanup(void)
2175{
2176 WARN_ON(!list_empty(&vfio.group_list));
2177
03a76b60
AW
2178#ifdef CONFIG_VFIO_NOIOMMU
2179 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2180#endif
9cef7391 2181 ida_destroy(&vfio.group_ida);
8bcb64a5 2182 unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
cba3345c
AW
2183 class_destroy(vfio.class);
2184 vfio.class = NULL;
d1099901 2185 misc_deregister(&vfio_dev);
2fd585f4 2186 xa_destroy(&vfio_device_set_xa);
cba3345c
AW
2187}
2188
2189module_init(vfio_init);
2190module_exit(vfio_cleanup);
2191
2192MODULE_VERSION(DRIVER_VERSION);
2193MODULE_LICENSE("GPL v2");
2194MODULE_AUTHOR(DRIVER_AUTHOR);
2195MODULE_DESCRIPTION(DRIVER_DESC);
d1099901
AW
2196MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2197MODULE_ALIAS("devname:vfio/vfio");
0ca582fd 2198MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");