vfio: fix deadlock between group lock and kvm lock
[linux-block.git] / drivers / vfio / vfio_main.c
CommitLineData
d2912cb1 1// SPDX-License-Identifier: GPL-2.0-only
cba3345c
AW
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
cba3345c
AW
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
cba3345c
AW
16#include <linux/fs.h>
17#include <linux/idr.h>
18#include <linux/iommu.h>
2b48f52f
MR
19#ifdef CONFIG_HAVE_KVM
20#include <linux/kvm_host.h>
21#endif
cba3345c 22#include <linux/list.h>
d1099901 23#include <linux/miscdevice.h>
cba3345c
AW
24#include <linux/module.h>
25#include <linux/mutex.h>
5f096b14 26#include <linux/pci.h>
9587f44a 27#include <linux/rwsem.h>
cba3345c
AW
28#include <linux/sched.h>
29#include <linux/slab.h>
664e9386 30#include <linux/stat.h>
cba3345c
AW
31#include <linux/string.h>
32#include <linux/uaccess.h>
33#include <linux/vfio.h>
34#include <linux/wait.h>
41be3e26 35#include <linux/sched/signal.h>
8e5c6995 36#include <linux/pm_runtime.h>
80c4b92a
YH
37#include <linux/interval_tree.h>
38#include <linux/iova_bitmap.h>
2a3dab19 39#include <linux/iommufd.h>
8cc02d22 40#include "vfio.h"
cba3345c
AW
41
42#define DRIVER_VERSION "0.3"
43#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44#define DRIVER_DESC "VFIO - User Level meta-driver"
45
46static struct vfio {
3c28a761
YL
47 struct class *device_class;
48 struct ida device_ida;
cba3345c
AW
49} vfio;
50
2fd585f4
JG
51static DEFINE_XARRAY(vfio_device_set_xa);
52
53int vfio_assign_device_set(struct vfio_device *device, void *set_id)
54{
55 unsigned long idx = (unsigned long)set_id;
56 struct vfio_device_set *new_dev_set;
57 struct vfio_device_set *dev_set;
58
59 if (WARN_ON(!set_id))
60 return -EINVAL;
61
62 /*
63 * Atomically acquire a singleton object in the xarray for this set_id
64 */
65 xa_lock(&vfio_device_set_xa);
66 dev_set = xa_load(&vfio_device_set_xa, idx);
67 if (dev_set)
68 goto found_get_ref;
69 xa_unlock(&vfio_device_set_xa);
70
71 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
72 if (!new_dev_set)
73 return -ENOMEM;
74 mutex_init(&new_dev_set->lock);
75 INIT_LIST_HEAD(&new_dev_set->device_list);
76 new_dev_set->set_id = set_id;
77
78 xa_lock(&vfio_device_set_xa);
79 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
80 GFP_KERNEL);
81 if (!dev_set) {
82 dev_set = new_dev_set;
83 goto found_get_ref;
84 }
85
86 kfree(new_dev_set);
87 if (xa_is_err(dev_set)) {
88 xa_unlock(&vfio_device_set_xa);
89 return xa_err(dev_set);
90 }
91
92found_get_ref:
93 dev_set->device_count++;
94 xa_unlock(&vfio_device_set_xa);
95 mutex_lock(&dev_set->lock);
96 device->dev_set = dev_set;
97 list_add_tail(&device->dev_set_list, &dev_set->device_list);
98 mutex_unlock(&dev_set->lock);
99 return 0;
100}
101EXPORT_SYMBOL_GPL(vfio_assign_device_set);
102
103static void vfio_release_device_set(struct vfio_device *device)
104{
105 struct vfio_device_set *dev_set = device->dev_set;
106
107 if (!dev_set)
108 return;
109
110 mutex_lock(&dev_set->lock);
111 list_del(&device->dev_set_list);
112 mutex_unlock(&dev_set->lock);
113
114 xa_lock(&vfio_device_set_xa);
115 if (!--dev_set->device_count) {
116 __xa_erase(&vfio_device_set_xa,
117 (unsigned long)dev_set->set_id);
118 mutex_destroy(&dev_set->lock);
119 kfree(dev_set);
120 }
121 xa_unlock(&vfio_device_set_xa);
122}
123
5cd189e4
AD
124unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
125{
126 struct vfio_device *cur;
127 unsigned int open_count = 0;
128
129 lockdep_assert_held(&dev_set->lock);
130
131 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
132 open_count += cur->open_count;
133 return open_count;
134}
135EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
136
3b9a2d57 137/*
cba3345c
AW
138 * Device objects - create, release, get, put, search
139 */
cba3345c 140/* Device reference always implies a group reference */
9eefba80 141void vfio_device_put_registration(struct vfio_device *device)
cba3345c 142{
5e42c999
JG
143 if (refcount_dec_and_test(&device->refcount))
144 complete(&device->comp);
cba3345c
AW
145}
146
9eefba80 147bool vfio_device_try_get_registration(struct vfio_device *device)
cba3345c 148{
5e42c999 149 return refcount_inc_not_zero(&device->refcount);
cba3345c
AW
150}
151
3b9a2d57 152/*
cba3345c
AW
153 * VFIO driver API
154 */
cb9ff3f3 155/* Release helper called by vfio_put_device() */
3c28a761 156static void vfio_device_release(struct device *dev)
cb9ff3f3
KT
157{
158 struct vfio_device *device =
3c28a761 159 container_of(dev, struct vfio_device, device);
cb9ff3f3 160
ebb72b76 161 vfio_release_device_set(device);
3c28a761 162 ida_free(&vfio.device_ida, device->index);
cb9ff3f3 163
913447d0
EF
164 if (device->ops->release)
165 device->ops->release(device);
166
167 kvfree(device);
cb9ff3f3 168}
cb9ff3f3 169
d1104f93
EF
170static int vfio_init_device(struct vfio_device *device, struct device *dev,
171 const struct vfio_device_ops *ops);
172
cb9ff3f3
KT
173/*
174 * Allocate and initialize vfio_device so it can be registered to vfio
175 * core.
176 *
177 * Drivers should use the wrapper vfio_alloc_device() for allocation.
178 * @size is the size of the structure to be allocated, including any
179 * private data used by the driver.
180 *
181 * Driver may provide an @init callback to cover device private data.
182 *
183 * Use vfio_put_device() to release the structure after success return.
184 */
185struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
186 const struct vfio_device_ops *ops)
187{
188 struct vfio_device *device;
189 int ret;
190
191 if (WARN_ON(size < sizeof(struct vfio_device)))
192 return ERR_PTR(-EINVAL);
193
194 device = kvzalloc(size, GFP_KERNEL);
195 if (!device)
196 return ERR_PTR(-ENOMEM);
197
198 ret = vfio_init_device(device, dev, ops);
199 if (ret)
200 goto out_free;
201 return device;
202
203out_free:
204 kvfree(device);
205 return ERR_PTR(ret);
206}
207EXPORT_SYMBOL_GPL(_vfio_alloc_device);
208
209/*
210 * Initialize a vfio_device so it can be registered to vfio core.
cb9ff3f3 211 */
d1104f93
EF
212static int vfio_init_device(struct vfio_device *device, struct device *dev,
213 const struct vfio_device_ops *ops)
cb9ff3f3
KT
214{
215 int ret;
216
3c28a761
YL
217 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
218 if (ret < 0) {
219 dev_dbg(dev, "Error to alloc index\n");
220 return ret;
221 }
222
223 device->index = ret;
ebb72b76
KT
224 init_completion(&device->comp);
225 device->dev = dev;
226 device->ops = ops;
cb9ff3f3
KT
227
228 if (ops->init) {
229 ret = ops->init(device);
230 if (ret)
231 goto out_uninit;
232 }
233
3c28a761
YL
234 device_initialize(&device->device);
235 device->device.release = vfio_device_release;
236 device->device.class = vfio.device_class;
237 device->device.parent = device->dev;
cb9ff3f3
KT
238 return 0;
239
240out_uninit:
ebb72b76 241 vfio_release_device_set(device);
3c28a761 242 ida_free(&vfio.device_ida, device->index);
cb9ff3f3
KT
243 return ret;
244}
cb9ff3f3 245
49ea02d3
YL
246static int __vfio_register_dev(struct vfio_device *device,
247 enum vfio_group_type type)
248{
249 int ret;
250
a4d1f91d
JG
251 if (WARN_ON(device->ops->bind_iommufd &&
252 (!device->ops->unbind_iommufd ||
253 !device->ops->attach_ioas)))
254 return -EINVAL;
255
2fd585f4
JG
256 /*
257 * If the driver doesn't specify a set then the device is added to a
258 * singleton set just for itself.
259 */
260 if (!device->dev_set)
261 vfio_assign_device_set(device, device);
262
3c28a761
YL
263 ret = dev_set_name(&device->device, "vfio%d", device->index);
264 if (ret)
49ea02d3
YL
265 return ret;
266
267 ret = vfio_device_set_group(device, type);
268 if (ret)
269 return ret;
3c28a761
YL
270
271 ret = device_add(&device->device);
272 if (ret)
273 goto err_out;
274
0bfc6a4e
JG
275 /* Refcounting can't start until the driver calls register */
276 refcount_set(&device->refcount, 1);
277
32e09228 278 vfio_device_group_register(device);
0bfc6a4e
JG
279
280 return 0;
3c28a761 281err_out:
ca5f21b2 282 vfio_device_remove_group(device);
3c28a761 283 return ret;
0bfc6a4e 284}
c68ea0d0
CH
285
286int vfio_register_group_dev(struct vfio_device *device)
287{
49ea02d3 288 return __vfio_register_dev(device, VFIO_IOMMU);
c68ea0d0 289}
0bfc6a4e
JG
290EXPORT_SYMBOL_GPL(vfio_register_group_dev);
291
c68ea0d0
CH
292/*
293 * Register a virtual device without IOMMU backing. The user of this
294 * device must not be able to directly trigger unmediated DMA.
295 */
296int vfio_register_emulated_iommu_dev(struct vfio_device *device)
297{
49ea02d3 298 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
c68ea0d0
CH
299}
300EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
301
cba3345c
AW
302/*
303 * Decrement the device reference count and wait for the device to be
304 * removed. Open file descriptors for the device... */
0bfc6a4e 305void vfio_unregister_group_dev(struct vfio_device *device)
cba3345c 306{
13060b64 307 unsigned int i = 0;
db7d4d7f 308 bool interrupted = false;
5e42c999 309 long rc;
cba3345c 310
4a725b8d 311 vfio_device_put_registration(device);
5e42c999
JG
312 rc = try_wait_for_completion(&device->comp);
313 while (rc <= 0) {
13060b64 314 if (device->ops->request)
6df62c5b 315 device->ops->request(device, i++);
13060b64 316
db7d4d7f 317 if (interrupted) {
5e42c999
JG
318 rc = wait_for_completion_timeout(&device->comp,
319 HZ * 10);
db7d4d7f 320 } else {
5e42c999
JG
321 rc = wait_for_completion_interruptible_timeout(
322 &device->comp, HZ * 10);
323 if (rc < 0) {
db7d4d7f 324 interrupted = true;
0bfc6a4e 325 dev_warn(device->dev,
db7d4d7f
AW
326 "Device is currently in use, task"
327 " \"%s\" (%d) "
328 "blocked until device is released",
329 current->comm, task_pid_nr(current));
330 }
331 }
5e42c999 332 }
e014e944 333
32e09228 334 vfio_device_group_unregister(device);
41be3e26 335
3c28a761
YL
336 /* Balances device_add in register path */
337 device_del(&device->device);
338
49ea02d3 339 /* Balances vfio_device_set_group in register path */
ca5f21b2 340 vfio_device_remove_group(device);
0bfc6a4e
JG
341}
342EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
343
2b48f52f
MR
344#ifdef CONFIG_HAVE_KVM
345void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
346{
347 void (*pfn)(struct kvm *kvm);
348 bool (*fn)(struct kvm *kvm);
349 bool ret;
350
351 lockdep_assert_held(&device->dev_set->lock);
352
353 pfn = symbol_get(kvm_put_kvm);
354 if (WARN_ON(!pfn))
355 return;
356
357 fn = symbol_get(kvm_get_kvm_safe);
358 if (WARN_ON(!fn)) {
359 symbol_put(kvm_put_kvm);
360 return;
361 }
362
363 ret = fn(kvm);
364 symbol_put(kvm_get_kvm_safe);
365 if (!ret) {
366 symbol_put(kvm_put_kvm);
367 return;
368 }
369
370 device->put_kvm = pfn;
371 device->kvm = kvm;
372}
373
374void vfio_device_put_kvm(struct vfio_device *device)
375{
376 lockdep_assert_held(&device->dev_set->lock);
377
378 if (!device->kvm)
379 return;
380
381 if (WARN_ON(!device->put_kvm))
382 goto clear;
383
384 device->put_kvm(device->kvm);
385 device->put_kvm = NULL;
386 symbol_put(kvm_put_kvm);
387
388clear:
389 device->kvm = NULL;
390}
391#endif
392
eadd86f8 393/* true if the vfio_device has open_device() called but not close_device() */
4741f2e9 394static bool vfio_assert_device_open(struct vfio_device *device)
32f55d83 395{
eadd86f8
JG
396 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
397}
398
5c8d3d93
YL
399static int vfio_device_first_open(struct vfio_device *device,
400 struct iommufd_ctx *iommufd, struct kvm *kvm)
294aaccb
JG
401{
402 int ret;
403
404 lockdep_assert_held(&device->dev_set->lock);
405
406 if (!try_module_get(device->dev->driver->owner))
407 return -ENODEV;
408
5c8d3d93
YL
409 if (iommufd)
410 ret = vfio_iommufd_bind(device, iommufd);
411 else
412 ret = vfio_device_group_use_iommu(device);
413 if (ret)
bab6fabc
JG
414 goto err_module_put;
415
294aaccb
JG
416 if (device->ops->open_device) {
417 ret = device->ops->open_device(device);
418 if (ret)
5c8d3d93 419 goto err_unuse_iommu;
294aaccb 420 }
294aaccb
JG
421 return 0;
422
5c8d3d93 423err_unuse_iommu:
5c8d3d93 424 if (iommufd)
a4d1f91d 425 vfio_iommufd_unbind(device);
5c8d3d93
YL
426 else
427 vfio_device_group_unuse_iommu(device);
bab6fabc 428err_module_put:
294aaccb
JG
429 module_put(device->dev->driver->owner);
430 return ret;
431}
432
5c8d3d93
YL
433static void vfio_device_last_close(struct vfio_device *device,
434 struct iommufd_ctx *iommufd)
294aaccb
JG
435{
436 lockdep_assert_held(&device->dev_set->lock);
437
294aaccb
JG
438 if (device->ops->close_device)
439 device->ops->close_device(device);
5c8d3d93 440 if (iommufd)
a4d1f91d 441 vfio_iommufd_unbind(device);
5c8d3d93
YL
442 else
443 vfio_device_group_unuse_iommu(device);
294aaccb
JG
444 module_put(device->dev->driver->owner);
445}
446
9eefba80
YL
447int vfio_device_open(struct vfio_device *device,
448 struct iommufd_ctx *iommufd, struct kvm *kvm)
cba3345c 449{
5cfff077 450 int ret = 0;
03a76b60 451
2b48f52f
MR
452 lockdep_assert_held(&device->dev_set->lock);
453
2fd585f4 454 device->open_count++;
421cfe65 455 if (device->open_count == 1) {
5c8d3d93 456 ret = vfio_device_first_open(device, iommufd, kvm);
294aaccb 457 if (ret)
5cfff077 458 device->open_count--;
2fd585f4 459 }
2fd585f4 460
5cfff077
YL
461 return ret;
462}
463
9eefba80
YL
464void vfio_device_close(struct vfio_device *device,
465 struct iommufd_ctx *iommufd)
5cfff077 466{
2b48f52f
MR
467 lockdep_assert_held(&device->dev_set->lock);
468
5cfff077
YL
469 vfio_assert_device_open(device);
470 if (device->open_count == 1)
5c8d3d93 471 vfio_device_last_close(device, iommufd);
5cfff077 472 device->open_count--;
5cfff077
YL
473}
474
8e5c6995
AS
475/*
476 * Wrapper around pm_runtime_resume_and_get().
477 * Return error code on failure or 0 on success.
478 */
479static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
480{
481 struct device *dev = device->dev;
482
483 if (dev->driver && dev->driver->pm) {
484 int ret;
485
486 ret = pm_runtime_resume_and_get(dev);
487 if (ret) {
488 dev_info_ratelimited(dev,
489 "vfio: runtime resume failed %d\n", ret);
490 return -EIO;
491 }
492 }
493
494 return 0;
495}
496
497/*
498 * Wrapper around pm_runtime_put().
499 */
500static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
501{
502 struct device *dev = device->dev;
503
504 if (dev->driver && dev->driver->pm)
505 pm_runtime_put(dev);
506}
507
3b9a2d57 508/*
cba3345c
AW
509 * VFIO Device fd
510 */
511static int vfio_device_fops_release(struct inode *inode, struct file *filep)
512{
513 struct vfio_device *device = filep->private_data;
514
5c8d3d93 515 vfio_device_group_close(device);
cba3345c 516
4a725b8d 517 vfio_device_put_registration(device);
cba3345c
AW
518
519 return 0;
520}
521
115dcec6
JG
522/*
523 * vfio_mig_get_next_state - Compute the next step in the FSM
524 * @cur_fsm - The current state the device is in
525 * @new_fsm - The target state to reach
526 * @next_fsm - Pointer to the next step to get to new_fsm
527 *
528 * Return 0 upon success, otherwise -errno
529 * Upon success the next step in the state progression between cur_fsm and
530 * new_fsm will be set in next_fsm.
531 *
532 * This breaks down requests for combination transitions into smaller steps and
533 * returns the next step to get to new_fsm. The function may need to be called
534 * multiple times before reaching new_fsm.
535 *
536 */
537int vfio_mig_get_next_state(struct vfio_device *device,
538 enum vfio_device_mig_state cur_fsm,
539 enum vfio_device_mig_state new_fsm,
540 enum vfio_device_mig_state *next_fsm)
541{
4db52602 542 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
115dcec6 543 /*
8cb3d83b
JG
544 * The coding in this table requires the driver to implement the
545 * following FSM arcs:
115dcec6 546 * RESUMING -> STOP
115dcec6 547 * STOP -> RESUMING
115dcec6
JG
548 * STOP -> STOP_COPY
549 * STOP_COPY -> STOP
550 *
8cb3d83b
JG
551 * If P2P is supported then the driver must also implement these FSM
552 * arcs:
553 * RUNNING -> RUNNING_P2P
554 * RUNNING_P2P -> RUNNING
555 * RUNNING_P2P -> STOP
556 * STOP -> RUNNING_P2P
4db52602
JG
557 *
558 * If precopy is supported then the driver must support these additional
559 * FSM arcs:
560 * RUNNING -> PRE_COPY
561 * PRE_COPY -> RUNNING
562 * PRE_COPY -> STOP_COPY
563 * However, if precopy and P2P are supported together then the driver
564 * must support these additional arcs beyond the P2P arcs above:
565 * PRE_COPY -> RUNNING
566 * PRE_COPY -> PRE_COPY_P2P
567 * PRE_COPY_P2P -> PRE_COPY
568 * PRE_COPY_P2P -> RUNNING_P2P
569 * PRE_COPY_P2P -> STOP_COPY
570 * RUNNING -> PRE_COPY
571 * RUNNING_P2P -> PRE_COPY_P2P
572 *
573 * Without P2P and precopy the driver must implement:
8cb3d83b
JG
574 * RUNNING -> STOP
575 * STOP -> RUNNING
576 *
577 * The coding will step through multiple states for some combination
578 * transitions; if all optional features are supported, this means the
579 * following ones:
4db52602
JG
580 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
581 * PRE_COPY -> RUNNING -> RUNNING_P2P
582 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
583 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
584 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
585 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
586 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
8cb3d83b 587 * RESUMING -> STOP -> RUNNING_P2P
4db52602 588 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b 589 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
4db52602 590 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
115dcec6 591 * RESUMING -> STOP -> STOP_COPY
4db52602 592 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b
JG
593 * RUNNING -> RUNNING_P2P -> STOP
594 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
595 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
4db52602 596 * RUNNING_P2P -> RUNNING -> PRE_COPY
8cb3d83b
JG
597 * RUNNING_P2P -> STOP -> RESUMING
598 * RUNNING_P2P -> STOP -> STOP_COPY
4db52602 599 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b 600 * STOP -> RUNNING_P2P -> RUNNING
4db52602 601 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
115dcec6 602 * STOP_COPY -> STOP -> RESUMING
8cb3d83b
JG
603 * STOP_COPY -> STOP -> RUNNING_P2P
604 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
4db52602
JG
605 *
606 * The following transitions are blocked:
607 * STOP_COPY -> PRE_COPY
608 * STOP_COPY -> PRE_COPY_P2P
115dcec6
JG
609 */
610 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
611 [VFIO_DEVICE_STATE_STOP] = {
612 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 613 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
4db52602
JG
614 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
615 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
616 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
617 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b 618 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
619 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
620 },
621 [VFIO_DEVICE_STATE_RUNNING] = {
8cb3d83b 622 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6 623 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
4db52602
JG
624 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
625 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
8cb3d83b
JG
626 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
627 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
628 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
629 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
630 },
4db52602
JG
631 [VFIO_DEVICE_STATE_PRE_COPY] = {
632 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
633 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
634 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
635 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
636 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
637 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
638 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
639 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
640 },
641 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
642 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
643 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
644 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
645 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
646 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
647 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
648 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
649 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
650 },
115dcec6
JG
651 [VFIO_DEVICE_STATE_STOP_COPY] = {
652 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
653 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
4db52602
JG
654 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
655 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
656 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
657 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 658 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
659 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
660 },
661 [VFIO_DEVICE_STATE_RESUMING] = {
662 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
663 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
4db52602
JG
664 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
665 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
666 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
667 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b
JG
668 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
669 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
670 },
671 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
672 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
673 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
4db52602
JG
674 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
675 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
8cb3d83b
JG
676 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
677 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
678 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
679 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
680 },
681 [VFIO_DEVICE_STATE_ERROR] = {
682 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
683 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
4db52602
JG
684 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
685 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
686 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
687 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
8cb3d83b 688 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
689 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
690 },
691 };
692
8cb3d83b
JG
693 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
694 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
695 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
4db52602
JG
696 [VFIO_DEVICE_STATE_PRE_COPY] =
697 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
698 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
699 VFIO_MIGRATION_P2P |
700 VFIO_MIGRATION_PRE_COPY,
8cb3d83b
JG
701 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
702 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
703 [VFIO_DEVICE_STATE_RUNNING_P2P] =
704 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
705 [VFIO_DEVICE_STATE_ERROR] = ~0U,
706 };
707
708 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
709 (state_flags_table[cur_fsm] & device->migration_flags) !=
710 state_flags_table[cur_fsm]))
115dcec6
JG
711 return -EINVAL;
712
8cb3d83b
JG
713 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
714 (state_flags_table[new_fsm] & device->migration_flags) !=
715 state_flags_table[new_fsm])
115dcec6
JG
716 return -EINVAL;
717
8cb3d83b
JG
718 /*
719 * Arcs touching optional and unsupported states are skipped over. The
720 * driver will instead see an arc from the original state to the next
721 * logical state, as per the above comment.
722 */
115dcec6 723 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
8cb3d83b
JG
724 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
725 state_flags_table[*next_fsm])
726 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
727
115dcec6
JG
728 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
729}
730EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
731
732/*
733 * Convert the drivers's struct file into a FD number and return it to userspace
734 */
735static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
736 struct vfio_device_feature_mig_state *mig)
737{
738 int ret;
739 int fd;
740
741 fd = get_unused_fd_flags(O_CLOEXEC);
742 if (fd < 0) {
743 ret = fd;
744 goto out_fput;
745 }
746
747 mig->data_fd = fd;
748 if (copy_to_user(arg, mig, sizeof(*mig))) {
749 ret = -EFAULT;
750 goto out_put_unused;
751 }
752 fd_install(fd, filp);
753 return 0;
754
755out_put_unused:
756 put_unused_fd(fd);
757out_fput:
758 fput(filp);
759 return ret;
760}
761
762static int
763vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
764 u32 flags, void __user *arg,
765 size_t argsz)
766{
767 size_t minsz =
768 offsetofend(struct vfio_device_feature_mig_state, data_fd);
769 struct vfio_device_feature_mig_state mig;
770 struct file *filp = NULL;
771 int ret;
772
6e97eba8 773 if (!device->mig_ops)
115dcec6
JG
774 return -ENOTTY;
775
776 ret = vfio_check_feature(flags, argsz,
777 VFIO_DEVICE_FEATURE_SET |
778 VFIO_DEVICE_FEATURE_GET,
779 sizeof(mig));
780 if (ret != 1)
781 return ret;
782
783 if (copy_from_user(&mig, arg, minsz))
784 return -EFAULT;
785
786 if (flags & VFIO_DEVICE_FEATURE_GET) {
787 enum vfio_device_mig_state curr_state;
788
6e97eba8
YH
789 ret = device->mig_ops->migration_get_state(device,
790 &curr_state);
115dcec6
JG
791 if (ret)
792 return ret;
793 mig.device_state = curr_state;
794 goto out_copy;
795 }
796
797 /* Handle the VFIO_DEVICE_FEATURE_SET */
6e97eba8 798 filp = device->mig_ops->migration_set_state(device, mig.device_state);
115dcec6
JG
799 if (IS_ERR(filp) || !filp)
800 goto out_copy;
801
802 return vfio_ioct_mig_return_fd(filp, arg, &mig);
803out_copy:
804 mig.data_fd = -1;
805 if (copy_to_user(arg, &mig, sizeof(mig)))
806 return -EFAULT;
807 if (IS_ERR(filp))
808 return PTR_ERR(filp);
809 return 0;
810}
811
4e016f96
YH
812static int
813vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
814 u32 flags, void __user *arg,
815 size_t argsz)
816{
817 struct vfio_device_feature_mig_data_size data_size = {};
818 unsigned long stop_copy_length;
819 int ret;
820
821 if (!device->mig_ops)
822 return -ENOTTY;
823
824 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
825 sizeof(data_size));
826 if (ret != 1)
827 return ret;
828
829 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
830 if (ret)
831 return ret;
832
833 data_size.stop_copy_length = stop_copy_length;
834 if (copy_to_user(arg, &data_size, sizeof(data_size)))
835 return -EFAULT;
836
837 return 0;
838}
839
115dcec6
JG
840static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
841 u32 flags, void __user *arg,
842 size_t argsz)
843{
844 struct vfio_device_feature_migration mig = {
8cb3d83b 845 .flags = device->migration_flags,
115dcec6
JG
846 };
847 int ret;
848
6e97eba8 849 if (!device->mig_ops)
115dcec6
JG
850 return -ENOTTY;
851
852 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
853 sizeof(mig));
854 if (ret != 1)
855 return ret;
856 if (copy_to_user(arg, &mig, sizeof(mig)))
857 return -EFAULT;
858 return 0;
859}
860
80c4b92a
YH
861/* Ranges should fit into a single kernel page */
862#define LOG_MAX_RANGES \
863 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
864
865static int
866vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
867 u32 flags, void __user *arg,
868 size_t argsz)
869{
870 size_t minsz =
871 offsetofend(struct vfio_device_feature_dma_logging_control,
872 ranges);
873 struct vfio_device_feature_dma_logging_range __user *ranges;
874 struct vfio_device_feature_dma_logging_control control;
875 struct vfio_device_feature_dma_logging_range range;
876 struct rb_root_cached root = RB_ROOT_CACHED;
877 struct interval_tree_node *nodes;
878 u64 iova_end;
879 u32 nnodes;
880 int i, ret;
881
882 if (!device->log_ops)
883 return -ENOTTY;
884
885 ret = vfio_check_feature(flags, argsz,
886 VFIO_DEVICE_FEATURE_SET,
887 sizeof(control));
888 if (ret != 1)
889 return ret;
890
891 if (copy_from_user(&control, arg, minsz))
892 return -EFAULT;
893
894 nnodes = control.num_ranges;
895 if (!nnodes)
896 return -EINVAL;
897
898 if (nnodes > LOG_MAX_RANGES)
899 return -E2BIG;
900
901 ranges = u64_to_user_ptr(control.ranges);
902 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
903 GFP_KERNEL);
904 if (!nodes)
905 return -ENOMEM;
906
907 for (i = 0; i < nnodes; i++) {
908 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
909 ret = -EFAULT;
910 goto end;
911 }
912 if (!IS_ALIGNED(range.iova, control.page_size) ||
913 !IS_ALIGNED(range.length, control.page_size)) {
914 ret = -EINVAL;
915 goto end;
916 }
917
918 if (check_add_overflow(range.iova, range.length, &iova_end) ||
919 iova_end > ULONG_MAX) {
920 ret = -EOVERFLOW;
921 goto end;
922 }
923
924 nodes[i].start = range.iova;
925 nodes[i].last = range.iova + range.length - 1;
926 if (interval_tree_iter_first(&root, nodes[i].start,
927 nodes[i].last)) {
928 /* Range overlapping */
929 ret = -EINVAL;
930 goto end;
931 }
932 interval_tree_insert(nodes + i, &root);
933 }
934
935 ret = device->log_ops->log_start(device, &root, nnodes,
936 &control.page_size);
937 if (ret)
938 goto end;
939
940 if (copy_to_user(arg, &control, sizeof(control))) {
941 ret = -EFAULT;
942 device->log_ops->log_stop(device);
943 }
944
945end:
946 kfree(nodes);
947 return ret;
948}
949
950static int
951vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
952 u32 flags, void __user *arg,
953 size_t argsz)
954{
955 int ret;
956
957 if (!device->log_ops)
958 return -ENOTTY;
959
960 ret = vfio_check_feature(flags, argsz,
961 VFIO_DEVICE_FEATURE_SET, 0);
962 if (ret != 1)
963 return ret;
964
965 return device->log_ops->log_stop(device);
966}
967
968static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
969 unsigned long iova, size_t length,
970 void *opaque)
971{
972 struct vfio_device *device = opaque;
973
974 return device->log_ops->log_read_and_clear(device, iova, length, iter);
975}
976
977static int
978vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
979 u32 flags, void __user *arg,
980 size_t argsz)
981{
982 size_t minsz =
983 offsetofend(struct vfio_device_feature_dma_logging_report,
984 bitmap);
985 struct vfio_device_feature_dma_logging_report report;
986 struct iova_bitmap *iter;
987 u64 iova_end;
988 int ret;
989
990 if (!device->log_ops)
991 return -ENOTTY;
992
993 ret = vfio_check_feature(flags, argsz,
994 VFIO_DEVICE_FEATURE_GET,
995 sizeof(report));
996 if (ret != 1)
997 return ret;
998
999 if (copy_from_user(&report, arg, minsz))
1000 return -EFAULT;
1001
1002 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1003 return -EINVAL;
1004
1005 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1006 iova_end > ULONG_MAX)
1007 return -EOVERFLOW;
1008
1009 iter = iova_bitmap_alloc(report.iova, report.length,
1010 report.page_size,
1011 u64_to_user_ptr(report.bitmap));
1012 if (IS_ERR(iter))
1013 return PTR_ERR(iter);
1014
1015 ret = iova_bitmap_for_each(iter, device,
1016 vfio_device_log_read_and_clear);
1017
1018 iova_bitmap_free(iter);
1019 return ret;
1020}
1021
445ad495
JG
1022static int vfio_ioctl_device_feature(struct vfio_device *device,
1023 struct vfio_device_feature __user *arg)
1024{
1025 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1026 struct vfio_device_feature feature;
1027
1028 if (copy_from_user(&feature, arg, minsz))
1029 return -EFAULT;
1030
1031 if (feature.argsz < minsz)
1032 return -EINVAL;
1033
1034 /* Check unknown flags */
1035 if (feature.flags &
1036 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1037 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1038 return -EINVAL;
1039
1040 /* GET & SET are mutually exclusive except with PROBE */
1041 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1042 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1043 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1044 return -EINVAL;
1045
1046 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
115dcec6
JG
1047 case VFIO_DEVICE_FEATURE_MIGRATION:
1048 return vfio_ioctl_device_feature_migration(
1049 device, feature.flags, arg->data,
1050 feature.argsz - minsz);
1051 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1052 return vfio_ioctl_device_feature_mig_device_state(
1053 device, feature.flags, arg->data,
1054 feature.argsz - minsz);
80c4b92a
YH
1055 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1056 return vfio_ioctl_device_feature_logging_start(
1057 device, feature.flags, arg->data,
1058 feature.argsz - minsz);
1059 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1060 return vfio_ioctl_device_feature_logging_stop(
1061 device, feature.flags, arg->data,
1062 feature.argsz - minsz);
1063 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1064 return vfio_ioctl_device_feature_logging_report(
1065 device, feature.flags, arg->data,
1066 feature.argsz - minsz);
4e016f96
YH
1067 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1068 return vfio_ioctl_device_feature_migration_data_size(
1069 device, feature.flags, arg->data,
1070 feature.argsz - minsz);
445ad495
JG
1071 default:
1072 if (unlikely(!device->ops->device_feature))
1073 return -EINVAL;
1074 return device->ops->device_feature(device, feature.flags,
1075 arg->data,
1076 feature.argsz - minsz);
1077 }
1078}
1079
cba3345c
AW
1080static long vfio_device_fops_unl_ioctl(struct file *filep,
1081 unsigned int cmd, unsigned long arg)
1082{
1083 struct vfio_device *device = filep->private_data;
8e5c6995
AS
1084 int ret;
1085
1086 ret = vfio_device_pm_runtime_get(device);
1087 if (ret)
1088 return ret;
cba3345c 1089
445ad495
JG
1090 switch (cmd) {
1091 case VFIO_DEVICE_FEATURE:
8e5c6995
AS
1092 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1093 break;
1094
445ad495
JG
1095 default:
1096 if (unlikely(!device->ops->ioctl))
8e5c6995
AS
1097 ret = -EINVAL;
1098 else
1099 ret = device->ops->ioctl(device, cmd, arg);
1100 break;
445ad495 1101 }
8e5c6995
AS
1102
1103 vfio_device_pm_runtime_put(device);
1104 return ret;
cba3345c
AW
1105}
1106
1107static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1108 size_t count, loff_t *ppos)
1109{
1110 struct vfio_device *device = filep->private_data;
1111
1112 if (unlikely(!device->ops->read))
1113 return -EINVAL;
1114
6df62c5b 1115 return device->ops->read(device, buf, count, ppos);
cba3345c
AW
1116}
1117
1118static ssize_t vfio_device_fops_write(struct file *filep,
1119 const char __user *buf,
1120 size_t count, loff_t *ppos)
1121{
1122 struct vfio_device *device = filep->private_data;
1123
1124 if (unlikely(!device->ops->write))
1125 return -EINVAL;
1126
6df62c5b 1127 return device->ops->write(device, buf, count, ppos);
cba3345c
AW
1128}
1129
1130static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1131{
1132 struct vfio_device *device = filep->private_data;
1133
1134 if (unlikely(!device->ops->mmap))
1135 return -EINVAL;
1136
6df62c5b 1137 return device->ops->mmap(device, vma);
cba3345c
AW
1138}
1139
9eefba80 1140const struct file_operations vfio_device_fops = {
cba3345c
AW
1141 .owner = THIS_MODULE,
1142 .release = vfio_device_fops_release,
1143 .read = vfio_device_fops_read,
1144 .write = vfio_device_fops_write,
1145 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
407e9ef7 1146 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1147 .mmap = vfio_device_fops_mmap,
1148};
1149
3b9a2d57 1150/*
d7a8d5ed
AW
1151 * Sub-module support
1152 */
1153/*
1154 * Helper for managing a buffer of info chain capabilities, allocate or
1155 * reallocate a buffer with additional @size, filling in @id and @version
1156 * of the capability. A pointer to the new capability is returned.
1157 *
1158 * NB. The chain is based at the head of the buffer, so new entries are
1159 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1160 * next offsets prior to copying to the user buffer.
1161 */
1162struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1163 size_t size, u16 id, u16 version)
1164{
1165 void *buf;
1166 struct vfio_info_cap_header *header, *tmp;
1167
1168 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1169 if (!buf) {
1170 kfree(caps->buf);
6641085e 1171 caps->buf = NULL;
d7a8d5ed
AW
1172 caps->size = 0;
1173 return ERR_PTR(-ENOMEM);
1174 }
1175
1176 caps->buf = buf;
1177 header = buf + caps->size;
1178
1179 /* Eventually copied to user buffer, zero */
1180 memset(header, 0, size);
1181
1182 header->id = id;
1183 header->version = version;
1184
1185 /* Add to the end of the capability chain */
5ba6de98 1186 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
d7a8d5ed
AW
1187 ; /* nothing */
1188
1189 tmp->next = caps->size;
1190 caps->size += size;
1191
1192 return header;
1193}
1194EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1195
1196void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1197{
1198 struct vfio_info_cap_header *tmp;
5ba6de98 1199 void *buf = (void *)caps->buf;
d7a8d5ed 1200
5ba6de98 1201 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
d7a8d5ed
AW
1202 tmp->next += offset;
1203}
b3c0a866 1204EXPORT_SYMBOL(vfio_info_cap_shift);
d7a8d5ed 1205
dda01f78
AW
1206int vfio_info_add_capability(struct vfio_info_cap *caps,
1207 struct vfio_info_cap_header *cap, size_t size)
b3c0a866
KW
1208{
1209 struct vfio_info_cap_header *header;
b3c0a866 1210
dda01f78 1211 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
b3c0a866
KW
1212 if (IS_ERR(header))
1213 return PTR_ERR(header);
1214
dda01f78 1215 memcpy(header + 1, cap + 1, size - sizeof(*header));
b3c0a866 1216
b3c0a866
KW
1217 return 0;
1218}
b3c0a866 1219EXPORT_SYMBOL(vfio_info_add_capability);
2169037d 1220
c747f08a
KW
1221int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1222 int max_irq_type, size_t *data_size)
1223{
1224 unsigned long minsz;
1225 size_t size;
1226
1227 minsz = offsetofend(struct vfio_irq_set, count);
1228
1229 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1230 (hdr->count >= (U32_MAX - hdr->start)) ||
1231 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1232 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1233 return -EINVAL;
1234
1235 if (data_size)
1236 *data_size = 0;
1237
1238 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1239 return -EINVAL;
1240
1241 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1242 case VFIO_IRQ_SET_DATA_NONE:
1243 size = 0;
1244 break;
1245 case VFIO_IRQ_SET_DATA_BOOL:
1246 size = sizeof(uint8_t);
1247 break;
1248 case VFIO_IRQ_SET_DATA_EVENTFD:
1249 size = sizeof(int32_t);
1250 break;
1251 default:
1252 return -EINVAL;
1253 }
1254
1255 if (size) {
1256 if (hdr->argsz - minsz < hdr->count * size)
1257 return -EINVAL;
1258
1259 if (!data_size)
1260 return -EINVAL;
1261
1262 *data_size = hdr->count * size;
1263 }
1264
1265 return 0;
1266}
1267EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1268
4741f2e9
JG
1269/*
1270 * Pin contiguous user pages and return their associated host pages for local
1271 * domain only.
1272 * @device [in] : device
1273 * @iova [in] : starting IOVA of user pages to be pinned.
1274 * @npage [in] : count of pages to be pinned. This count should not
1275 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1276 * @prot [in] : protection flags
1277 * @pages[out] : array of host pages
1278 * Return error or number of pages pinned.
1279 *
1280 * A driver may only call this function if the vfio_device was created
8da7a0e7 1281 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
4741f2e9
JG
1282 */
1283int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1284 int npage, int prot, struct page **pages)
1285{
1286 /* group->container cannot change while a vfio device is open */
1287 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1288 return -EINVAL;
8da7a0e7
YL
1289 if (vfio_device_has_container(device))
1290 return vfio_device_container_pin_pages(device, iova,
1291 npage, prot, pages);
4741f2e9
JG
1292 if (device->iommufd_access) {
1293 int ret;
1294
1295 if (iova > ULONG_MAX)
1296 return -EINVAL;
1297 /*
1298 * VFIO ignores the sub page offset, npages is from the start of
1299 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1300 * the sub page offset by doing:
1301 * pages[0] + (iova % PAGE_SIZE)
1302 */
1303 ret = iommufd_access_pin_pages(
1304 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1305 npage * PAGE_SIZE, pages,
1306 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1307 if (ret)
1308 return ret;
1309 return npage;
1310 }
1311 return -EINVAL;
1312}
1313EXPORT_SYMBOL(vfio_pin_pages);
1314
1315/*
1316 * Unpin contiguous host pages for local domain only.
1317 * @device [in] : device
1318 * @iova [in] : starting address of user pages to be unpinned.
1319 * @npage [in] : count of pages to be unpinned. This count should not
1320 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1321 */
1322void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1323{
1324 if (WARN_ON(!vfio_assert_device_open(device)))
1325 return;
1326
8da7a0e7
YL
1327 if (vfio_device_has_container(device)) {
1328 vfio_device_container_unpin_pages(device, iova, npage);
4741f2e9
JG
1329 return;
1330 }
1331 if (device->iommufd_access) {
1332 if (WARN_ON(iova > ULONG_MAX))
1333 return;
1334 iommufd_access_unpin_pages(device->iommufd_access,
1335 ALIGN_DOWN(iova, PAGE_SIZE),
1336 npage * PAGE_SIZE);
1337 return;
1338 }
1339}
1340EXPORT_SYMBOL(vfio_unpin_pages);
1341
1342/*
1343 * This interface allows the CPUs to perform some sort of virtual DMA on
1344 * behalf of the device.
1345 *
1346 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1347 * into/from a kernel buffer.
1348 *
1349 * As the read/write of user space memory is conducted via the CPUs and is
1350 * not a real device DMA, it is not necessary to pin the user space memory.
1351 *
1352 * @device [in] : VFIO device
1353 * @iova [in] : base IOVA of a user space buffer
1354 * @data [in] : pointer to kernel buffer
1355 * @len [in] : kernel buffer length
1356 * @write : indicate read or write
1357 * Return error code on failure or 0 on success.
1358 */
1359int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1360 size_t len, bool write)
1361{
1362 if (!data || len <= 0 || !vfio_assert_device_open(device))
1363 return -EINVAL;
1364
8da7a0e7
YL
1365 if (vfio_device_has_container(device))
1366 return vfio_device_container_dma_rw(device, iova,
1367 data, len, write);
4741f2e9
JG
1368
1369 if (device->iommufd_access) {
1370 unsigned int flags = 0;
1371
1372 if (iova > ULONG_MAX)
1373 return -EINVAL;
1374
1375 /* VFIO historically tries to auto-detect a kthread */
1376 if (!current->mm)
1377 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1378 if (write)
1379 flags |= IOMMUFD_ACCESS_RW_WRITE;
1380 return iommufd_access_rw(device->iommufd_access, iova, data,
1381 len, flags);
1382 }
1383 return -EINVAL;
1384}
1385EXPORT_SYMBOL(vfio_dma_rw);
1386
3b9a2d57 1387/*
cba3345c
AW
1388 * Module/class support
1389 */
1334e47e
YL
1390static int __init vfio_init(void)
1391{
1392 int ret;
1393
1394 ida_init(&vfio.device_ida);
1395
1396 ret = vfio_group_init();
1397 if (ret)
1398 return ret;
1399
e2d55709
JG
1400 ret = vfio_virqfd_init();
1401 if (ret)
1402 goto err_virqfd;
1403
1334e47e
YL
1404 /* /sys/class/vfio-dev/vfioX */
1405 vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1406 if (IS_ERR(vfio.device_class)) {
1407 ret = PTR_ERR(vfio.device_class);
1408 goto err_dev_class;
1409 }
1410
1411 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1412 return 0;
1413
1414err_dev_class:
e2d55709
JG
1415 vfio_virqfd_exit();
1416err_virqfd:
1334e47e
YL
1417 vfio_group_cleanup();
1418 return ret;
1419}
1420
1421static void __exit vfio_cleanup(void)
1422{
1423 ida_destroy(&vfio.device_ida);
1424 class_destroy(vfio.device_class);
1425 vfio.device_class = NULL;
e2d55709 1426 vfio_virqfd_exit();
1334e47e 1427 vfio_group_cleanup();
2fd585f4 1428 xa_destroy(&vfio_device_set_xa);
cba3345c
AW
1429}
1430
1431module_init(vfio_init);
1432module_exit(vfio_cleanup);
1433
1434MODULE_VERSION(DRIVER_VERSION);
1435MODULE_LICENSE("GPL v2");
1436MODULE_AUTHOR(DRIVER_AUTHOR);
1437MODULE_DESCRIPTION(DRIVER_DESC);
0ca582fd 1438MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");