Merge tag 'linux-can-next-for-6.4-20230327' of git://git.kernel.org/pub/scm/linux...
[linux-block.git] / drivers / vfio / vfio_main.c
CommitLineData
d2912cb1 1// SPDX-License-Identifier: GPL-2.0-only
cba3345c
AW
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
cba3345c
AW
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
cba3345c
AW
16#include <linux/fs.h>
17#include <linux/idr.h>
18#include <linux/iommu.h>
2b48f52f
MR
19#ifdef CONFIG_HAVE_KVM
20#include <linux/kvm_host.h>
21#endif
cba3345c 22#include <linux/list.h>
d1099901 23#include <linux/miscdevice.h>
cba3345c
AW
24#include <linux/module.h>
25#include <linux/mutex.h>
5f096b14 26#include <linux/pci.h>
9587f44a 27#include <linux/rwsem.h>
cba3345c
AW
28#include <linux/sched.h>
29#include <linux/slab.h>
664e9386 30#include <linux/stat.h>
cba3345c
AW
31#include <linux/string.h>
32#include <linux/uaccess.h>
33#include <linux/vfio.h>
34#include <linux/wait.h>
41be3e26 35#include <linux/sched/signal.h>
8e5c6995 36#include <linux/pm_runtime.h>
80c4b92a
YH
37#include <linux/interval_tree.h>
38#include <linux/iova_bitmap.h>
2a3dab19 39#include <linux/iommufd.h>
8cc02d22 40#include "vfio.h"
cba3345c
AW
41
42#define DRIVER_VERSION "0.3"
43#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44#define DRIVER_DESC "VFIO - User Level meta-driver"
45
46static struct vfio {
3c28a761
YL
47 struct class *device_class;
48 struct ida device_ida;
cba3345c
AW
49} vfio;
50
c9a397ce
JG
51#ifdef CONFIG_VFIO_NOIOMMU
52bool vfio_noiommu __read_mostly;
53module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
56#endif
57
2fd585f4
JG
58static DEFINE_XARRAY(vfio_device_set_xa);
59
60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61{
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
65
66 if (WARN_ON(!set_id))
67 return -EINVAL;
68
69 /*
70 * Atomically acquire a singleton object in the xarray for this set_id
71 */
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, idx);
74 if (dev_set)
75 goto found_get_ref;
76 xa_unlock(&vfio_device_set_xa);
77
78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 if (!new_dev_set)
80 return -ENOMEM;
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(&new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
84
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 GFP_KERNEL);
88 if (!dev_set) {
89 dev_set = new_dev_set;
90 goto found_get_ref;
91 }
92
93 kfree(new_dev_set);
94 if (xa_is_err(dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(dev_set);
97 }
98
99found_get_ref:
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 mutex_unlock(&dev_set->lock);
106 return 0;
107}
108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110static void vfio_release_device_set(struct vfio_device *device)
111{
112 struct vfio_device_set *dev_set = device->dev_set;
113
114 if (!dev_set)
115 return;
116
117 mutex_lock(&dev_set->lock);
118 list_del(&device->dev_set_list);
119 mutex_unlock(&dev_set->lock);
120
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 (unsigned long)dev_set->set_id);
125 mutex_destroy(&dev_set->lock);
126 kfree(dev_set);
127 }
128 xa_unlock(&vfio_device_set_xa);
129}
130
5cd189e4
AD
131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132{
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
135
136 lockdep_assert_held(&dev_set->lock);
137
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
140 return open_count;
141}
142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
3b9a2d57 144/*
cba3345c
AW
145 * Device objects - create, release, get, put, search
146 */
cba3345c 147/* Device reference always implies a group reference */
9eefba80 148void vfio_device_put_registration(struct vfio_device *device)
cba3345c 149{
5e42c999
JG
150 if (refcount_dec_and_test(&device->refcount))
151 complete(&device->comp);
cba3345c
AW
152}
153
9eefba80 154bool vfio_device_try_get_registration(struct vfio_device *device)
cba3345c 155{
5e42c999 156 return refcount_inc_not_zero(&device->refcount);
cba3345c
AW
157}
158
3b9a2d57 159/*
cba3345c
AW
160 * VFIO driver API
161 */
cb9ff3f3 162/* Release helper called by vfio_put_device() */
3c28a761 163static void vfio_device_release(struct device *dev)
cb9ff3f3
KT
164{
165 struct vfio_device *device =
3c28a761 166 container_of(dev, struct vfio_device, device);
cb9ff3f3 167
ebb72b76 168 vfio_release_device_set(device);
3c28a761 169 ida_free(&vfio.device_ida, device->index);
cb9ff3f3 170
913447d0
EF
171 if (device->ops->release)
172 device->ops->release(device);
173
174 kvfree(device);
cb9ff3f3 175}
cb9ff3f3 176
d1104f93
EF
177static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 const struct vfio_device_ops *ops);
179
cb9ff3f3
KT
180/*
181 * Allocate and initialize vfio_device so it can be registered to vfio
182 * core.
183 *
184 * Drivers should use the wrapper vfio_alloc_device() for allocation.
185 * @size is the size of the structure to be allocated, including any
186 * private data used by the driver.
187 *
188 * Driver may provide an @init callback to cover device private data.
189 *
190 * Use vfio_put_device() to release the structure after success return.
191 */
192struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 const struct vfio_device_ops *ops)
194{
195 struct vfio_device *device;
196 int ret;
197
198 if (WARN_ON(size < sizeof(struct vfio_device)))
199 return ERR_PTR(-EINVAL);
200
201 device = kvzalloc(size, GFP_KERNEL);
202 if (!device)
203 return ERR_PTR(-ENOMEM);
204
205 ret = vfio_init_device(device, dev, ops);
206 if (ret)
207 goto out_free;
208 return device;
209
210out_free:
211 kvfree(device);
212 return ERR_PTR(ret);
213}
214EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215
216/*
217 * Initialize a vfio_device so it can be registered to vfio core.
cb9ff3f3 218 */
d1104f93
EF
219static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 const struct vfio_device_ops *ops)
cb9ff3f3
KT
221{
222 int ret;
223
3c28a761
YL
224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225 if (ret < 0) {
226 dev_dbg(dev, "Error to alloc index\n");
227 return ret;
228 }
229
230 device->index = ret;
ebb72b76
KT
231 init_completion(&device->comp);
232 device->dev = dev;
233 device->ops = ops;
cb9ff3f3
KT
234
235 if (ops->init) {
236 ret = ops->init(device);
237 if (ret)
238 goto out_uninit;
239 }
240
3c28a761
YL
241 device_initialize(&device->device);
242 device->device.release = vfio_device_release;
243 device->device.class = vfio.device_class;
244 device->device.parent = device->dev;
cb9ff3f3
KT
245 return 0;
246
247out_uninit:
ebb72b76 248 vfio_release_device_set(device);
3c28a761 249 ida_free(&vfio.device_ida, device->index);
cb9ff3f3
KT
250 return ret;
251}
cb9ff3f3 252
49ea02d3
YL
253static int __vfio_register_dev(struct vfio_device *device,
254 enum vfio_group_type type)
255{
256 int ret;
257
a4d1f91d
JG
258 if (WARN_ON(device->ops->bind_iommufd &&
259 (!device->ops->unbind_iommufd ||
260 !device->ops->attach_ioas)))
261 return -EINVAL;
262
2fd585f4
JG
263 /*
264 * If the driver doesn't specify a set then the device is added to a
265 * singleton set just for itself.
266 */
267 if (!device->dev_set)
268 vfio_assign_device_set(device, device);
269
3c28a761
YL
270 ret = dev_set_name(&device->device, "vfio%d", device->index);
271 if (ret)
49ea02d3
YL
272 return ret;
273
274 ret = vfio_device_set_group(device, type);
275 if (ret)
276 return ret;
3c28a761
YL
277
278 ret = device_add(&device->device);
279 if (ret)
280 goto err_out;
281
0bfc6a4e
JG
282 /* Refcounting can't start until the driver calls register */
283 refcount_set(&device->refcount, 1);
284
32e09228 285 vfio_device_group_register(device);
0bfc6a4e
JG
286
287 return 0;
3c28a761 288err_out:
ca5f21b2 289 vfio_device_remove_group(device);
3c28a761 290 return ret;
0bfc6a4e 291}
c68ea0d0
CH
292
293int vfio_register_group_dev(struct vfio_device *device)
294{
49ea02d3 295 return __vfio_register_dev(device, VFIO_IOMMU);
c68ea0d0 296}
0bfc6a4e
JG
297EXPORT_SYMBOL_GPL(vfio_register_group_dev);
298
c68ea0d0
CH
299/*
300 * Register a virtual device without IOMMU backing. The user of this
301 * device must not be able to directly trigger unmediated DMA.
302 */
303int vfio_register_emulated_iommu_dev(struct vfio_device *device)
304{
49ea02d3 305 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
c68ea0d0
CH
306}
307EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
308
cba3345c
AW
309/*
310 * Decrement the device reference count and wait for the device to be
311 * removed. Open file descriptors for the device... */
0bfc6a4e 312void vfio_unregister_group_dev(struct vfio_device *device)
cba3345c 313{
13060b64 314 unsigned int i = 0;
db7d4d7f 315 bool interrupted = false;
5e42c999 316 long rc;
cba3345c 317
4a725b8d 318 vfio_device_put_registration(device);
5e42c999
JG
319 rc = try_wait_for_completion(&device->comp);
320 while (rc <= 0) {
13060b64 321 if (device->ops->request)
6df62c5b 322 device->ops->request(device, i++);
13060b64 323
db7d4d7f 324 if (interrupted) {
5e42c999
JG
325 rc = wait_for_completion_timeout(&device->comp,
326 HZ * 10);
db7d4d7f 327 } else {
5e42c999
JG
328 rc = wait_for_completion_interruptible_timeout(
329 &device->comp, HZ * 10);
330 if (rc < 0) {
db7d4d7f 331 interrupted = true;
0bfc6a4e 332 dev_warn(device->dev,
db7d4d7f
AW
333 "Device is currently in use, task"
334 " \"%s\" (%d) "
335 "blocked until device is released",
336 current->comm, task_pid_nr(current));
337 }
338 }
5e42c999 339 }
e014e944 340
32e09228 341 vfio_device_group_unregister(device);
41be3e26 342
3c28a761
YL
343 /* Balances device_add in register path */
344 device_del(&device->device);
345
49ea02d3 346 /* Balances vfio_device_set_group in register path */
ca5f21b2 347 vfio_device_remove_group(device);
0bfc6a4e
JG
348}
349EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
350
2b48f52f
MR
351#ifdef CONFIG_HAVE_KVM
352void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
353{
354 void (*pfn)(struct kvm *kvm);
355 bool (*fn)(struct kvm *kvm);
356 bool ret;
357
358 lockdep_assert_held(&device->dev_set->lock);
359
360 pfn = symbol_get(kvm_put_kvm);
361 if (WARN_ON(!pfn))
362 return;
363
364 fn = symbol_get(kvm_get_kvm_safe);
365 if (WARN_ON(!fn)) {
366 symbol_put(kvm_put_kvm);
367 return;
368 }
369
370 ret = fn(kvm);
371 symbol_put(kvm_get_kvm_safe);
372 if (!ret) {
373 symbol_put(kvm_put_kvm);
374 return;
375 }
376
377 device->put_kvm = pfn;
378 device->kvm = kvm;
379}
380
381void vfio_device_put_kvm(struct vfio_device *device)
382{
383 lockdep_assert_held(&device->dev_set->lock);
384
385 if (!device->kvm)
386 return;
387
388 if (WARN_ON(!device->put_kvm))
389 goto clear;
390
391 device->put_kvm(device->kvm);
392 device->put_kvm = NULL;
393 symbol_put(kvm_put_kvm);
394
395clear:
396 device->kvm = NULL;
397}
398#endif
399
eadd86f8 400/* true if the vfio_device has open_device() called but not close_device() */
4741f2e9 401static bool vfio_assert_device_open(struct vfio_device *device)
32f55d83 402{
eadd86f8
JG
403 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
404}
405
5c8d3d93 406static int vfio_device_first_open(struct vfio_device *device,
b0d2d569 407 struct iommufd_ctx *iommufd)
294aaccb
JG
408{
409 int ret;
410
411 lockdep_assert_held(&device->dev_set->lock);
412
413 if (!try_module_get(device->dev->driver->owner))
414 return -ENODEV;
415
5c8d3d93
YL
416 if (iommufd)
417 ret = vfio_iommufd_bind(device, iommufd);
418 else
419 ret = vfio_device_group_use_iommu(device);
420 if (ret)
bab6fabc
JG
421 goto err_module_put;
422
294aaccb
JG
423 if (device->ops->open_device) {
424 ret = device->ops->open_device(device);
425 if (ret)
5c8d3d93 426 goto err_unuse_iommu;
294aaccb 427 }
294aaccb
JG
428 return 0;
429
5c8d3d93 430err_unuse_iommu:
5c8d3d93 431 if (iommufd)
a4d1f91d 432 vfio_iommufd_unbind(device);
5c8d3d93
YL
433 else
434 vfio_device_group_unuse_iommu(device);
bab6fabc 435err_module_put:
294aaccb
JG
436 module_put(device->dev->driver->owner);
437 return ret;
438}
439
5c8d3d93
YL
440static void vfio_device_last_close(struct vfio_device *device,
441 struct iommufd_ctx *iommufd)
294aaccb
JG
442{
443 lockdep_assert_held(&device->dev_set->lock);
444
294aaccb
JG
445 if (device->ops->close_device)
446 device->ops->close_device(device);
5c8d3d93 447 if (iommufd)
a4d1f91d 448 vfio_iommufd_unbind(device);
5c8d3d93
YL
449 else
450 vfio_device_group_unuse_iommu(device);
294aaccb
JG
451 module_put(device->dev->driver->owner);
452}
453
b0d2d569 454int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
cba3345c 455{
5cfff077 456 int ret = 0;
03a76b60 457
2b48f52f
MR
458 lockdep_assert_held(&device->dev_set->lock);
459
2fd585f4 460 device->open_count++;
421cfe65 461 if (device->open_count == 1) {
b0d2d569 462 ret = vfio_device_first_open(device, iommufd);
294aaccb 463 if (ret)
5cfff077 464 device->open_count--;
2fd585f4 465 }
2fd585f4 466
5cfff077
YL
467 return ret;
468}
469
9eefba80
YL
470void vfio_device_close(struct vfio_device *device,
471 struct iommufd_ctx *iommufd)
5cfff077 472{
2b48f52f
MR
473 lockdep_assert_held(&device->dev_set->lock);
474
5cfff077
YL
475 vfio_assert_device_open(device);
476 if (device->open_count == 1)
5c8d3d93 477 vfio_device_last_close(device, iommufd);
5cfff077 478 device->open_count--;
5cfff077
YL
479}
480
8e5c6995
AS
481/*
482 * Wrapper around pm_runtime_resume_and_get().
483 * Return error code on failure or 0 on success.
484 */
485static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
486{
487 struct device *dev = device->dev;
488
489 if (dev->driver && dev->driver->pm) {
490 int ret;
491
492 ret = pm_runtime_resume_and_get(dev);
493 if (ret) {
494 dev_info_ratelimited(dev,
495 "vfio: runtime resume failed %d\n", ret);
496 return -EIO;
497 }
498 }
499
500 return 0;
501}
502
503/*
504 * Wrapper around pm_runtime_put().
505 */
506static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
507{
508 struct device *dev = device->dev;
509
510 if (dev->driver && dev->driver->pm)
511 pm_runtime_put(dev);
512}
513
3b9a2d57 514/*
cba3345c
AW
515 * VFIO Device fd
516 */
517static int vfio_device_fops_release(struct inode *inode, struct file *filep)
518{
519 struct vfio_device *device = filep->private_data;
520
5c8d3d93 521 vfio_device_group_close(device);
cba3345c 522
4a725b8d 523 vfio_device_put_registration(device);
cba3345c
AW
524
525 return 0;
526}
527
115dcec6
JG
528/*
529 * vfio_mig_get_next_state - Compute the next step in the FSM
530 * @cur_fsm - The current state the device is in
531 * @new_fsm - The target state to reach
532 * @next_fsm - Pointer to the next step to get to new_fsm
533 *
534 * Return 0 upon success, otherwise -errno
535 * Upon success the next step in the state progression between cur_fsm and
536 * new_fsm will be set in next_fsm.
537 *
538 * This breaks down requests for combination transitions into smaller steps and
539 * returns the next step to get to new_fsm. The function may need to be called
540 * multiple times before reaching new_fsm.
541 *
542 */
543int vfio_mig_get_next_state(struct vfio_device *device,
544 enum vfio_device_mig_state cur_fsm,
545 enum vfio_device_mig_state new_fsm,
546 enum vfio_device_mig_state *next_fsm)
547{
4db52602 548 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
115dcec6 549 /*
8cb3d83b
JG
550 * The coding in this table requires the driver to implement the
551 * following FSM arcs:
115dcec6 552 * RESUMING -> STOP
115dcec6 553 * STOP -> RESUMING
115dcec6
JG
554 * STOP -> STOP_COPY
555 * STOP_COPY -> STOP
556 *
8cb3d83b
JG
557 * If P2P is supported then the driver must also implement these FSM
558 * arcs:
559 * RUNNING -> RUNNING_P2P
560 * RUNNING_P2P -> RUNNING
561 * RUNNING_P2P -> STOP
562 * STOP -> RUNNING_P2P
4db52602
JG
563 *
564 * If precopy is supported then the driver must support these additional
565 * FSM arcs:
566 * RUNNING -> PRE_COPY
567 * PRE_COPY -> RUNNING
568 * PRE_COPY -> STOP_COPY
569 * However, if precopy and P2P are supported together then the driver
570 * must support these additional arcs beyond the P2P arcs above:
571 * PRE_COPY -> RUNNING
572 * PRE_COPY -> PRE_COPY_P2P
573 * PRE_COPY_P2P -> PRE_COPY
574 * PRE_COPY_P2P -> RUNNING_P2P
575 * PRE_COPY_P2P -> STOP_COPY
576 * RUNNING -> PRE_COPY
577 * RUNNING_P2P -> PRE_COPY_P2P
578 *
579 * Without P2P and precopy the driver must implement:
8cb3d83b
JG
580 * RUNNING -> STOP
581 * STOP -> RUNNING
582 *
583 * The coding will step through multiple states for some combination
584 * transitions; if all optional features are supported, this means the
585 * following ones:
4db52602
JG
586 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
587 * PRE_COPY -> RUNNING -> RUNNING_P2P
588 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
590 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
591 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
8cb3d83b 593 * RESUMING -> STOP -> RUNNING_P2P
4db52602 594 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b 595 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
4db52602 596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
115dcec6 597 * RESUMING -> STOP -> STOP_COPY
4db52602 598 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b
JG
599 * RUNNING -> RUNNING_P2P -> STOP
600 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
601 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
4db52602 602 * RUNNING_P2P -> RUNNING -> PRE_COPY
8cb3d83b
JG
603 * RUNNING_P2P -> STOP -> RESUMING
604 * RUNNING_P2P -> STOP -> STOP_COPY
4db52602 605 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b 606 * STOP -> RUNNING_P2P -> RUNNING
4db52602 607 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
115dcec6 608 * STOP_COPY -> STOP -> RESUMING
8cb3d83b
JG
609 * STOP_COPY -> STOP -> RUNNING_P2P
610 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
4db52602
JG
611 *
612 * The following transitions are blocked:
613 * STOP_COPY -> PRE_COPY
614 * STOP_COPY -> PRE_COPY_P2P
115dcec6
JG
615 */
616 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
617 [VFIO_DEVICE_STATE_STOP] = {
618 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 619 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
4db52602
JG
620 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
621 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
622 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
623 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b 624 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
625 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
626 },
627 [VFIO_DEVICE_STATE_RUNNING] = {
8cb3d83b 628 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6 629 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
4db52602
JG
630 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
631 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
8cb3d83b
JG
632 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
633 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
635 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
636 },
4db52602
JG
637 [VFIO_DEVICE_STATE_PRE_COPY] = {
638 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
639 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
640 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
641 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
642 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
644 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
645 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
646 },
647 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
648 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
649 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
651 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
652 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
653 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
654 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
656 },
115dcec6
JG
657 [VFIO_DEVICE_STATE_STOP_COPY] = {
658 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
659 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
4db52602
JG
660 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
661 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
662 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
663 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 664 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
665 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
666 },
667 [VFIO_DEVICE_STATE_RESUMING] = {
668 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
669 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
4db52602
JG
670 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
671 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
672 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
673 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b
JG
674 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
675 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
676 },
677 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
678 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
679 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
4db52602
JG
680 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
681 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
8cb3d83b
JG
682 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
683 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
684 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
685 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
686 },
687 [VFIO_DEVICE_STATE_ERROR] = {
688 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
689 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
4db52602
JG
690 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
691 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
692 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
693 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
8cb3d83b 694 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
695 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
696 },
697 };
698
8cb3d83b
JG
699 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
700 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
701 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
4db52602
JG
702 [VFIO_DEVICE_STATE_PRE_COPY] =
703 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
705 VFIO_MIGRATION_P2P |
706 VFIO_MIGRATION_PRE_COPY,
8cb3d83b
JG
707 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
708 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
709 [VFIO_DEVICE_STATE_RUNNING_P2P] =
710 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
711 [VFIO_DEVICE_STATE_ERROR] = ~0U,
712 };
713
714 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
715 (state_flags_table[cur_fsm] & device->migration_flags) !=
716 state_flags_table[cur_fsm]))
115dcec6
JG
717 return -EINVAL;
718
8cb3d83b
JG
719 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
720 (state_flags_table[new_fsm] & device->migration_flags) !=
721 state_flags_table[new_fsm])
115dcec6
JG
722 return -EINVAL;
723
8cb3d83b
JG
724 /*
725 * Arcs touching optional and unsupported states are skipped over. The
726 * driver will instead see an arc from the original state to the next
727 * logical state, as per the above comment.
728 */
115dcec6 729 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
8cb3d83b
JG
730 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
731 state_flags_table[*next_fsm])
732 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
733
115dcec6
JG
734 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
735}
736EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
737
738/*
739 * Convert the drivers's struct file into a FD number and return it to userspace
740 */
741static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
742 struct vfio_device_feature_mig_state *mig)
743{
744 int ret;
745 int fd;
746
747 fd = get_unused_fd_flags(O_CLOEXEC);
748 if (fd < 0) {
749 ret = fd;
750 goto out_fput;
751 }
752
753 mig->data_fd = fd;
754 if (copy_to_user(arg, mig, sizeof(*mig))) {
755 ret = -EFAULT;
756 goto out_put_unused;
757 }
758 fd_install(fd, filp);
759 return 0;
760
761out_put_unused:
762 put_unused_fd(fd);
763out_fput:
764 fput(filp);
765 return ret;
766}
767
768static int
769vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
770 u32 flags, void __user *arg,
771 size_t argsz)
772{
773 size_t minsz =
774 offsetofend(struct vfio_device_feature_mig_state, data_fd);
775 struct vfio_device_feature_mig_state mig;
776 struct file *filp = NULL;
777 int ret;
778
6e97eba8 779 if (!device->mig_ops)
115dcec6
JG
780 return -ENOTTY;
781
782 ret = vfio_check_feature(flags, argsz,
783 VFIO_DEVICE_FEATURE_SET |
784 VFIO_DEVICE_FEATURE_GET,
785 sizeof(mig));
786 if (ret != 1)
787 return ret;
788
789 if (copy_from_user(&mig, arg, minsz))
790 return -EFAULT;
791
792 if (flags & VFIO_DEVICE_FEATURE_GET) {
793 enum vfio_device_mig_state curr_state;
794
6e97eba8
YH
795 ret = device->mig_ops->migration_get_state(device,
796 &curr_state);
115dcec6
JG
797 if (ret)
798 return ret;
799 mig.device_state = curr_state;
800 goto out_copy;
801 }
802
803 /* Handle the VFIO_DEVICE_FEATURE_SET */
6e97eba8 804 filp = device->mig_ops->migration_set_state(device, mig.device_state);
115dcec6
JG
805 if (IS_ERR(filp) || !filp)
806 goto out_copy;
807
808 return vfio_ioct_mig_return_fd(filp, arg, &mig);
809out_copy:
810 mig.data_fd = -1;
811 if (copy_to_user(arg, &mig, sizeof(mig)))
812 return -EFAULT;
813 if (IS_ERR(filp))
814 return PTR_ERR(filp);
815 return 0;
816}
817
4e016f96
YH
818static int
819vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
820 u32 flags, void __user *arg,
821 size_t argsz)
822{
823 struct vfio_device_feature_mig_data_size data_size = {};
824 unsigned long stop_copy_length;
825 int ret;
826
827 if (!device->mig_ops)
828 return -ENOTTY;
829
830 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
831 sizeof(data_size));
832 if (ret != 1)
833 return ret;
834
835 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
836 if (ret)
837 return ret;
838
839 data_size.stop_copy_length = stop_copy_length;
840 if (copy_to_user(arg, &data_size, sizeof(data_size)))
841 return -EFAULT;
842
843 return 0;
844}
845
115dcec6
JG
846static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
847 u32 flags, void __user *arg,
848 size_t argsz)
849{
850 struct vfio_device_feature_migration mig = {
8cb3d83b 851 .flags = device->migration_flags,
115dcec6
JG
852 };
853 int ret;
854
6e97eba8 855 if (!device->mig_ops)
115dcec6
JG
856 return -ENOTTY;
857
858 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
859 sizeof(mig));
860 if (ret != 1)
861 return ret;
862 if (copy_to_user(arg, &mig, sizeof(mig)))
863 return -EFAULT;
864 return 0;
865}
866
80c4b92a
YH
867/* Ranges should fit into a single kernel page */
868#define LOG_MAX_RANGES \
869 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
870
871static int
872vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
873 u32 flags, void __user *arg,
874 size_t argsz)
875{
876 size_t minsz =
877 offsetofend(struct vfio_device_feature_dma_logging_control,
878 ranges);
879 struct vfio_device_feature_dma_logging_range __user *ranges;
880 struct vfio_device_feature_dma_logging_control control;
881 struct vfio_device_feature_dma_logging_range range;
882 struct rb_root_cached root = RB_ROOT_CACHED;
883 struct interval_tree_node *nodes;
884 u64 iova_end;
885 u32 nnodes;
886 int i, ret;
887
888 if (!device->log_ops)
889 return -ENOTTY;
890
891 ret = vfio_check_feature(flags, argsz,
892 VFIO_DEVICE_FEATURE_SET,
893 sizeof(control));
894 if (ret != 1)
895 return ret;
896
897 if (copy_from_user(&control, arg, minsz))
898 return -EFAULT;
899
900 nnodes = control.num_ranges;
901 if (!nnodes)
902 return -EINVAL;
903
904 if (nnodes > LOG_MAX_RANGES)
905 return -E2BIG;
906
907 ranges = u64_to_user_ptr(control.ranges);
908 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
909 GFP_KERNEL);
910 if (!nodes)
911 return -ENOMEM;
912
913 for (i = 0; i < nnodes; i++) {
914 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
915 ret = -EFAULT;
916 goto end;
917 }
918 if (!IS_ALIGNED(range.iova, control.page_size) ||
919 !IS_ALIGNED(range.length, control.page_size)) {
920 ret = -EINVAL;
921 goto end;
922 }
923
924 if (check_add_overflow(range.iova, range.length, &iova_end) ||
925 iova_end > ULONG_MAX) {
926 ret = -EOVERFLOW;
927 goto end;
928 }
929
930 nodes[i].start = range.iova;
931 nodes[i].last = range.iova + range.length - 1;
932 if (interval_tree_iter_first(&root, nodes[i].start,
933 nodes[i].last)) {
934 /* Range overlapping */
935 ret = -EINVAL;
936 goto end;
937 }
938 interval_tree_insert(nodes + i, &root);
939 }
940
941 ret = device->log_ops->log_start(device, &root, nnodes,
942 &control.page_size);
943 if (ret)
944 goto end;
945
946 if (copy_to_user(arg, &control, sizeof(control))) {
947 ret = -EFAULT;
948 device->log_ops->log_stop(device);
949 }
950
951end:
952 kfree(nodes);
953 return ret;
954}
955
956static int
957vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
958 u32 flags, void __user *arg,
959 size_t argsz)
960{
961 int ret;
962
963 if (!device->log_ops)
964 return -ENOTTY;
965
966 ret = vfio_check_feature(flags, argsz,
967 VFIO_DEVICE_FEATURE_SET, 0);
968 if (ret != 1)
969 return ret;
970
971 return device->log_ops->log_stop(device);
972}
973
974static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
975 unsigned long iova, size_t length,
976 void *opaque)
977{
978 struct vfio_device *device = opaque;
979
980 return device->log_ops->log_read_and_clear(device, iova, length, iter);
981}
982
983static int
984vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
985 u32 flags, void __user *arg,
986 size_t argsz)
987{
988 size_t minsz =
989 offsetofend(struct vfio_device_feature_dma_logging_report,
990 bitmap);
991 struct vfio_device_feature_dma_logging_report report;
992 struct iova_bitmap *iter;
993 u64 iova_end;
994 int ret;
995
996 if (!device->log_ops)
997 return -ENOTTY;
998
999 ret = vfio_check_feature(flags, argsz,
1000 VFIO_DEVICE_FEATURE_GET,
1001 sizeof(report));
1002 if (ret != 1)
1003 return ret;
1004
1005 if (copy_from_user(&report, arg, minsz))
1006 return -EFAULT;
1007
1008 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1009 return -EINVAL;
1010
1011 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1012 iova_end > ULONG_MAX)
1013 return -EOVERFLOW;
1014
1015 iter = iova_bitmap_alloc(report.iova, report.length,
1016 report.page_size,
1017 u64_to_user_ptr(report.bitmap));
1018 if (IS_ERR(iter))
1019 return PTR_ERR(iter);
1020
1021 ret = iova_bitmap_for_each(iter, device,
1022 vfio_device_log_read_and_clear);
1023
1024 iova_bitmap_free(iter);
1025 return ret;
1026}
1027
445ad495
JG
1028static int vfio_ioctl_device_feature(struct vfio_device *device,
1029 struct vfio_device_feature __user *arg)
1030{
1031 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1032 struct vfio_device_feature feature;
1033
1034 if (copy_from_user(&feature, arg, minsz))
1035 return -EFAULT;
1036
1037 if (feature.argsz < minsz)
1038 return -EINVAL;
1039
1040 /* Check unknown flags */
1041 if (feature.flags &
1042 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1043 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1044 return -EINVAL;
1045
1046 /* GET & SET are mutually exclusive except with PROBE */
1047 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1048 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1049 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1050 return -EINVAL;
1051
1052 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
115dcec6
JG
1053 case VFIO_DEVICE_FEATURE_MIGRATION:
1054 return vfio_ioctl_device_feature_migration(
1055 device, feature.flags, arg->data,
1056 feature.argsz - minsz);
1057 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1058 return vfio_ioctl_device_feature_mig_device_state(
1059 device, feature.flags, arg->data,
1060 feature.argsz - minsz);
80c4b92a
YH
1061 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1062 return vfio_ioctl_device_feature_logging_start(
1063 device, feature.flags, arg->data,
1064 feature.argsz - minsz);
1065 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1066 return vfio_ioctl_device_feature_logging_stop(
1067 device, feature.flags, arg->data,
1068 feature.argsz - minsz);
1069 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1070 return vfio_ioctl_device_feature_logging_report(
1071 device, feature.flags, arg->data,
1072 feature.argsz - minsz);
4e016f96
YH
1073 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1074 return vfio_ioctl_device_feature_migration_data_size(
1075 device, feature.flags, arg->data,
1076 feature.argsz - minsz);
445ad495
JG
1077 default:
1078 if (unlikely(!device->ops->device_feature))
1079 return -EINVAL;
1080 return device->ops->device_feature(device, feature.flags,
1081 arg->data,
1082 feature.argsz - minsz);
1083 }
1084}
1085
cba3345c
AW
1086static long vfio_device_fops_unl_ioctl(struct file *filep,
1087 unsigned int cmd, unsigned long arg)
1088{
1089 struct vfio_device *device = filep->private_data;
8e5c6995
AS
1090 int ret;
1091
1092 ret = vfio_device_pm_runtime_get(device);
1093 if (ret)
1094 return ret;
cba3345c 1095
445ad495
JG
1096 switch (cmd) {
1097 case VFIO_DEVICE_FEATURE:
8e5c6995
AS
1098 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1099 break;
1100
445ad495
JG
1101 default:
1102 if (unlikely(!device->ops->ioctl))
8e5c6995
AS
1103 ret = -EINVAL;
1104 else
1105 ret = device->ops->ioctl(device, cmd, arg);
1106 break;
445ad495 1107 }
8e5c6995
AS
1108
1109 vfio_device_pm_runtime_put(device);
1110 return ret;
cba3345c
AW
1111}
1112
1113static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1114 size_t count, loff_t *ppos)
1115{
1116 struct vfio_device *device = filep->private_data;
1117
1118 if (unlikely(!device->ops->read))
1119 return -EINVAL;
1120
6df62c5b 1121 return device->ops->read(device, buf, count, ppos);
cba3345c
AW
1122}
1123
1124static ssize_t vfio_device_fops_write(struct file *filep,
1125 const char __user *buf,
1126 size_t count, loff_t *ppos)
1127{
1128 struct vfio_device *device = filep->private_data;
1129
1130 if (unlikely(!device->ops->write))
1131 return -EINVAL;
1132
6df62c5b 1133 return device->ops->write(device, buf, count, ppos);
cba3345c
AW
1134}
1135
1136static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1137{
1138 struct vfio_device *device = filep->private_data;
1139
1140 if (unlikely(!device->ops->mmap))
1141 return -EINVAL;
1142
6df62c5b 1143 return device->ops->mmap(device, vma);
cba3345c
AW
1144}
1145
9eefba80 1146const struct file_operations vfio_device_fops = {
cba3345c
AW
1147 .owner = THIS_MODULE,
1148 .release = vfio_device_fops_release,
1149 .read = vfio_device_fops_read,
1150 .write = vfio_device_fops_write,
1151 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
407e9ef7 1152 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1153 .mmap = vfio_device_fops_mmap,
1154};
1155
3b9a2d57 1156/*
d7a8d5ed
AW
1157 * Sub-module support
1158 */
1159/*
1160 * Helper for managing a buffer of info chain capabilities, allocate or
1161 * reallocate a buffer with additional @size, filling in @id and @version
1162 * of the capability. A pointer to the new capability is returned.
1163 *
1164 * NB. The chain is based at the head of the buffer, so new entries are
1165 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1166 * next offsets prior to copying to the user buffer.
1167 */
1168struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1169 size_t size, u16 id, u16 version)
1170{
1171 void *buf;
1172 struct vfio_info_cap_header *header, *tmp;
1173
1174 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1175 if (!buf) {
1176 kfree(caps->buf);
6641085e 1177 caps->buf = NULL;
d7a8d5ed
AW
1178 caps->size = 0;
1179 return ERR_PTR(-ENOMEM);
1180 }
1181
1182 caps->buf = buf;
1183 header = buf + caps->size;
1184
1185 /* Eventually copied to user buffer, zero */
1186 memset(header, 0, size);
1187
1188 header->id = id;
1189 header->version = version;
1190
1191 /* Add to the end of the capability chain */
5ba6de98 1192 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
d7a8d5ed
AW
1193 ; /* nothing */
1194
1195 tmp->next = caps->size;
1196 caps->size += size;
1197
1198 return header;
1199}
1200EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1201
1202void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1203{
1204 struct vfio_info_cap_header *tmp;
5ba6de98 1205 void *buf = (void *)caps->buf;
d7a8d5ed 1206
5ba6de98 1207 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
d7a8d5ed
AW
1208 tmp->next += offset;
1209}
b3c0a866 1210EXPORT_SYMBOL(vfio_info_cap_shift);
d7a8d5ed 1211
dda01f78
AW
1212int vfio_info_add_capability(struct vfio_info_cap *caps,
1213 struct vfio_info_cap_header *cap, size_t size)
b3c0a866
KW
1214{
1215 struct vfio_info_cap_header *header;
b3c0a866 1216
dda01f78 1217 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
b3c0a866
KW
1218 if (IS_ERR(header))
1219 return PTR_ERR(header);
1220
dda01f78 1221 memcpy(header + 1, cap + 1, size - sizeof(*header));
b3c0a866 1222
b3c0a866
KW
1223 return 0;
1224}
b3c0a866 1225EXPORT_SYMBOL(vfio_info_add_capability);
2169037d 1226
c747f08a
KW
1227int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1228 int max_irq_type, size_t *data_size)
1229{
1230 unsigned long minsz;
1231 size_t size;
1232
1233 minsz = offsetofend(struct vfio_irq_set, count);
1234
1235 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1236 (hdr->count >= (U32_MAX - hdr->start)) ||
1237 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1238 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1239 return -EINVAL;
1240
1241 if (data_size)
1242 *data_size = 0;
1243
1244 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1245 return -EINVAL;
1246
1247 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1248 case VFIO_IRQ_SET_DATA_NONE:
1249 size = 0;
1250 break;
1251 case VFIO_IRQ_SET_DATA_BOOL:
1252 size = sizeof(uint8_t);
1253 break;
1254 case VFIO_IRQ_SET_DATA_EVENTFD:
1255 size = sizeof(int32_t);
1256 break;
1257 default:
1258 return -EINVAL;
1259 }
1260
1261 if (size) {
1262 if (hdr->argsz - minsz < hdr->count * size)
1263 return -EINVAL;
1264
1265 if (!data_size)
1266 return -EINVAL;
1267
1268 *data_size = hdr->count * size;
1269 }
1270
1271 return 0;
1272}
1273EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1274
4741f2e9
JG
1275/*
1276 * Pin contiguous user pages and return their associated host pages for local
1277 * domain only.
1278 * @device [in] : device
1279 * @iova [in] : starting IOVA of user pages to be pinned.
1280 * @npage [in] : count of pages to be pinned. This count should not
1281 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1282 * @prot [in] : protection flags
1283 * @pages[out] : array of host pages
1284 * Return error or number of pages pinned.
1285 *
1286 * A driver may only call this function if the vfio_device was created
8da7a0e7 1287 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
4741f2e9
JG
1288 */
1289int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1290 int npage, int prot, struct page **pages)
1291{
1292 /* group->container cannot change while a vfio device is open */
1293 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1294 return -EINVAL;
8da7a0e7
YL
1295 if (vfio_device_has_container(device))
1296 return vfio_device_container_pin_pages(device, iova,
1297 npage, prot, pages);
4741f2e9
JG
1298 if (device->iommufd_access) {
1299 int ret;
1300
1301 if (iova > ULONG_MAX)
1302 return -EINVAL;
1303 /*
1304 * VFIO ignores the sub page offset, npages is from the start of
1305 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1306 * the sub page offset by doing:
1307 * pages[0] + (iova % PAGE_SIZE)
1308 */
1309 ret = iommufd_access_pin_pages(
1310 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1311 npage * PAGE_SIZE, pages,
1312 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1313 if (ret)
1314 return ret;
1315 return npage;
1316 }
1317 return -EINVAL;
1318}
1319EXPORT_SYMBOL(vfio_pin_pages);
1320
1321/*
1322 * Unpin contiguous host pages for local domain only.
1323 * @device [in] : device
1324 * @iova [in] : starting address of user pages to be unpinned.
1325 * @npage [in] : count of pages to be unpinned. This count should not
1326 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1327 */
1328void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1329{
1330 if (WARN_ON(!vfio_assert_device_open(device)))
1331 return;
1332
8da7a0e7
YL
1333 if (vfio_device_has_container(device)) {
1334 vfio_device_container_unpin_pages(device, iova, npage);
4741f2e9
JG
1335 return;
1336 }
1337 if (device->iommufd_access) {
1338 if (WARN_ON(iova > ULONG_MAX))
1339 return;
1340 iommufd_access_unpin_pages(device->iommufd_access,
1341 ALIGN_DOWN(iova, PAGE_SIZE),
1342 npage * PAGE_SIZE);
1343 return;
1344 }
1345}
1346EXPORT_SYMBOL(vfio_unpin_pages);
1347
1348/*
1349 * This interface allows the CPUs to perform some sort of virtual DMA on
1350 * behalf of the device.
1351 *
1352 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1353 * into/from a kernel buffer.
1354 *
1355 * As the read/write of user space memory is conducted via the CPUs and is
1356 * not a real device DMA, it is not necessary to pin the user space memory.
1357 *
1358 * @device [in] : VFIO device
1359 * @iova [in] : base IOVA of a user space buffer
1360 * @data [in] : pointer to kernel buffer
1361 * @len [in] : kernel buffer length
1362 * @write : indicate read or write
1363 * Return error code on failure or 0 on success.
1364 */
1365int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1366 size_t len, bool write)
1367{
1368 if (!data || len <= 0 || !vfio_assert_device_open(device))
1369 return -EINVAL;
1370
8da7a0e7
YL
1371 if (vfio_device_has_container(device))
1372 return vfio_device_container_dma_rw(device, iova,
1373 data, len, write);
4741f2e9
JG
1374
1375 if (device->iommufd_access) {
1376 unsigned int flags = 0;
1377
1378 if (iova > ULONG_MAX)
1379 return -EINVAL;
1380
1381 /* VFIO historically tries to auto-detect a kthread */
1382 if (!current->mm)
1383 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1384 if (write)
1385 flags |= IOMMUFD_ACCESS_RW_WRITE;
1386 return iommufd_access_rw(device->iommufd_access, iova, data,
1387 len, flags);
1388 }
1389 return -EINVAL;
1390}
1391EXPORT_SYMBOL(vfio_dma_rw);
1392
3b9a2d57 1393/*
cba3345c
AW
1394 * Module/class support
1395 */
1334e47e
YL
1396static int __init vfio_init(void)
1397{
1398 int ret;
1399
1400 ida_init(&vfio.device_ida);
1401
1402 ret = vfio_group_init();
1403 if (ret)
1404 return ret;
1405
e2d55709
JG
1406 ret = vfio_virqfd_init();
1407 if (ret)
1408 goto err_virqfd;
1409
1334e47e
YL
1410 /* /sys/class/vfio-dev/vfioX */
1411 vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1412 if (IS_ERR(vfio.device_class)) {
1413 ret = PTR_ERR(vfio.device_class);
1414 goto err_dev_class;
1415 }
1416
1417 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1418 return 0;
1419
1420err_dev_class:
e2d55709
JG
1421 vfio_virqfd_exit();
1422err_virqfd:
1334e47e
YL
1423 vfio_group_cleanup();
1424 return ret;
1425}
1426
1427static void __exit vfio_cleanup(void)
1428{
1429 ida_destroy(&vfio.device_ida);
1430 class_destroy(vfio.device_class);
1431 vfio.device_class = NULL;
e2d55709 1432 vfio_virqfd_exit();
1334e47e 1433 vfio_group_cleanup();
2fd585f4 1434 xa_destroy(&vfio_device_set_xa);
cba3345c
AW
1435}
1436
1437module_init(vfio_init);
1438module_exit(vfio_cleanup);
1439
1440MODULE_VERSION(DRIVER_VERSION);
1441MODULE_LICENSE("GPL v2");
1442MODULE_AUTHOR(DRIVER_AUTHOR);
1443MODULE_DESCRIPTION(DRIVER_DESC);
0ca582fd 1444MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");