fbdev: imsttfb: Fix use after free bug in imsttfb_probe
[linux-block.git] / drivers / vfio / vfio_main.c
CommitLineData
d2912cb1 1// SPDX-License-Identifier: GPL-2.0-only
cba3345c
AW
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
cba3345c
AW
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
cba3345c
AW
16#include <linux/fs.h>
17#include <linux/idr.h>
18#include <linux/iommu.h>
2b48f52f
MR
19#ifdef CONFIG_HAVE_KVM
20#include <linux/kvm_host.h>
21#endif
cba3345c 22#include <linux/list.h>
d1099901 23#include <linux/miscdevice.h>
cba3345c
AW
24#include <linux/module.h>
25#include <linux/mutex.h>
5f096b14 26#include <linux/pci.h>
9587f44a 27#include <linux/rwsem.h>
cba3345c
AW
28#include <linux/sched.h>
29#include <linux/slab.h>
664e9386 30#include <linux/stat.h>
cba3345c
AW
31#include <linux/string.h>
32#include <linux/uaccess.h>
33#include <linux/vfio.h>
34#include <linux/wait.h>
41be3e26 35#include <linux/sched/signal.h>
8e5c6995 36#include <linux/pm_runtime.h>
80c4b92a
YH
37#include <linux/interval_tree.h>
38#include <linux/iova_bitmap.h>
2a3dab19 39#include <linux/iommufd.h>
8cc02d22 40#include "vfio.h"
cba3345c
AW
41
42#define DRIVER_VERSION "0.3"
43#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44#define DRIVER_DESC "VFIO - User Level meta-driver"
45
46static struct vfio {
3c28a761
YL
47 struct class *device_class;
48 struct ida device_ida;
cba3345c
AW
49} vfio;
50
c9a397ce
JG
51#ifdef CONFIG_VFIO_NOIOMMU
52bool vfio_noiommu __read_mostly;
53module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
56#endif
57
2fd585f4
JG
58static DEFINE_XARRAY(vfio_device_set_xa);
59
60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61{
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
65
66 if (WARN_ON(!set_id))
67 return -EINVAL;
68
69 /*
70 * Atomically acquire a singleton object in the xarray for this set_id
71 */
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, idx);
74 if (dev_set)
75 goto found_get_ref;
76 xa_unlock(&vfio_device_set_xa);
77
78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 if (!new_dev_set)
80 return -ENOMEM;
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(&new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
84
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 GFP_KERNEL);
88 if (!dev_set) {
89 dev_set = new_dev_set;
90 goto found_get_ref;
91 }
92
93 kfree(new_dev_set);
94 if (xa_is_err(dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(dev_set);
97 }
98
99found_get_ref:
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 mutex_unlock(&dev_set->lock);
106 return 0;
107}
108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110static void vfio_release_device_set(struct vfio_device *device)
111{
112 struct vfio_device_set *dev_set = device->dev_set;
113
114 if (!dev_set)
115 return;
116
117 mutex_lock(&dev_set->lock);
118 list_del(&device->dev_set_list);
119 mutex_unlock(&dev_set->lock);
120
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 (unsigned long)dev_set->set_id);
125 mutex_destroy(&dev_set->lock);
126 kfree(dev_set);
127 }
128 xa_unlock(&vfio_device_set_xa);
129}
130
5cd189e4
AD
131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132{
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
135
136 lockdep_assert_held(&dev_set->lock);
137
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
140 return open_count;
141}
142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
3b9a2d57 144/*
cba3345c
AW
145 * Device objects - create, release, get, put, search
146 */
cba3345c 147/* Device reference always implies a group reference */
9eefba80 148void vfio_device_put_registration(struct vfio_device *device)
cba3345c 149{
5e42c999
JG
150 if (refcount_dec_and_test(&device->refcount))
151 complete(&device->comp);
cba3345c
AW
152}
153
9eefba80 154bool vfio_device_try_get_registration(struct vfio_device *device)
cba3345c 155{
5e42c999 156 return refcount_inc_not_zero(&device->refcount);
cba3345c
AW
157}
158
3b9a2d57 159/*
cba3345c
AW
160 * VFIO driver API
161 */
cb9ff3f3 162/* Release helper called by vfio_put_device() */
3c28a761 163static void vfio_device_release(struct device *dev)
cb9ff3f3
KT
164{
165 struct vfio_device *device =
3c28a761 166 container_of(dev, struct vfio_device, device);
cb9ff3f3 167
ebb72b76 168 vfio_release_device_set(device);
3c28a761 169 ida_free(&vfio.device_ida, device->index);
cb9ff3f3 170
913447d0
EF
171 if (device->ops->release)
172 device->ops->release(device);
173
174 kvfree(device);
cb9ff3f3 175}
cb9ff3f3 176
d1104f93
EF
177static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 const struct vfio_device_ops *ops);
179
cb9ff3f3
KT
180/*
181 * Allocate and initialize vfio_device so it can be registered to vfio
182 * core.
183 *
184 * Drivers should use the wrapper vfio_alloc_device() for allocation.
185 * @size is the size of the structure to be allocated, including any
186 * private data used by the driver.
187 *
188 * Driver may provide an @init callback to cover device private data.
189 *
190 * Use vfio_put_device() to release the structure after success return.
191 */
192struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 const struct vfio_device_ops *ops)
194{
195 struct vfio_device *device;
196 int ret;
197
198 if (WARN_ON(size < sizeof(struct vfio_device)))
199 return ERR_PTR(-EINVAL);
200
201 device = kvzalloc(size, GFP_KERNEL);
202 if (!device)
203 return ERR_PTR(-ENOMEM);
204
205 ret = vfio_init_device(device, dev, ops);
206 if (ret)
207 goto out_free;
208 return device;
209
210out_free:
211 kvfree(device);
212 return ERR_PTR(ret);
213}
214EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215
216/*
217 * Initialize a vfio_device so it can be registered to vfio core.
cb9ff3f3 218 */
d1104f93
EF
219static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 const struct vfio_device_ops *ops)
cb9ff3f3
KT
221{
222 int ret;
223
3c28a761
YL
224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225 if (ret < 0) {
226 dev_dbg(dev, "Error to alloc index\n");
227 return ret;
228 }
229
230 device->index = ret;
ebb72b76
KT
231 init_completion(&device->comp);
232 device->dev = dev;
233 device->ops = ops;
cb9ff3f3
KT
234
235 if (ops->init) {
236 ret = ops->init(device);
237 if (ret)
238 goto out_uninit;
239 }
240
3c28a761
YL
241 device_initialize(&device->device);
242 device->device.release = vfio_device_release;
243 device->device.class = vfio.device_class;
244 device->device.parent = device->dev;
cb9ff3f3
KT
245 return 0;
246
247out_uninit:
ebb72b76 248 vfio_release_device_set(device);
3c28a761 249 ida_free(&vfio.device_ida, device->index);
cb9ff3f3
KT
250 return ret;
251}
cb9ff3f3 252
49ea02d3
YL
253static int __vfio_register_dev(struct vfio_device *device,
254 enum vfio_group_type type)
255{
256 int ret;
257
7d12578c
YL
258 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
259 (!device->ops->bind_iommufd ||
260 !device->ops->unbind_iommufd ||
a4d1f91d
JG
261 !device->ops->attach_ioas)))
262 return -EINVAL;
263
2fd585f4
JG
264 /*
265 * If the driver doesn't specify a set then the device is added to a
266 * singleton set just for itself.
267 */
268 if (!device->dev_set)
269 vfio_assign_device_set(device, device);
270
3c28a761
YL
271 ret = dev_set_name(&device->device, "vfio%d", device->index);
272 if (ret)
49ea02d3
YL
273 return ret;
274
275 ret = vfio_device_set_group(device, type);
276 if (ret)
277 return ret;
3c28a761
YL
278
279 ret = device_add(&device->device);
280 if (ret)
281 goto err_out;
282
0bfc6a4e
JG
283 /* Refcounting can't start until the driver calls register */
284 refcount_set(&device->refcount, 1);
285
32e09228 286 vfio_device_group_register(device);
0bfc6a4e
JG
287
288 return 0;
3c28a761 289err_out:
ca5f21b2 290 vfio_device_remove_group(device);
3c28a761 291 return ret;
0bfc6a4e 292}
c68ea0d0
CH
293
294int vfio_register_group_dev(struct vfio_device *device)
295{
49ea02d3 296 return __vfio_register_dev(device, VFIO_IOMMU);
c68ea0d0 297}
0bfc6a4e
JG
298EXPORT_SYMBOL_GPL(vfio_register_group_dev);
299
c68ea0d0
CH
300/*
301 * Register a virtual device without IOMMU backing. The user of this
302 * device must not be able to directly trigger unmediated DMA.
303 */
304int vfio_register_emulated_iommu_dev(struct vfio_device *device)
305{
49ea02d3 306 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
c68ea0d0
CH
307}
308EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
309
cba3345c
AW
310/*
311 * Decrement the device reference count and wait for the device to be
312 * removed. Open file descriptors for the device... */
0bfc6a4e 313void vfio_unregister_group_dev(struct vfio_device *device)
cba3345c 314{
13060b64 315 unsigned int i = 0;
db7d4d7f 316 bool interrupted = false;
5e42c999 317 long rc;
cba3345c 318
4a725b8d 319 vfio_device_put_registration(device);
5e42c999
JG
320 rc = try_wait_for_completion(&device->comp);
321 while (rc <= 0) {
13060b64 322 if (device->ops->request)
6df62c5b 323 device->ops->request(device, i++);
13060b64 324
db7d4d7f 325 if (interrupted) {
5e42c999
JG
326 rc = wait_for_completion_timeout(&device->comp,
327 HZ * 10);
db7d4d7f 328 } else {
5e42c999
JG
329 rc = wait_for_completion_interruptible_timeout(
330 &device->comp, HZ * 10);
331 if (rc < 0) {
db7d4d7f 332 interrupted = true;
0bfc6a4e 333 dev_warn(device->dev,
db7d4d7f
AW
334 "Device is currently in use, task"
335 " \"%s\" (%d) "
336 "blocked until device is released",
337 current->comm, task_pid_nr(current));
338 }
339 }
5e42c999 340 }
e014e944 341
32e09228 342 vfio_device_group_unregister(device);
41be3e26 343
3c28a761
YL
344 /* Balances device_add in register path */
345 device_del(&device->device);
346
49ea02d3 347 /* Balances vfio_device_set_group in register path */
ca5f21b2 348 vfio_device_remove_group(device);
0bfc6a4e
JG
349}
350EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
351
2b48f52f
MR
352#ifdef CONFIG_HAVE_KVM
353void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
354{
355 void (*pfn)(struct kvm *kvm);
356 bool (*fn)(struct kvm *kvm);
357 bool ret;
358
359 lockdep_assert_held(&device->dev_set->lock);
360
361 pfn = symbol_get(kvm_put_kvm);
362 if (WARN_ON(!pfn))
363 return;
364
365 fn = symbol_get(kvm_get_kvm_safe);
366 if (WARN_ON(!fn)) {
367 symbol_put(kvm_put_kvm);
368 return;
369 }
370
371 ret = fn(kvm);
372 symbol_put(kvm_get_kvm_safe);
373 if (!ret) {
374 symbol_put(kvm_put_kvm);
375 return;
376 }
377
378 device->put_kvm = pfn;
379 device->kvm = kvm;
380}
381
382void vfio_device_put_kvm(struct vfio_device *device)
383{
384 lockdep_assert_held(&device->dev_set->lock);
385
386 if (!device->kvm)
387 return;
388
389 if (WARN_ON(!device->put_kvm))
390 goto clear;
391
392 device->put_kvm(device->kvm);
393 device->put_kvm = NULL;
394 symbol_put(kvm_put_kvm);
395
396clear:
397 device->kvm = NULL;
398}
399#endif
400
eadd86f8 401/* true if the vfio_device has open_device() called but not close_device() */
4741f2e9 402static bool vfio_assert_device_open(struct vfio_device *device)
32f55d83 403{
eadd86f8
JG
404 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
405}
406
5c8d3d93 407static int vfio_device_first_open(struct vfio_device *device,
b0d2d569 408 struct iommufd_ctx *iommufd)
294aaccb
JG
409{
410 int ret;
411
412 lockdep_assert_held(&device->dev_set->lock);
413
414 if (!try_module_get(device->dev->driver->owner))
415 return -ENODEV;
416
5c8d3d93
YL
417 if (iommufd)
418 ret = vfio_iommufd_bind(device, iommufd);
419 else
420 ret = vfio_device_group_use_iommu(device);
421 if (ret)
bab6fabc
JG
422 goto err_module_put;
423
294aaccb
JG
424 if (device->ops->open_device) {
425 ret = device->ops->open_device(device);
426 if (ret)
5c8d3d93 427 goto err_unuse_iommu;
294aaccb 428 }
294aaccb
JG
429 return 0;
430
5c8d3d93 431err_unuse_iommu:
5c8d3d93 432 if (iommufd)
a4d1f91d 433 vfio_iommufd_unbind(device);
5c8d3d93
YL
434 else
435 vfio_device_group_unuse_iommu(device);
bab6fabc 436err_module_put:
294aaccb
JG
437 module_put(device->dev->driver->owner);
438 return ret;
439}
440
5c8d3d93
YL
441static void vfio_device_last_close(struct vfio_device *device,
442 struct iommufd_ctx *iommufd)
294aaccb
JG
443{
444 lockdep_assert_held(&device->dev_set->lock);
445
294aaccb
JG
446 if (device->ops->close_device)
447 device->ops->close_device(device);
5c8d3d93 448 if (iommufd)
a4d1f91d 449 vfio_iommufd_unbind(device);
5c8d3d93
YL
450 else
451 vfio_device_group_unuse_iommu(device);
294aaccb
JG
452 module_put(device->dev->driver->owner);
453}
454
b0d2d569 455int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
cba3345c 456{
5cfff077 457 int ret = 0;
03a76b60 458
2b48f52f
MR
459 lockdep_assert_held(&device->dev_set->lock);
460
2fd585f4 461 device->open_count++;
421cfe65 462 if (device->open_count == 1) {
b0d2d569 463 ret = vfio_device_first_open(device, iommufd);
294aaccb 464 if (ret)
5cfff077 465 device->open_count--;
2fd585f4 466 }
2fd585f4 467
5cfff077
YL
468 return ret;
469}
470
9eefba80
YL
471void vfio_device_close(struct vfio_device *device,
472 struct iommufd_ctx *iommufd)
5cfff077 473{
2b48f52f
MR
474 lockdep_assert_held(&device->dev_set->lock);
475
5cfff077
YL
476 vfio_assert_device_open(device);
477 if (device->open_count == 1)
5c8d3d93 478 vfio_device_last_close(device, iommufd);
5cfff077 479 device->open_count--;
5cfff077
YL
480}
481
8e5c6995
AS
482/*
483 * Wrapper around pm_runtime_resume_and_get().
484 * Return error code on failure or 0 on success.
485 */
486static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
487{
488 struct device *dev = device->dev;
489
490 if (dev->driver && dev->driver->pm) {
491 int ret;
492
493 ret = pm_runtime_resume_and_get(dev);
494 if (ret) {
495 dev_info_ratelimited(dev,
496 "vfio: runtime resume failed %d\n", ret);
497 return -EIO;
498 }
499 }
500
501 return 0;
502}
503
504/*
505 * Wrapper around pm_runtime_put().
506 */
507static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
508{
509 struct device *dev = device->dev;
510
511 if (dev->driver && dev->driver->pm)
512 pm_runtime_put(dev);
513}
514
3b9a2d57 515/*
cba3345c
AW
516 * VFIO Device fd
517 */
518static int vfio_device_fops_release(struct inode *inode, struct file *filep)
519{
520 struct vfio_device *device = filep->private_data;
521
5c8d3d93 522 vfio_device_group_close(device);
cba3345c 523
4a725b8d 524 vfio_device_put_registration(device);
cba3345c
AW
525
526 return 0;
527}
528
115dcec6
JG
529/*
530 * vfio_mig_get_next_state - Compute the next step in the FSM
531 * @cur_fsm - The current state the device is in
532 * @new_fsm - The target state to reach
533 * @next_fsm - Pointer to the next step to get to new_fsm
534 *
535 * Return 0 upon success, otherwise -errno
536 * Upon success the next step in the state progression between cur_fsm and
537 * new_fsm will be set in next_fsm.
538 *
539 * This breaks down requests for combination transitions into smaller steps and
540 * returns the next step to get to new_fsm. The function may need to be called
541 * multiple times before reaching new_fsm.
542 *
543 */
544int vfio_mig_get_next_state(struct vfio_device *device,
545 enum vfio_device_mig_state cur_fsm,
546 enum vfio_device_mig_state new_fsm,
547 enum vfio_device_mig_state *next_fsm)
548{
4db52602 549 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
115dcec6 550 /*
8cb3d83b
JG
551 * The coding in this table requires the driver to implement the
552 * following FSM arcs:
115dcec6 553 * RESUMING -> STOP
115dcec6 554 * STOP -> RESUMING
115dcec6
JG
555 * STOP -> STOP_COPY
556 * STOP_COPY -> STOP
557 *
8cb3d83b
JG
558 * If P2P is supported then the driver must also implement these FSM
559 * arcs:
560 * RUNNING -> RUNNING_P2P
561 * RUNNING_P2P -> RUNNING
562 * RUNNING_P2P -> STOP
563 * STOP -> RUNNING_P2P
4db52602
JG
564 *
565 * If precopy is supported then the driver must support these additional
566 * FSM arcs:
567 * RUNNING -> PRE_COPY
568 * PRE_COPY -> RUNNING
569 * PRE_COPY -> STOP_COPY
570 * However, if precopy and P2P are supported together then the driver
571 * must support these additional arcs beyond the P2P arcs above:
572 * PRE_COPY -> RUNNING
573 * PRE_COPY -> PRE_COPY_P2P
574 * PRE_COPY_P2P -> PRE_COPY
575 * PRE_COPY_P2P -> RUNNING_P2P
576 * PRE_COPY_P2P -> STOP_COPY
577 * RUNNING -> PRE_COPY
578 * RUNNING_P2P -> PRE_COPY_P2P
579 *
580 * Without P2P and precopy the driver must implement:
8cb3d83b
JG
581 * RUNNING -> STOP
582 * STOP -> RUNNING
583 *
584 * The coding will step through multiple states for some combination
585 * transitions; if all optional features are supported, this means the
586 * following ones:
4db52602
JG
587 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
588 * PRE_COPY -> RUNNING -> RUNNING_P2P
589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
590 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
591 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
593 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
8cb3d83b 594 * RESUMING -> STOP -> RUNNING_P2P
4db52602 595 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b 596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
4db52602 597 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
115dcec6 598 * RESUMING -> STOP -> STOP_COPY
4db52602 599 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b
JG
600 * RUNNING -> RUNNING_P2P -> STOP
601 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
602 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
4db52602 603 * RUNNING_P2P -> RUNNING -> PRE_COPY
8cb3d83b
JG
604 * RUNNING_P2P -> STOP -> RESUMING
605 * RUNNING_P2P -> STOP -> STOP_COPY
4db52602 606 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
8cb3d83b 607 * STOP -> RUNNING_P2P -> RUNNING
4db52602 608 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
115dcec6 609 * STOP_COPY -> STOP -> RESUMING
8cb3d83b
JG
610 * STOP_COPY -> STOP -> RUNNING_P2P
611 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
4db52602
JG
612 *
613 * The following transitions are blocked:
614 * STOP_COPY -> PRE_COPY
615 * STOP_COPY -> PRE_COPY_P2P
115dcec6
JG
616 */
617 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
618 [VFIO_DEVICE_STATE_STOP] = {
619 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 620 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
4db52602
JG
621 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
622 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
623 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
624 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b 625 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
626 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
627 },
628 [VFIO_DEVICE_STATE_RUNNING] = {
8cb3d83b 629 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6 630 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
4db52602
JG
631 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
632 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
8cb3d83b
JG
633 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
635 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
636 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
637 },
4db52602
JG
638 [VFIO_DEVICE_STATE_PRE_COPY] = {
639 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
640 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
641 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
642 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
644 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
645 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
646 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
647 },
648 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
649 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
651 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
652 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
653 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
654 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
656 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
657 },
115dcec6
JG
658 [VFIO_DEVICE_STATE_STOP_COPY] = {
659 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
660 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
4db52602
JG
661 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
662 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
663 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
664 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
8cb3d83b 665 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
666 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
667 },
668 [VFIO_DEVICE_STATE_RESUMING] = {
669 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
670 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
4db52602
JG
671 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
672 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
115dcec6
JG
673 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
674 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
8cb3d83b
JG
675 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
676 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
677 },
678 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
679 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
680 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
4db52602
JG
681 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
682 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
8cb3d83b
JG
683 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
684 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
685 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
115dcec6
JG
686 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
687 },
688 [VFIO_DEVICE_STATE_ERROR] = {
689 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
690 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
4db52602
JG
691 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
692 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
693 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
694 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
8cb3d83b 695 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
115dcec6
JG
696 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
697 },
698 };
699
8cb3d83b
JG
700 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
701 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
702 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
4db52602
JG
703 [VFIO_DEVICE_STATE_PRE_COPY] =
704 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
705 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
706 VFIO_MIGRATION_P2P |
707 VFIO_MIGRATION_PRE_COPY,
8cb3d83b
JG
708 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
709 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
710 [VFIO_DEVICE_STATE_RUNNING_P2P] =
711 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
712 [VFIO_DEVICE_STATE_ERROR] = ~0U,
713 };
714
715 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
716 (state_flags_table[cur_fsm] & device->migration_flags) !=
717 state_flags_table[cur_fsm]))
115dcec6
JG
718 return -EINVAL;
719
8cb3d83b
JG
720 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
721 (state_flags_table[new_fsm] & device->migration_flags) !=
722 state_flags_table[new_fsm])
115dcec6
JG
723 return -EINVAL;
724
8cb3d83b
JG
725 /*
726 * Arcs touching optional and unsupported states are skipped over. The
727 * driver will instead see an arc from the original state to the next
728 * logical state, as per the above comment.
729 */
115dcec6 730 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
8cb3d83b
JG
731 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
732 state_flags_table[*next_fsm])
733 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
734
115dcec6
JG
735 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
736}
737EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
738
739/*
740 * Convert the drivers's struct file into a FD number and return it to userspace
741 */
742static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
743 struct vfio_device_feature_mig_state *mig)
744{
745 int ret;
746 int fd;
747
748 fd = get_unused_fd_flags(O_CLOEXEC);
749 if (fd < 0) {
750 ret = fd;
751 goto out_fput;
752 }
753
754 mig->data_fd = fd;
755 if (copy_to_user(arg, mig, sizeof(*mig))) {
756 ret = -EFAULT;
757 goto out_put_unused;
758 }
759 fd_install(fd, filp);
760 return 0;
761
762out_put_unused:
763 put_unused_fd(fd);
764out_fput:
765 fput(filp);
766 return ret;
767}
768
769static int
770vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
771 u32 flags, void __user *arg,
772 size_t argsz)
773{
774 size_t minsz =
775 offsetofend(struct vfio_device_feature_mig_state, data_fd);
776 struct vfio_device_feature_mig_state mig;
777 struct file *filp = NULL;
778 int ret;
779
6e97eba8 780 if (!device->mig_ops)
115dcec6
JG
781 return -ENOTTY;
782
783 ret = vfio_check_feature(flags, argsz,
784 VFIO_DEVICE_FEATURE_SET |
785 VFIO_DEVICE_FEATURE_GET,
786 sizeof(mig));
787 if (ret != 1)
788 return ret;
789
790 if (copy_from_user(&mig, arg, minsz))
791 return -EFAULT;
792
793 if (flags & VFIO_DEVICE_FEATURE_GET) {
794 enum vfio_device_mig_state curr_state;
795
6e97eba8
YH
796 ret = device->mig_ops->migration_get_state(device,
797 &curr_state);
115dcec6
JG
798 if (ret)
799 return ret;
800 mig.device_state = curr_state;
801 goto out_copy;
802 }
803
804 /* Handle the VFIO_DEVICE_FEATURE_SET */
6e97eba8 805 filp = device->mig_ops->migration_set_state(device, mig.device_state);
115dcec6
JG
806 if (IS_ERR(filp) || !filp)
807 goto out_copy;
808
809 return vfio_ioct_mig_return_fd(filp, arg, &mig);
810out_copy:
811 mig.data_fd = -1;
812 if (copy_to_user(arg, &mig, sizeof(mig)))
813 return -EFAULT;
814 if (IS_ERR(filp))
815 return PTR_ERR(filp);
816 return 0;
817}
818
4e016f96
YH
819static int
820vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
821 u32 flags, void __user *arg,
822 size_t argsz)
823{
824 struct vfio_device_feature_mig_data_size data_size = {};
825 unsigned long stop_copy_length;
826 int ret;
827
828 if (!device->mig_ops)
829 return -ENOTTY;
830
831 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
832 sizeof(data_size));
833 if (ret != 1)
834 return ret;
835
836 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
837 if (ret)
838 return ret;
839
840 data_size.stop_copy_length = stop_copy_length;
841 if (copy_to_user(arg, &data_size, sizeof(data_size)))
842 return -EFAULT;
843
844 return 0;
845}
846
115dcec6
JG
847static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
848 u32 flags, void __user *arg,
849 size_t argsz)
850{
851 struct vfio_device_feature_migration mig = {
8cb3d83b 852 .flags = device->migration_flags,
115dcec6
JG
853 };
854 int ret;
855
6e97eba8 856 if (!device->mig_ops)
115dcec6
JG
857 return -ENOTTY;
858
859 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
860 sizeof(mig));
861 if (ret != 1)
862 return ret;
863 if (copy_to_user(arg, &mig, sizeof(mig)))
864 return -EFAULT;
865 return 0;
866}
867
80c4b92a
YH
868/* Ranges should fit into a single kernel page */
869#define LOG_MAX_RANGES \
870 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
871
872static int
873vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
874 u32 flags, void __user *arg,
875 size_t argsz)
876{
877 size_t minsz =
878 offsetofend(struct vfio_device_feature_dma_logging_control,
879 ranges);
880 struct vfio_device_feature_dma_logging_range __user *ranges;
881 struct vfio_device_feature_dma_logging_control control;
882 struct vfio_device_feature_dma_logging_range range;
883 struct rb_root_cached root = RB_ROOT_CACHED;
884 struct interval_tree_node *nodes;
885 u64 iova_end;
886 u32 nnodes;
887 int i, ret;
888
889 if (!device->log_ops)
890 return -ENOTTY;
891
892 ret = vfio_check_feature(flags, argsz,
893 VFIO_DEVICE_FEATURE_SET,
894 sizeof(control));
895 if (ret != 1)
896 return ret;
897
898 if (copy_from_user(&control, arg, minsz))
899 return -EFAULT;
900
901 nnodes = control.num_ranges;
902 if (!nnodes)
903 return -EINVAL;
904
905 if (nnodes > LOG_MAX_RANGES)
906 return -E2BIG;
907
908 ranges = u64_to_user_ptr(control.ranges);
909 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
910 GFP_KERNEL);
911 if (!nodes)
912 return -ENOMEM;
913
914 for (i = 0; i < nnodes; i++) {
915 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
916 ret = -EFAULT;
917 goto end;
918 }
919 if (!IS_ALIGNED(range.iova, control.page_size) ||
920 !IS_ALIGNED(range.length, control.page_size)) {
921 ret = -EINVAL;
922 goto end;
923 }
924
925 if (check_add_overflow(range.iova, range.length, &iova_end) ||
926 iova_end > ULONG_MAX) {
927 ret = -EOVERFLOW;
928 goto end;
929 }
930
931 nodes[i].start = range.iova;
932 nodes[i].last = range.iova + range.length - 1;
933 if (interval_tree_iter_first(&root, nodes[i].start,
934 nodes[i].last)) {
935 /* Range overlapping */
936 ret = -EINVAL;
937 goto end;
938 }
939 interval_tree_insert(nodes + i, &root);
940 }
941
942 ret = device->log_ops->log_start(device, &root, nnodes,
943 &control.page_size);
944 if (ret)
945 goto end;
946
947 if (copy_to_user(arg, &control, sizeof(control))) {
948 ret = -EFAULT;
949 device->log_ops->log_stop(device);
950 }
951
952end:
953 kfree(nodes);
954 return ret;
955}
956
957static int
958vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
959 u32 flags, void __user *arg,
960 size_t argsz)
961{
962 int ret;
963
964 if (!device->log_ops)
965 return -ENOTTY;
966
967 ret = vfio_check_feature(flags, argsz,
968 VFIO_DEVICE_FEATURE_SET, 0);
969 if (ret != 1)
970 return ret;
971
972 return device->log_ops->log_stop(device);
973}
974
975static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
976 unsigned long iova, size_t length,
977 void *opaque)
978{
979 struct vfio_device *device = opaque;
980
981 return device->log_ops->log_read_and_clear(device, iova, length, iter);
982}
983
984static int
985vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
986 u32 flags, void __user *arg,
987 size_t argsz)
988{
989 size_t minsz =
990 offsetofend(struct vfio_device_feature_dma_logging_report,
991 bitmap);
992 struct vfio_device_feature_dma_logging_report report;
993 struct iova_bitmap *iter;
994 u64 iova_end;
995 int ret;
996
997 if (!device->log_ops)
998 return -ENOTTY;
999
1000 ret = vfio_check_feature(flags, argsz,
1001 VFIO_DEVICE_FEATURE_GET,
1002 sizeof(report));
1003 if (ret != 1)
1004 return ret;
1005
1006 if (copy_from_user(&report, arg, minsz))
1007 return -EFAULT;
1008
1009 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1010 return -EINVAL;
1011
1012 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1013 iova_end > ULONG_MAX)
1014 return -EOVERFLOW;
1015
1016 iter = iova_bitmap_alloc(report.iova, report.length,
1017 report.page_size,
1018 u64_to_user_ptr(report.bitmap));
1019 if (IS_ERR(iter))
1020 return PTR_ERR(iter);
1021
1022 ret = iova_bitmap_for_each(iter, device,
1023 vfio_device_log_read_and_clear);
1024
1025 iova_bitmap_free(iter);
1026 return ret;
1027}
1028
445ad495
JG
1029static int vfio_ioctl_device_feature(struct vfio_device *device,
1030 struct vfio_device_feature __user *arg)
1031{
1032 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1033 struct vfio_device_feature feature;
1034
1035 if (copy_from_user(&feature, arg, minsz))
1036 return -EFAULT;
1037
1038 if (feature.argsz < minsz)
1039 return -EINVAL;
1040
1041 /* Check unknown flags */
1042 if (feature.flags &
1043 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1044 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1045 return -EINVAL;
1046
1047 /* GET & SET are mutually exclusive except with PROBE */
1048 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1049 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1050 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1051 return -EINVAL;
1052
1053 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
115dcec6
JG
1054 case VFIO_DEVICE_FEATURE_MIGRATION:
1055 return vfio_ioctl_device_feature_migration(
1056 device, feature.flags, arg->data,
1057 feature.argsz - minsz);
1058 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1059 return vfio_ioctl_device_feature_mig_device_state(
1060 device, feature.flags, arg->data,
1061 feature.argsz - minsz);
80c4b92a
YH
1062 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1063 return vfio_ioctl_device_feature_logging_start(
1064 device, feature.flags, arg->data,
1065 feature.argsz - minsz);
1066 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1067 return vfio_ioctl_device_feature_logging_stop(
1068 device, feature.flags, arg->data,
1069 feature.argsz - minsz);
1070 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1071 return vfio_ioctl_device_feature_logging_report(
1072 device, feature.flags, arg->data,
1073 feature.argsz - minsz);
4e016f96
YH
1074 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1075 return vfio_ioctl_device_feature_migration_data_size(
1076 device, feature.flags, arg->data,
1077 feature.argsz - minsz);
445ad495
JG
1078 default:
1079 if (unlikely(!device->ops->device_feature))
1080 return -EINVAL;
1081 return device->ops->device_feature(device, feature.flags,
1082 arg->data,
1083 feature.argsz - minsz);
1084 }
1085}
1086
cba3345c
AW
1087static long vfio_device_fops_unl_ioctl(struct file *filep,
1088 unsigned int cmd, unsigned long arg)
1089{
1090 struct vfio_device *device = filep->private_data;
8e5c6995
AS
1091 int ret;
1092
1093 ret = vfio_device_pm_runtime_get(device);
1094 if (ret)
1095 return ret;
cba3345c 1096
445ad495
JG
1097 switch (cmd) {
1098 case VFIO_DEVICE_FEATURE:
8e5c6995
AS
1099 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1100 break;
1101
445ad495
JG
1102 default:
1103 if (unlikely(!device->ops->ioctl))
8e5c6995
AS
1104 ret = -EINVAL;
1105 else
1106 ret = device->ops->ioctl(device, cmd, arg);
1107 break;
445ad495 1108 }
8e5c6995
AS
1109
1110 vfio_device_pm_runtime_put(device);
1111 return ret;
cba3345c
AW
1112}
1113
1114static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1115 size_t count, loff_t *ppos)
1116{
1117 struct vfio_device *device = filep->private_data;
1118
1119 if (unlikely(!device->ops->read))
1120 return -EINVAL;
1121
6df62c5b 1122 return device->ops->read(device, buf, count, ppos);
cba3345c
AW
1123}
1124
1125static ssize_t vfio_device_fops_write(struct file *filep,
1126 const char __user *buf,
1127 size_t count, loff_t *ppos)
1128{
1129 struct vfio_device *device = filep->private_data;
1130
1131 if (unlikely(!device->ops->write))
1132 return -EINVAL;
1133
6df62c5b 1134 return device->ops->write(device, buf, count, ppos);
cba3345c
AW
1135}
1136
1137static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1138{
1139 struct vfio_device *device = filep->private_data;
1140
1141 if (unlikely(!device->ops->mmap))
1142 return -EINVAL;
1143
6df62c5b 1144 return device->ops->mmap(device, vma);
cba3345c
AW
1145}
1146
9eefba80 1147const struct file_operations vfio_device_fops = {
cba3345c
AW
1148 .owner = THIS_MODULE,
1149 .release = vfio_device_fops_release,
1150 .read = vfio_device_fops_read,
1151 .write = vfio_device_fops_write,
1152 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
407e9ef7 1153 .compat_ioctl = compat_ptr_ioctl,
cba3345c
AW
1154 .mmap = vfio_device_fops_mmap,
1155};
1156
3b9a2d57 1157/*
d7a8d5ed
AW
1158 * Sub-module support
1159 */
1160/*
1161 * Helper for managing a buffer of info chain capabilities, allocate or
1162 * reallocate a buffer with additional @size, filling in @id and @version
1163 * of the capability. A pointer to the new capability is returned.
1164 *
1165 * NB. The chain is based at the head of the buffer, so new entries are
1166 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1167 * next offsets prior to copying to the user buffer.
1168 */
1169struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1170 size_t size, u16 id, u16 version)
1171{
1172 void *buf;
1173 struct vfio_info_cap_header *header, *tmp;
1174
1175 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1176 if (!buf) {
1177 kfree(caps->buf);
6641085e 1178 caps->buf = NULL;
d7a8d5ed
AW
1179 caps->size = 0;
1180 return ERR_PTR(-ENOMEM);
1181 }
1182
1183 caps->buf = buf;
1184 header = buf + caps->size;
1185
1186 /* Eventually copied to user buffer, zero */
1187 memset(header, 0, size);
1188
1189 header->id = id;
1190 header->version = version;
1191
1192 /* Add to the end of the capability chain */
5ba6de98 1193 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
d7a8d5ed
AW
1194 ; /* nothing */
1195
1196 tmp->next = caps->size;
1197 caps->size += size;
1198
1199 return header;
1200}
1201EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1202
1203void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1204{
1205 struct vfio_info_cap_header *tmp;
5ba6de98 1206 void *buf = (void *)caps->buf;
d7a8d5ed 1207
5ba6de98 1208 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
d7a8d5ed
AW
1209 tmp->next += offset;
1210}
b3c0a866 1211EXPORT_SYMBOL(vfio_info_cap_shift);
d7a8d5ed 1212
dda01f78
AW
1213int vfio_info_add_capability(struct vfio_info_cap *caps,
1214 struct vfio_info_cap_header *cap, size_t size)
b3c0a866
KW
1215{
1216 struct vfio_info_cap_header *header;
b3c0a866 1217
dda01f78 1218 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
b3c0a866
KW
1219 if (IS_ERR(header))
1220 return PTR_ERR(header);
1221
dda01f78 1222 memcpy(header + 1, cap + 1, size - sizeof(*header));
b3c0a866 1223
b3c0a866
KW
1224 return 0;
1225}
b3c0a866 1226EXPORT_SYMBOL(vfio_info_add_capability);
2169037d 1227
c747f08a
KW
1228int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1229 int max_irq_type, size_t *data_size)
1230{
1231 unsigned long minsz;
1232 size_t size;
1233
1234 minsz = offsetofend(struct vfio_irq_set, count);
1235
1236 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1237 (hdr->count >= (U32_MAX - hdr->start)) ||
1238 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1239 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1240 return -EINVAL;
1241
1242 if (data_size)
1243 *data_size = 0;
1244
1245 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1246 return -EINVAL;
1247
1248 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1249 case VFIO_IRQ_SET_DATA_NONE:
1250 size = 0;
1251 break;
1252 case VFIO_IRQ_SET_DATA_BOOL:
1253 size = sizeof(uint8_t);
1254 break;
1255 case VFIO_IRQ_SET_DATA_EVENTFD:
1256 size = sizeof(int32_t);
1257 break;
1258 default:
1259 return -EINVAL;
1260 }
1261
1262 if (size) {
1263 if (hdr->argsz - minsz < hdr->count * size)
1264 return -EINVAL;
1265
1266 if (!data_size)
1267 return -EINVAL;
1268
1269 *data_size = hdr->count * size;
1270 }
1271
1272 return 0;
1273}
1274EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1275
4741f2e9
JG
1276/*
1277 * Pin contiguous user pages and return their associated host pages for local
1278 * domain only.
1279 * @device [in] : device
1280 * @iova [in] : starting IOVA of user pages to be pinned.
1281 * @npage [in] : count of pages to be pinned. This count should not
1282 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1283 * @prot [in] : protection flags
1284 * @pages[out] : array of host pages
1285 * Return error or number of pages pinned.
1286 *
1287 * A driver may only call this function if the vfio_device was created
8da7a0e7 1288 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
4741f2e9
JG
1289 */
1290int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1291 int npage, int prot, struct page **pages)
1292{
1293 /* group->container cannot change while a vfio device is open */
1294 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1295 return -EINVAL;
8da7a0e7
YL
1296 if (vfio_device_has_container(device))
1297 return vfio_device_container_pin_pages(device, iova,
1298 npage, prot, pages);
4741f2e9
JG
1299 if (device->iommufd_access) {
1300 int ret;
1301
1302 if (iova > ULONG_MAX)
1303 return -EINVAL;
1304 /*
1305 * VFIO ignores the sub page offset, npages is from the start of
1306 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1307 * the sub page offset by doing:
1308 * pages[0] + (iova % PAGE_SIZE)
1309 */
1310 ret = iommufd_access_pin_pages(
1311 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1312 npage * PAGE_SIZE, pages,
1313 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1314 if (ret)
1315 return ret;
1316 return npage;
1317 }
1318 return -EINVAL;
1319}
1320EXPORT_SYMBOL(vfio_pin_pages);
1321
1322/*
1323 * Unpin contiguous host pages for local domain only.
1324 * @device [in] : device
1325 * @iova [in] : starting address of user pages to be unpinned.
1326 * @npage [in] : count of pages to be unpinned. This count should not
1327 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1328 */
1329void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1330{
1331 if (WARN_ON(!vfio_assert_device_open(device)))
1332 return;
1333
8da7a0e7
YL
1334 if (vfio_device_has_container(device)) {
1335 vfio_device_container_unpin_pages(device, iova, npage);
4741f2e9
JG
1336 return;
1337 }
1338 if (device->iommufd_access) {
1339 if (WARN_ON(iova > ULONG_MAX))
1340 return;
1341 iommufd_access_unpin_pages(device->iommufd_access,
1342 ALIGN_DOWN(iova, PAGE_SIZE),
1343 npage * PAGE_SIZE);
1344 return;
1345 }
1346}
1347EXPORT_SYMBOL(vfio_unpin_pages);
1348
1349/*
1350 * This interface allows the CPUs to perform some sort of virtual DMA on
1351 * behalf of the device.
1352 *
1353 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1354 * into/from a kernel buffer.
1355 *
1356 * As the read/write of user space memory is conducted via the CPUs and is
1357 * not a real device DMA, it is not necessary to pin the user space memory.
1358 *
1359 * @device [in] : VFIO device
1360 * @iova [in] : base IOVA of a user space buffer
1361 * @data [in] : pointer to kernel buffer
1362 * @len [in] : kernel buffer length
1363 * @write : indicate read or write
1364 * Return error code on failure or 0 on success.
1365 */
1366int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1367 size_t len, bool write)
1368{
1369 if (!data || len <= 0 || !vfio_assert_device_open(device))
1370 return -EINVAL;
1371
8da7a0e7
YL
1372 if (vfio_device_has_container(device))
1373 return vfio_device_container_dma_rw(device, iova,
1374 data, len, write);
4741f2e9
JG
1375
1376 if (device->iommufd_access) {
1377 unsigned int flags = 0;
1378
1379 if (iova > ULONG_MAX)
1380 return -EINVAL;
1381
1382 /* VFIO historically tries to auto-detect a kthread */
1383 if (!current->mm)
1384 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1385 if (write)
1386 flags |= IOMMUFD_ACCESS_RW_WRITE;
1387 return iommufd_access_rw(device->iommufd_access, iova, data,
1388 len, flags);
1389 }
1390 return -EINVAL;
1391}
1392EXPORT_SYMBOL(vfio_dma_rw);
1393
3b9a2d57 1394/*
cba3345c
AW
1395 * Module/class support
1396 */
1334e47e
YL
1397static int __init vfio_init(void)
1398{
1399 int ret;
1400
1401 ida_init(&vfio.device_ida);
1402
1403 ret = vfio_group_init();
1404 if (ret)
1405 return ret;
1406
e2d55709
JG
1407 ret = vfio_virqfd_init();
1408 if (ret)
1409 goto err_virqfd;
1410
1334e47e 1411 /* /sys/class/vfio-dev/vfioX */
1aaba11d 1412 vfio.device_class = class_create("vfio-dev");
1334e47e
YL
1413 if (IS_ERR(vfio.device_class)) {
1414 ret = PTR_ERR(vfio.device_class);
1415 goto err_dev_class;
1416 }
1417
1418 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1419 return 0;
1420
1421err_dev_class:
e2d55709
JG
1422 vfio_virqfd_exit();
1423err_virqfd:
1334e47e
YL
1424 vfio_group_cleanup();
1425 return ret;
1426}
1427
1428static void __exit vfio_cleanup(void)
1429{
1430 ida_destroy(&vfio.device_ida);
1431 class_destroy(vfio.device_class);
1432 vfio.device_class = NULL;
e2d55709 1433 vfio_virqfd_exit();
1334e47e 1434 vfio_group_cleanup();
2fd585f4 1435 xa_destroy(&vfio_device_set_xa);
cba3345c
AW
1436}
1437
1438module_init(vfio_init);
1439module_exit(vfio_cleanup);
1440
1441MODULE_VERSION(DRIVER_VERSION);
1442MODULE_LICENSE("GPL v2");
1443MODULE_AUTHOR(DRIVER_AUTHOR);
1444MODULE_DESCRIPTION(DRIVER_DESC);
0ca582fd 1445MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");