Merge tag 'soc-for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
[linux-2.6-block.git] / drivers / vfio / pci / vfio_pci.c
CommitLineData
89e1f7d4
AW
1/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/device.h>
15#include <linux/eventfd.h>
8b27ee60 16#include <linux/file.h>
89e1f7d4
AW
17#include <linux/interrupt.h>
18#include <linux/iommu.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include <linux/notifier.h>
22#include <linux/pci.h>
23#include <linux/pm_runtime.h>
24#include <linux/slab.h>
25#include <linux/types.h>
26#include <linux/uaccess.h>
27#include <linux/vfio.h>
28
29#include "vfio_pci_private.h"
30
31#define DRIVER_VERSION "0.2"
32#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
33#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
34
35static bool nointxmask;
36module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
37MODULE_PARM_DESC(nointxmask,
38 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
39
40static int vfio_pci_enable(struct vfio_pci_device *vdev)
41{
42 struct pci_dev *pdev = vdev->pdev;
43 int ret;
44 u16 cmd;
45 u8 msix_pos;
46
9a92c509
AW
47 ret = pci_enable_device(pdev);
48 if (ret)
49 return ret;
50
89e1f7d4
AW
51 vdev->reset_works = (pci_reset_function(pdev) == 0);
52 pci_save_state(pdev);
53 vdev->pci_saved_state = pci_store_saved_state(pdev);
54 if (!vdev->pci_saved_state)
55 pr_debug("%s: Couldn't store %s saved state\n",
56 __func__, dev_name(&pdev->dev));
57
58 ret = vfio_config_init(vdev);
9a92c509 59 if (ret) {
eb5685f0
AW
60 kfree(vdev->pci_saved_state);
61 vdev->pci_saved_state = NULL;
9a92c509
AW
62 pci_disable_device(pdev);
63 return ret;
64 }
89e1f7d4
AW
65
66 if (likely(!nointxmask))
67 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
68
69 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
70 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
71 cmd &= ~PCI_COMMAND_INTX_DISABLE;
72 pci_write_config_word(pdev, PCI_COMMAND, cmd);
73 }
74
a9047f24 75 msix_pos = pdev->msix_cap;
89e1f7d4
AW
76 if (msix_pos) {
77 u16 flags;
78 u32 table;
79
80 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
81 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
82
508d1aa6
BH
83 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
84 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
89e1f7d4
AW
85 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
86 } else
87 vdev->msix_bar = 0xFF;
88
84237a82
AW
89#ifdef CONFIG_VFIO_PCI_VGA
90 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
91 vdev->has_vga = true;
92#endif
93
9a92c509 94 return 0;
89e1f7d4
AW
95}
96
97static void vfio_pci_disable(struct vfio_pci_device *vdev)
98{
2007722a 99 struct pci_dev *pdev = vdev->pdev;
89e1f7d4
AW
100 int bar;
101
2007722a 102 pci_disable_device(pdev);
89e1f7d4
AW
103
104 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
105 VFIO_IRQ_SET_ACTION_TRIGGER,
106 vdev->irq_type, 0, 0, NULL);
107
108 vdev->virq_disabled = false;
109
110 vfio_config_free(vdev);
111
89e1f7d4
AW
112 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
113 if (!vdev->barmap[bar])
114 continue;
2007722a
AW
115 pci_iounmap(pdev, vdev->barmap[bar]);
116 pci_release_selected_regions(pdev, 1 << bar);
89e1f7d4
AW
117 vdev->barmap[bar] = NULL;
118 }
2007722a
AW
119
120 /*
121 * If we have saved state, restore it. If we can reset the device,
122 * even better. Resetting with current state seems better than
123 * nothing, but saving and restoring current state without reset
124 * is just busy work.
125 */
126 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
127 pr_info("%s: Couldn't reload %s saved state\n",
128 __func__, dev_name(&pdev->dev));
129
130 if (!vdev->reset_works)
131 return;
132
133 pci_save_state(pdev);
134 }
135
136 /*
137 * Disable INTx and MSI, presumably to avoid spurious interrupts
138 * during reset. Stolen from pci_reset_function()
139 */
140 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
141
d24cdbfd 142 /*
890ed578
AW
143 * Try to reset the device. The success of this is dependent on
144 * being able to lock the device, which is not always possible.
d24cdbfd
AW
145 */
146 if (vdev->reset_works) {
890ed578
AW
147 int ret = pci_try_reset_function(pdev);
148 if (ret)
149 pr_warn("%s: Failed to reset device %s (%d)\n",
150 __func__, dev_name(&pdev->dev), ret);
d24cdbfd 151 }
2007722a
AW
152
153 pci_restore_state(pdev);
89e1f7d4
AW
154}
155
156static void vfio_pci_release(void *device_data)
157{
158 struct vfio_pci_device *vdev = device_data;
159
1b69be5e
GS
160 if (atomic_dec_and_test(&vdev->refcnt)) {
161 vfio_spapr_pci_eeh_release(vdev->pdev);
89e1f7d4 162 vfio_pci_disable(vdev);
1b69be5e 163 }
89e1f7d4
AW
164
165 module_put(THIS_MODULE);
166}
167
168static int vfio_pci_open(void *device_data)
169{
170 struct vfio_pci_device *vdev = device_data;
1b69be5e 171 int ret;
89e1f7d4
AW
172
173 if (!try_module_get(THIS_MODULE))
174 return -ENODEV;
175
176 if (atomic_inc_return(&vdev->refcnt) == 1) {
1b69be5e
GS
177 ret = vfio_pci_enable(vdev);
178 if (ret)
179 goto error;
180
181 ret = vfio_spapr_pci_eeh_open(vdev->pdev);
89e1f7d4 182 if (ret) {
1b69be5e
GS
183 vfio_pci_disable(vdev);
184 goto error;
89e1f7d4
AW
185 }
186 }
187
188 return 0;
1b69be5e
GS
189error:
190 module_put(THIS_MODULE);
191 return ret;
89e1f7d4
AW
192}
193
194static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
195{
196 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
197 u8 pin;
198 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
199 if (pin)
200 return 1;
201
202 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
203 u8 pos;
204 u16 flags;
205
a9047f24 206 pos = vdev->pdev->msi_cap;
89e1f7d4
AW
207 if (pos) {
208 pci_read_config_word(vdev->pdev,
209 pos + PCI_MSI_FLAGS, &flags);
fd49c81f 210 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
89e1f7d4
AW
211 }
212 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
213 u8 pos;
214 u16 flags;
215
a9047f24 216 pos = vdev->pdev->msix_cap;
89e1f7d4
AW
217 if (pos) {
218 pci_read_config_word(vdev->pdev,
219 pos + PCI_MSIX_FLAGS, &flags);
220
221 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
222 }
dad9f897
VMP
223 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
224 if (pci_is_pcie(vdev->pdev))
225 return 1;
89e1f7d4
AW
226
227 return 0;
228}
229
8b27ee60
AW
230static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
231{
232 (*(int *)data)++;
233 return 0;
234}
235
236struct vfio_pci_fill_info {
237 int max;
238 int cur;
239 struct vfio_pci_dependent_device *devices;
240};
241
242static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
243{
244 struct vfio_pci_fill_info *fill = data;
245 struct iommu_group *iommu_group;
246
247 if (fill->cur == fill->max)
248 return -EAGAIN; /* Something changed, try again */
249
250 iommu_group = iommu_group_get(&pdev->dev);
251 if (!iommu_group)
252 return -EPERM; /* Cannot reset non-isolated devices */
253
254 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
255 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
256 fill->devices[fill->cur].bus = pdev->bus->number;
257 fill->devices[fill->cur].devfn = pdev->devfn;
258 fill->cur++;
259 iommu_group_put(iommu_group);
260 return 0;
261}
262
263struct vfio_pci_group_entry {
264 struct vfio_group *group;
265 int id;
266};
267
268struct vfio_pci_group_info {
269 int count;
270 struct vfio_pci_group_entry *groups;
271};
272
273static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
274{
275 struct vfio_pci_group_info *info = data;
276 struct iommu_group *group;
277 int id, i;
278
279 group = iommu_group_get(&pdev->dev);
280 if (!group)
281 return -EPERM;
282
283 id = iommu_group_id(group);
284
285 for (i = 0; i < info->count; i++)
286 if (info->groups[i].id == id)
287 break;
288
289 iommu_group_put(group);
290
291 return (i == info->count) ? -EINVAL : 0;
292}
293
294static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
295{
296 for (; pdev; pdev = pdev->bus->self)
297 if (pdev->bus == slot->bus)
298 return (pdev->slot == slot);
299 return false;
300}
301
302struct vfio_pci_walk_info {
303 int (*fn)(struct pci_dev *, void *data);
304 void *data;
305 struct pci_dev *pdev;
306 bool slot;
307 int ret;
308};
309
310static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
311{
312 struct vfio_pci_walk_info *walk = data;
313
314 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
315 walk->ret = walk->fn(pdev, walk->data);
316
317 return walk->ret;
318}
319
320static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
321 int (*fn)(struct pci_dev *,
322 void *data), void *data,
323 bool slot)
324{
325 struct vfio_pci_walk_info walk = {
326 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
327 };
328
329 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
330
331 return walk.ret;
332}
333
89e1f7d4
AW
334static long vfio_pci_ioctl(void *device_data,
335 unsigned int cmd, unsigned long arg)
336{
337 struct vfio_pci_device *vdev = device_data;
338 unsigned long minsz;
339
340 if (cmd == VFIO_DEVICE_GET_INFO) {
341 struct vfio_device_info info;
342
343 minsz = offsetofend(struct vfio_device_info, num_irqs);
344
345 if (copy_from_user(&info, (void __user *)arg, minsz))
346 return -EFAULT;
347
348 if (info.argsz < minsz)
349 return -EINVAL;
350
351 info.flags = VFIO_DEVICE_FLAGS_PCI;
352
353 if (vdev->reset_works)
354 info.flags |= VFIO_DEVICE_FLAGS_RESET;
355
356 info.num_regions = VFIO_PCI_NUM_REGIONS;
357 info.num_irqs = VFIO_PCI_NUM_IRQS;
358
359 return copy_to_user((void __user *)arg, &info, minsz);
360
361 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
362 struct pci_dev *pdev = vdev->pdev;
363 struct vfio_region_info info;
364
365 minsz = offsetofend(struct vfio_region_info, offset);
366
367 if (copy_from_user(&info, (void __user *)arg, minsz))
368 return -EFAULT;
369
370 if (info.argsz < minsz)
371 return -EINVAL;
372
373 switch (info.index) {
374 case VFIO_PCI_CONFIG_REGION_INDEX:
375 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
376 info.size = pdev->cfg_size;
377 info.flags = VFIO_REGION_INFO_FLAG_READ |
378 VFIO_REGION_INFO_FLAG_WRITE;
379 break;
380 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
381 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
382 info.size = pci_resource_len(pdev, info.index);
383 if (!info.size) {
384 info.flags = 0;
385 break;
386 }
387
388 info.flags = VFIO_REGION_INFO_FLAG_READ |
389 VFIO_REGION_INFO_FLAG_WRITE;
390 if (pci_resource_flags(pdev, info.index) &
391 IORESOURCE_MEM && info.size >= PAGE_SIZE)
392 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
393 break;
394 case VFIO_PCI_ROM_REGION_INDEX:
395 {
396 void __iomem *io;
397 size_t size;
398
399 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
400 info.flags = 0;
401
402 /* Report the BAR size, not the ROM size */
403 info.size = pci_resource_len(pdev, info.index);
404 if (!info.size)
405 break;
406
407 /* Is it really there? */
408 io = pci_map_rom(pdev, &size);
409 if (!io || !size) {
410 info.size = 0;
411 break;
412 }
413 pci_unmap_rom(pdev, io);
414
415 info.flags = VFIO_REGION_INFO_FLAG_READ;
416 break;
417 }
84237a82
AW
418 case VFIO_PCI_VGA_REGION_INDEX:
419 if (!vdev->has_vga)
420 return -EINVAL;
421
422 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
423 info.size = 0xc0000;
424 info.flags = VFIO_REGION_INFO_FLAG_READ |
425 VFIO_REGION_INFO_FLAG_WRITE;
426
427 break;
89e1f7d4
AW
428 default:
429 return -EINVAL;
430 }
431
432 return copy_to_user((void __user *)arg, &info, minsz);
433
434 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
435 struct vfio_irq_info info;
436
437 minsz = offsetofend(struct vfio_irq_info, count);
438
439 if (copy_from_user(&info, (void __user *)arg, minsz))
440 return -EFAULT;
441
442 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
443 return -EINVAL;
444
dad9f897
VMP
445 switch (info.index) {
446 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
447 break;
448 case VFIO_PCI_ERR_IRQ_INDEX:
449 if (pci_is_pcie(vdev->pdev))
450 break;
451 /* pass thru to return error */
452 default:
453 return -EINVAL;
454 }
455
89e1f7d4
AW
456 info.flags = VFIO_IRQ_INFO_EVENTFD;
457
458 info.count = vfio_pci_get_irq_count(vdev, info.index);
459
460 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
461 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
462 VFIO_IRQ_INFO_AUTOMASKED);
463 else
464 info.flags |= VFIO_IRQ_INFO_NORESIZE;
465
466 return copy_to_user((void __user *)arg, &info, minsz);
467
468 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
469 struct vfio_irq_set hdr;
470 u8 *data = NULL;
471 int ret = 0;
472
473 minsz = offsetofend(struct vfio_irq_set, count);
474
475 if (copy_from_user(&hdr, (void __user *)arg, minsz))
476 return -EFAULT;
477
478 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
479 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
480 VFIO_IRQ_SET_ACTION_TYPE_MASK))
481 return -EINVAL;
482
483 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
484 size_t size;
904c680c 485 int max = vfio_pci_get_irq_count(vdev, hdr.index);
89e1f7d4
AW
486
487 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
488 size = sizeof(uint8_t);
489 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
490 size = sizeof(int32_t);
491 else
492 return -EINVAL;
493
494 if (hdr.argsz - minsz < hdr.count * size ||
904c680c 495 hdr.start >= max || hdr.start + hdr.count > max)
89e1f7d4
AW
496 return -EINVAL;
497
3a1f7041
FW
498 data = memdup_user((void __user *)(arg + minsz),
499 hdr.count * size);
500 if (IS_ERR(data))
501 return PTR_ERR(data);
89e1f7d4
AW
502 }
503
504 mutex_lock(&vdev->igate);
505
506 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
507 hdr.start, hdr.count, data);
508
509 mutex_unlock(&vdev->igate);
510 kfree(data);
511
512 return ret;
513
8b27ee60 514 } else if (cmd == VFIO_DEVICE_RESET) {
89e1f7d4 515 return vdev->reset_works ?
890ed578 516 pci_try_reset_function(vdev->pdev) : -EINVAL;
89e1f7d4 517
8b27ee60
AW
518 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
519 struct vfio_pci_hot_reset_info hdr;
520 struct vfio_pci_fill_info fill = { 0 };
521 struct vfio_pci_dependent_device *devices = NULL;
522 bool slot = false;
523 int ret = 0;
524
525 minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
526
527 if (copy_from_user(&hdr, (void __user *)arg, minsz))
528 return -EFAULT;
529
530 if (hdr.argsz < minsz)
531 return -EINVAL;
532
533 hdr.flags = 0;
534
535 /* Can we do a slot or bus reset or neither? */
536 if (!pci_probe_reset_slot(vdev->pdev->slot))
537 slot = true;
538 else if (pci_probe_reset_bus(vdev->pdev->bus))
539 return -ENODEV;
540
541 /* How many devices are affected? */
542 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
543 vfio_pci_count_devs,
544 &fill.max, slot);
545 if (ret)
546 return ret;
547
548 WARN_ON(!fill.max); /* Should always be at least one */
549
550 /*
551 * If there's enough space, fill it now, otherwise return
552 * -ENOSPC and the number of devices affected.
553 */
554 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
555 ret = -ENOSPC;
556 hdr.count = fill.max;
557 goto reset_info_exit;
558 }
559
560 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
561 if (!devices)
562 return -ENOMEM;
563
564 fill.devices = devices;
565
566 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
567 vfio_pci_fill_devs,
568 &fill, slot);
569
570 /*
571 * If a device was removed between counting and filling,
572 * we may come up short of fill.max. If a device was
573 * added, we'll have a return of -EAGAIN above.
574 */
575 if (!ret)
576 hdr.count = fill.cur;
577
578reset_info_exit:
579 if (copy_to_user((void __user *)arg, &hdr, minsz))
580 ret = -EFAULT;
581
582 if (!ret) {
583 if (copy_to_user((void __user *)(arg + minsz), devices,
584 hdr.count * sizeof(*devices)))
585 ret = -EFAULT;
586 }
587
588 kfree(devices);
589 return ret;
590
591 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
592 struct vfio_pci_hot_reset hdr;
593 int32_t *group_fds;
594 struct vfio_pci_group_entry *groups;
595 struct vfio_pci_group_info info;
596 bool slot = false;
597 int i, count = 0, ret = 0;
598
599 minsz = offsetofend(struct vfio_pci_hot_reset, count);
600
601 if (copy_from_user(&hdr, (void __user *)arg, minsz))
602 return -EFAULT;
603
604 if (hdr.argsz < minsz || hdr.flags)
605 return -EINVAL;
606
607 /* Can we do a slot or bus reset or neither? */
608 if (!pci_probe_reset_slot(vdev->pdev->slot))
609 slot = true;
610 else if (pci_probe_reset_bus(vdev->pdev->bus))
611 return -ENODEV;
612
613 /*
614 * We can't let userspace give us an arbitrarily large
615 * buffer to copy, so verify how many we think there
616 * could be. Note groups can have multiple devices so
617 * one group per device is the max.
618 */
619 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
620 vfio_pci_count_devs,
621 &count, slot);
622 if (ret)
623 return ret;
624
625 /* Somewhere between 1 and count is OK */
626 if (!hdr.count || hdr.count > count)
627 return -EINVAL;
628
629 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
630 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
631 if (!group_fds || !groups) {
632 kfree(group_fds);
633 kfree(groups);
634 return -ENOMEM;
635 }
636
637 if (copy_from_user(group_fds, (void __user *)(arg + minsz),
638 hdr.count * sizeof(*group_fds))) {
639 kfree(group_fds);
640 kfree(groups);
641 return -EFAULT;
642 }
643
644 /*
645 * For each group_fd, get the group through the vfio external
646 * user interface and store the group and iommu ID. This
647 * ensures the group is held across the reset.
648 */
649 for (i = 0; i < hdr.count; i++) {
650 struct vfio_group *group;
651 struct fd f = fdget(group_fds[i]);
652 if (!f.file) {
653 ret = -EBADF;
654 break;
655 }
656
657 group = vfio_group_get_external_user(f.file);
658 fdput(f);
659 if (IS_ERR(group)) {
660 ret = PTR_ERR(group);
661 break;
662 }
663
664 groups[i].group = group;
665 groups[i].id = vfio_external_user_iommu_id(group);
666 }
667
668 kfree(group_fds);
669
670 /* release reference to groups on error */
671 if (ret)
672 goto hot_reset_release;
673
674 info.count = hdr.count;
675 info.groups = groups;
676
677 /*
678 * Test whether all the affected devices are contained
679 * by the set of groups provided by the user.
680 */
681 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
682 vfio_pci_validate_devs,
683 &info, slot);
684 if (!ret)
685 /* User has access, do the reset */
890ed578
AW
686 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
687 pci_try_reset_bus(vdev->pdev->bus);
8b27ee60
AW
688
689hot_reset_release:
690 for (i--; i >= 0; i--)
691 vfio_group_put_external_user(groups[i].group);
692
693 kfree(groups);
694 return ret;
695 }
696
89e1f7d4
AW
697 return -ENOTTY;
698}
699
5b279a11
AW
700static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
701 size_t count, loff_t *ppos, bool iswrite)
89e1f7d4
AW
702{
703 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
704 struct vfio_pci_device *vdev = device_data;
89e1f7d4
AW
705
706 if (index >= VFIO_PCI_NUM_REGIONS)
707 return -EINVAL;
708
5b279a11
AW
709 switch (index) {
710 case VFIO_PCI_CONFIG_REGION_INDEX:
906ee99d
AW
711 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
712
5b279a11
AW
713 case VFIO_PCI_ROM_REGION_INDEX:
714 if (iswrite)
715 return -EINVAL;
906ee99d 716 return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
89e1f7d4 717
5b279a11 718 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
906ee99d 719 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
84237a82
AW
720
721 case VFIO_PCI_VGA_REGION_INDEX:
722 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
5b279a11
AW
723 }
724
89e1f7d4
AW
725 return -EINVAL;
726}
727
5b279a11
AW
728static ssize_t vfio_pci_read(void *device_data, char __user *buf,
729 size_t count, loff_t *ppos)
730{
906ee99d
AW
731 if (!count)
732 return 0;
733
5b279a11
AW
734 return vfio_pci_rw(device_data, buf, count, ppos, false);
735}
736
89e1f7d4
AW
737static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
738 size_t count, loff_t *ppos)
739{
906ee99d
AW
740 if (!count)
741 return 0;
742
743 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
89e1f7d4
AW
744}
745
746static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
747{
748 struct vfio_pci_device *vdev = device_data;
749 struct pci_dev *pdev = vdev->pdev;
750 unsigned int index;
34002f54 751 u64 phys_len, req_len, pgoff, req_start;
89e1f7d4
AW
752 int ret;
753
754 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
755
756 if (vma->vm_end < vma->vm_start)
757 return -EINVAL;
758 if ((vma->vm_flags & VM_SHARED) == 0)
759 return -EINVAL;
760 if (index >= VFIO_PCI_ROM_REGION_INDEX)
761 return -EINVAL;
762 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
763 return -EINVAL;
764
765 phys_len = pci_resource_len(pdev, index);
766 req_len = vma->vm_end - vma->vm_start;
767 pgoff = vma->vm_pgoff &
768 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
769 req_start = pgoff << PAGE_SHIFT;
770
771 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
772 return -EINVAL;
773
774 if (index == vdev->msix_bar) {
775 /*
776 * Disallow mmaps overlapping the MSI-X table; users don't
777 * get to touch this directly. We could find somewhere
778 * else to map the overlap, but page granularity is only
779 * a recommendation, not a requirement, so the user needs
780 * to know which bits are real. Requiring them to mmap
781 * around the table makes that clear.
782 */
783
784 /* If neither entirely above nor below, then it overlaps */
785 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
786 req_start + req_len <= vdev->msix_offset))
787 return -EINVAL;
788 }
789
790 /*
791 * Even though we don't make use of the barmap for the mmap,
792 * we need to request the region and the barmap tracks that.
793 */
794 if (!vdev->barmap[index]) {
795 ret = pci_request_selected_regions(pdev,
796 1 << index, "vfio-pci");
797 if (ret)
798 return ret;
799
800 vdev->barmap[index] = pci_iomap(pdev, index, 0);
801 }
802
803 vma->vm_private_data = vdev;
89e1f7d4 804 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
34002f54 805 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
89e1f7d4 806
34002f54 807 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
89e1f7d4
AW
808 req_len, vma->vm_page_prot);
809}
810
811static const struct vfio_device_ops vfio_pci_ops = {
812 .name = "vfio-pci",
813 .open = vfio_pci_open,
814 .release = vfio_pci_release,
815 .ioctl = vfio_pci_ioctl,
816 .read = vfio_pci_read,
817 .write = vfio_pci_write,
818 .mmap = vfio_pci_mmap,
819};
820
821static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
822{
823 u8 type;
824 struct vfio_pci_device *vdev;
825 struct iommu_group *group;
826 int ret;
827
828 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
829 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
830 return -EINVAL;
831
832 group = iommu_group_get(&pdev->dev);
833 if (!group)
834 return -EINVAL;
835
836 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
837 if (!vdev) {
838 iommu_group_put(group);
839 return -ENOMEM;
840 }
841
842 vdev->pdev = pdev;
843 vdev->irq_type = VFIO_PCI_NUM_IRQS;
844 mutex_init(&vdev->igate);
845 spin_lock_init(&vdev->irqlock);
846 atomic_set(&vdev->refcnt, 0);
847
848 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
849 if (ret) {
850 iommu_group_put(group);
851 kfree(vdev);
852 }
853
854 return ret;
855}
856
857static void vfio_pci_remove(struct pci_dev *pdev)
858{
859 struct vfio_pci_device *vdev;
860
861 vdev = vfio_del_group_dev(&pdev->dev);
862 if (!vdev)
863 return;
864
865 iommu_group_put(pdev->dev.iommu_group);
866 kfree(vdev);
867}
868
dad9f897
VMP
869static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
870 pci_channel_state_t state)
871{
872 struct vfio_pci_device *vdev;
873 struct vfio_device *device;
874
875 device = vfio_device_get_from_dev(&pdev->dev);
876 if (device == NULL)
877 return PCI_ERS_RESULT_DISCONNECT;
878
879 vdev = vfio_device_data(device);
880 if (vdev == NULL) {
881 vfio_device_put(device);
882 return PCI_ERS_RESULT_DISCONNECT;
883 }
884
3be3a074
AW
885 mutex_lock(&vdev->igate);
886
dad9f897
VMP
887 if (vdev->err_trigger)
888 eventfd_signal(vdev->err_trigger, 1);
889
3be3a074
AW
890 mutex_unlock(&vdev->igate);
891
dad9f897
VMP
892 vfio_device_put(device);
893
894 return PCI_ERS_RESULT_CAN_RECOVER;
895}
896
897static struct pci_error_handlers vfio_err_handlers = {
898 .error_detected = vfio_pci_aer_err_detected,
899};
900
89e1f7d4
AW
901static struct pci_driver vfio_pci_driver = {
902 .name = "vfio-pci",
903 .id_table = NULL, /* only dynamic ids */
904 .probe = vfio_pci_probe,
905 .remove = vfio_pci_remove,
dad9f897 906 .err_handler = &vfio_err_handlers,
89e1f7d4
AW
907};
908
909static void __exit vfio_pci_cleanup(void)
910{
911 pci_unregister_driver(&vfio_pci_driver);
912 vfio_pci_virqfd_exit();
913 vfio_pci_uninit_perm_bits();
914}
915
916static int __init vfio_pci_init(void)
917{
918 int ret;
919
920 /* Allocate shared config space permision data used by all devices */
921 ret = vfio_pci_init_perm_bits();
922 if (ret)
923 return ret;
924
925 /* Start the virqfd cleanup handler */
926 ret = vfio_pci_virqfd_init();
927 if (ret)
928 goto out_virqfd;
929
930 /* Register and scan for devices */
931 ret = pci_register_driver(&vfio_pci_driver);
932 if (ret)
933 goto out_driver;
934
935 return 0;
936
89e1f7d4 937out_driver:
05bf3aac
JL
938 vfio_pci_virqfd_exit();
939out_virqfd:
89e1f7d4
AW
940 vfio_pci_uninit_perm_bits();
941 return ret;
942}
943
944module_init(vfio_pci_init);
945module_exit(vfio_pci_cleanup);
946
947MODULE_VERSION(DRIVER_VERSION);
948MODULE_LICENSE("GPL v2");
949MODULE_AUTHOR(DRIVER_AUTHOR);
950MODULE_DESCRIPTION(DRIVER_DESC);