vfio/mlx5: Implement vfio_pci driver for mlx5 devices
[linux-2.6-block.git] / drivers / vfio / pci / vfio_pci_core.c
CommitLineData
d2912cb1 1// SPDX-License-Identifier: GPL-2.0-only
89e1f7d4
AW
2/*
3 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
4 * Author: Alex Williamson <alex.williamson@redhat.com>
5 *
89e1f7d4
AW
6 * Derived from original vfio:
7 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
8 * Author: Tom Lyon, pugs@cisco.com
9 */
10
7fa005ca
MG
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
89e1f7d4
AW
13#include <linux/device.h>
14#include <linux/eventfd.h>
8b27ee60 15#include <linux/file.h>
89e1f7d4
AW
16#include <linux/interrupt.h>
17#include <linux/iommu.h>
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/notifier.h>
21#include <linux/pci.h>
22#include <linux/pm_runtime.h>
23#include <linux/slab.h>
24#include <linux/types.h>
25#include <linux/uaccess.h>
ecaa1f6a 26#include <linux/vgaarb.h>
0e714d27 27#include <linux/nospec.h>
abafbc55 28#include <linux/sched/mm.h>
89e1f7d4 29
7fa005ca
MG
30#include <linux/vfio_pci_core.h>
31
32#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
33#define DRIVER_DESC "core driver for VFIO based PCI devices"
89e1f7d4 34
89e1f7d4 35static bool nointxmask;
88c0dead 36static bool disable_vga;
6eb70187 37static bool disable_idle_d3;
6eb70187 38
88c0dead
AW
39static inline bool vfio_vga_disabled(void)
40{
41#ifdef CONFIG_VFIO_PCI_VGA
42 return disable_vga;
43#else
44 return true;
45#endif
46}
47
ecaa1f6a
AW
48/*
49 * Our VGA arbiter participation is limited since we don't know anything
50 * about the device itself. However, if the device is the only VGA device
51 * downstream of a bridge and VFIO VGA support is disabled, then we can
52 * safely return legacy VGA IO and memory as not decoded since the user
53 * has no way to get to it and routing can be disabled externally at the
54 * bridge.
55 */
89b6b8cd 56static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
ecaa1f6a 57{
89b6b8cd 58 struct pci_dev *tmp = NULL;
ecaa1f6a
AW
59 unsigned char max_busnr;
60 unsigned int decodes;
61
62 if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
63 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
64 VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
65
66 max_busnr = pci_bus_max_busnr(pdev->bus);
67 decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
68
69 while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
70 if (tmp == pdev ||
71 pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
72 pci_is_root_bus(tmp->bus))
73 continue;
74
75 if (tmp->bus->number >= pdev->bus->number &&
76 tmp->bus->number <= max_busnr) {
77 pci_dev_put(tmp);
78 decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
79 break;
80 }
81 }
82
83 return decodes;
84}
85
53647510 86static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
05f0c03f
YX
87{
88 struct resource *res;
c9c13ba4 89 int i;
05f0c03f
YX
90 struct vfio_pci_dummy_resource *dummy_res;
91
c9c13ba4
DE
92 for (i = 0; i < PCI_STD_NUM_BARS; i++) {
93 int bar = i + PCI_STD_RESOURCES;
94
95 res = &vdev->pdev->resource[bar];
05f0c03f
YX
96
97 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
98 goto no_mmap;
99
100 if (!(res->flags & IORESOURCE_MEM))
101 goto no_mmap;
102
103 /*
104 * The PCI core shouldn't set up a resource with a
105 * type but zero size. But there may be bugs that
106 * cause us to do that.
107 */
108 if (!resource_size(res))
109 goto no_mmap;
110
111 if (resource_size(res) >= PAGE_SIZE) {
112 vdev->bar_mmap_supported[bar] = true;
113 continue;
114 }
115
116 if (!(res->start & ~PAGE_MASK)) {
117 /*
118 * Add a dummy resource to reserve the remainder
119 * of the exclusive page in case that hot-add
120 * device's bar is assigned into it.
121 */
122 dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
123 if (dummy_res == NULL)
124 goto no_mmap;
125
126 dummy_res->resource.name = "vfio sub-page reserved";
127 dummy_res->resource.start = res->end + 1;
128 dummy_res->resource.end = res->start + PAGE_SIZE - 1;
129 dummy_res->resource.flags = res->flags;
130 if (request_resource(res->parent,
131 &dummy_res->resource)) {
132 kfree(dummy_res);
133 goto no_mmap;
134 }
135 dummy_res->index = bar;
136 list_add(&dummy_res->res_next,
137 &vdev->dummy_resources_list);
138 vdev->bar_mmap_supported[bar] = true;
139 continue;
140 }
141 /*
142 * Here we don't handle the case when the BAR is not page
143 * aligned because we can't expect the BAR will be
144 * assigned into the same location in a page in guest
145 * when we passthrough the BAR. And it's hard to access
146 * this BAR in userspace because we have no way to get
147 * the BAR's location in a page.
148 */
149no_mmap:
150 vdev->bar_mmap_supported[bar] = false;
151 }
152}
153
db44c174 154struct vfio_pci_group_info;
a882c16a 155static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
db44c174
JG
156static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
157 struct vfio_pci_group_info *groups);
bc4fba77 158
45074405
AW
159/*
160 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
161 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
162 * If a device implements the former but not the latter we would typically
163 * expect broken_intx_masking be set and require an exclusive interrupt.
164 * However since we do have control of the device's ability to assert INTx,
165 * we can instead pretend that the device does not implement INTx, virtualizing
166 * the pin register to report zero and maintaining DisINTx set on the host.
167 */
168static bool vfio_pci_nointx(struct pci_dev *pdev)
169{
170 switch (pdev->vendor) {
171 case PCI_VENDOR_ID_INTEL:
172 switch (pdev->device) {
7d57e5e9 173 /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
45074405
AW
174 case 0x1572:
175 case 0x1574:
176 case 0x1580 ... 0x1581:
7d57e5e9 177 case 0x1583 ... 0x158b:
45074405 178 case 0x37d0 ... 0x37d2:
bf3551e1
AW
179 /* X550 */
180 case 0x1563:
45074405
AW
181 return true;
182 default:
183 return false;
184 }
185 }
186
187 return false;
188}
189
53647510 190static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
51ef3a00
AW
191{
192 struct pci_dev *pdev = vdev->pdev;
193 u16 pmcsr;
194
195 if (!pdev->pm_cap)
196 return;
197
198 pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
199
200 vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
201}
202
203/*
204 * pci_set_power_state() wrapper handling devices which perform a soft reset on
205 * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
206 * restore when returned to D0. Saved separately from pci_saved_state for use
207 * by PM capability emulation and separately from pci_dev internal saved state
208 * to avoid it being overwritten and consumed around other resets.
209 */
53647510 210int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
51ef3a00
AW
211{
212 struct pci_dev *pdev = vdev->pdev;
213 bool needs_restore = false, needs_save = false;
214 int ret;
215
216 if (vdev->needs_pm_restore) {
217 if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
218 pci_save_state(pdev);
219 needs_save = true;
220 }
221
222 if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
223 needs_restore = true;
224 }
225
226 ret = pci_set_power_state(pdev, state);
227
228 if (!ret) {
229 /* D3 might be unsupported via quirk, skip unless in D3 */
230 if (needs_save && pdev->current_state >= PCI_D3hot) {
231 vdev->pm_save = pci_store_saved_state(pdev);
232 } else if (needs_restore) {
233 pci_load_and_free_saved_state(pdev, &vdev->pm_save);
234 pci_restore_state(pdev);
235 }
236 }
237
238 return ret;
239}
240
2fb89f56 241int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
89e1f7d4
AW
242{
243 struct pci_dev *pdev = vdev->pdev;
244 int ret;
245 u16 cmd;
246 u8 msix_pos;
247
51ef3a00 248 vfio_pci_set_power_state(vdev, PCI_D0);
6eb70187 249
9c22e660
AW
250 /* Don't allow our initial saved state to include busmaster */
251 pci_clear_master(pdev);
252
9a92c509
AW
253 ret = pci_enable_device(pdev);
254 if (ret)
255 return ret;
256
9f478035
AW
257 /* If reset fails because of the device lock, fail this path entirely */
258 ret = pci_try_reset_function(pdev);
259 if (ret == -EAGAIN) {
260 pci_disable_device(pdev);
261 return ret;
262 }
263
264 vdev->reset_works = !ret;
89e1f7d4
AW
265 pci_save_state(pdev);
266 vdev->pci_saved_state = pci_store_saved_state(pdev);
267 if (!vdev->pci_saved_state)
a88a7b3e 268 pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
89e1f7d4 269
45074405
AW
270 if (likely(!nointxmask)) {
271 if (vfio_pci_nointx(pdev)) {
a88a7b3e 272 pci_info(pdev, "Masking broken INTx support\n");
45074405
AW
273 vdev->nointx = true;
274 pci_intx(pdev, 0);
275 } else
276 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
9a92c509 277 }
89e1f7d4 278
89e1f7d4
AW
279 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
280 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
281 cmd &= ~PCI_COMMAND_INTX_DISABLE;
282 pci_write_config_word(pdev, PCI_COMMAND, cmd);
283 }
284
45074405
AW
285 ret = vfio_config_init(vdev);
286 if (ret) {
287 kfree(vdev->pci_saved_state);
288 vdev->pci_saved_state = NULL;
289 pci_disable_device(pdev);
290 return ret;
291 }
292
a9047f24 293 msix_pos = pdev->msix_cap;
89e1f7d4
AW
294 if (msix_pos) {
295 u16 flags;
296 u32 table;
297
298 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
299 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
300
508d1aa6
BH
301 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
302 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
89e1f7d4
AW
303 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
304 } else
305 vdev->msix_bar = 0xFF;
306
ecaa1f6a 307 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
84237a82 308 vdev->has_vga = true;
84237a82 309
05f0c03f 310
9a92c509 311 return 0;
89e1f7d4 312}
7fa005ca 313EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
89e1f7d4 314
2fb89f56 315void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
89e1f7d4 316{
2007722a 317 struct pci_dev *pdev = vdev->pdev;
05f0c03f 318 struct vfio_pci_dummy_resource *dummy_res, *tmp;
30656177 319 struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
28541d41 320 int i, bar;
89e1f7d4 321
a882c16a
JG
322 /* For needs_reset */
323 lockdep_assert_held(&vdev->vdev.dev_set->lock);
324
9c22e660
AW
325 /* Stop the device from further DMA */
326 pci_clear_master(pdev);
89e1f7d4
AW
327
328 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
329 VFIO_IRQ_SET_ACTION_TRIGGER,
330 vdev->irq_type, 0, 0, NULL);
331
30656177
AW
332 /* Device closed, don't need mutex here */
333 list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
334 &vdev->ioeventfds_list, next) {
335 vfio_virqfd_disable(&ioeventfd->virqfd);
336 list_del(&ioeventfd->next);
337 kfree(ioeventfd);
338 }
339 vdev->ioeventfds_nr = 0;
340
89e1f7d4
AW
341 vdev->virq_disabled = false;
342
28541d41
AW
343 for (i = 0; i < vdev->num_regions; i++)
344 vdev->region[i].ops->release(vdev, &vdev->region[i]);
345
346 vdev->num_regions = 0;
347 kfree(vdev->region);
348 vdev->region = NULL; /* don't krealloc a freed pointer */
349
89e1f7d4
AW
350 vfio_config_free(vdev);
351
c9c13ba4
DE
352 for (i = 0; i < PCI_STD_NUM_BARS; i++) {
353 bar = i + PCI_STD_RESOURCES;
89e1f7d4
AW
354 if (!vdev->barmap[bar])
355 continue;
2007722a
AW
356 pci_iounmap(pdev, vdev->barmap[bar]);
357 pci_release_selected_regions(pdev, 1 << bar);
89e1f7d4
AW
358 vdev->barmap[bar] = NULL;
359 }
2007722a 360
05f0c03f
YX
361 list_for_each_entry_safe(dummy_res, tmp,
362 &vdev->dummy_resources_list, res_next) {
363 list_del(&dummy_res->res_next);
364 release_resource(&dummy_res->resource);
365 kfree(dummy_res);
366 }
367
bc4fba77
AW
368 vdev->needs_reset = true;
369
2007722a
AW
370 /*
371 * If we have saved state, restore it. If we can reset the device,
372 * even better. Resetting with current state seems better than
373 * nothing, but saving and restoring current state without reset
374 * is just busy work.
375 */
376 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
a88a7b3e 377 pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
2007722a
AW
378
379 if (!vdev->reset_works)
9c22e660 380 goto out;
2007722a
AW
381
382 pci_save_state(pdev);
383 }
384
385 /*
386 * Disable INTx and MSI, presumably to avoid spurious interrupts
387 * during reset. Stolen from pci_reset_function()
388 */
389 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
390
d24cdbfd 391 /*
92c80268 392 * Try to get the locks ourselves to prevent a deadlock. The
393 * success of this is dependent on being able to lock the device,
394 * which is not always possible.
395 * We can not use the "try" reset interface here, which will
396 * overwrite the previously restored configuration information.
d24cdbfd 397 */
742b4c0d
LC
398 if (vdev->reset_works && pci_dev_trylock(pdev)) {
399 if (!__pci_reset_function_locked(pdev))
400 vdev->needs_reset = false;
401 pci_dev_unlock(pdev);
92c80268 402 }
2007722a
AW
403
404 pci_restore_state(pdev);
9c22e660
AW
405out:
406 pci_disable_device(pdev);
bc4fba77 407
a882c16a 408 if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
51ef3a00 409 vfio_pci_set_power_state(vdev, PCI_D3hot);
89e1f7d4 410}
7fa005ca 411EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
89e1f7d4 412
53647510 413static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev)
cc20d799
AW
414{
415 struct pci_dev *physfn = pci_physfn(vdev->pdev);
07d47b42 416 struct vfio_device *pf_dev;
cc20d799
AW
417
418 if (!vdev->pdev->is_virtfn)
419 return NULL;
420
07d47b42
JG
421 pf_dev = vfio_device_get_from_dev(&physfn->dev);
422 if (!pf_dev)
cc20d799
AW
423 return NULL;
424
ff53edf6 425 if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) {
07d47b42 426 vfio_device_put(pf_dev);
cc20d799
AW
427 return NULL;
428 }
429
53647510 430 return container_of(pf_dev, struct vfio_pci_core_device, vdev);
cc20d799
AW
431}
432
53647510 433static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val)
cc20d799 434{
53647510 435 struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
cc20d799
AW
436
437 if (!pf_vdev)
438 return;
439
440 mutex_lock(&pf_vdev->vf_token->lock);
441 pf_vdev->vf_token->users += val;
442 WARN_ON(pf_vdev->vf_token->users < 0);
443 mutex_unlock(&pf_vdev->vf_token->lock);
444
07d47b42 445 vfio_device_put(&pf_vdev->vdev);
cc20d799
AW
446}
447
ff53edf6 448void vfio_pci_core_close_device(struct vfio_device *core_vdev)
89e1f7d4 449{
53647510
MG
450 struct vfio_pci_core_device *vdev =
451 container_of(core_vdev, struct vfio_pci_core_device, vdev);
89e1f7d4 452
2cd8b14a
YH
453 vfio_pci_vf_token_user_add(vdev, -1);
454 vfio_spapr_pci_eeh_release(vdev->pdev);
2fb89f56 455 vfio_pci_core_disable(vdev);
924b51ab 456
2cd8b14a
YH
457 mutex_lock(&vdev->igate);
458 if (vdev->err_trigger) {
459 eventfd_ctx_put(vdev->err_trigger);
460 vdev->err_trigger = NULL;
1b69be5e 461 }
2cd8b14a
YH
462 if (vdev->req_trigger) {
463 eventfd_ctx_put(vdev->req_trigger);
464 vdev->req_trigger = NULL;
465 }
466 mutex_unlock(&vdev->igate);
89e1f7d4 467}
7fa005ca 468EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
89e1f7d4 469
2fb89f56 470void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
89e1f7d4 471{
2fb89f56 472 vfio_pci_probe_mmaps(vdev);
2cd8b14a
YH
473 vfio_spapr_pci_eeh_open(vdev->pdev);
474 vfio_pci_vf_token_user_add(vdev, 1);
89e1f7d4 475}
7fa005ca 476EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
89e1f7d4 477
53647510 478static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
89e1f7d4
AW
479{
480 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
481 u8 pin;
db04264f
AW
482
483 if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
484 vdev->nointx || vdev->pdev->is_virtfn)
485 return 0;
486
89e1f7d4 487 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
89e1f7d4 488
db04264f 489 return pin ? 1 : 0;
89e1f7d4
AW
490 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
491 u8 pos;
492 u16 flags;
493
a9047f24 494 pos = vdev->pdev->msi_cap;
89e1f7d4
AW
495 if (pos) {
496 pci_read_config_word(vdev->pdev,
497 pos + PCI_MSI_FLAGS, &flags);
fd49c81f 498 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
89e1f7d4
AW
499 }
500 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
501 u8 pos;
502 u16 flags;
503
a9047f24 504 pos = vdev->pdev->msix_cap;
89e1f7d4
AW
505 if (pos) {
506 pci_read_config_word(vdev->pdev,
507 pos + PCI_MSIX_FLAGS, &flags);
508
509 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
510 }
6140a8f5 511 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
dad9f897
VMP
512 if (pci_is_pcie(vdev->pdev))
513 return 1;
6140a8f5
AW
514 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
515 return 1;
516 }
89e1f7d4
AW
517
518 return 0;
519}
520
8b27ee60
AW
521static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
522{
523 (*(int *)data)++;
524 return 0;
525}
526
527struct vfio_pci_fill_info {
528 int max;
529 int cur;
530 struct vfio_pci_dependent_device *devices;
531};
532
533static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
534{
535 struct vfio_pci_fill_info *fill = data;
536 struct iommu_group *iommu_group;
537
538 if (fill->cur == fill->max)
539 return -EAGAIN; /* Something changed, try again */
540
541 iommu_group = iommu_group_get(&pdev->dev);
542 if (!iommu_group)
543 return -EPERM; /* Cannot reset non-isolated devices */
544
545 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
546 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
547 fill->devices[fill->cur].bus = pdev->bus->number;
548 fill->devices[fill->cur].devfn = pdev->devfn;
549 fill->cur++;
550 iommu_group_put(iommu_group);
551 return 0;
552}
553
8b27ee60
AW
554struct vfio_pci_group_info {
555 int count;
db44c174 556 struct vfio_group **groups;
8b27ee60
AW
557};
558
8b27ee60
AW
559static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
560{
561 for (; pdev; pdev = pdev->bus->self)
562 if (pdev->bus == slot->bus)
563 return (pdev->slot == slot);
564 return false;
565}
566
567struct vfio_pci_walk_info {
8bd8d1df 568 int (*fn)(struct pci_dev *pdev, void *data);
8b27ee60
AW
569 void *data;
570 struct pci_dev *pdev;
571 bool slot;
572 int ret;
573};
574
575static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
576{
577 struct vfio_pci_walk_info *walk = data;
578
579 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
580 walk->ret = walk->fn(pdev, walk->data);
581
582 return walk->ret;
583}
584
585static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
586 int (*fn)(struct pci_dev *,
587 void *data), void *data,
588 bool slot)
589{
590 struct vfio_pci_walk_info walk = {
591 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
592 };
593
594 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
595
596 return walk.ret;
597}
598
53647510 599static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
a32295c6 600 struct vfio_info_cap *caps)
188ad9d6 601{
a32295c6
AK
602 struct vfio_info_cap_header header = {
603 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
604 .version = 1
605 };
28541d41 606
a32295c6 607 return vfio_info_add_capability(caps, &header, sizeof(header));
28541d41
AW
608}
609
53647510 610int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
28541d41
AW
611 unsigned int type, unsigned int subtype,
612 const struct vfio_pci_regops *ops,
613 size_t size, u32 flags, void *data)
614{
615 struct vfio_pci_region *region;
616
617 region = krealloc(vdev->region,
618 (vdev->num_regions + 1) * sizeof(*region),
619 GFP_KERNEL);
620 if (!region)
621 return -ENOMEM;
622
623 vdev->region = region;
624 vdev->region[vdev->num_regions].type = type;
625 vdev->region[vdev->num_regions].subtype = subtype;
626 vdev->region[vdev->num_regions].ops = ops;
627 vdev->region[vdev->num_regions].size = size;
628 vdev->region[vdev->num_regions].flags = flags;
629 vdev->region[vdev->num_regions].data = data;
630
631 vdev->num_regions++;
632
633 return 0;
634}
7fa005ca 635EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region);
28541d41 636
ff53edf6
MG
637long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
638 unsigned long arg)
89e1f7d4 639{
53647510
MG
640 struct vfio_pci_core_device *vdev =
641 container_of(core_vdev, struct vfio_pci_core_device, vdev);
89e1f7d4
AW
642 unsigned long minsz;
643
644 if (cmd == VFIO_DEVICE_GET_INFO) {
645 struct vfio_device_info info;
e6b817d4
MR
646 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
647 unsigned long capsz;
b9abef43 648 int ret;
89e1f7d4
AW
649
650 minsz = offsetofend(struct vfio_device_info, num_irqs);
651
e6b817d4
MR
652 /* For backward compatibility, cannot require this */
653 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
654
89e1f7d4
AW
655 if (copy_from_user(&info, (void __user *)arg, minsz))
656 return -EFAULT;
657
658 if (info.argsz < minsz)
659 return -EINVAL;
660
e6b817d4
MR
661 if (info.argsz >= capsz) {
662 minsz = capsz;
663 info.cap_offset = 0;
664 }
665
89e1f7d4
AW
666 info.flags = VFIO_DEVICE_FLAGS_PCI;
667
668 if (vdev->reset_works)
669 info.flags |= VFIO_DEVICE_FLAGS_RESET;
670
28541d41 671 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
89e1f7d4
AW
672 info.num_irqs = VFIO_PCI_NUM_IRQS;
673
b9abef43
MG
674 ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
675 if (ret && ret != -ENODEV) {
676 pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
677 return ret;
e6b817d4
MR
678 }
679
680 if (caps.size) {
681 info.flags |= VFIO_DEVICE_FLAGS_CAPS;
682 if (info.argsz < sizeof(info) + caps.size) {
683 info.argsz = sizeof(info) + caps.size;
684 } else {
685 vfio_info_cap_shift(&caps, sizeof(info));
686 if (copy_to_user((void __user *)arg +
687 sizeof(info), caps.buf,
688 caps.size)) {
689 kfree(caps.buf);
690 return -EFAULT;
691 }
692 info.cap_offset = sizeof(info);
693 }
694
695 kfree(caps.buf);
696 }
697
8160c4e4
MT
698 return copy_to_user((void __user *)arg, &info, minsz) ?
699 -EFAULT : 0;
89e1f7d4
AW
700
701 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
702 struct pci_dev *pdev = vdev->pdev;
703 struct vfio_region_info info;
188ad9d6 704 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
28541d41 705 int i, ret;
89e1f7d4
AW
706
707 minsz = offsetofend(struct vfio_region_info, offset);
708
709 if (copy_from_user(&info, (void __user *)arg, minsz))
710 return -EFAULT;
711
712 if (info.argsz < minsz)
713 return -EINVAL;
714
715 switch (info.index) {
716 case VFIO_PCI_CONFIG_REGION_INDEX:
717 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
718 info.size = pdev->cfg_size;
719 info.flags = VFIO_REGION_INFO_FLAG_READ |
720 VFIO_REGION_INFO_FLAG_WRITE;
721 break;
722 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
723 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
724 info.size = pci_resource_len(pdev, info.index);
725 if (!info.size) {
726 info.flags = 0;
727 break;
728 }
729
730 info.flags = VFIO_REGION_INFO_FLAG_READ |
731 VFIO_REGION_INFO_FLAG_WRITE;
05f0c03f 732 if (vdev->bar_mmap_supported[info.index]) {
89e1f7d4 733 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
188ad9d6 734 if (info.index == vdev->msix_bar) {
a32295c6 735 ret = msix_mmappable_cap(vdev, &caps);
188ad9d6
AW
736 if (ret)
737 return ret;
738 }
739 }
740
89e1f7d4
AW
741 break;
742 case VFIO_PCI_ROM_REGION_INDEX:
743 {
744 void __iomem *io;
745 size_t size;
abafbc55 746 u16 cmd;
89e1f7d4
AW
747
748 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
749 info.flags = 0;
750
751 /* Report the BAR size, not the ROM size */
752 info.size = pci_resource_len(pdev, info.index);
a13b6459
AW
753 if (!info.size) {
754 /* Shadow ROMs appear as PCI option ROMs */
755 if (pdev->resource[PCI_ROM_RESOURCE].flags &
756 IORESOURCE_ROM_SHADOW)
757 info.size = 0x20000;
758 else
759 break;
760 }
89e1f7d4 761
0cfd027b
EA
762 /*
763 * Is it really there? Enable memory decode for
764 * implicit access in pci_map_rom().
765 */
abafbc55 766 cmd = vfio_pci_memory_lock_and_enable(vdev);
89e1f7d4 767 io = pci_map_rom(pdev, &size);
0cfd027b
EA
768 if (io) {
769 info.flags = VFIO_REGION_INFO_FLAG_READ;
770 pci_unmap_rom(pdev, io);
771 } else {
89e1f7d4 772 info.size = 0;
89e1f7d4 773 }
abafbc55 774 vfio_pci_memory_unlock_and_restore(vdev, cmd);
89e1f7d4 775
89e1f7d4
AW
776 break;
777 }
84237a82
AW
778 case VFIO_PCI_VGA_REGION_INDEX:
779 if (!vdev->has_vga)
780 return -EINVAL;
781
782 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
783 info.size = 0xc0000;
784 info.flags = VFIO_REGION_INFO_FLAG_READ |
785 VFIO_REGION_INFO_FLAG_WRITE;
786
787 break;
89e1f7d4 788 default:
c535d345 789 {
dda01f78
AW
790 struct vfio_region_info_cap_type cap_type = {
791 .header.id = VFIO_REGION_INFO_CAP_TYPE,
792 .header.version = 1 };
c535d345 793
28541d41
AW
794 if (info.index >=
795 VFIO_PCI_NUM_REGIONS + vdev->num_regions)
796 return -EINVAL;
0e714d27
GS
797 info.index = array_index_nospec(info.index,
798 VFIO_PCI_NUM_REGIONS +
799 vdev->num_regions);
28541d41
AW
800
801 i = info.index - VFIO_PCI_NUM_REGIONS;
802
803 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
804 info.size = vdev->region[i].size;
805 info.flags = vdev->region[i].flags;
806
c535d345
KW
807 cap_type.type = vdev->region[i].type;
808 cap_type.subtype = vdev->region[i].subtype;
809
dda01f78
AW
810 ret = vfio_info_add_capability(&caps, &cap_type.header,
811 sizeof(cap_type));
28541d41
AW
812 if (ret)
813 return ret;
c535d345 814
c2c0f1cd
AK
815 if (vdev->region[i].ops->add_capability) {
816 ret = vdev->region[i].ops->add_capability(vdev,
817 &vdev->region[i], &caps);
818 if (ret)
819 return ret;
820 }
c535d345 821 }
89e1f7d4
AW
822 }
823
188ad9d6
AW
824 if (caps.size) {
825 info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
826 if (info.argsz < sizeof(info) + caps.size) {
827 info.argsz = sizeof(info) + caps.size;
828 info.cap_offset = 0;
829 } else {
830 vfio_info_cap_shift(&caps, sizeof(info));
c4aec310
DC
831 if (copy_to_user((void __user *)arg +
832 sizeof(info), caps.buf,
833 caps.size)) {
188ad9d6 834 kfree(caps.buf);
c4aec310 835 return -EFAULT;
188ad9d6
AW
836 }
837 info.cap_offset = sizeof(info);
838 }
839
840 kfree(caps.buf);
89e1f7d4
AW
841 }
842
8160c4e4
MT
843 return copy_to_user((void __user *)arg, &info, minsz) ?
844 -EFAULT : 0;
89e1f7d4
AW
845
846 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
847 struct vfio_irq_info info;
848
849 minsz = offsetofend(struct vfio_irq_info, count);
850
851 if (copy_from_user(&info, (void __user *)arg, minsz))
852 return -EFAULT;
853
854 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
855 return -EINVAL;
856
dad9f897
VMP
857 switch (info.index) {
858 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
6140a8f5 859 case VFIO_PCI_REQ_IRQ_INDEX:
dad9f897
VMP
860 break;
861 case VFIO_PCI_ERR_IRQ_INDEX:
862 if (pci_is_pcie(vdev->pdev))
863 break;
df561f66 864 fallthrough;
dad9f897
VMP
865 default:
866 return -EINVAL;
867 }
868
89e1f7d4
AW
869 info.flags = VFIO_IRQ_INFO_EVENTFD;
870
871 info.count = vfio_pci_get_irq_count(vdev, info.index);
872
873 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
874 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
875 VFIO_IRQ_INFO_AUTOMASKED);
876 else
877 info.flags |= VFIO_IRQ_INFO_NORESIZE;
878
8160c4e4
MT
879 return copy_to_user((void __user *)arg, &info, minsz) ?
880 -EFAULT : 0;
89e1f7d4
AW
881
882 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
883 struct vfio_irq_set hdr;
884 u8 *data = NULL;
05692d70 885 int max, ret = 0;
ef198aaa 886 size_t data_size = 0;
89e1f7d4
AW
887
888 minsz = offsetofend(struct vfio_irq_set, count);
889
890 if (copy_from_user(&hdr, (void __user *)arg, minsz))
891 return -EFAULT;
892
05692d70 893 max = vfio_pci_get_irq_count(vdev, hdr.index);
89e1f7d4 894
ef198aaa
KW
895 ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
896 VFIO_PCI_NUM_IRQS, &data_size);
897 if (ret)
898 return ret;
89e1f7d4 899
ef198aaa 900 if (data_size) {
3a1f7041 901 data = memdup_user((void __user *)(arg + minsz),
ef198aaa 902 data_size);
3a1f7041
FW
903 if (IS_ERR(data))
904 return PTR_ERR(data);
89e1f7d4
AW
905 }
906
907 mutex_lock(&vdev->igate);
908
909 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
910 hdr.start, hdr.count, data);
911
912 mutex_unlock(&vdev->igate);
913 kfree(data);
914
915 return ret;
916
8b27ee60 917 } else if (cmd == VFIO_DEVICE_RESET) {
abafbc55
AW
918 int ret;
919
920 if (!vdev->reset_works)
921 return -EINVAL;
922
923 vfio_pci_zap_and_down_write_memory_lock(vdev);
924 ret = pci_try_reset_function(vdev->pdev);
925 up_write(&vdev->memory_lock);
926
927 return ret;
89e1f7d4 928
8b27ee60
AW
929 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
930 struct vfio_pci_hot_reset_info hdr;
931 struct vfio_pci_fill_info fill = { 0 };
932 struct vfio_pci_dependent_device *devices = NULL;
933 bool slot = false;
934 int ret = 0;
935
936 minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
937
938 if (copy_from_user(&hdr, (void __user *)arg, minsz))
939 return -EFAULT;
940
941 if (hdr.argsz < minsz)
942 return -EINVAL;
943
944 hdr.flags = 0;
945
946 /* Can we do a slot or bus reset or neither? */
947 if (!pci_probe_reset_slot(vdev->pdev->slot))
948 slot = true;
949 else if (pci_probe_reset_bus(vdev->pdev->bus))
950 return -ENODEV;
951
952 /* How many devices are affected? */
953 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
954 vfio_pci_count_devs,
955 &fill.max, slot);
956 if (ret)
957 return ret;
958
959 WARN_ON(!fill.max); /* Should always be at least one */
960
961 /*
962 * If there's enough space, fill it now, otherwise return
963 * -ENOSPC and the number of devices affected.
964 */
965 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
966 ret = -ENOSPC;
967 hdr.count = fill.max;
968 goto reset_info_exit;
969 }
970
971 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
972 if (!devices)
973 return -ENOMEM;
974
975 fill.devices = devices;
976
977 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
978 vfio_pci_fill_devs,
979 &fill, slot);
980
981 /*
982 * If a device was removed between counting and filling,
983 * we may come up short of fill.max. If a device was
984 * added, we'll have a return of -EAGAIN above.
985 */
986 if (!ret)
987 hdr.count = fill.cur;
988
989reset_info_exit:
990 if (copy_to_user((void __user *)arg, &hdr, minsz))
991 ret = -EFAULT;
992
993 if (!ret) {
994 if (copy_to_user((void __user *)(arg + minsz), devices,
995 hdr.count * sizeof(*devices)))
996 ret = -EFAULT;
997 }
998
999 kfree(devices);
1000 return ret;
1001
1002 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
1003 struct vfio_pci_hot_reset hdr;
1004 int32_t *group_fds;
db44c174 1005 struct vfio_group **groups;
8b27ee60
AW
1006 struct vfio_pci_group_info info;
1007 bool slot = false;
db44c174 1008 int group_idx, count = 0, ret = 0;
8b27ee60
AW
1009
1010 minsz = offsetofend(struct vfio_pci_hot_reset, count);
1011
1012 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1013 return -EFAULT;
1014
1015 if (hdr.argsz < minsz || hdr.flags)
1016 return -EINVAL;
1017
1018 /* Can we do a slot or bus reset or neither? */
1019 if (!pci_probe_reset_slot(vdev->pdev->slot))
1020 slot = true;
1021 else if (pci_probe_reset_bus(vdev->pdev->bus))
1022 return -ENODEV;
1023
1024 /*
1025 * We can't let userspace give us an arbitrarily large
1026 * buffer to copy, so verify how many we think there
1027 * could be. Note groups can have multiple devices so
1028 * one group per device is the max.
1029 */
1030 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1031 vfio_pci_count_devs,
1032 &count, slot);
1033 if (ret)
1034 return ret;
1035
1036 /* Somewhere between 1 and count is OK */
1037 if (!hdr.count || hdr.count > count)
1038 return -EINVAL;
1039
1040 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
1041 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
1042 if (!group_fds || !groups) {
1043 kfree(group_fds);
1044 kfree(groups);
1045 return -ENOMEM;
1046 }
1047
1048 if (copy_from_user(group_fds, (void __user *)(arg + minsz),
1049 hdr.count * sizeof(*group_fds))) {
1050 kfree(group_fds);
1051 kfree(groups);
1052 return -EFAULT;
1053 }
1054
1055 /*
1056 * For each group_fd, get the group through the vfio external
1057 * user interface and store the group and iommu ID. This
1058 * ensures the group is held across the reset.
1059 */
abafbc55 1060 for (group_idx = 0; group_idx < hdr.count; group_idx++) {
8b27ee60 1061 struct vfio_group *group;
abafbc55 1062 struct fd f = fdget(group_fds[group_idx]);
8b27ee60
AW
1063 if (!f.file) {
1064 ret = -EBADF;
1065 break;
1066 }
1067
1068 group = vfio_group_get_external_user(f.file);
1069 fdput(f);
1070 if (IS_ERR(group)) {
1071 ret = PTR_ERR(group);
1072 break;
1073 }
1074
db44c174 1075 groups[group_idx] = group;
8b27ee60
AW
1076 }
1077
1078 kfree(group_fds);
1079
1080 /* release reference to groups on error */
1081 if (ret)
1082 goto hot_reset_release;
1083
1084 info.count = hdr.count;
1085 info.groups = groups;
1086
db44c174 1087 ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
8b27ee60
AW
1088
1089hot_reset_release:
abafbc55 1090 for (group_idx--; group_idx >= 0; group_idx--)
db44c174 1091 vfio_group_put_external_user(groups[group_idx]);
8b27ee60
AW
1092
1093 kfree(groups);
1094 return ret;
30656177
AW
1095 } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
1096 struct vfio_device_ioeventfd ioeventfd;
1097 int count;
1098
1099 minsz = offsetofend(struct vfio_device_ioeventfd, fd);
1100
1101 if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
1102 return -EFAULT;
1103
1104 if (ioeventfd.argsz < minsz)
1105 return -EINVAL;
1106
1107 if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
1108 return -EINVAL;
1109
1110 count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
1111
1112 if (hweight8(count) != 1 || ioeventfd.fd < -1)
1113 return -EINVAL;
1114
1115 return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
1116 ioeventfd.data, count, ioeventfd.fd);
445ad495
JG
1117 }
1118 return -ENOTTY;
1119}
1120EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
43eeeecc 1121
445ad495
JG
1122static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
1123 void __user *arg, size_t argsz)
1124{
1125 struct vfio_pci_core_device *vdev =
1126 container_of(device, struct vfio_pci_core_device, vdev);
1127 uuid_t uuid;
1128 int ret;
43eeeecc 1129
445ad495
JG
1130 if (!vdev->vf_token)
1131 return -ENOTTY;
1132 /*
1133 * We do not support GET of the VF Token UUID as this could
1134 * expose the token of the previous device user.
1135 */
1136 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
1137 sizeof(uuid));
1138 if (ret != 1)
1139 return ret;
43eeeecc 1140
445ad495
JG
1141 if (copy_from_user(&uuid, arg, sizeof(uuid)))
1142 return -EFAULT;
43eeeecc 1143
445ad495
JG
1144 mutex_lock(&vdev->vf_token->lock);
1145 uuid_copy(&vdev->vf_token->uuid, &uuid);
1146 mutex_unlock(&vdev->vf_token->lock);
1147 return 0;
1148}
43eeeecc 1149
445ad495
JG
1150int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
1151 void __user *arg, size_t argsz)
1152{
1153 switch (flags & VFIO_DEVICE_FEATURE_MASK) {
1154 case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1155 return vfio_pci_core_feature_token(device, flags, arg, argsz);
1156 default:
1157 return -ENOTTY;
8b27ee60 1158 }
89e1f7d4 1159}
445ad495 1160EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature);
89e1f7d4 1161
53647510 1162static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
5b279a11 1163 size_t count, loff_t *ppos, bool iswrite)
89e1f7d4
AW
1164{
1165 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
89e1f7d4 1166
28541d41 1167 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
89e1f7d4
AW
1168 return -EINVAL;
1169
5b279a11
AW
1170 switch (index) {
1171 case VFIO_PCI_CONFIG_REGION_INDEX:
906ee99d
AW
1172 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
1173
5b279a11
AW
1174 case VFIO_PCI_ROM_REGION_INDEX:
1175 if (iswrite)
1176 return -EINVAL;
906ee99d 1177 return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
89e1f7d4 1178
5b279a11 1179 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
906ee99d 1180 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
84237a82
AW
1181
1182 case VFIO_PCI_VGA_REGION_INDEX:
1183 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
28541d41
AW
1184 default:
1185 index -= VFIO_PCI_NUM_REGIONS;
1186 return vdev->region[index].ops->rw(vdev, buf,
1187 count, ppos, iswrite);
5b279a11
AW
1188 }
1189
89e1f7d4
AW
1190 return -EINVAL;
1191}
1192
ff53edf6
MG
1193ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
1194 size_t count, loff_t *ppos)
5b279a11 1195{
53647510
MG
1196 struct vfio_pci_core_device *vdev =
1197 container_of(core_vdev, struct vfio_pci_core_device, vdev);
6df62c5b 1198
906ee99d
AW
1199 if (!count)
1200 return 0;
1201
6df62c5b 1202 return vfio_pci_rw(vdev, buf, count, ppos, false);
5b279a11 1203}
7fa005ca 1204EXPORT_SYMBOL_GPL(vfio_pci_core_read);
5b279a11 1205
ff53edf6
MG
1206ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
1207 size_t count, loff_t *ppos)
89e1f7d4 1208{
53647510
MG
1209 struct vfio_pci_core_device *vdev =
1210 container_of(core_vdev, struct vfio_pci_core_device, vdev);
6df62c5b 1211
906ee99d
AW
1212 if (!count)
1213 return 0;
1214
6df62c5b 1215 return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
89e1f7d4 1216}
7fa005ca 1217EXPORT_SYMBOL_GPL(vfio_pci_core_write);
89e1f7d4 1218
abafbc55 1219/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
53647510 1220static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
abafbc55
AW
1221{
1222 struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1223
1224 /*
1225 * Lock ordering:
c1e8d7c6 1226 * vma_lock is nested under mmap_lock for vm_ops callback paths.
abafbc55
AW
1227 * The memory_lock semaphore is used by both code paths calling
1228 * into this function to zap vmas and the vm_ops.fault callback
1229 * to protect the memory enable state of the device.
1230 *
c1e8d7c6 1231 * When zapping vmas we need to maintain the mmap_lock => vma_lock
abafbc55 1232 * ordering, which requires using vma_lock to walk vma_list to
c1e8d7c6 1233 * acquire an mm, then dropping vma_lock to get the mmap_lock and
abafbc55
AW
1234 * reacquiring vma_lock. This logic is derived from similar
1235 * requirements in uverbs_user_mmap_disassociate().
1236 *
c1e8d7c6 1237 * mmap_lock must always be the top-level lock when it is taken.
abafbc55 1238 * Therefore we can only hold the memory_lock write lock when
c1e8d7c6 1239 * vma_list is empty, as we'd need to take mmap_lock to clear
abafbc55
AW
1240 * entries. vma_list can only be guaranteed empty when holding
1241 * vma_lock, thus memory_lock is nested under vma_lock.
1242 *
1243 * This enables the vm_ops.fault callback to acquire vma_lock,
1244 * followed by memory_lock read lock, while already holding
c1e8d7c6 1245 * mmap_lock without risk of deadlock.
abafbc55
AW
1246 */
1247 while (1) {
1248 struct mm_struct *mm = NULL;
1249
1250 if (try) {
1251 if (!mutex_trylock(&vdev->vma_lock))
1252 return 0;
1253 } else {
1254 mutex_lock(&vdev->vma_lock);
1255 }
1256 while (!list_empty(&vdev->vma_list)) {
1257 mmap_vma = list_first_entry(&vdev->vma_list,
1258 struct vfio_pci_mmap_vma,
1259 vma_next);
1260 mm = mmap_vma->vma->vm_mm;
1261 if (mmget_not_zero(mm))
1262 break;
1263
1264 list_del(&mmap_vma->vma_next);
1265 kfree(mmap_vma);
1266 mm = NULL;
1267 }
1268 if (!mm)
1269 return 1;
1270 mutex_unlock(&vdev->vma_lock);
1271
1272 if (try) {
89154dd5 1273 if (!mmap_read_trylock(mm)) {
abafbc55
AW
1274 mmput(mm);
1275 return 0;
1276 }
1277 } else {
89154dd5 1278 mmap_read_lock(mm);
abafbc55 1279 }
4d45e75a
JH
1280 if (try) {
1281 if (!mutex_trylock(&vdev->vma_lock)) {
1282 mmap_read_unlock(mm);
1283 mmput(mm);
1284 return 0;
abafbc55 1285 }
4d45e75a
JH
1286 } else {
1287 mutex_lock(&vdev->vma_lock);
1288 }
1289 list_for_each_entry_safe(mmap_vma, tmp,
1290 &vdev->vma_list, vma_next) {
1291 struct vm_area_struct *vma = mmap_vma->vma;
abafbc55 1292
4d45e75a
JH
1293 if (vma->vm_mm != mm)
1294 continue;
abafbc55 1295
4d45e75a
JH
1296 list_del(&mmap_vma->vma_next);
1297 kfree(mmap_vma);
abafbc55 1298
4d45e75a
JH
1299 zap_vma_ptes(vma, vma->vm_start,
1300 vma->vm_end - vma->vm_start);
abafbc55 1301 }
4d45e75a 1302 mutex_unlock(&vdev->vma_lock);
89154dd5 1303 mmap_read_unlock(mm);
abafbc55
AW
1304 mmput(mm);
1305 }
1306}
1307
53647510 1308void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
abafbc55
AW
1309{
1310 vfio_pci_zap_and_vma_lock(vdev, false);
1311 down_write(&vdev->memory_lock);
1312 mutex_unlock(&vdev->vma_lock);
1313}
1314
53647510 1315u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
abafbc55
AW
1316{
1317 u16 cmd;
1318
1319 down_write(&vdev->memory_lock);
1320 pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
1321 if (!(cmd & PCI_COMMAND_MEMORY))
1322 pci_write_config_word(vdev->pdev, PCI_COMMAND,
1323 cmd | PCI_COMMAND_MEMORY);
1324
1325 return cmd;
1326}
1327
53647510 1328void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
abafbc55
AW
1329{
1330 pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
1331 up_write(&vdev->memory_lock);
1332}
1333
1334/* Caller holds vma_lock */
53647510 1335static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
abafbc55 1336 struct vm_area_struct *vma)
11c4cd07
AW
1337{
1338 struct vfio_pci_mmap_vma *mmap_vma;
1339
1340 mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
1341 if (!mmap_vma)
1342 return -ENOMEM;
1343
1344 mmap_vma->vma = vma;
11c4cd07 1345 list_add(&mmap_vma->vma_next, &vdev->vma_list);
11c4cd07
AW
1346
1347 return 0;
1348}
1349
1350/*
1351 * Zap mmaps on open so that we can fault them in on access and therefore
1352 * our vma_list only tracks mappings accessed since last zap.
1353 */
1354static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1355{
1356 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1357}
1358
1359static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1360{
53647510 1361 struct vfio_pci_core_device *vdev = vma->vm_private_data;
11c4cd07
AW
1362 struct vfio_pci_mmap_vma *mmap_vma;
1363
1364 mutex_lock(&vdev->vma_lock);
1365 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1366 if (mmap_vma->vma == vma) {
1367 list_del(&mmap_vma->vma_next);
1368 kfree(mmap_vma);
1369 break;
1370 }
1371 }
1372 mutex_unlock(&vdev->vma_lock);
1373}
1374
1375static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
1376{
1377 struct vm_area_struct *vma = vmf->vma;
53647510 1378 struct vfio_pci_core_device *vdev = vma->vm_private_data;
6a45ece4 1379 struct vfio_pci_mmap_vma *mmap_vma;
abafbc55
AW
1380 vm_fault_t ret = VM_FAULT_NOPAGE;
1381
1382 mutex_lock(&vdev->vma_lock);
1383 down_read(&vdev->memory_lock);
1384
1385 if (!__vfio_pci_memory_enabled(vdev)) {
1386 ret = VM_FAULT_SIGBUS;
abafbc55
AW
1387 goto up_out;
1388 }
11c4cd07 1389
6a45ece4
AW
1390 /*
1391 * We populate the whole vma on fault, so we need to test whether
1392 * the vma has already been mapped, such as for concurrent faults
1393 * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
1394 * we ask it to fill the same range again.
1395 */
1396 list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1397 if (mmap_vma->vma == vma)
1398 goto up_out;
abafbc55
AW
1399 }
1400
7b06a56d 1401 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
6a45ece4
AW
1402 vma->vm_end - vma->vm_start,
1403 vma->vm_page_prot)) {
abafbc55 1404 ret = VM_FAULT_SIGBUS;
6a45ece4
AW
1405 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1406 goto up_out;
1407 }
1408
1409 if (__vfio_pci_add_vma(vdev, vma)) {
1410 ret = VM_FAULT_OOM;
1411 zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1412 }
11c4cd07 1413
abafbc55
AW
1414up_out:
1415 up_read(&vdev->memory_lock);
6a45ece4 1416 mutex_unlock(&vdev->vma_lock);
abafbc55 1417 return ret;
11c4cd07
AW
1418}
1419
1420static const struct vm_operations_struct vfio_pci_mmap_ops = {
1421 .open = vfio_pci_mmap_open,
1422 .close = vfio_pci_mmap_close,
1423 .fault = vfio_pci_mmap_fault,
1424};
1425
ff53edf6 1426int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
89e1f7d4 1427{
53647510
MG
1428 struct vfio_pci_core_device *vdev =
1429 container_of(core_vdev, struct vfio_pci_core_device, vdev);
89e1f7d4
AW
1430 struct pci_dev *pdev = vdev->pdev;
1431 unsigned int index;
34002f54 1432 u64 phys_len, req_len, pgoff, req_start;
89e1f7d4
AW
1433 int ret;
1434
1435 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1436
90929078
CE
1437 if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1438 return -EINVAL;
89e1f7d4
AW
1439 if (vma->vm_end < vma->vm_start)
1440 return -EINVAL;
1441 if ((vma->vm_flags & VM_SHARED) == 0)
1442 return -EINVAL;
a15b1883
AK
1443 if (index >= VFIO_PCI_NUM_REGIONS) {
1444 int regnum = index - VFIO_PCI_NUM_REGIONS;
1445 struct vfio_pci_region *region = vdev->region + regnum;
1446
90929078 1447 if (region->ops && region->ops->mmap &&
a15b1883
AK
1448 (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1449 return region->ops->mmap(vdev, region, vma);
1450 return -EINVAL;
1451 }
89e1f7d4
AW
1452 if (index >= VFIO_PCI_ROM_REGION_INDEX)
1453 return -EINVAL;
05f0c03f 1454 if (!vdev->bar_mmap_supported[index])
89e1f7d4
AW
1455 return -EINVAL;
1456
05f0c03f 1457 phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
89e1f7d4
AW
1458 req_len = vma->vm_end - vma->vm_start;
1459 pgoff = vma->vm_pgoff &
1460 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1461 req_start = pgoff << PAGE_SHIFT;
1462
05f0c03f 1463 if (req_start + req_len > phys_len)
89e1f7d4
AW
1464 return -EINVAL;
1465
89e1f7d4
AW
1466 /*
1467 * Even though we don't make use of the barmap for the mmap,
1468 * we need to request the region and the barmap tracks that.
1469 */
1470 if (!vdev->barmap[index]) {
1471 ret = pci_request_selected_regions(pdev,
1472 1 << index, "vfio-pci");
1473 if (ret)
1474 return ret;
1475
1476 vdev->barmap[index] = pci_iomap(pdev, index, 0);
e19f32da
AY
1477 if (!vdev->barmap[index]) {
1478 pci_release_selected_regions(pdev, 1 << index);
1479 return -ENOMEM;
1480 }
89e1f7d4
AW
1481 }
1482
1483 vma->vm_private_data = vdev;
89e1f7d4 1484 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
34002f54 1485 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
89e1f7d4 1486
11c4cd07
AW
1487 /*
1488 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1489 * change vm_flags within the fault handler. Set them now.
1490 */
1491 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1492 vma->vm_ops = &vfio_pci_mmap_ops;
1493
1494 return 0;
89e1f7d4 1495}
7fa005ca 1496EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
89e1f7d4 1497
ff53edf6 1498void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
6140a8f5 1499{
53647510
MG
1500 struct vfio_pci_core_device *vdev =
1501 container_of(core_vdev, struct vfio_pci_core_device, vdev);
a88a7b3e 1502 struct pci_dev *pdev = vdev->pdev;
6140a8f5
AW
1503
1504 mutex_lock(&vdev->igate);
1505
1506 if (vdev->req_trigger) {
5f55d2ae 1507 if (!(count % 10))
a88a7b3e 1508 pci_notice_ratelimited(pdev,
5f55d2ae
AW
1509 "Relaying device request to user (#%u)\n",
1510 count);
6140a8f5 1511 eventfd_signal(vdev->req_trigger, 1);
5f55d2ae 1512 } else if (count == 0) {
a88a7b3e 1513 pci_warn(pdev,
5f55d2ae 1514 "No device request channel registered, blocked until released by user\n");
6140a8f5
AW
1515 }
1516
1517 mutex_unlock(&vdev->igate);
1518}
7fa005ca 1519EXPORT_SYMBOL_GPL(vfio_pci_core_request);
6140a8f5 1520
53647510 1521static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
cc20d799
AW
1522 bool vf_token, uuid_t *uuid)
1523{
1524 /*
1525 * There's always some degree of trust or collaboration between SR-IOV
1526 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1527 * can disrupt VFs with a reset, but often the PF has more explicit
1528 * access to deny service to the VF or access data passed through the
1529 * VF. We therefore require an opt-in via a shared VF token (UUID) to
1530 * represent this trust. This both prevents that a VF driver might
1531 * assume the PF driver is a trusted, in-kernel driver, and also that
1532 * a PF driver might be replaced with a rogue driver, unknown to in-use
1533 * VF drivers.
1534 *
1535 * Therefore when presented with a VF, if the PF is a vfio device and
1536 * it is bound to the vfio-pci driver, the user needs to provide a VF
1537 * token to access the device, in the form of appending a vf_token to
1538 * the device name, for example:
1539 *
1540 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1541 *
1542 * When presented with a PF which has VFs in use, the user must also
1543 * provide the current VF token to prove collaboration with existing
1544 * VF users. If VFs are not in use, the VF token provided for the PF
1545 * device will act to set the VF token.
1546 *
1547 * If the VF token is provided but unused, an error is generated.
1548 */
1549 if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
1550 return 0; /* No VF token provided or required */
1551
1552 if (vdev->pdev->is_virtfn) {
53647510 1553 struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
cc20d799
AW
1554 bool match;
1555
1556 if (!pf_vdev) {
1557 if (!vf_token)
1558 return 0; /* PF is not vfio-pci, no VF token */
1559
1560 pci_info_ratelimited(vdev->pdev,
1561 "VF token incorrectly provided, PF not bound to vfio-pci\n");
1562 return -EINVAL;
1563 }
1564
1565 if (!vf_token) {
07d47b42 1566 vfio_device_put(&pf_vdev->vdev);
cc20d799
AW
1567 pci_info_ratelimited(vdev->pdev,
1568 "VF token required to access device\n");
1569 return -EACCES;
1570 }
1571
1572 mutex_lock(&pf_vdev->vf_token->lock);
1573 match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1574 mutex_unlock(&pf_vdev->vf_token->lock);
1575
07d47b42 1576 vfio_device_put(&pf_vdev->vdev);
cc20d799
AW
1577
1578 if (!match) {
1579 pci_info_ratelimited(vdev->pdev,
1580 "Incorrect VF token provided for device\n");
1581 return -EACCES;
1582 }
1583 } else if (vdev->vf_token) {
1584 mutex_lock(&vdev->vf_token->lock);
1585 if (vdev->vf_token->users) {
1586 if (!vf_token) {
1587 mutex_unlock(&vdev->vf_token->lock);
1588 pci_info_ratelimited(vdev->pdev,
1589 "VF token required to access device\n");
1590 return -EACCES;
1591 }
1592
1593 if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1594 mutex_unlock(&vdev->vf_token->lock);
1595 pci_info_ratelimited(vdev->pdev,
1596 "Incorrect VF token provided for device\n");
1597 return -EACCES;
1598 }
1599 } else if (vf_token) {
1600 uuid_copy(&vdev->vf_token->uuid, uuid);
1601 }
1602
1603 mutex_unlock(&vdev->vf_token->lock);
1604 } else if (vf_token) {
1605 pci_info_ratelimited(vdev->pdev,
1606 "VF token incorrectly provided, not a PF or VF\n");
1607 return -EINVAL;
1608 }
1609
1610 return 0;
1611}
1612
1613#define VF_TOKEN_ARG "vf_token="
1614
ff53edf6 1615int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
467c084f 1616{
53647510
MG
1617 struct vfio_pci_core_device *vdev =
1618 container_of(core_vdev, struct vfio_pci_core_device, vdev);
cc20d799
AW
1619 bool vf_token = false;
1620 uuid_t uuid;
1621 int ret;
1622
1623 if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
1624 return 0; /* No match */
1625
1626 if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
1627 buf += strlen(pci_name(vdev->pdev));
1628
1629 if (*buf != ' ')
1630 return 0; /* No match: non-whitespace after name */
1631
1632 while (*buf) {
1633 if (*buf == ' ') {
1634 buf++;
1635 continue;
1636 }
1637
1638 if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
1639 strlen(VF_TOKEN_ARG))) {
1640 buf += strlen(VF_TOKEN_ARG);
1641
1642 if (strlen(buf) < UUID_STRING_LEN)
1643 return -EINVAL;
1644
1645 ret = uuid_parse(buf, &uuid);
1646 if (ret)
1647 return ret;
467c084f 1648
cc20d799
AW
1649 vf_token = true;
1650 buf += UUID_STRING_LEN;
1651 } else {
1652 /* Unknown/duplicate option */
1653 return -EINVAL;
1654 }
1655 }
1656 }
1657
1658 ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
1659 if (ret)
1660 return ret;
1661
1662 return 1; /* Match */
467c084f 1663}
7fa005ca 1664EXPORT_SYMBOL_GPL(vfio_pci_core_match);
467c084f 1665
137e5531
AW
1666static int vfio_pci_bus_notifier(struct notifier_block *nb,
1667 unsigned long action, void *data)
1668{
53647510
MG
1669 struct vfio_pci_core_device *vdev = container_of(nb,
1670 struct vfio_pci_core_device, nb);
137e5531
AW
1671 struct device *dev = data;
1672 struct pci_dev *pdev = to_pci_dev(dev);
1673 struct pci_dev *physfn = pci_physfn(pdev);
1674
1675 if (action == BUS_NOTIFY_ADD_DEVICE &&
1676 pdev->is_virtfn && physfn == vdev->pdev) {
1677 pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
1678 pci_name(pdev));
1679 pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
ff53edf6 1680 vdev->vdev.ops->name);
137e5531
AW
1681 } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
1682 pdev->is_virtfn && physfn == vdev->pdev) {
1683 struct pci_driver *drv = pci_dev_driver(pdev);
1684
ff53edf6 1685 if (drv && drv != pci_dev_driver(vdev->pdev))
137e5531 1686 pci_warn(vdev->pdev,
ff53edf6
MG
1687 "VF %s bound to driver %s while PF bound to driver %s\n",
1688 pci_name(pdev), drv->name,
1689 pci_dev_driver(vdev->pdev)->name);
137e5531
AW
1690 }
1691
1692 return 0;
1693}
e309df5b 1694
53647510 1695static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
61e90817
JG
1696{
1697 struct pci_dev *pdev = vdev->pdev;
1698 int ret;
1699
1700 if (!pdev->is_physfn)
1701 return 0;
1702
1703 vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
1704 if (!vdev->vf_token)
1705 return -ENOMEM;
1706
1707 mutex_init(&vdev->vf_token->lock);
1708 uuid_gen(&vdev->vf_token->uuid);
1709
1710 vdev->nb.notifier_call = vfio_pci_bus_notifier;
1711 ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
1712 if (ret) {
1713 kfree(vdev->vf_token);
1714 return ret;
1715 }
1716 return 0;
1717}
1718
53647510 1719static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
61e90817
JG
1720{
1721 if (!vdev->vf_token)
1722 return;
1723
1724 bus_unregister_notifier(&pci_bus_type, &vdev->nb);
1725 WARN_ON(vdev->vf_token->users);
1726 mutex_destroy(&vdev->vf_token->lock);
1727 kfree(vdev->vf_token);
1728}
1729
53647510 1730static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
61e90817
JG
1731{
1732 struct pci_dev *pdev = vdev->pdev;
1733 int ret;
1734
1735 if (!vfio_pci_is_vga(pdev))
1736 return 0;
1737
89b6b8cd 1738 ret = vga_client_register(pdev, vfio_pci_set_decode);
61e90817
JG
1739 if (ret)
1740 return ret;
89b6b8cd 1741 vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
61e90817
JG
1742 return 0;
1743}
1744
53647510 1745static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
61e90817
JG
1746{
1747 struct pci_dev *pdev = vdev->pdev;
1748
1749 if (!vfio_pci_is_vga(pdev))
1750 return;
89b6b8cd 1751 vga_client_unregister(pdev);
61e90817
JG
1752 vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
1753 VGA_RSRC_LEGACY_IO |
1754 VGA_RSRC_LEGACY_MEM);
1755}
1756
ff53edf6
MG
1757void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
1758 struct pci_dev *pdev,
1759 const struct vfio_device_ops *vfio_pci_ops)
89e1f7d4 1760{
ff53edf6
MG
1761 vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops);
1762 vdev->pdev = pdev;
1763 vdev->irq_type = VFIO_PCI_NUM_IRQS;
1764 mutex_init(&vdev->igate);
1765 spin_lock_init(&vdev->irqlock);
1766 mutex_init(&vdev->ioeventfds_lock);
1767 INIT_LIST_HEAD(&vdev->dummy_resources_list);
1768 INIT_LIST_HEAD(&vdev->ioeventfds_list);
1769 mutex_init(&vdev->vma_lock);
1770 INIT_LIST_HEAD(&vdev->vma_list);
1771 init_rwsem(&vdev->memory_lock);
1772}
7fa005ca 1773EXPORT_SYMBOL_GPL(vfio_pci_core_init_device);
ff53edf6
MG
1774
1775void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
1776{
1777 mutex_destroy(&vdev->igate);
1778 mutex_destroy(&vdev->ioeventfds_lock);
1779 mutex_destroy(&vdev->vma_lock);
1780 vfio_uninit_group_dev(&vdev->vdev);
1781 kfree(vdev->region);
1782 kfree(vdev->pm_save);
1783}
7fa005ca 1784EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device);
ff53edf6
MG
1785
1786int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
1787{
1788 struct pci_dev *pdev = vdev->pdev;
89e1f7d4
AW
1789 int ret;
1790
7c2e211f 1791 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
89e1f7d4
AW
1792 return -EINVAL;
1793
0dd0e297 1794 /*
137e5531
AW
1795 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1796 * by the host or other users. We cannot capture the VFs if they
1797 * already exist, nor can we track VF users. Disabling SR-IOV here
1798 * would initiate removing the VFs, which would unbind the driver,
1799 * which is prone to blocking if that VF is also in use by vfio-pci.
1800 * Just reject these PFs and let the user sort it out.
0dd0e297
AW
1801 */
1802 if (pci_num_vf(pdev)) {
1803 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
1804 return -EBUSY;
1805 }
1806
2cd8b14a
YH
1807 if (pci_is_root_bus(pdev->bus)) {
1808 ret = vfio_assign_device_set(&vdev->vdev, vdev);
1809 } else if (!pci_probe_reset_slot(pdev->slot)) {
1810 ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
1811 } else {
1812 /*
1813 * If there is no slot reset support for this device, the whole
1814 * bus needs to be grouped together to support bus-wide resets.
1815 */
1816 ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
1817 }
1818
b66574a3 1819 if (ret)
38a68934 1820 return ret;
61e90817 1821 ret = vfio_pci_vf_init(vdev);
b66574a3 1822 if (ret)
38a68934 1823 return ret;
61e90817
JG
1824 ret = vfio_pci_vga_init(vdev);
1825 if (ret)
1826 goto out_vf;
ecaa1f6a 1827
51ef3a00
AW
1828 vfio_pci_probe_power_state(vdev);
1829
6eb70187
AW
1830 if (!disable_idle_d3) {
1831 /*
1832 * pci-core sets the device power state to an unknown value at
1833 * bootup and after being removed from a driver. The only
1834 * transition it allows from this unknown state is to D0, which
1835 * typically happens when a driver calls pci_enable_device().
1836 * We're not ready to enable the device yet, but we do want to
1837 * be able to get to D3. Therefore first do a D0 transition
1838 * before going to D3.
1839 */
51ef3a00
AW
1840 vfio_pci_set_power_state(vdev, PCI_D0);
1841 vfio_pci_set_power_state(vdev, PCI_D3hot);
6eb70187
AW
1842 }
1843
6b018e20 1844 ret = vfio_register_group_dev(&vdev->vdev);
4aeec398
JG
1845 if (ret)
1846 goto out_power;
1847 return 0;
b66574a3 1848
4aeec398
JG
1849out_power:
1850 if (!disable_idle_d3)
1851 vfio_pci_set_power_state(vdev, PCI_D0);
61e90817
JG
1852out_vf:
1853 vfio_pci_vf_uninit(vdev);
b66574a3 1854 return ret;
89e1f7d4 1855}
7fa005ca 1856EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
89e1f7d4 1857
ff53edf6 1858void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
89e1f7d4 1859{
ff53edf6 1860 struct pci_dev *pdev = vdev->pdev;
89e1f7d4 1861
137e5531
AW
1862 pci_disable_sriov(pdev);
1863
6b018e20 1864 vfio_unregister_group_dev(&vdev->vdev);
137e5531 1865
61e90817 1866 vfio_pci_vf_uninit(vdev);
61e90817 1867 vfio_pci_vga_uninit(vdev);
e309df5b 1868
51ef3a00
AW
1869 if (!disable_idle_d3)
1870 vfio_pci_set_power_state(vdev, PCI_D0);
89e1f7d4 1871}
7fa005ca 1872EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
89e1f7d4 1873
dad9f897
VMP
1874static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
1875 pci_channel_state_t state)
1876{
53647510 1877 struct vfio_pci_core_device *vdev;
dad9f897
VMP
1878 struct vfio_device *device;
1879
1880 device = vfio_device_get_from_dev(&pdev->dev);
1881 if (device == NULL)
1882 return PCI_ERS_RESULT_DISCONNECT;
1883
53647510 1884 vdev = container_of(device, struct vfio_pci_core_device, vdev);
dad9f897 1885
3be3a074
AW
1886 mutex_lock(&vdev->igate);
1887
dad9f897
VMP
1888 if (vdev->err_trigger)
1889 eventfd_signal(vdev->err_trigger, 1);
1890
3be3a074
AW
1891 mutex_unlock(&vdev->igate);
1892
dad9f897
VMP
1893 vfio_device_put(device);
1894
1895 return PCI_ERS_RESULT_CAN_RECOVER;
1896}
1897
ff53edf6 1898int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
137e5531 1899{
137e5531
AW
1900 struct vfio_device *device;
1901 int ret = 0;
1902
137e5531
AW
1903 device = vfio_device_get_from_dev(&pdev->dev);
1904 if (!device)
1905 return -ENODEV;
1906
137e5531
AW
1907 if (nr_virtfn == 0)
1908 pci_disable_sriov(pdev);
1909 else
1910 ret = pci_enable_sriov(pdev, nr_virtfn);
1911
1912 vfio_device_put(device);
1913
1914 return ret < 0 ? ret : nr_virtfn;
1915}
7fa005ca 1916EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
137e5531 1917
ff53edf6 1918const struct pci_error_handlers vfio_pci_core_err_handlers = {
dad9f897
VMP
1919 .error_detected = vfio_pci_aer_err_detected,
1920};
7fa005ca 1921EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
dad9f897 1922
53647510 1923static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
db44c174 1924 struct vfio_pci_group_info *groups)
bc4fba77 1925{
db44c174 1926 unsigned int i;
bc4fba77 1927
db44c174
JG
1928 for (i = 0; i < groups->count; i++)
1929 if (groups->groups[i] == vdev->vdev.group)
1930 return true;
1931 return false;
bc4fba77
AW
1932}
1933
a882c16a 1934static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
abafbc55 1935{
a882c16a
JG
1936 struct vfio_device_set *dev_set = data;
1937 struct vfio_device *cur;
abafbc55 1938
a882c16a
JG
1939 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
1940 if (cur->dev == &pdev->dev)
1941 return 0;
1942 return -EBUSY;
1943}
abafbc55 1944
a882c16a
JG
1945/*
1946 * vfio-core considers a group to be viable and will create a vfio_device even
1947 * if some devices are bound to drivers like pci-stub or pcieport. Here we
1948 * require all PCI devices to be inside our dev_set since that ensures they stay
1949 * put and that every driver controlling the device can co-ordinate with the
1950 * device reset.
1951 *
1952 * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
1953 * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
1954 */
1955static struct pci_dev *
1956vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
1957{
1958 struct pci_dev *pdev;
abafbc55 1959
a882c16a 1960 lockdep_assert_held(&dev_set->lock);
abafbc55
AW
1961
1962 /*
a882c16a
JG
1963 * By definition all PCI devices in the dev_set share the same PCI
1964 * reset, so any pci_dev will have the same outcomes for
1965 * pci_probe_reset_*() and pci_reset_bus().
abafbc55 1966 */
53647510
MG
1967 pdev = list_first_entry(&dev_set->device_list,
1968 struct vfio_pci_core_device,
a882c16a 1969 vdev.dev_set_list)->pdev;
abafbc55 1970
a882c16a
JG
1971 /* pci_reset_bus() is supported */
1972 if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
1973 return NULL;
1974
1975 if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
1976 dev_set,
1977 !pci_probe_reset_slot(pdev->slot)))
1978 return NULL;
1979 return pdev;
1980}
1981
db44c174
JG
1982/*
1983 * We need to get memory_lock for each device, but devices can share mmap_lock,
1984 * therefore we need to zap and hold the vma_lock for each device, and only then
1985 * get each memory_lock.
1986 */
1987static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
1988 struct vfio_pci_group_info *groups)
1989{
53647510
MG
1990 struct vfio_pci_core_device *cur_mem;
1991 struct vfio_pci_core_device *cur_vma;
1992 struct vfio_pci_core_device *cur;
db44c174
JG
1993 struct pci_dev *pdev;
1994 bool is_mem = true;
1995 int ret;
1996
1997 mutex_lock(&dev_set->lock);
1998 cur_mem = list_first_entry(&dev_set->device_list,
53647510
MG
1999 struct vfio_pci_core_device,
2000 vdev.dev_set_list);
db44c174
JG
2001
2002 pdev = vfio_pci_dev_set_resettable(dev_set);
2003 if (!pdev) {
2004 ret = -EINVAL;
2005 goto err_unlock;
2006 }
2007
2008 list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
2009 /*
2010 * Test whether all the affected devices are contained by the
2011 * set of groups provided by the user.
2012 */
2013 if (!vfio_dev_in_groups(cur_vma, groups)) {
2014 ret = -EINVAL;
2015 goto err_undo;
2016 }
2017
2018 /*
2019 * Locking multiple devices is prone to deadlock, runaway and
2020 * unwind if we hit contention.
2021 */
2022 if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
2023 ret = -EBUSY;
2024 goto err_undo;
2025 }
2026 }
2027 cur_vma = NULL;
2028
2029 list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
2030 if (!down_write_trylock(&cur_mem->memory_lock)) {
2031 ret = -EBUSY;
2032 goto err_undo;
2033 }
2034 mutex_unlock(&cur_mem->vma_lock);
2035 }
2036 cur_mem = NULL;
2037
2038 ret = pci_reset_bus(pdev);
2039
2040err_undo:
2041 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2042 if (cur == cur_mem)
2043 is_mem = false;
2044 if (cur == cur_vma)
2045 break;
2046 if (is_mem)
2047 up_write(&cur->memory_lock);
2048 else
2049 mutex_unlock(&cur->vma_lock);
2050 }
2051err_unlock:
2052 mutex_unlock(&dev_set->lock);
2053 return ret;
2054}
2055
a882c16a
JG
2056static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
2057{
53647510 2058 struct vfio_pci_core_device *cur;
a882c16a
JG
2059 bool needs_reset = false;
2060
2061 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2062 /* No VFIO device in the set can have an open device FD */
2063 if (cur->vdev.open_count)
2064 return false;
2065 needs_reset |= cur->needs_reset;
2066 }
2067 return needs_reset;
abafbc55
AW
2068}
2069
bc4fba77 2070/*
a882c16a 2071 * If a bus or slot reset is available for the provided dev_set and:
e309df5b 2072 * - All of the devices affected by that bus or slot reset are unused
e309df5b
AW
2073 * - At least one of the affected devices is marked dirty via
2074 * needs_reset (such as by lack of FLR support)
a882c16a
JG
2075 * Then attempt to perform that bus or slot reset.
2076 * Returns true if the dev_set was reset.
bc4fba77 2077 */
a882c16a 2078static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
bc4fba77 2079{
53647510 2080 struct vfio_pci_core_device *cur;
a882c16a
JG
2081 struct pci_dev *pdev;
2082 int ret;
93899a67 2083
a882c16a
JG
2084 if (!vfio_pci_dev_set_needs_reset(dev_set))
2085 return false;
e309df5b 2086
a882c16a
JG
2087 pdev = vfio_pci_dev_set_resettable(dev_set);
2088 if (!pdev)
2089 return false;
6eb70187 2090
a882c16a
JG
2091 ret = pci_reset_bus(pdev);
2092 if (ret)
2093 return false;
6eb70187 2094
a882c16a
JG
2095 list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2096 cur->needs_reset = false;
2097 if (!disable_idle_d3)
2098 vfio_pci_set_power_state(cur, PCI_D3hot);
93899a67 2099 }
a882c16a 2100 return true;
bc4fba77
AW
2101}
2102
c61302aa
YH
2103void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
2104 bool is_disable_idle_d3)
2105{
2106 nointxmask = is_nointxmask;
2107 disable_vga = is_disable_vga;
2108 disable_idle_d3 = is_disable_idle_d3;
2109}
7fa005ca 2110EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
c61302aa 2111
7fa005ca 2112static void vfio_pci_core_cleanup(void)
89e1f7d4 2113{
89e1f7d4
AW
2114 vfio_pci_uninit_perm_bits();
2115}
2116
7fa005ca 2117static int __init vfio_pci_core_init(void)
80c7e8cc 2118{
fbc9d371 2119 /* Allocate shared config space permission data used by all devices */
ff53edf6 2120 return vfio_pci_init_perm_bits();
89e1f7d4 2121}
7fa005ca
MG
2122
2123module_init(vfio_pci_core_init);
2124module_exit(vfio_pci_core_cleanup);
2125
2126MODULE_LICENSE("GPL v2");
2127MODULE_AUTHOR(DRIVER_AUTHOR);
2128MODULE_DESCRIPTION(DRIVER_DESC);