Commit | Line | Data |
---|---|---|
89e1f7d4 AW |
1 | /* |
2 | * VFIO PCI config space virtualization | |
3 | * | |
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | |
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as | |
9 | * published by the Free Software Foundation. | |
10 | * | |
11 | * Derived from original vfio: | |
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | |
13 | * Author: Tom Lyon, pugs@cisco.com | |
14 | */ | |
15 | ||
16 | /* | |
17 | * This code handles reading and writing of PCI configuration registers. | |
18 | * This is hairy because we want to allow a lot of flexibility to the | |
19 | * user driver, but cannot trust it with all of the config fields. | |
20 | * Tables determine which fields can be read and written, as well as | |
21 | * which fields are 'virtualized' - special actions and translations to | |
22 | * make it appear to the user that he has control, when in fact things | |
23 | * must be negotiated with the underlying OS. | |
24 | */ | |
25 | ||
26 | #include <linux/fs.h> | |
27 | #include <linux/pci.h> | |
28 | #include <linux/uaccess.h> | |
29 | #include <linux/vfio.h> | |
25e9789d | 30 | #include <linux/slab.h> |
89e1f7d4 AW |
31 | |
32 | #include "vfio_pci_private.h" | |
33 | ||
34 | #define PCI_CFG_SPACE_SIZE 256 | |
35 | ||
345d7104 | 36 | /* Fake capability ID for standard config space */ |
89e1f7d4 | 37 | #define PCI_CAP_ID_BASIC 0 |
89e1f7d4 AW |
38 | |
39 | #define is_bar(offset) \ | |
40 | ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ | |
41 | (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4)) | |
42 | ||
43 | /* | |
44 | * Lengths of PCI Config Capabilities | |
45 | * 0: Removed from the user visible capability list | |
46 | * FF: Variable length | |
47 | */ | |
222e684c | 48 | static const u8 pci_cap_length[PCI_CAP_ID_MAX + 1] = { |
89e1f7d4 AW |
49 | [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */ |
50 | [PCI_CAP_ID_PM] = PCI_PM_SIZEOF, | |
51 | [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF, | |
52 | [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF, | |
53 | [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */ | |
54 | [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */ | |
55 | [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */ | |
56 | [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */ | |
57 | [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */ | |
58 | [PCI_CAP_ID_VNDR] = 0xFF, /* variable */ | |
59 | [PCI_CAP_ID_DBG] = 0, /* debug - don't care */ | |
60 | [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */ | |
61 | [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */ | |
62 | [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */ | |
63 | [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */ | |
64 | [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */ | |
65 | [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */ | |
66 | [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF, | |
67 | [PCI_CAP_ID_SATA] = 0xFF, | |
68 | [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF, | |
69 | }; | |
70 | ||
71 | /* | |
72 | * Lengths of PCIe/PCI-X Extended Config Capabilities | |
73 | * 0: Removed or masked from the user visible capabilty list | |
74 | * FF: Variable length | |
75 | */ | |
222e684c | 76 | static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = { |
89e1f7d4 AW |
77 | [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND, |
78 | [PCI_EXT_CAP_ID_VC] = 0xFF, | |
79 | [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF, | |
80 | [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF, | |
81 | [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */ | |
82 | [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */ | |
83 | [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */ | |
84 | [PCI_EXT_CAP_ID_MFVC] = 0xFF, | |
85 | [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */ | |
86 | [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */ | |
87 | [PCI_EXT_CAP_ID_VNDR] = 0xFF, | |
88 | [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */ | |
89 | [PCI_EXT_CAP_ID_ACS] = 0xFF, | |
90 | [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF, | |
91 | [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF, | |
92 | [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF, | |
93 | [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */ | |
94 | [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF, | |
95 | [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF, | |
96 | [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */ | |
97 | [PCI_EXT_CAP_ID_REBAR] = 0xFF, | |
98 | [PCI_EXT_CAP_ID_DPA] = 0xFF, | |
99 | [PCI_EXT_CAP_ID_TPH] = 0xFF, | |
100 | [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, | |
101 | [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ | |
102 | [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ | |
103 | [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ | |
104 | }; | |
105 | ||
106 | /* | |
107 | * Read/Write Permission Bits - one bit for each bit in capability | |
108 | * Any field can be read if it exists, but what is read depends on | |
109 | * whether the field is 'virtualized', or just pass thru to the | |
110 | * hardware. Any virtualized field is also virtualized for writes. | |
111 | * Writes are only permitted if they have a 1 bit here. | |
112 | */ | |
113 | struct perm_bits { | |
114 | u8 *virt; /* read/write virtual data, not hw */ | |
115 | u8 *write; /* writeable bits */ | |
116 | int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, | |
117 | struct perm_bits *perm, int offset, __le32 *val); | |
118 | int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, | |
119 | struct perm_bits *perm, int offset, __le32 val); | |
120 | }; | |
121 | ||
122 | #define NO_VIRT 0 | |
123 | #define ALL_VIRT 0xFFFFFFFFU | |
124 | #define NO_WRITE 0 | |
125 | #define ALL_WRITE 0xFFFFFFFFU | |
126 | ||
127 | static int vfio_user_config_read(struct pci_dev *pdev, int offset, | |
128 | __le32 *val, int count) | |
129 | { | |
130 | int ret = -EINVAL; | |
131 | u32 tmp_val = 0; | |
132 | ||
133 | switch (count) { | |
134 | case 1: | |
135 | { | |
136 | u8 tmp; | |
137 | ret = pci_user_read_config_byte(pdev, offset, &tmp); | |
138 | tmp_val = tmp; | |
139 | break; | |
140 | } | |
141 | case 2: | |
142 | { | |
143 | u16 tmp; | |
144 | ret = pci_user_read_config_word(pdev, offset, &tmp); | |
145 | tmp_val = tmp; | |
146 | break; | |
147 | } | |
148 | case 4: | |
149 | ret = pci_user_read_config_dword(pdev, offset, &tmp_val); | |
150 | break; | |
151 | } | |
152 | ||
153 | *val = cpu_to_le32(tmp_val); | |
154 | ||
155 | return pcibios_err_to_errno(ret); | |
156 | } | |
157 | ||
158 | static int vfio_user_config_write(struct pci_dev *pdev, int offset, | |
159 | __le32 val, int count) | |
160 | { | |
161 | int ret = -EINVAL; | |
162 | u32 tmp_val = le32_to_cpu(val); | |
163 | ||
164 | switch (count) { | |
165 | case 1: | |
166 | ret = pci_user_write_config_byte(pdev, offset, tmp_val); | |
167 | break; | |
168 | case 2: | |
169 | ret = pci_user_write_config_word(pdev, offset, tmp_val); | |
170 | break; | |
171 | case 4: | |
172 | ret = pci_user_write_config_dword(pdev, offset, tmp_val); | |
173 | break; | |
174 | } | |
175 | ||
176 | return pcibios_err_to_errno(ret); | |
177 | } | |
178 | ||
179 | static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, | |
180 | int count, struct perm_bits *perm, | |
181 | int offset, __le32 *val) | |
182 | { | |
183 | __le32 virt = 0; | |
184 | ||
185 | memcpy(val, vdev->vconfig + pos, count); | |
186 | ||
187 | memcpy(&virt, perm->virt + offset, count); | |
188 | ||
189 | /* Any non-virtualized bits? */ | |
190 | if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) { | |
191 | struct pci_dev *pdev = vdev->pdev; | |
192 | __le32 phys_val = 0; | |
193 | int ret; | |
194 | ||
195 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | |
196 | if (ret) | |
197 | return ret; | |
198 | ||
199 | *val = (phys_val & ~virt) | (*val & virt); | |
200 | } | |
201 | ||
202 | return count; | |
203 | } | |
204 | ||
205 | static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, | |
206 | int count, struct perm_bits *perm, | |
207 | int offset, __le32 val) | |
208 | { | |
209 | __le32 virt = 0, write = 0; | |
210 | ||
211 | memcpy(&write, perm->write + offset, count); | |
212 | ||
213 | if (!write) | |
214 | return count; /* drop, no writable bits */ | |
215 | ||
216 | memcpy(&virt, perm->virt + offset, count); | |
217 | ||
218 | /* Virtualized and writable bits go to vconfig */ | |
219 | if (write & virt) { | |
220 | __le32 virt_val = 0; | |
221 | ||
222 | memcpy(&virt_val, vdev->vconfig + pos, count); | |
223 | ||
224 | virt_val &= ~(write & virt); | |
225 | virt_val |= (val & (write & virt)); | |
226 | ||
227 | memcpy(vdev->vconfig + pos, &virt_val, count); | |
228 | } | |
229 | ||
230 | /* Non-virtualzed and writable bits go to hardware */ | |
231 | if (write & ~virt) { | |
232 | struct pci_dev *pdev = vdev->pdev; | |
233 | __le32 phys_val = 0; | |
234 | int ret; | |
235 | ||
236 | ret = vfio_user_config_read(pdev, pos, &phys_val, count); | |
237 | if (ret) | |
238 | return ret; | |
239 | ||
240 | phys_val &= ~(write & ~virt); | |
241 | phys_val |= (val & (write & ~virt)); | |
242 | ||
243 | ret = vfio_user_config_write(pdev, pos, phys_val, count); | |
244 | if (ret) | |
245 | return ret; | |
246 | } | |
247 | ||
248 | return count; | |
249 | } | |
250 | ||
251 | /* Allow direct read from hardware, except for capability next pointer */ | |
252 | static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, | |
253 | int count, struct perm_bits *perm, | |
254 | int offset, __le32 *val) | |
255 | { | |
256 | int ret; | |
257 | ||
258 | ret = vfio_user_config_read(vdev->pdev, pos, val, count); | |
259 | if (ret) | |
260 | return pcibios_err_to_errno(ret); | |
261 | ||
262 | if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ | |
263 | if (offset < 4) | |
264 | memcpy(val, vdev->vconfig + pos, count); | |
265 | } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */ | |
266 | if (offset == PCI_CAP_LIST_ID && count > 1) | |
267 | memcpy(val, vdev->vconfig + pos, | |
268 | min(PCI_CAP_FLAGS, count)); | |
269 | else if (offset == PCI_CAP_LIST_NEXT) | |
270 | memcpy(val, vdev->vconfig + pos, 1); | |
271 | } | |
272 | ||
273 | return count; | |
274 | } | |
275 | ||
a7d1ea1c AW |
276 | /* Raw access skips any kind of virtualization */ |
277 | static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, | |
278 | int count, struct perm_bits *perm, | |
279 | int offset, __le32 val) | |
89e1f7d4 AW |
280 | { |
281 | int ret; | |
282 | ||
283 | ret = vfio_user_config_write(vdev->pdev, pos, val, count); | |
284 | if (ret) | |
285 | return ret; | |
286 | ||
287 | return count; | |
288 | } | |
289 | ||
a7d1ea1c AW |
290 | static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, |
291 | int count, struct perm_bits *perm, | |
292 | int offset, __le32 *val) | |
293 | { | |
294 | int ret; | |
295 | ||
296 | ret = vfio_user_config_read(vdev->pdev, pos, val, count); | |
297 | if (ret) | |
298 | return pcibios_err_to_errno(ret); | |
299 | ||
300 | return count; | |
301 | } | |
302 | ||
345d7104 AW |
303 | /* Virt access uses only virtualization */ |
304 | static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos, | |
305 | int count, struct perm_bits *perm, | |
306 | int offset, __le32 val) | |
307 | { | |
308 | memcpy(vdev->vconfig + pos, &val, count); | |
309 | return count; | |
310 | } | |
311 | ||
312 | static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos, | |
313 | int count, struct perm_bits *perm, | |
314 | int offset, __le32 *val) | |
315 | { | |
316 | memcpy(val, vdev->vconfig + pos, count); | |
317 | return count; | |
318 | } | |
319 | ||
a7d1ea1c | 320 | /* Default capability regions to read-only, no-virtualization */ |
89e1f7d4 AW |
321 | static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { |
322 | [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | |
323 | }; | |
324 | static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { | |
325 | [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } | |
326 | }; | |
a7d1ea1c AW |
327 | /* |
328 | * Default unassigned regions to raw read-write access. Some devices | |
329 | * require this to function as they hide registers between the gaps in | |
330 | * config space (be2net). Like MMIO and I/O port registers, we have | |
331 | * to trust the hardware isolation. | |
332 | */ | |
333 | static struct perm_bits unassigned_perms = { | |
334 | .readfn = vfio_raw_config_read, | |
335 | .writefn = vfio_raw_config_write | |
336 | }; | |
89e1f7d4 | 337 | |
345d7104 AW |
338 | static struct perm_bits virt_perms = { |
339 | .readfn = vfio_virt_config_read, | |
340 | .writefn = vfio_virt_config_write | |
341 | }; | |
342 | ||
89e1f7d4 AW |
343 | static void free_perm_bits(struct perm_bits *perm) |
344 | { | |
345 | kfree(perm->virt); | |
346 | kfree(perm->write); | |
347 | perm->virt = NULL; | |
348 | perm->write = NULL; | |
349 | } | |
350 | ||
351 | static int alloc_perm_bits(struct perm_bits *perm, int size) | |
352 | { | |
353 | /* | |
354 | * Round up all permission bits to the next dword, this lets us | |
355 | * ignore whether a read/write exceeds the defined capability | |
356 | * structure. We can do this because: | |
357 | * - Standard config space is already dword aligned | |
358 | * - Capabilities are all dword alinged (bits 0:1 of next reserved) | |
359 | * - Express capabilities defined as dword aligned | |
360 | */ | |
361 | size = round_up(size, 4); | |
362 | ||
363 | /* | |
364 | * Zero state is | |
365 | * - All Readable, None Writeable, None Virtualized | |
366 | */ | |
367 | perm->virt = kzalloc(size, GFP_KERNEL); | |
368 | perm->write = kzalloc(size, GFP_KERNEL); | |
369 | if (!perm->virt || !perm->write) { | |
370 | free_perm_bits(perm); | |
371 | return -ENOMEM; | |
372 | } | |
373 | ||
374 | perm->readfn = vfio_default_config_read; | |
375 | perm->writefn = vfio_default_config_write; | |
376 | ||
377 | return 0; | |
378 | } | |
379 | ||
380 | /* | |
381 | * Helper functions for filling in permission tables | |
382 | */ | |
383 | static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write) | |
384 | { | |
385 | p->virt[off] = virt; | |
386 | p->write[off] = write; | |
387 | } | |
388 | ||
389 | /* Handle endian-ness - pci and tables are little-endian */ | |
390 | static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write) | |
391 | { | |
392 | *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt); | |
393 | *(__le16 *)(&p->write[off]) = cpu_to_le16(write); | |
394 | } | |
395 | ||
396 | /* Handle endian-ness - pci and tables are little-endian */ | |
397 | static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write) | |
398 | { | |
399 | *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt); | |
400 | *(__le32 *)(&p->write[off]) = cpu_to_le32(write); | |
401 | } | |
402 | ||
403 | /* | |
404 | * Restore the *real* BARs after we detect a FLR or backdoor reset. | |
405 | * (backdoor = some device specific technique that we didn't catch) | |
406 | */ | |
407 | static void vfio_bar_restore(struct vfio_pci_device *vdev) | |
408 | { | |
409 | struct pci_dev *pdev = vdev->pdev; | |
410 | u32 *rbar = vdev->rbar; | |
45074405 | 411 | u16 cmd; |
89e1f7d4 AW |
412 | int i; |
413 | ||
414 | if (pdev->is_virtfn) | |
415 | return; | |
416 | ||
417 | pr_info("%s: %s reset recovery - restoring bars\n", | |
418 | __func__, dev_name(&pdev->dev)); | |
419 | ||
420 | for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++) | |
421 | pci_user_write_config_dword(pdev, i, *rbar); | |
422 | ||
423 | pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar); | |
45074405 AW |
424 | |
425 | if (vdev->nointx) { | |
426 | pci_user_read_config_word(pdev, PCI_COMMAND, &cmd); | |
427 | cmd |= PCI_COMMAND_INTX_DISABLE; | |
428 | pci_user_write_config_word(pdev, PCI_COMMAND, cmd); | |
429 | } | |
89e1f7d4 AW |
430 | } |
431 | ||
432 | static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar) | |
433 | { | |
434 | unsigned long flags = pci_resource_flags(pdev, bar); | |
435 | u32 val; | |
436 | ||
437 | if (flags & IORESOURCE_IO) | |
438 | return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO); | |
439 | ||
440 | val = PCI_BASE_ADDRESS_SPACE_MEMORY; | |
441 | ||
442 | if (flags & IORESOURCE_PREFETCH) | |
443 | val |= PCI_BASE_ADDRESS_MEM_PREFETCH; | |
444 | ||
445 | if (flags & IORESOURCE_MEM_64) | |
446 | val |= PCI_BASE_ADDRESS_MEM_TYPE_64; | |
447 | ||
448 | return cpu_to_le32(val); | |
449 | } | |
450 | ||
451 | /* | |
452 | * Pretend we're hardware and tweak the values of the *virtual* PCI BARs | |
453 | * to reflect the hardware capabilities. This implements BAR sizing. | |
454 | */ | |
455 | static void vfio_bar_fixup(struct vfio_pci_device *vdev) | |
456 | { | |
457 | struct pci_dev *pdev = vdev->pdev; | |
458 | int i; | |
459 | __le32 *bar; | |
460 | u64 mask; | |
461 | ||
462 | bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0]; | |
463 | ||
464 | for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) { | |
465 | if (!pci_resource_start(pdev, i)) { | |
466 | *bar = 0; /* Unmapped by host = unimplemented to user */ | |
467 | continue; | |
468 | } | |
469 | ||
470 | mask = ~(pci_resource_len(pdev, i) - 1); | |
471 | ||
472 | *bar &= cpu_to_le32((u32)mask); | |
473 | *bar |= vfio_generate_bar_flags(pdev, i); | |
474 | ||
475 | if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) { | |
476 | bar++; | |
477 | *bar &= cpu_to_le32((u32)(mask >> 32)); | |
478 | i++; | |
479 | } | |
480 | } | |
481 | ||
482 | bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; | |
483 | ||
484 | /* | |
a13b6459 AW |
485 | * NB. REGION_INFO will have reported zero size if we weren't able |
486 | * to read the ROM, but we still return the actual BAR size here if | |
487 | * it exists (or the shadow ROM space). | |
89e1f7d4 AW |
488 | */ |
489 | if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { | |
490 | mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); | |
491 | mask |= PCI_ROM_ADDRESS_ENABLE; | |
492 | *bar &= cpu_to_le32((u32)mask); | |
a13b6459 AW |
493 | } else if (pdev->resource[PCI_ROM_RESOURCE].flags & |
494 | IORESOURCE_ROM_SHADOW) { | |
495 | mask = ~(0x20000 - 1); | |
496 | mask |= PCI_ROM_ADDRESS_ENABLE; | |
497 | *bar &= cpu_to_le32((u32)mask); | |
89e1f7d4 AW |
498 | } else |
499 | *bar = 0; | |
500 | ||
501 | vdev->bardirty = false; | |
502 | } | |
503 | ||
504 | static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, | |
505 | int count, struct perm_bits *perm, | |
506 | int offset, __le32 *val) | |
507 | { | |
508 | if (is_bar(offset)) /* pos == offset for basic config */ | |
509 | vfio_bar_fixup(vdev); | |
510 | ||
511 | count = vfio_default_config_read(vdev, pos, count, perm, offset, val); | |
512 | ||
513 | /* Mask in virtual memory enable for SR-IOV devices */ | |
514 | if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { | |
515 | u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); | |
516 | u32 tmp_val = le32_to_cpu(*val); | |
517 | ||
518 | tmp_val |= cmd & PCI_COMMAND_MEMORY; | |
519 | *val = cpu_to_le32(tmp_val); | |
520 | } | |
521 | ||
522 | return count; | |
523 | } | |
524 | ||
dc928109 AW |
525 | /* Test whether BARs match the value we think they should contain */ |
526 | static bool vfio_need_bar_restore(struct vfio_pci_device *vdev) | |
527 | { | |
528 | int i = 0, pos = PCI_BASE_ADDRESS_0, ret; | |
529 | u32 bar; | |
530 | ||
531 | for (; pos <= PCI_BASE_ADDRESS_5; i++, pos += 4) { | |
532 | if (vdev->rbar[i]) { | |
533 | ret = pci_user_read_config_dword(vdev->pdev, pos, &bar); | |
534 | if (ret || vdev->rbar[i] != bar) | |
535 | return true; | |
536 | } | |
537 | } | |
538 | ||
539 | return false; | |
540 | } | |
541 | ||
89e1f7d4 AW |
542 | static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, |
543 | int count, struct perm_bits *perm, | |
544 | int offset, __le32 val) | |
545 | { | |
546 | struct pci_dev *pdev = vdev->pdev; | |
547 | __le16 *virt_cmd; | |
548 | u16 new_cmd = 0; | |
549 | int ret; | |
550 | ||
551 | virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND]; | |
552 | ||
553 | if (offset == PCI_COMMAND) { | |
554 | bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io; | |
555 | u16 phys_cmd; | |
556 | ||
557 | ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd); | |
558 | if (ret) | |
559 | return ret; | |
560 | ||
561 | new_cmd = le32_to_cpu(val); | |
562 | ||
563 | phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY); | |
564 | virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); | |
565 | new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); | |
566 | ||
567 | phys_io = !!(phys_cmd & PCI_COMMAND_IO); | |
568 | virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); | |
569 | new_io = !!(new_cmd & PCI_COMMAND_IO); | |
570 | ||
571 | /* | |
572 | * If the user is writing mem/io enable (new_mem/io) and we | |
573 | * think it's already enabled (virt_mem/io), but the hardware | |
574 | * shows it disabled (phys_mem/io, then the device has | |
575 | * undergone some kind of backdoor reset and needs to be | |
576 | * restored before we allow it to enable the bars. | |
577 | * SR-IOV devices will trigger this, but we catch them later | |
578 | */ | |
579 | if ((new_mem && virt_mem && !phys_mem) || | |
dc928109 AW |
580 | (new_io && virt_io && !phys_io) || |
581 | vfio_need_bar_restore(vdev)) | |
89e1f7d4 AW |
582 | vfio_bar_restore(vdev); |
583 | } | |
584 | ||
585 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | |
586 | if (count < 0) | |
587 | return count; | |
588 | ||
589 | /* | |
590 | * Save current memory/io enable bits in vconfig to allow for | |
591 | * the test above next time. | |
592 | */ | |
593 | if (offset == PCI_COMMAND) { | |
594 | u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO; | |
595 | ||
596 | *virt_cmd &= cpu_to_le16(~mask); | |
597 | *virt_cmd |= cpu_to_le16(new_cmd & mask); | |
598 | } | |
599 | ||
600 | /* Emulate INTx disable */ | |
601 | if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) { | |
602 | bool virt_intx_disable; | |
603 | ||
604 | virt_intx_disable = !!(le16_to_cpu(*virt_cmd) & | |
605 | PCI_COMMAND_INTX_DISABLE); | |
606 | ||
607 | if (virt_intx_disable && !vdev->virq_disabled) { | |
608 | vdev->virq_disabled = true; | |
609 | vfio_pci_intx_mask(vdev); | |
610 | } else if (!virt_intx_disable && vdev->virq_disabled) { | |
611 | vdev->virq_disabled = false; | |
612 | vfio_pci_intx_unmask(vdev); | |
613 | } | |
614 | } | |
615 | ||
616 | if (is_bar(offset)) | |
617 | vdev->bardirty = true; | |
618 | ||
619 | return count; | |
620 | } | |
621 | ||
622 | /* Permissions for the Basic PCI Header */ | |
623 | static int __init init_pci_cap_basic_perm(struct perm_bits *perm) | |
624 | { | |
625 | if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF)) | |
626 | return -ENOMEM; | |
627 | ||
628 | perm->readfn = vfio_basic_config_read; | |
629 | perm->writefn = vfio_basic_config_write; | |
630 | ||
631 | /* Virtualized for SR-IOV functions, which just have FFFF */ | |
632 | p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE); | |
633 | p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE); | |
634 | ||
635 | /* | |
636 | * Virtualize INTx disable, we use it internally for interrupt | |
637 | * control and can emulate it for non-PCI 2.3 devices. | |
638 | */ | |
639 | p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE); | |
640 | ||
641 | /* Virtualize capability list, we might want to skip/disable */ | |
642 | p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE); | |
643 | ||
644 | /* No harm to write */ | |
645 | p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE); | |
646 | p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE); | |
647 | p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE); | |
648 | ||
649 | /* Virtualize all bars, can't touch the real ones */ | |
650 | p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE); | |
651 | p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE); | |
652 | p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE); | |
653 | p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE); | |
654 | p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE); | |
655 | p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE); | |
656 | p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE); | |
657 | ||
658 | /* Allow us to adjust capability chain */ | |
659 | p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE); | |
660 | ||
661 | /* Sometimes used by sw, just virtualize */ | |
662 | p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE); | |
1d53a3a7 FB |
663 | |
664 | /* Virtualize interrupt pin to allow hiding INTx */ | |
665 | p_setb(perm, PCI_INTERRUPT_PIN, (u8)ALL_VIRT, (u8)NO_WRITE); | |
666 | ||
89e1f7d4 AW |
667 | return 0; |
668 | } | |
669 | ||
2dd11948 AW |
670 | static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos, |
671 | int count, struct perm_bits *perm, | |
672 | int offset, __le32 val) | |
673 | { | |
674 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | |
675 | if (count < 0) | |
676 | return count; | |
677 | ||
678 | if (offset == PCI_PM_CTRL) { | |
679 | pci_power_t state; | |
680 | ||
681 | switch (le32_to_cpu(val) & PCI_PM_CTRL_STATE_MASK) { | |
682 | case 0: | |
683 | state = PCI_D0; | |
684 | break; | |
685 | case 1: | |
686 | state = PCI_D1; | |
687 | break; | |
688 | case 2: | |
689 | state = PCI_D2; | |
690 | break; | |
691 | case 3: | |
692 | state = PCI_D3hot; | |
693 | break; | |
694 | } | |
695 | ||
696 | pci_set_power_state(vdev->pdev, state); | |
697 | } | |
698 | ||
699 | return count; | |
700 | } | |
701 | ||
89e1f7d4 AW |
702 | /* Permissions for the Power Management capability */ |
703 | static int __init init_pci_cap_pm_perm(struct perm_bits *perm) | |
704 | { | |
705 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM])) | |
706 | return -ENOMEM; | |
707 | ||
2dd11948 AW |
708 | perm->writefn = vfio_pm_config_write; |
709 | ||
89e1f7d4 AW |
710 | /* |
711 | * We always virtualize the next field so we can remove | |
712 | * capabilities from the chain if we want to. | |
713 | */ | |
714 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | |
715 | ||
716 | /* | |
2dd11948 AW |
717 | * Power management is defined *per function*, so we can let |
718 | * the user change power state, but we trap and initiate the | |
719 | * change ourselves, so the state bits are read-only. | |
89e1f7d4 | 720 | */ |
2dd11948 | 721 | p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK); |
89e1f7d4 AW |
722 | return 0; |
723 | } | |
724 | ||
4e1a6355 AW |
725 | static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos, |
726 | int count, struct perm_bits *perm, | |
727 | int offset, __le32 val) | |
728 | { | |
729 | struct pci_dev *pdev = vdev->pdev; | |
730 | __le16 *paddr = (__le16 *)(vdev->vconfig + pos - offset + PCI_VPD_ADDR); | |
731 | __le32 *pdata = (__le32 *)(vdev->vconfig + pos - offset + PCI_VPD_DATA); | |
732 | u16 addr; | |
733 | u32 data; | |
734 | ||
735 | /* | |
736 | * Write through to emulation. If the write includes the upper byte | |
737 | * of PCI_VPD_ADDR, then the PCI_VPD_ADDR_F bit is written and we | |
738 | * have work to do. | |
739 | */ | |
740 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | |
741 | if (count < 0 || offset > PCI_VPD_ADDR + 1 || | |
742 | offset + count <= PCI_VPD_ADDR + 1) | |
743 | return count; | |
744 | ||
745 | addr = le16_to_cpu(*paddr); | |
746 | ||
747 | if (addr & PCI_VPD_ADDR_F) { | |
748 | data = le32_to_cpu(*pdata); | |
749 | if (pci_write_vpd(pdev, addr & ~PCI_VPD_ADDR_F, 4, &data) != 4) | |
750 | return count; | |
751 | } else { | |
ce7585f3 AW |
752 | data = 0; |
753 | if (pci_read_vpd(pdev, addr, 4, &data) < 0) | |
4e1a6355 AW |
754 | return count; |
755 | *pdata = cpu_to_le32(data); | |
756 | } | |
757 | ||
758 | /* | |
759 | * Toggle PCI_VPD_ADDR_F in the emulated PCI_VPD_ADDR register to | |
760 | * signal completion. If an error occurs above, we assume that not | |
761 | * toggling this bit will induce a driver timeout. | |
762 | */ | |
763 | addr ^= PCI_VPD_ADDR_F; | |
764 | *paddr = cpu_to_le16(addr); | |
765 | ||
766 | return count; | |
767 | } | |
768 | ||
769 | /* Permissions for Vital Product Data capability */ | |
770 | static int __init init_pci_cap_vpd_perm(struct perm_bits *perm) | |
771 | { | |
772 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_VPD])) | |
773 | return -ENOMEM; | |
774 | ||
775 | perm->writefn = vfio_vpd_config_write; | |
776 | ||
777 | /* | |
778 | * We always virtualize the next field so we can remove | |
779 | * capabilities from the chain if we want to. | |
780 | */ | |
781 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | |
782 | ||
783 | /* | |
784 | * Both the address and data registers are virtualized to | |
785 | * enable access through the pci_vpd_read/write functions | |
786 | */ | |
787 | p_setw(perm, PCI_VPD_ADDR, (u16)ALL_VIRT, (u16)ALL_WRITE); | |
788 | p_setd(perm, PCI_VPD_DATA, ALL_VIRT, ALL_WRITE); | |
789 | ||
790 | return 0; | |
791 | } | |
792 | ||
89e1f7d4 AW |
793 | /* Permissions for PCI-X capability */ |
794 | static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) | |
795 | { | |
796 | /* Alloc 24, but only 8 are used in v0 */ | |
797 | if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2)) | |
798 | return -ENOMEM; | |
799 | ||
800 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | |
801 | ||
802 | p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE); | |
803 | p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE); | |
804 | return 0; | |
805 | } | |
806 | ||
807 | /* Permissions for PCI Express capability */ | |
808 | static int __init init_pci_cap_exp_perm(struct perm_bits *perm) | |
809 | { | |
810 | /* Alloc larger of two possible sizes */ | |
811 | if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2)) | |
812 | return -ENOMEM; | |
813 | ||
814 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | |
815 | ||
816 | /* | |
817 | * Allow writes to device control fields (includes FLR!) | |
818 | * but not to devctl_phantom which could confuse IOMMU | |
819 | * or to the ARI bit in devctl2 which is set at probe time | |
820 | */ | |
821 | p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM); | |
822 | p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); | |
823 | return 0; | |
824 | } | |
825 | ||
826 | /* Permissions for Advanced Function capability */ | |
827 | static int __init init_pci_cap_af_perm(struct perm_bits *perm) | |
828 | { | |
829 | if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF])) | |
830 | return -ENOMEM; | |
831 | ||
832 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | |
833 | p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR); | |
834 | return 0; | |
835 | } | |
836 | ||
837 | /* Permissions for Advanced Error Reporting extended capability */ | |
838 | static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm) | |
839 | { | |
840 | u32 mask; | |
841 | ||
842 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR])) | |
843 | return -ENOMEM; | |
844 | ||
845 | /* | |
846 | * Virtualize the first dword of all express capabilities | |
847 | * because it includes the next pointer. This lets us later | |
848 | * remove capabilities from the chain if we need to. | |
849 | */ | |
850 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | |
851 | ||
852 | /* Writable bits mask */ | |
846fc709 | 853 | mask = PCI_ERR_UNC_UND | /* Undefined */ |
89e1f7d4 AW |
854 | PCI_ERR_UNC_DLP | /* Data Link Protocol */ |
855 | PCI_ERR_UNC_SURPDN | /* Surprise Down */ | |
856 | PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */ | |
857 | PCI_ERR_UNC_FCP | /* Flow Control Protocol */ | |
858 | PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */ | |
859 | PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */ | |
860 | PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */ | |
861 | PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */ | |
862 | PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */ | |
863 | PCI_ERR_UNC_ECRC | /* ECRC Error Status */ | |
864 | PCI_ERR_UNC_UNSUP | /* Unsupported Request */ | |
865 | PCI_ERR_UNC_ACSV | /* ACS Violation */ | |
866 | PCI_ERR_UNC_INTN | /* internal error */ | |
867 | PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */ | |
868 | PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */ | |
869 | PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */ | |
870 | p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask); | |
871 | p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask); | |
872 | p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask); | |
873 | ||
874 | mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */ | |
875 | PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */ | |
876 | PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */ | |
877 | PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */ | |
878 | PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */ | |
879 | PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */ | |
880 | PCI_ERR_COR_INTERNAL | /* Corrected Internal */ | |
881 | PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */ | |
882 | p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask); | |
883 | p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask); | |
884 | ||
885 | mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */ | |
886 | PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */ | |
887 | p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask); | |
888 | return 0; | |
889 | } | |
890 | ||
891 | /* Permissions for Power Budgeting extended capability */ | |
892 | static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) | |
893 | { | |
894 | if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR])) | |
895 | return -ENOMEM; | |
896 | ||
897 | p_setd(perm, 0, ALL_VIRT, NO_WRITE); | |
898 | ||
899 | /* Writing the data selector is OK, the info is still read-only */ | |
900 | p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE); | |
901 | return 0; | |
902 | } | |
903 | ||
904 | /* | |
905 | * Initialize the shared permission tables | |
906 | */ | |
907 | void vfio_pci_uninit_perm_bits(void) | |
908 | { | |
909 | free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]); | |
910 | ||
911 | free_perm_bits(&cap_perms[PCI_CAP_ID_PM]); | |
4e1a6355 | 912 | free_perm_bits(&cap_perms[PCI_CAP_ID_VPD]); |
89e1f7d4 AW |
913 | free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]); |
914 | free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]); | |
915 | free_perm_bits(&cap_perms[PCI_CAP_ID_AF]); | |
916 | ||
917 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | |
918 | free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | |
919 | } | |
920 | ||
921 | int __init vfio_pci_init_perm_bits(void) | |
922 | { | |
923 | int ret; | |
924 | ||
925 | /* Basic config space */ | |
926 | ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]); | |
927 | ||
928 | /* Capabilities */ | |
929 | ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); | |
4e1a6355 | 930 | ret |= init_pci_cap_vpd_perm(&cap_perms[PCI_CAP_ID_VPD]); |
89e1f7d4 | 931 | ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); |
a7d1ea1c | 932 | cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write; |
89e1f7d4 AW |
933 | ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); |
934 | ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); | |
935 | ||
936 | /* Extended capabilities */ | |
937 | ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); | |
938 | ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); | |
a7d1ea1c | 939 | ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; |
89e1f7d4 AW |
940 | |
941 | if (ret) | |
942 | vfio_pci_uninit_perm_bits(); | |
943 | ||
944 | return ret; | |
945 | } | |
946 | ||
947 | static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) | |
948 | { | |
949 | u8 cap; | |
950 | int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : | |
951 | PCI_STD_HEADER_SIZEOF; | |
89e1f7d4 AW |
952 | cap = vdev->pci_config_map[pos]; |
953 | ||
954 | if (cap == PCI_CAP_ID_BASIC) | |
955 | return 0; | |
956 | ||
957 | /* XXX Can we have to abutting capabilities of the same type? */ | |
958 | while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) | |
959 | pos--; | |
960 | ||
180b1381 | 961 | return pos; |
89e1f7d4 AW |
962 | } |
963 | ||
964 | static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, | |
965 | int count, struct perm_bits *perm, | |
966 | int offset, __le32 *val) | |
967 | { | |
968 | /* Update max available queue size from msi_qmax */ | |
969 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | |
970 | __le16 *flags; | |
971 | int start; | |
972 | ||
973 | start = vfio_find_cap_start(vdev, pos); | |
974 | ||
975 | flags = (__le16 *)&vdev->vconfig[start]; | |
976 | ||
977 | *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK); | |
978 | *flags |= cpu_to_le16(vdev->msi_qmax << 1); | |
979 | } | |
980 | ||
981 | return vfio_default_config_read(vdev, pos, count, perm, offset, val); | |
982 | } | |
983 | ||
984 | static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, | |
985 | int count, struct perm_bits *perm, | |
986 | int offset, __le32 val) | |
987 | { | |
988 | count = vfio_default_config_write(vdev, pos, count, perm, offset, val); | |
989 | if (count < 0) | |
990 | return count; | |
991 | ||
992 | /* Fixup and write configured queue size and enable to hardware */ | |
993 | if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { | |
994 | __le16 *pflags; | |
995 | u16 flags; | |
996 | int start, ret; | |
997 | ||
998 | start = vfio_find_cap_start(vdev, pos); | |
999 | ||
1000 | pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS]; | |
1001 | ||
1002 | flags = le16_to_cpu(*pflags); | |
1003 | ||
1004 | /* MSI is enabled via ioctl */ | |
1005 | if (!is_msi(vdev)) | |
1006 | flags &= ~PCI_MSI_FLAGS_ENABLE; | |
1007 | ||
1008 | /* Check queue size */ | |
1009 | if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) { | |
1010 | flags &= ~PCI_MSI_FLAGS_QSIZE; | |
1011 | flags |= vdev->msi_qmax << 4; | |
1012 | } | |
1013 | ||
1014 | /* Write back to virt and to hardware */ | |
1015 | *pflags = cpu_to_le16(flags); | |
1016 | ret = pci_user_write_config_word(vdev->pdev, | |
1017 | start + PCI_MSI_FLAGS, | |
1018 | flags); | |
1019 | if (ret) | |
1020 | return pcibios_err_to_errno(ret); | |
1021 | } | |
1022 | ||
1023 | return count; | |
1024 | } | |
1025 | ||
1026 | /* | |
1027 | * MSI determination is per-device, so this routine gets used beyond | |
1028 | * initialization time. Don't add __init | |
1029 | */ | |
1030 | static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags) | |
1031 | { | |
1032 | if (alloc_perm_bits(perm, len)) | |
1033 | return -ENOMEM; | |
1034 | ||
1035 | perm->readfn = vfio_msi_config_read; | |
1036 | perm->writefn = vfio_msi_config_write; | |
1037 | ||
1038 | p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); | |
1039 | ||
1040 | /* | |
1041 | * The upper byte of the control register is reserved, | |
1042 | * just setup the lower byte. | |
1043 | */ | |
1044 | p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE); | |
1045 | p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE); | |
1046 | if (flags & PCI_MSI_FLAGS_64BIT) { | |
1047 | p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE); | |
1048 | p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE); | |
1049 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | |
1050 | p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE); | |
1051 | p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE); | |
1052 | } | |
1053 | } else { | |
1054 | p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE); | |
1055 | if (flags & PCI_MSI_FLAGS_MASKBIT) { | |
1056 | p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE); | |
1057 | p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE); | |
1058 | } | |
1059 | } | |
1060 | return 0; | |
1061 | } | |
1062 | ||
1063 | /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ | |
1064 | static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) | |
1065 | { | |
1066 | struct pci_dev *pdev = vdev->pdev; | |
1067 | int len, ret; | |
1068 | u16 flags; | |
1069 | ||
1070 | ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags); | |
1071 | if (ret) | |
1072 | return pcibios_err_to_errno(ret); | |
1073 | ||
1074 | len = 10; /* Minimum size */ | |
1075 | if (flags & PCI_MSI_FLAGS_64BIT) | |
1076 | len += 4; | |
1077 | if (flags & PCI_MSI_FLAGS_MASKBIT) | |
1078 | len += 10; | |
1079 | ||
1080 | if (vdev->msi_perm) | |
1081 | return len; | |
1082 | ||
1083 | vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL); | |
1084 | if (!vdev->msi_perm) | |
1085 | return -ENOMEM; | |
1086 | ||
1087 | ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags); | |
1088 | if (ret) | |
1089 | return ret; | |
1090 | ||
1091 | return len; | |
1092 | } | |
1093 | ||
1094 | /* Determine extended capability length for VC (2 & 9) and MFVC */ | |
1095 | static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) | |
1096 | { | |
1097 | struct pci_dev *pdev = vdev->pdev; | |
1098 | u32 tmp; | |
1099 | int ret, evcc, phases, vc_arb; | |
1100 | int len = PCI_CAP_VC_BASE_SIZEOF; | |
1101 | ||
274127a1 | 1102 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP1, &tmp); |
89e1f7d4 AW |
1103 | if (ret) |
1104 | return pcibios_err_to_errno(ret); | |
1105 | ||
274127a1 AW |
1106 | evcc = tmp & PCI_VC_CAP1_EVCC; /* extended vc count */ |
1107 | ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP2, &tmp); | |
89e1f7d4 AW |
1108 | if (ret) |
1109 | return pcibios_err_to_errno(ret); | |
1110 | ||
274127a1 | 1111 | if (tmp & PCI_VC_CAP2_128_PHASE) |
89e1f7d4 | 1112 | phases = 128; |
274127a1 | 1113 | else if (tmp & PCI_VC_CAP2_64_PHASE) |
89e1f7d4 | 1114 | phases = 64; |
274127a1 | 1115 | else if (tmp & PCI_VC_CAP2_32_PHASE) |
89e1f7d4 AW |
1116 | phases = 32; |
1117 | else | |
1118 | phases = 0; | |
1119 | ||
1120 | vc_arb = phases * 4; | |
1121 | ||
1122 | /* | |
1123 | * Port arbitration tables are root & switch only; | |
1124 | * function arbitration tables are function 0 only. | |
1125 | * In either case, we'll never let user write them so | |
1126 | * we don't care how big they are | |
1127 | */ | |
1128 | len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF; | |
1129 | if (vc_arb) { | |
1130 | len = round_up(len, 16); | |
1131 | len += vc_arb / 8; | |
1132 | } | |
1133 | return len; | |
1134 | } | |
1135 | ||
1136 | static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) | |
1137 | { | |
1138 | struct pci_dev *pdev = vdev->pdev; | |
17638db1 | 1139 | u32 dword; |
89e1f7d4 AW |
1140 | u16 word; |
1141 | u8 byte; | |
1142 | int ret; | |
1143 | ||
1144 | switch (cap) { | |
1145 | case PCI_CAP_ID_MSI: | |
1146 | return vfio_msi_cap_len(vdev, pos); | |
1147 | case PCI_CAP_ID_PCIX: | |
1148 | ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word); | |
1149 | if (ret) | |
1150 | return pcibios_err_to_errno(ret); | |
1151 | ||
1152 | if (PCI_X_CMD_VERSION(word)) { | |
f7055280 AK |
1153 | if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) { |
1154 | /* Test for extended capabilities */ | |
1155 | pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, | |
1156 | &dword); | |
1157 | vdev->extended_caps = (dword != 0); | |
1158 | } | |
89e1f7d4 AW |
1159 | return PCI_CAP_PCIX_SIZEOF_V2; |
1160 | } else | |
1161 | return PCI_CAP_PCIX_SIZEOF_V0; | |
1162 | case PCI_CAP_ID_VNDR: | |
1163 | /* length follows next field */ | |
1164 | ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte); | |
1165 | if (ret) | |
1166 | return pcibios_err_to_errno(ret); | |
1167 | ||
1168 | return byte; | |
1169 | case PCI_CAP_ID_EXP: | |
f7055280 AK |
1170 | if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) { |
1171 | /* Test for extended capabilities */ | |
1172 | pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); | |
1173 | vdev->extended_caps = (dword != 0); | |
1174 | } | |
5641ade4 | 1175 | |
17638db1 | 1176 | /* length based on version */ |
aa2cba51 | 1177 | if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) |
89e1f7d4 | 1178 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; |
5641ade4 | 1179 | else |
89e1f7d4 | 1180 | return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; |
89e1f7d4 AW |
1181 | case PCI_CAP_ID_HT: |
1182 | ret = pci_read_config_byte(pdev, pos + 3, &byte); | |
1183 | if (ret) | |
1184 | return pcibios_err_to_errno(ret); | |
1185 | ||
1186 | return (byte & HT_3BIT_CAP_MASK) ? | |
1187 | HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG; | |
1188 | case PCI_CAP_ID_SATA: | |
1189 | ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte); | |
1190 | if (ret) | |
1191 | return pcibios_err_to_errno(ret); | |
1192 | ||
1193 | byte &= PCI_SATA_REGS_MASK; | |
1194 | if (byte == PCI_SATA_REGS_INLINE) | |
1195 | return PCI_SATA_SIZEOF_LONG; | |
1196 | else | |
1197 | return PCI_SATA_SIZEOF_SHORT; | |
1198 | default: | |
1199 | pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n", | |
1200 | dev_name(&pdev->dev), __func__, cap, pos); | |
1201 | } | |
1202 | ||
1203 | return 0; | |
1204 | } | |
1205 | ||
1206 | static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) | |
1207 | { | |
1208 | struct pci_dev *pdev = vdev->pdev; | |
1209 | u8 byte; | |
1210 | u32 dword; | |
1211 | int ret; | |
1212 | ||
1213 | switch (ecap) { | |
1214 | case PCI_EXT_CAP_ID_VNDR: | |
1215 | ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); | |
1216 | if (ret) | |
1217 | return pcibios_err_to_errno(ret); | |
1218 | ||
1219 | return dword >> PCI_VSEC_HDR_LEN_SHIFT; | |
1220 | case PCI_EXT_CAP_ID_VC: | |
1221 | case PCI_EXT_CAP_ID_VC9: | |
1222 | case PCI_EXT_CAP_ID_MFVC: | |
1223 | return vfio_vc_cap_len(vdev, epos); | |
1224 | case PCI_EXT_CAP_ID_ACS: | |
1225 | ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte); | |
1226 | if (ret) | |
1227 | return pcibios_err_to_errno(ret); | |
1228 | ||
1229 | if (byte & PCI_ACS_EC) { | |
1230 | int bits; | |
1231 | ||
1232 | ret = pci_read_config_byte(pdev, | |
1233 | epos + PCI_ACS_EGRESS_BITS, | |
1234 | &byte); | |
1235 | if (ret) | |
1236 | return pcibios_err_to_errno(ret); | |
1237 | ||
1238 | bits = byte ? round_up(byte, 32) : 256; | |
1239 | return 8 + (bits / 8); | |
1240 | } | |
1241 | return 8; | |
1242 | ||
1243 | case PCI_EXT_CAP_ID_REBAR: | |
1244 | ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte); | |
1245 | if (ret) | |
1246 | return pcibios_err_to_errno(ret); | |
1247 | ||
1248 | byte &= PCI_REBAR_CTRL_NBAR_MASK; | |
1249 | byte >>= PCI_REBAR_CTRL_NBAR_SHIFT; | |
1250 | ||
1251 | return 4 + (byte * 8); | |
1252 | case PCI_EXT_CAP_ID_DPA: | |
1253 | ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte); | |
1254 | if (ret) | |
1255 | return pcibios_err_to_errno(ret); | |
1256 | ||
1257 | byte &= PCI_DPA_CAP_SUBSTATE_MASK; | |
afa63252 | 1258 | return PCI_DPA_BASE_SIZEOF + byte + 1; |
89e1f7d4 AW |
1259 | case PCI_EXT_CAP_ID_TPH: |
1260 | ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword); | |
1261 | if (ret) | |
1262 | return pcibios_err_to_errno(ret); | |
1263 | ||
1264 | if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) { | |
1265 | int sts; | |
1266 | ||
afa63252 | 1267 | sts = dword & PCI_TPH_CAP_ST_MASK; |
89e1f7d4 | 1268 | sts >>= PCI_TPH_CAP_ST_SHIFT; |
afa63252 | 1269 | return PCI_TPH_BASE_SIZEOF + (sts * 2) + 2; |
89e1f7d4 AW |
1270 | } |
1271 | return PCI_TPH_BASE_SIZEOF; | |
1272 | default: | |
1273 | pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n", | |
1274 | dev_name(&pdev->dev), __func__, ecap, epos); | |
1275 | } | |
1276 | ||
1277 | return 0; | |
1278 | } | |
1279 | ||
1280 | static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, | |
1281 | int offset, int size) | |
1282 | { | |
1283 | struct pci_dev *pdev = vdev->pdev; | |
1284 | int ret = 0; | |
1285 | ||
1286 | /* | |
1287 | * We try to read physical config space in the largest chunks | |
1288 | * we can, assuming that all of the fields support dword access. | |
1289 | * pci_save_state() makes this same assumption and seems to do ok. | |
1290 | */ | |
1291 | while (size) { | |
1292 | int filled; | |
1293 | ||
1294 | if (size >= 4 && !(offset % 4)) { | |
1295 | __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; | |
1296 | u32 dword; | |
1297 | ||
1298 | ret = pci_read_config_dword(pdev, offset, &dword); | |
1299 | if (ret) | |
1300 | return ret; | |
1301 | *dwordp = cpu_to_le32(dword); | |
1302 | filled = 4; | |
1303 | } else if (size >= 2 && !(offset % 2)) { | |
1304 | __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; | |
1305 | u16 word; | |
1306 | ||
1307 | ret = pci_read_config_word(pdev, offset, &word); | |
1308 | if (ret) | |
1309 | return ret; | |
1310 | *wordp = cpu_to_le16(word); | |
1311 | filled = 2; | |
1312 | } else { | |
1313 | u8 *byte = &vdev->vconfig[offset]; | |
1314 | ret = pci_read_config_byte(pdev, offset, byte); | |
1315 | if (ret) | |
1316 | return ret; | |
1317 | filled = 1; | |
1318 | } | |
1319 | ||
1320 | offset += filled; | |
1321 | size -= filled; | |
1322 | } | |
1323 | ||
1324 | return ret; | |
1325 | } | |
1326 | ||
1327 | static int vfio_cap_init(struct vfio_pci_device *vdev) | |
1328 | { | |
1329 | struct pci_dev *pdev = vdev->pdev; | |
1330 | u8 *map = vdev->pci_config_map; | |
1331 | u16 status; | |
1332 | u8 pos, *prev, cap; | |
1333 | int loops, ret, caps = 0; | |
1334 | ||
1335 | /* Any capabilities? */ | |
1336 | ret = pci_read_config_word(pdev, PCI_STATUS, &status); | |
1337 | if (ret) | |
1338 | return ret; | |
1339 | ||
1340 | if (!(status & PCI_STATUS_CAP_LIST)) | |
1341 | return 0; /* Done */ | |
1342 | ||
1343 | ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); | |
1344 | if (ret) | |
1345 | return ret; | |
1346 | ||
1347 | /* Mark the previous position in case we want to skip a capability */ | |
1348 | prev = &vdev->vconfig[PCI_CAPABILITY_LIST]; | |
1349 | ||
1350 | /* We can bound our loop, capabilities are dword aligned */ | |
1351 | loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF; | |
1352 | while (pos && loops--) { | |
1353 | u8 next; | |
1354 | int i, len = 0; | |
1355 | ||
1356 | ret = pci_read_config_byte(pdev, pos, &cap); | |
1357 | if (ret) | |
1358 | return ret; | |
1359 | ||
1360 | ret = pci_read_config_byte(pdev, | |
1361 | pos + PCI_CAP_LIST_NEXT, &next); | |
1362 | if (ret) | |
1363 | return ret; | |
1364 | ||
1365 | if (cap <= PCI_CAP_ID_MAX) { | |
1366 | len = pci_cap_length[cap]; | |
1367 | if (len == 0xFF) { /* Variable length */ | |
1368 | len = vfio_cap_len(vdev, cap, pos); | |
1369 | if (len < 0) | |
1370 | return len; | |
1371 | } | |
1372 | } | |
1373 | ||
1374 | if (!len) { | |
1375 | pr_info("%s: %s hiding cap 0x%x\n", | |
1376 | __func__, dev_name(&pdev->dev), cap); | |
1377 | *prev = next; | |
1378 | pos = next; | |
1379 | continue; | |
1380 | } | |
1381 | ||
1382 | /* Sanity check, do we overlap other capabilities? */ | |
180b1381 AW |
1383 | for (i = 0; i < len; i++) { |
1384 | if (likely(map[pos + i] == PCI_CAP_ID_INVALID)) | |
89e1f7d4 AW |
1385 | continue; |
1386 | ||
1387 | pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", | |
1388 | __func__, dev_name(&pdev->dev), | |
1389 | pos + i, map[pos + i], cap); | |
1390 | } | |
1391 | ||
345d7104 AW |
1392 | BUILD_BUG_ON(PCI_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT); |
1393 | ||
180b1381 | 1394 | memset(map + pos, cap, len); |
89e1f7d4 AW |
1395 | ret = vfio_fill_vconfig_bytes(vdev, pos, len); |
1396 | if (ret) | |
1397 | return ret; | |
1398 | ||
1399 | prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; | |
1400 | pos = next; | |
1401 | caps++; | |
1402 | } | |
1403 | ||
1404 | /* If we didn't fill any capabilities, clear the status flag */ | |
1405 | if (!caps) { | |
1406 | __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS]; | |
1407 | *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST); | |
1408 | } | |
1409 | ||
1410 | return 0; | |
1411 | } | |
1412 | ||
1413 | static int vfio_ecap_init(struct vfio_pci_device *vdev) | |
1414 | { | |
1415 | struct pci_dev *pdev = vdev->pdev; | |
1416 | u8 *map = vdev->pci_config_map; | |
1417 | u16 epos; | |
1418 | __le32 *prev = NULL; | |
1419 | int loops, ret, ecaps = 0; | |
1420 | ||
1421 | if (!vdev->extended_caps) | |
1422 | return 0; | |
1423 | ||
1424 | epos = PCI_CFG_SPACE_SIZE; | |
1425 | ||
1426 | loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; | |
1427 | ||
1428 | while (loops-- && epos >= PCI_CFG_SPACE_SIZE) { | |
1429 | u32 header; | |
1430 | u16 ecap; | |
1431 | int i, len = 0; | |
1432 | bool hidden = false; | |
1433 | ||
1434 | ret = pci_read_config_dword(pdev, epos, &header); | |
1435 | if (ret) | |
1436 | return ret; | |
1437 | ||
1438 | ecap = PCI_EXT_CAP_ID(header); | |
1439 | ||
1440 | if (ecap <= PCI_EXT_CAP_ID_MAX) { | |
1441 | len = pci_ext_cap_length[ecap]; | |
1442 | if (len == 0xFF) { | |
1443 | len = vfio_ext_cap_len(vdev, ecap, epos); | |
1444 | if (len < 0) | |
1445 | return ret; | |
1446 | } | |
1447 | } | |
1448 | ||
1449 | if (!len) { | |
1450 | pr_info("%s: %s hiding ecap 0x%x@0x%x\n", | |
1451 | __func__, dev_name(&pdev->dev), ecap, epos); | |
1452 | ||
1453 | /* If not the first in the chain, we can skip over it */ | |
1454 | if (prev) { | |
1455 | u32 val = epos = PCI_EXT_CAP_NEXT(header); | |
1456 | *prev &= cpu_to_le32(~(0xffcU << 20)); | |
1457 | *prev |= cpu_to_le32(val << 20); | |
1458 | continue; | |
1459 | } | |
1460 | ||
1461 | /* | |
1462 | * Otherwise, fill in a placeholder, the direct | |
1463 | * readfn will virtualize this automatically | |
1464 | */ | |
1465 | len = PCI_CAP_SIZEOF; | |
1466 | hidden = true; | |
1467 | } | |
1468 | ||
180b1381 AW |
1469 | for (i = 0; i < len; i++) { |
1470 | if (likely(map[epos + i] == PCI_CAP_ID_INVALID)) | |
89e1f7d4 AW |
1471 | continue; |
1472 | ||
1473 | pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", | |
1474 | __func__, dev_name(&pdev->dev), | |
1475 | epos + i, map[epos + i], ecap); | |
1476 | } | |
1477 | ||
1478 | /* | |
1479 | * Even though ecap is 2 bytes, we're currently a long way | |
1480 | * from exceeding 1 byte capabilities. If we ever make it | |
345d7104 | 1481 | * up to 0xFE we'll need to up this to a two-byte, byte map. |
89e1f7d4 | 1482 | */ |
345d7104 | 1483 | BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT); |
89e1f7d4 | 1484 | |
180b1381 | 1485 | memset(map + epos, ecap, len); |
89e1f7d4 AW |
1486 | ret = vfio_fill_vconfig_bytes(vdev, epos, len); |
1487 | if (ret) | |
1488 | return ret; | |
1489 | ||
1490 | /* | |
1491 | * If we're just using this capability to anchor the list, | |
1492 | * hide the real ID. Only count real ecaps. XXX PCI spec | |
1493 | * indicates to use cap id = 0, version = 0, next = 0 if | |
1494 | * ecaps are absent, hope users check all the way to next. | |
1495 | */ | |
1496 | if (hidden) | |
1497 | *(__le32 *)&vdev->vconfig[epos] &= | |
1498 | cpu_to_le32((0xffcU << 20)); | |
1499 | else | |
1500 | ecaps++; | |
1501 | ||
1502 | prev = (__le32 *)&vdev->vconfig[epos]; | |
1503 | epos = PCI_EXT_CAP_NEXT(header); | |
1504 | } | |
1505 | ||
1506 | if (!ecaps) | |
1507 | *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; | |
1508 | ||
1509 | return 0; | |
1510 | } | |
1511 | ||
1512 | /* | |
1513 | * For each device we allocate a pci_config_map that indicates the | |
1514 | * capability occupying each dword and thus the struct perm_bits we | |
1515 | * use for read and write. We also allocate a virtualized config | |
1516 | * space which tracks reads and writes to bits that we emulate for | |
1517 | * the user. Initial values filled from device. | |
1518 | * | |
1519 | * Using shared stuct perm_bits between all vfio-pci devices saves | |
1520 | * us from allocating cfg_size buffers for virt and write for every | |
1521 | * device. We could remove vconfig and allocate individual buffers | |
1522 | * for each area requring emulated bits, but the array of pointers | |
1523 | * would be comparable in size (at least for standard config space). | |
1524 | */ | |
1525 | int vfio_config_init(struct vfio_pci_device *vdev) | |
1526 | { | |
1527 | struct pci_dev *pdev = vdev->pdev; | |
1528 | u8 *map, *vconfig; | |
1529 | int ret; | |
1530 | ||
1531 | /* | |
180b1381 AW |
1532 | * Config space, caps and ecaps are all dword aligned, so we could |
1533 | * use one byte per dword to record the type. However, there are | |
1534 | * no requiremenst on the length of a capability, so the gap between | |
1535 | * capabilities needs byte granularity. | |
89e1f7d4 | 1536 | */ |
180b1381 | 1537 | map = kmalloc(pdev->cfg_size, GFP_KERNEL); |
89e1f7d4 AW |
1538 | if (!map) |
1539 | return -ENOMEM; | |
1540 | ||
1541 | vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL); | |
1542 | if (!vconfig) { | |
1543 | kfree(map); | |
1544 | return -ENOMEM; | |
1545 | } | |
1546 | ||
1547 | vdev->pci_config_map = map; | |
1548 | vdev->vconfig = vconfig; | |
1549 | ||
180b1381 AW |
1550 | memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF); |
1551 | memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID, | |
1552 | pdev->cfg_size - PCI_STD_HEADER_SIZEOF); | |
89e1f7d4 AW |
1553 | |
1554 | ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); | |
1555 | if (ret) | |
1556 | goto out; | |
1557 | ||
1558 | vdev->bardirty = true; | |
1559 | ||
1560 | /* | |
1561 | * XXX can we just pci_load_saved_state/pci_restore_state? | |
1562 | * may need to rebuild vconfig after that | |
1563 | */ | |
1564 | ||
1565 | /* For restore after reset */ | |
1566 | vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]); | |
1567 | vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]); | |
1568 | vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]); | |
1569 | vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]); | |
1570 | vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]); | |
1571 | vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]); | |
1572 | vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]); | |
1573 | ||
1574 | if (pdev->is_virtfn) { | |
1575 | *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor); | |
1576 | *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device); | |
1577 | } | |
1578 | ||
45074405 | 1579 | if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx) |
1d53a3a7 FB |
1580 | vconfig[PCI_INTERRUPT_PIN] = 0; |
1581 | ||
89e1f7d4 AW |
1582 | ret = vfio_cap_init(vdev); |
1583 | if (ret) | |
1584 | goto out; | |
1585 | ||
1586 | ret = vfio_ecap_init(vdev); | |
1587 | if (ret) | |
1588 | goto out; | |
1589 | ||
1590 | return 0; | |
1591 | ||
1592 | out: | |
1593 | kfree(map); | |
1594 | vdev->pci_config_map = NULL; | |
1595 | kfree(vconfig); | |
1596 | vdev->vconfig = NULL; | |
1597 | return pcibios_err_to_errno(ret); | |
1598 | } | |
1599 | ||
1600 | void vfio_config_free(struct vfio_pci_device *vdev) | |
1601 | { | |
1602 | kfree(vdev->vconfig); | |
1603 | vdev->vconfig = NULL; | |
1604 | kfree(vdev->pci_config_map); | |
1605 | vdev->pci_config_map = NULL; | |
1606 | kfree(vdev->msi_perm); | |
1607 | vdev->msi_perm = NULL; | |
1608 | } | |
1609 | ||
180b1381 AW |
1610 | /* |
1611 | * Find the remaining number of bytes in a dword that match the given | |
1612 | * position. Stop at either the end of the capability or the dword boundary. | |
1613 | */ | |
1614 | static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, | |
1615 | loff_t pos) | |
1616 | { | |
1617 | u8 cap = vdev->pci_config_map[pos]; | |
1618 | size_t i; | |
1619 | ||
1620 | for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++) | |
1621 | /* nop */; | |
1622 | ||
1623 | return i; | |
1624 | } | |
1625 | ||
89e1f7d4 AW |
1626 | static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, |
1627 | size_t count, loff_t *ppos, bool iswrite) | |
1628 | { | |
1629 | struct pci_dev *pdev = vdev->pdev; | |
1630 | struct perm_bits *perm; | |
1631 | __le32 val = 0; | |
1632 | int cap_start = 0, offset; | |
1633 | u8 cap_id; | |
180b1381 | 1634 | ssize_t ret; |
89e1f7d4 | 1635 | |
180b1381 AW |
1636 | if (*ppos < 0 || *ppos >= pdev->cfg_size || |
1637 | *ppos + count > pdev->cfg_size) | |
89e1f7d4 AW |
1638 | return -EFAULT; |
1639 | ||
1640 | /* | |
180b1381 AW |
1641 | * Chop accesses into aligned chunks containing no more than a |
1642 | * single capability. Caller increments to the next chunk. | |
89e1f7d4 | 1643 | */ |
180b1381 AW |
1644 | count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos)); |
1645 | if (count >= 4 && !(*ppos % 4)) | |
1646 | count = 4; | |
1647 | else if (count >= 2 && !(*ppos % 2)) | |
1648 | count = 2; | |
1649 | else | |
1650 | count = 1; | |
89e1f7d4 | 1651 | |
180b1381 | 1652 | ret = count; |
89e1f7d4 | 1653 | |
180b1381 | 1654 | cap_id = vdev->pci_config_map[*ppos]; |
89e1f7d4 | 1655 | |
89e1f7d4 | 1656 | if (cap_id == PCI_CAP_ID_INVALID) { |
a7d1ea1c AW |
1657 | perm = &unassigned_perms; |
1658 | cap_start = *ppos; | |
345d7104 AW |
1659 | } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { |
1660 | perm = &virt_perms; | |
1661 | cap_start = *ppos; | |
89e1f7d4 | 1662 | } else { |
a7d1ea1c AW |
1663 | if (*ppos >= PCI_CFG_SPACE_SIZE) { |
1664 | WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); | |
89e1f7d4 | 1665 | |
a7d1ea1c AW |
1666 | perm = &ecap_perms[cap_id]; |
1667 | cap_start = vfio_find_cap_start(vdev, *ppos); | |
1668 | } else { | |
1669 | WARN_ON(cap_id > PCI_CAP_ID_MAX); | |
89e1f7d4 | 1670 | |
a7d1ea1c | 1671 | perm = &cap_perms[cap_id]; |
89e1f7d4 | 1672 | |
a7d1ea1c AW |
1673 | if (cap_id == PCI_CAP_ID_MSI) |
1674 | perm = vdev->msi_perm; | |
89e1f7d4 | 1675 | |
a7d1ea1c AW |
1676 | if (cap_id > PCI_CAP_ID_BASIC) |
1677 | cap_start = vfio_find_cap_start(vdev, *ppos); | |
1678 | } | |
89e1f7d4 AW |
1679 | } |
1680 | ||
1681 | WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); | |
1682 | WARN_ON(cap_start > *ppos); | |
1683 | ||
1684 | offset = *ppos - cap_start; | |
1685 | ||
1686 | if (iswrite) { | |
1687 | if (!perm->writefn) | |
1688 | return ret; | |
1689 | ||
1690 | if (copy_from_user(&val, buf, count)) | |
1691 | return -EFAULT; | |
1692 | ||
1693 | ret = perm->writefn(vdev, *ppos, count, perm, offset, val); | |
1694 | } else { | |
1695 | if (perm->readfn) { | |
1696 | ret = perm->readfn(vdev, *ppos, count, | |
1697 | perm, offset, &val); | |
1698 | if (ret < 0) | |
1699 | return ret; | |
1700 | } | |
1701 | ||
1702 | if (copy_to_user(buf, &val, count)) | |
1703 | return -EFAULT; | |
1704 | } | |
1705 | ||
1706 | return ret; | |
1707 | } | |
1708 | ||
906ee99d AW |
1709 | ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, |
1710 | size_t count, loff_t *ppos, bool iswrite) | |
89e1f7d4 AW |
1711 | { |
1712 | size_t done = 0; | |
1713 | int ret = 0; | |
1714 | loff_t pos = *ppos; | |
1715 | ||
1716 | pos &= VFIO_PCI_OFFSET_MASK; | |
1717 | ||
89e1f7d4 | 1718 | while (count) { |
180b1381 | 1719 | ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite); |
89e1f7d4 AW |
1720 | if (ret < 0) |
1721 | return ret; | |
1722 | ||
1723 | count -= ret; | |
1724 | done += ret; | |
1725 | buf += ret; | |
1726 | pos += ret; | |
1727 | } | |
1728 | ||
1729 | *ppos += done; | |
1730 | ||
1731 | return done; | |
1732 | } |