Commit | Line | Data |
---|---|---|
77bd7415 LV |
1 | /* |
2 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. | |
3c8c90ab LV |
3 | * Copyright IBM Corp. 2004 2005 |
4 | * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 | |
77bd7415 LV |
5 | * |
6 | * All rights reserved. | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or modify | |
9 | * it under the terms of the GNU General Public License as published by | |
10 | * the Free Software Foundation; either version 2 of the License, or (at | |
11 | * your option) any later version. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, but | |
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | |
16 | * NON INFRINGEMENT. See the GNU General Public License for more | |
17 | * details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, write to the Free Software | |
21 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
22 | * | |
3c8c90ab | 23 | * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> |
77bd7415 LV |
24 | */ |
25 | #include <linux/delay.h> | |
77bd7415 | 26 | #include <linux/interrupt.h> |
ac325acd | 27 | #include <linux/irq.h> |
feadf7c0 | 28 | #include <linux/module.h> |
77bd7415 LV |
29 | #include <linux/pci.h> |
30 | #include <asm/eeh.h> | |
31 | #include <asm/eeh_event.h> | |
32 | #include <asm/ppc-pci.h> | |
33 | #include <asm/pci-bridge.h> | |
34 | #include <asm/prom.h> | |
35 | #include <asm/rtas.h> | |
36 | ||
67086e32 | 37 | struct eeh_rmv_data { |
1c5c533b SB |
38 | struct list_head removed_vf_list; |
39 | int removed_dev_count; | |
67086e32 WY |
40 | }; |
41 | ||
30424e38 SB |
42 | static int eeh_result_priority(enum pci_ers_result result) |
43 | { | |
44 | switch (result) { | |
45 | case PCI_ERS_RESULT_NONE: | |
46 | return 1; | |
47 | case PCI_ERS_RESULT_NO_AER_DRIVER: | |
48 | return 2; | |
49 | case PCI_ERS_RESULT_RECOVERED: | |
50 | return 3; | |
51 | case PCI_ERS_RESULT_CAN_RECOVER: | |
52 | return 4; | |
53 | case PCI_ERS_RESULT_DISCONNECT: | |
54 | return 5; | |
55 | case PCI_ERS_RESULT_NEED_RESET: | |
56 | return 6; | |
57 | default: | |
58 | WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); | |
59 | return 0; | |
60 | } | |
61 | }; | |
62 | ||
c36c5ffd | 63 | static const char *pci_ers_result_name(enum pci_ers_result result) |
20b34497 SB |
64 | { |
65 | switch (result) { | |
66 | case PCI_ERS_RESULT_NONE: | |
67 | return "none"; | |
68 | case PCI_ERS_RESULT_CAN_RECOVER: | |
69 | return "can recover"; | |
70 | case PCI_ERS_RESULT_NEED_RESET: | |
71 | return "need reset"; | |
72 | case PCI_ERS_RESULT_DISCONNECT: | |
73 | return "disconnect"; | |
74 | case PCI_ERS_RESULT_RECOVERED: | |
75 | return "recovered"; | |
76 | case PCI_ERS_RESULT_NO_AER_DRIVER: | |
77 | return "no AER driver"; | |
78 | default: | |
79 | WARN_ONCE(1, "Unknown result type: %d\n", (int)result); | |
80 | return "unknown"; | |
81 | } | |
82 | }; | |
83 | ||
84 | static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, | |
85 | const char *fmt, ...) | |
86 | { | |
87 | struct va_format vaf; | |
88 | va_list args; | |
89 | ||
90 | va_start(args, fmt); | |
91 | ||
92 | vaf.fmt = fmt; | |
93 | vaf.va = &args; | |
94 | ||
95 | printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, | |
96 | edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); | |
97 | ||
98 | va_end(args); | |
99 | } | |
100 | ||
30424e38 SB |
101 | static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, |
102 | enum pci_ers_result new) | |
103 | { | |
104 | if (eeh_result_priority(new) > eeh_result_priority(old)) | |
105 | return new; | |
106 | return old; | |
107 | } | |
108 | ||
e2b810d5 SB |
109 | static bool eeh_dev_removed(struct eeh_dev *edev) |
110 | { | |
111 | return !edev || (edev->mode & EEH_DEV_REMOVED); | |
112 | } | |
113 | ||
114 | static bool eeh_edev_actionable(struct eeh_dev *edev) | |
115 | { | |
116 | return (edev->pdev && !eeh_dev_removed(edev) && | |
117 | !eeh_pe_passed(edev->pe)); | |
118 | } | |
119 | ||
feadf7c0 GS |
120 | /** |
121 | * eeh_pcid_get - Get the PCI device driver | |
122 | * @pdev: PCI device | |
123 | * | |
124 | * The function is used to retrieve the PCI device driver for | |
125 | * the indicated PCI device. Besides, we will increase the reference | |
126 | * of the PCI device driver to prevent that being unloaded on | |
127 | * the fly. Otherwise, kernel crash would be seen. | |
128 | */ | |
129 | static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) | |
130 | { | |
131 | if (!pdev || !pdev->driver) | |
132 | return NULL; | |
133 | ||
134 | if (!try_module_get(pdev->driver->driver.owner)) | |
135 | return NULL; | |
136 | ||
137 | return pdev->driver; | |
138 | } | |
139 | ||
140 | /** | |
141 | * eeh_pcid_put - Dereference on the PCI device driver | |
142 | * @pdev: PCI device | |
143 | * | |
144 | * The function is called to do dereference on the PCI device | |
145 | * driver of the indicated PCI device. | |
146 | */ | |
147 | static inline void eeh_pcid_put(struct pci_dev *pdev) | |
148 | { | |
149 | if (!pdev || !pdev->driver) | |
150 | return; | |
151 | ||
152 | module_put(pdev->driver->driver.owner); | |
153 | } | |
154 | ||
8535ef05 | 155 | /** |
29f8bf1b GS |
156 | * eeh_disable_irq - Disable interrupt for the recovering device |
157 | * @dev: PCI device | |
158 | * | |
159 | * This routine must be called when reporting temporary or permanent | |
160 | * error to the particular PCI device to disable interrupt of that | |
161 | * device. If the device has enabled MSI or MSI-X interrupt, we needn't | |
162 | * do real work because EEH should freeze DMA transfers for those PCI | |
163 | * devices encountering EEH errors, which includes MSI or MSI-X. | |
8535ef05 | 164 | */ |
010acfa1 | 165 | static void eeh_disable_irq(struct eeh_dev *edev) |
8535ef05 | 166 | { |
8535ef05 MM |
167 | /* Don't disable MSI and MSI-X interrupts. They are |
168 | * effectively disabled by the DMA Stopped state | |
169 | * when an EEH error occurs. | |
29f8bf1b | 170 | */ |
010acfa1 | 171 | if (edev->pdev->msi_enabled || edev->pdev->msix_enabled) |
8535ef05 MM |
172 | return; |
173 | ||
010acfa1 | 174 | if (!irq_has_action(edev->pdev->irq)) |
8535ef05 MM |
175 | return; |
176 | ||
dbbceee1 | 177 | edev->mode |= EEH_DEV_IRQ_DISABLED; |
010acfa1 | 178 | disable_irq_nosync(edev->pdev->irq); |
8535ef05 MM |
179 | } |
180 | ||
181 | /** | |
29f8bf1b GS |
182 | * eeh_enable_irq - Enable interrupt for the recovering device |
183 | * @dev: PCI device | |
184 | * | |
185 | * This routine must be called to enable interrupt while failed | |
186 | * device could be resumed. | |
8535ef05 | 187 | */ |
010acfa1 | 188 | static void eeh_enable_irq(struct eeh_dev *edev) |
8535ef05 | 189 | { |
dbbceee1 GS |
190 | if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { |
191 | edev->mode &= ~EEH_DEV_IRQ_DISABLED; | |
b8a9a11b TG |
192 | /* |
193 | * FIXME !!!!! | |
194 | * | |
195 | * This is just ass backwards. This maze has | |
196 | * unbalanced irq_enable/disable calls. So instead of | |
197 | * finding the root cause it works around the warning | |
198 | * in the irq_enable code by conditionally calling | |
199 | * into it. | |
200 | * | |
201 | * That's just wrong.The warning in the core code is | |
027dfac6 | 202 | * there to tell people to fix their asymmetries in |
b8a9a11b TG |
203 | * their own code, not by abusing the core information |
204 | * to avoid it. | |
205 | * | |
206 | * I so wish that the assymetry would be the other way | |
207 | * round and a few more irq_disable calls render that | |
208 | * shit unusable forever. | |
209 | * | |
210 | * tglx | |
211 | */ | |
010acfa1 SB |
212 | if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq))) |
213 | enable_irq(edev->pdev->irq); | |
57310c3c | 214 | } |
8535ef05 MM |
215 | } |
216 | ||
d6c4932f | 217 | static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
5cfb20b9 | 218 | { |
5cfb20b9 GS |
219 | struct pci_dev *pdev; |
220 | ||
221 | if (!edev) | |
222 | return NULL; | |
223 | ||
5a0cdbfd GS |
224 | /* |
225 | * We cannot access the config space on some adapters. | |
226 | * Otherwise, it will cause fenced PHB. We don't save | |
227 | * the content in their config space and will restore | |
228 | * from the initial config space saved when the EEH | |
229 | * device is created. | |
230 | */ | |
231 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) | |
232 | return NULL; | |
233 | ||
5cfb20b9 GS |
234 | pdev = eeh_dev_to_pci_dev(edev); |
235 | if (!pdev) | |
236 | return NULL; | |
237 | ||
238 | pci_save_state(pdev); | |
239 | return NULL; | |
240 | } | |
241 | ||
47cc8c1c SB |
242 | static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) |
243 | { | |
244 | struct eeh_pe *pe; | |
245 | struct eeh_dev *edev, *tmp; | |
246 | ||
247 | eeh_for_each_pe(root, pe) | |
248 | eeh_pe_for_each_dev(pe, edev, tmp) | |
249 | if (eeh_edev_actionable(edev)) | |
250 | edev->pdev->error_state = s; | |
251 | } | |
252 | ||
010acfa1 SB |
253 | static void eeh_set_irq_state(struct eeh_pe *root, bool enable) |
254 | { | |
255 | struct eeh_pe *pe; | |
256 | struct eeh_dev *edev, *tmp; | |
257 | ||
258 | eeh_for_each_pe(root, pe) { | |
259 | eeh_pe_for_each_dev(pe, edev, tmp) { | |
260 | if (!eeh_edev_actionable(edev)) | |
261 | continue; | |
262 | ||
263 | if (!eeh_pcid_get(edev->pdev)) | |
264 | continue; | |
265 | ||
266 | if (enable) | |
267 | eeh_enable_irq(edev); | |
268 | else | |
269 | eeh_disable_irq(edev); | |
270 | ||
271 | eeh_pcid_put(edev->pdev); | |
272 | } | |
273 | } | |
274 | } | |
275 | ||
20b34497 SB |
276 | typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, |
277 | struct pci_driver *); | |
278 | static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, | |
279 | enum pci_ers_result *result) | |
77bd7415 | 280 | { |
feadf7c0 | 281 | struct pci_driver *driver; |
20b34497 SB |
282 | enum pci_ers_result new_result; |
283 | ||
bcbe3730 SB |
284 | if (!edev->pdev) { |
285 | eeh_edev_info(edev, "no device"); | |
286 | return; | |
287 | } | |
20b34497 SB |
288 | device_lock(&edev->pdev->dev); |
289 | if (eeh_edev_actionable(edev)) { | |
290 | driver = eeh_pcid_get(edev->pdev); | |
291 | ||
292 | if (!driver) | |
293 | eeh_edev_info(edev, "no driver"); | |
294 | else if (!driver->err_handler) | |
295 | eeh_edev_info(edev, "driver not EEH aware"); | |
296 | else if (edev->mode & EEH_DEV_NO_HANDLER) | |
297 | eeh_edev_info(edev, "driver bound too late"); | |
298 | else { | |
299 | new_result = fn(edev, driver); | |
300 | eeh_edev_info(edev, "%s driver reports: '%s'", | |
301 | driver->name, | |
302 | pci_ers_result_name(new_result)); | |
303 | if (result) | |
304 | *result = pci_ers_merge_result(*result, | |
305 | new_result); | |
306 | } | |
307 | if (driver) | |
308 | eeh_pcid_put(edev->pdev); | |
309 | } else { | |
310 | eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, | |
311 | !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); | |
312 | } | |
313 | device_unlock(&edev->pdev->dev); | |
314 | } | |
77bd7415 | 315 | |
20b34497 SB |
316 | static void eeh_pe_report(const char *name, struct eeh_pe *root, |
317 | eeh_report_fn fn, enum pci_ers_result *result) | |
318 | { | |
319 | struct eeh_pe *pe; | |
320 | struct eeh_dev *edev, *tmp; | |
f0295e04 | 321 | |
20b34497 SB |
322 | pr_info("EEH: Beginning: '%s'\n", name); |
323 | eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) | |
324 | eeh_pe_report_edev(edev, fn, result); | |
325 | if (result) | |
326 | pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n", | |
327 | name, pci_ers_result_name(*result)); | |
328 | else | |
329 | pr_info("EEH: Finished:'%s'", name); | |
330 | } | |
77bd7415 | 331 | |
20b34497 SB |
332 | /** |
333 | * eeh_report_error - Report pci error to each device driver | |
334 | * @edev: eeh device | |
335 | * @driver: device's PCI driver | |
336 | * | |
337 | * Report an EEH error to each device driver. | |
338 | */ | |
339 | static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, | |
340 | struct pci_driver *driver) | |
341 | { | |
342 | enum pci_ers_result rc; | |
343 | struct pci_dev *dev = edev->pdev; | |
77bd7415 | 344 | |
20b34497 SB |
345 | if (!driver->err_handler->error_detected) |
346 | return PCI_ERS_RESULT_NONE; | |
77bd7415 | 347 | |
20b34497 SB |
348 | eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", |
349 | driver->name); | |
29f8bf1b | 350 | rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); |
2a50f144 | 351 | |
67086e32 | 352 | edev->in_error = true; |
856e1eb9 | 353 | pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); |
20b34497 | 354 | return rc; |
6a1ca373 LV |
355 | } |
356 | ||
357 | /** | |
29f8bf1b | 358 | * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled |
20b34497 SB |
359 | * @edev: eeh device |
360 | * @driver: device's PCI driver | |
6a1ca373 | 361 | * |
638799b3 | 362 | * Tells each device driver that IO ports, MMIO and config space I/O |
20b34497 | 363 | * are now enabled. |
6a1ca373 | 364 | */ |
20b34497 SB |
365 | static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, |
366 | struct pci_driver *driver) | |
6a1ca373 | 367 | { |
20b34497 SB |
368 | if (!driver->err_handler->mmio_enabled) |
369 | return PCI_ERS_RESULT_NONE; | |
370 | eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); | |
371 | return driver->err_handler->mmio_enabled(edev->pdev); | |
77bd7415 LV |
372 | } |
373 | ||
cb5b5624 | 374 | /** |
29f8bf1b | 375 | * eeh_report_reset - Tell device that slot has been reset |
20b34497 SB |
376 | * @edev: eeh device |
377 | * @driver: device's PCI driver | |
29f8bf1b GS |
378 | * |
379 | * This routine must be called while EEH tries to reset particular | |
380 | * PCI device so that the associated PCI device driver could take | |
381 | * some actions, usually to save data the driver needs so that the | |
382 | * driver can work again while the device is recovered. | |
77bd7415 | 383 | */ |
20b34497 SB |
384 | static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, |
385 | struct pci_driver *driver) | |
77bd7415 | 386 | { |
20b34497 SB |
387 | if (!driver->err_handler->slot_reset || !edev->in_error) |
388 | return PCI_ERS_RESULT_NONE; | |
389 | eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); | |
390 | return driver->err_handler->slot_reset(edev->pdev); | |
77bd7415 LV |
391 | } |
392 | ||
d6c4932f | 393 | static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
5cfb20b9 | 394 | { |
5cfb20b9 GS |
395 | struct pci_dev *pdev; |
396 | ||
397 | if (!edev) | |
398 | return NULL; | |
399 | ||
5a0cdbfd GS |
400 | /* |
401 | * The content in the config space isn't saved because | |
402 | * the blocked config space on some adapters. We have | |
403 | * to restore the initial saved config space when the | |
404 | * EEH device is created. | |
405 | */ | |
406 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { | |
80e65b00 | 407 | if (list_is_last(&edev->entry, &edev->pe->edevs)) |
5a0cdbfd GS |
408 | eeh_pe_restore_bars(edev->pe); |
409 | ||
410 | return NULL; | |
411 | } | |
412 | ||
5cfb20b9 GS |
413 | pdev = eeh_dev_to_pci_dev(edev); |
414 | if (!pdev) | |
415 | return NULL; | |
416 | ||
417 | pci_restore_state(pdev); | |
418 | return NULL; | |
419 | } | |
420 | ||
cb5b5624 | 421 | /** |
29f8bf1b | 422 | * eeh_report_resume - Tell device to resume normal operations |
20b34497 SB |
423 | * @edev: eeh device |
424 | * @driver: device's PCI driver | |
29f8bf1b GS |
425 | * |
426 | * This routine must be called to notify the device driver that it | |
427 | * could resume so that the device driver can do some initialization | |
428 | * to make the recovered device work again. | |
cb5b5624 | 429 | */ |
20b34497 SB |
430 | static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, |
431 | struct pci_driver *driver) | |
77bd7415 | 432 | { |
20b34497 SB |
433 | if (!driver->err_handler->resume || !edev->in_error) |
434 | return PCI_ERS_RESULT_NONE; | |
d0e70341 | 435 | |
20b34497 SB |
436 | eeh_edev_info(edev, "Invoking %s->resume()", driver->name); |
437 | driver->err_handler->resume(edev->pdev); | |
8535ef05 | 438 | |
20b34497 | 439 | pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); |
856e1eb9 | 440 | #ifdef CONFIG_PCI_IOV |
521ca5a9 JA |
441 | if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) |
442 | eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); | |
856e1eb9 | 443 | #endif |
20b34497 | 444 | return PCI_ERS_RESULT_NONE; |
77bd7415 LV |
445 | } |
446 | ||
cb5b5624 | 447 | /** |
29f8bf1b | 448 | * eeh_report_failure - Tell device driver that device is dead. |
20b34497 SB |
449 | * @edev: eeh device |
450 | * @driver: device's PCI driver | |
cb5b5624 LV |
451 | * |
452 | * This informs the device driver that the device is permanently | |
453 | * dead, and that no further recovery attempts will be made on it. | |
454 | */ | |
20b34497 SB |
455 | static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, |
456 | struct pci_driver *driver) | |
77bd7415 | 457 | { |
20b34497 | 458 | enum pci_ers_result rc; |
77bd7415 | 459 | |
20b34497 SB |
460 | if (!driver->err_handler->error_detected) |
461 | return PCI_ERS_RESULT_NONE; | |
8535ef05 | 462 | |
20b34497 SB |
463 | eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", |
464 | driver->name); | |
465 | rc = driver->err_handler->error_detected(edev->pdev, | |
466 | pci_channel_io_perm_failure); | |
70298c6e | 467 | |
20b34497 SB |
468 | pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); |
469 | return rc; | |
77bd7415 LV |
470 | } |
471 | ||
bf773df9 | 472 | static void *eeh_add_virt_device(struct eeh_dev *edev) |
67086e32 WY |
473 | { |
474 | struct pci_driver *driver; | |
67086e32 WY |
475 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
476 | struct pci_dn *pdn = eeh_dev_to_pdn(edev); | |
477 | ||
478 | if (!(edev->physfn)) { | |
479 | pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", | |
69672bd7 | 480 | __func__, pdn->phb->global_number, pdn->busno, |
67086e32 WY |
481 | PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); |
482 | return NULL; | |
483 | } | |
484 | ||
485 | driver = eeh_pcid_get(dev); | |
486 | if (driver) { | |
46d4be41 SB |
487 | if (driver->err_handler) { |
488 | eeh_pcid_put(dev); | |
67086e32 | 489 | return NULL; |
46d4be41 SB |
490 | } |
491 | eeh_pcid_put(dev); | |
67086e32 WY |
492 | } |
493 | ||
988fc3ba | 494 | #ifdef CONFIG_PCI_IOV |
753f6124 | 495 | pci_iov_add_virtfn(edev->physfn, pdn->vf_index); |
67086e32 WY |
496 | #endif |
497 | return NULL; | |
498 | } | |
499 | ||
d6c4932f | 500 | static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
f5c57710 GS |
501 | { |
502 | struct pci_driver *driver; | |
f5c57710 | 503 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
67086e32 | 504 | struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; |
f5c57710 GS |
505 | |
506 | /* | |
507 | * Actually, we should remove the PCI bridges as well. | |
508 | * However, that's lots of complexity to do that, | |
509 | * particularly some of devices under the bridge might | |
510 | * support EEH. So we just care about PCI devices for | |
511 | * simplicity here. | |
512 | */ | |
93de6901 | 513 | if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) |
f5c57710 | 514 | return NULL; |
8cc6b6cd | 515 | |
d2b0f6f7 GS |
516 | /* |
517 | * We rely on count-based pcibios_release_device() to | |
518 | * detach permanently offlined PEs. Unfortunately, that's | |
519 | * not reliable enough. We might have the permanently | |
520 | * offlined PEs attached, but we needn't take care of | |
521 | * them and their child devices. | |
522 | */ | |
523 | if (eeh_dev_removed(edev)) | |
524 | return NULL; | |
525 | ||
1c5c533b | 526 | if (rmv_data) { |
46d4be41 | 527 | if (eeh_pe_passed(edev->pe)) |
8cc6b6cd | 528 | return NULL; |
46d4be41 SB |
529 | driver = eeh_pcid_get(dev); |
530 | if (driver) { | |
531 | if (driver->err_handler && | |
532 | driver->err_handler->error_detected && | |
533 | driver->err_handler->slot_reset) { | |
534 | eeh_pcid_put(dev); | |
535 | return NULL; | |
536 | } | |
537 | eeh_pcid_put(dev); | |
538 | } | |
8cc6b6cd | 539 | } |
f5c57710 GS |
540 | |
541 | /* Remove it from PCI subsystem */ | |
542 | pr_debug("EEH: Removing %s without EEH sensitive driver\n", | |
543 | pci_name(dev)); | |
f5c57710 | 544 | edev->mode |= EEH_DEV_DISCONNECTED; |
1c5c533b SB |
545 | if (rmv_data) |
546 | rmv_data->removed_dev_count++; | |
f5c57710 | 547 | |
67086e32 | 548 | if (edev->physfn) { |
988fc3ba | 549 | #ifdef CONFIG_PCI_IOV |
67086e32 WY |
550 | struct pci_dn *pdn = eeh_dev_to_pdn(edev); |
551 | ||
753f6124 | 552 | pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); |
67086e32 WY |
553 | edev->pdev = NULL; |
554 | ||
555 | /* | |
556 | * We have to set the VF PE number to invalid one, which is | |
557 | * required to plug the VF successfully. | |
558 | */ | |
559 | pdn->pe_number = IODA_INVALID_PE; | |
560 | #endif | |
561 | if (rmv_data) | |
1c5c533b | 562 | list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); |
67086e32 WY |
563 | } else { |
564 | pci_lock_rescan_remove(); | |
565 | pci_stop_and_remove_bus_device(dev); | |
566 | pci_unlock_rescan_remove(); | |
567 | } | |
f5c57710 GS |
568 | |
569 | return NULL; | |
570 | } | |
571 | ||
d6c4932f | 572 | static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) |
f5c57710 | 573 | { |
f5c57710 GS |
574 | struct eeh_dev *edev, *tmp; |
575 | ||
576 | eeh_pe_for_each_dev(pe, edev, tmp) { | |
577 | if (!(edev->mode & EEH_DEV_DISCONNECTED)) | |
578 | continue; | |
579 | ||
580 | edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); | |
581 | eeh_rmv_from_parent_pe(edev); | |
582 | } | |
583 | ||
584 | return NULL; | |
585 | } | |
586 | ||
78954700 GS |
587 | /* |
588 | * Explicitly clear PE's frozen state for PowerNV where | |
589 | * we have frozen PE until BAR restore is completed. It's | |
590 | * harmless to clear it for pSeries. To be consistent with | |
591 | * PE reset (for 3 times), we try to clear the frozen state | |
592 | * for 3 times as well. | |
593 | */ | |
d6c4932f | 594 | static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) |
78954700 | 595 | { |
f05fea5b | 596 | bool clear_sw_state = *(bool *)flag; |
c9dd0143 | 597 | int i, rc = 1; |
78954700 | 598 | |
c9dd0143 | 599 | for (i = 0; rc && i < 3; i++) |
5cfb20b9 | 600 | rc = eeh_unfreeze_pe(pe, clear_sw_state); |
78954700 | 601 | |
c9dd0143 | 602 | /* Stop immediately on any errors */ |
2c665992 | 603 | if (rc) { |
c9dd0143 GS |
604 | pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", |
605 | __func__, rc, pe->phb->global_number, pe->addr); | |
2c665992 GS |
606 | return (void *)pe; |
607 | } | |
608 | ||
609 | return NULL; | |
610 | } | |
611 | ||
5cfb20b9 GS |
612 | static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, |
613 | bool clear_sw_state) | |
2c665992 GS |
614 | { |
615 | void *rc; | |
616 | ||
5cfb20b9 | 617 | rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); |
2c665992 | 618 | if (!rc) |
78954700 GS |
619 | eeh_pe_state_clear(pe, EEH_PE_ISOLATED); |
620 | ||
2c665992 | 621 | return rc ? -EIO : 0; |
78954700 GS |
622 | } |
623 | ||
5cfb20b9 GS |
624 | int eeh_pe_reset_and_recover(struct eeh_pe *pe) |
625 | { | |
2efc771f | 626 | int ret; |
5cfb20b9 GS |
627 | |
628 | /* Bail if the PE is being recovered */ | |
629 | if (pe->state & EEH_PE_RECOVERING) | |
630 | return 0; | |
631 | ||
632 | /* Put the PE into recovery mode */ | |
633 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); | |
634 | ||
635 | /* Save states */ | |
636 | eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); | |
637 | ||
5cfb20b9 | 638 | /* Issue reset */ |
6654c936 | 639 | ret = eeh_pe_reset_full(pe); |
5cfb20b9 | 640 | if (ret) { |
28bf36f9 | 641 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
5cfb20b9 GS |
642 | return ret; |
643 | } | |
5cfb20b9 GS |
644 | |
645 | /* Unfreeze the PE */ | |
646 | ret = eeh_clear_pe_frozen_state(pe, true); | |
647 | if (ret) { | |
648 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING); | |
649 | return ret; | |
650 | } | |
651 | ||
5cfb20b9 GS |
652 | /* Restore device state */ |
653 | eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); | |
654 | ||
5cfb20b9 GS |
655 | /* Clear recovery mode */ |
656 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING); | |
657 | ||
658 | return 0; | |
659 | } | |
660 | ||
77bd7415 | 661 | /** |
29f8bf1b | 662 | * eeh_reset_device - Perform actual reset of a pci slot |
5fd13460 | 663 | * @driver_eeh_aware: Does the device's driver provide EEH support? |
9b3c76f0 | 664 | * @pe: EEH PE |
29f8bf1b | 665 | * @bus: PCI bus corresponding to the isolcated slot |
5fd13460 | 666 | * @rmv_data: Optional, list to record removed devices |
77bd7415 | 667 | * |
29f8bf1b GS |
668 | * This routine must be called to do reset on the indicated PE. |
669 | * During the reset, udev might be invoked because those affected | |
670 | * PCI devices will be removed and then added. | |
77bd7415 | 671 | */ |
67086e32 | 672 | static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, |
5fd13460 SB |
673 | struct eeh_rmv_data *rmv_data, |
674 | bool driver_eeh_aware) | |
77bd7415 | 675 | { |
edfd17ff | 676 | time64_t tstamp; |
67086e32 WY |
677 | int cnt, rc; |
678 | struct eeh_dev *edev; | |
42405456 LV |
679 | |
680 | /* pcibios will clear the counter; save the value */ | |
9b3c76f0 | 681 | cnt = pe->freeze_count; |
5a71978e | 682 | tstamp = pe->tstamp; |
42405456 | 683 | |
20ee6a97 GS |
684 | /* |
685 | * We don't remove the corresponding PE instances because | |
686 | * we need the information afterwords. The attached EEH | |
687 | * devices are expected to be attached soon when calling | |
bd251b89 | 688 | * into pci_hp_add_devices(). |
20ee6a97 | 689 | */ |
f5c57710 | 690 | eeh_pe_state_mark(pe, EEH_PE_KEEP); |
54048cf8 | 691 | if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
cca0e542 | 692 | eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); |
54048cf8 SB |
693 | } else { |
694 | pci_lock_rescan_remove(); | |
695 | pci_hp_remove_devices(bus); | |
696 | pci_unlock_rescan_remove(); | |
1c2042c8 | 697 | } |
77bd7415 | 698 | |
d0914f50 GS |
699 | /* |
700 | * Reset the pci controller. (Asserts RST#; resets config space). | |
b6495c0c | 701 | * Reconfigure bridges and devices. Don't try to bring the system |
29f8bf1b | 702 | * up if the reset failed for some reason. |
d0914f50 GS |
703 | * |
704 | * During the reset, it's very dangerous to have uncontrolled PCI | |
705 | * config accesses. So we prefer to block them. However, controlled | |
706 | * PCI config accesses initiated from EEH itself are allowed. | |
29f8bf1b | 707 | */ |
6654c936 | 708 | rc = eeh_pe_reset_full(pe); |
28bf36f9 | 709 | if (rc) |
b6495c0c | 710 | return rc; |
77bd7415 | 711 | |
1c2042c8 RW |
712 | pci_lock_rescan_remove(); |
713 | ||
9b3c76f0 GS |
714 | /* Restore PE */ |
715 | eeh_ops->configure_bridge(pe); | |
716 | eeh_pe_restore_bars(pe); | |
77bd7415 | 717 | |
dc9c41bd AD |
718 | /* Clear frozen state */ |
719 | rc = eeh_clear_pe_frozen_state(pe, false); | |
409bf7f8 AD |
720 | if (rc) { |
721 | pci_unlock_rescan_remove(); | |
dc9c41bd | 722 | return rc; |
409bf7f8 | 723 | } |
78954700 | 724 | |
77bd7415 | 725 | /* Give the system 5 seconds to finish running the user-space |
a84f273c GS |
726 | * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, |
727 | * this is a hack, but if we don't do this, and try to bring | |
728 | * the device up before the scripts have taken it down, | |
77bd7415 LV |
729 | * potentially weird things happen. |
730 | */ | |
1c5c533b | 731 | if (!driver_eeh_aware || rmv_data->removed_dev_count) { |
54048cf8 SB |
732 | pr_info("EEH: Sleep 5s ahead of %s hotplug\n", |
733 | (driver_eeh_aware ? "partial" : "complete")); | |
29f8bf1b | 734 | ssleep(5); |
f5c57710 GS |
735 | |
736 | /* | |
737 | * The EEH device is still connected with its parent | |
738 | * PE. We should disconnect it so the binding can be | |
739 | * rebuilt when adding PCI devices. | |
740 | */ | |
80e65b00 | 741 | edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); |
f5c57710 | 742 | eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); |
a3aa256b | 743 | if (pe->type & EEH_PE_VF) { |
bf773df9 | 744 | eeh_add_virt_device(edev); |
a3aa256b | 745 | } else { |
54048cf8 SB |
746 | if (!driver_eeh_aware) |
747 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); | |
bd251b89 | 748 | pci_hp_add_devices(bus); |
a3aa256b | 749 | } |
77bd7415 | 750 | } |
f5c57710 | 751 | eeh_pe_state_clear(pe, EEH_PE_KEEP); |
5a71978e GS |
752 | |
753 | pe->tstamp = tstamp; | |
9b3c76f0 | 754 | pe->freeze_count = cnt; |
b6495c0c | 755 | |
1c2042c8 | 756 | pci_unlock_rescan_remove(); |
b6495c0c | 757 | return 0; |
77bd7415 LV |
758 | } |
759 | ||
760 | /* The longest amount of time to wait for a pci device | |
761 | * to come back on line, in seconds. | |
762 | */ | |
fb48dc22 | 763 | #define MAX_WAIT_FOR_RECOVERY 300 |
77bd7415 | 764 | |
c0b64978 RC |
765 | /** |
766 | * eeh_handle_normal_event - Handle EEH events on a specific PE | |
37fd8125 SB |
767 | * @pe: EEH PE - which should not be used after we return, as it may |
768 | * have been invalidated. | |
c0b64978 RC |
769 | * |
770 | * Attempts to recover the given PE. If recovery fails or the PE has failed | |
771 | * too many times, remove the PE. | |
772 | * | |
68701780 SB |
773 | * While PHB detects address or data parity errors on particular PCI |
774 | * slot, the associated PE will be frozen. Besides, DMA's occurring | |
775 | * to wild addresses (which usually happen due to bugs in device | |
776 | * drivers or in PCI adapter firmware) can cause EEH error. #SERR, | |
777 | * #PERR or other misc PCI-related errors also can trigger EEH errors. | |
778 | * | |
779 | * Recovery process consists of unplugging the device driver (which | |
780 | * generated hotplug events to userspace), then issuing a PCI #RST to | |
781 | * the device, then reconfiguring the PCI config space for all bridges | |
782 | * & devices under this slot, and then finally restarting the device | |
783 | * drivers (which cause a second set of hotplug events to go out to | |
784 | * userspace). | |
c0b64978 | 785 | */ |
37fd8125 | 786 | void eeh_handle_normal_event(struct eeh_pe *pe) |
77bd7415 | 787 | { |
cd95f804 | 788 | struct pci_bus *bus; |
67086e32 | 789 | struct eeh_dev *edev, *tmp; |
665012c5 | 790 | struct eeh_pe *tmp_pe; |
b6495c0c | 791 | int rc = 0; |
18eb3b39 | 792 | enum pci_ers_result result = PCI_ERS_RESULT_NONE; |
1c5c533b SB |
793 | struct eeh_rmv_data rmv_data = |
794 | {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; | |
77bd7415 | 795 | |
cd95f804 SB |
796 | bus = eeh_pe_bus_get(pe); |
797 | if (!bus) { | |
1f52f176 | 798 | pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", |
9b3c76f0 | 799 | __func__, pe->phb->global_number, pe->addr); |
37fd8125 | 800 | return; |
77bd7415 LV |
801 | } |
802 | ||
37fd8125 SB |
803 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
804 | ||
5a71978e | 805 | eeh_pe_update_time_stamp(pe); |
9b3c76f0 | 806 | pe->freeze_count++; |
c0b64978 | 807 | if (pe->freeze_count > eeh_max_freezes) { |
796b9f5b | 808 | pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", |
c0b64978 RC |
809 | pe->phb->global_number, pe->addr, |
810 | pe->freeze_count); | |
b90484ec | 811 | result = PCI_ERS_RESULT_DISCONNECT; |
c0b64978 | 812 | } |
77bd7415 LV |
813 | |
814 | /* Walk the various device drivers attached to this slot through | |
815 | * a reset sequence, giving each an opportunity to do what it needs | |
816 | * to accomplish the reset. Each child gets a report of the | |
817 | * status ... if any child can't handle the reset, then the entire | |
818 | * slot is dlpar removed and added. | |
8234fced GS |
819 | * |
820 | * When the PHB is fenced, we have to issue a reset to recover from | |
821 | * the error. Override the result if necessary to have partially | |
822 | * hotplug for this case. | |
77bd7415 | 823 | */ |
b90484ec SB |
824 | if (result != PCI_ERS_RESULT_DISCONNECT) { |
825 | pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", | |
826 | pe->freeze_count, eeh_max_freezes); | |
827 | pr_info("EEH: Notify device drivers to shutdown\n"); | |
828 | eeh_set_channel_state(pe, pci_channel_io_frozen); | |
829 | eeh_set_irq_state(pe, false); | |
830 | eeh_pe_report("error_detected(IO frozen)", pe, | |
831 | eeh_report_error, &result); | |
832 | if ((pe->type & EEH_PE_PHB) && | |
833 | result != PCI_ERS_RESULT_NONE && | |
834 | result != PCI_ERS_RESULT_NEED_RESET) | |
835 | result = PCI_ERS_RESULT_NEED_RESET; | |
836 | } | |
77bd7415 | 837 | |
5f1a7c81 | 838 | /* Get the current PCI slot state. This can take a long time, |
2ac3990c | 839 | * sometimes over 300 seconds for certain systems. |
29f8bf1b | 840 | */ |
b90484ec SB |
841 | if (result != PCI_ERS_RESULT_DISCONNECT) { |
842 | rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); | |
843 | if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { | |
844 | pr_warn("EEH: Permanent failure\n"); | |
845 | result = PCI_ERS_RESULT_DISCONNECT; | |
846 | } | |
5f1a7c81 LV |
847 | } |
848 | ||
ede8ca26 LV |
849 | /* Since rtas may enable MMIO when posting the error log, |
850 | * don't post the error log until after all dev drivers | |
17213c3b LV |
851 | * have been informed. |
852 | */ | |
b90484ec SB |
853 | if (result != PCI_ERS_RESULT_DISCONNECT) { |
854 | pr_info("EEH: Collect temporary log\n"); | |
855 | eeh_slot_error_detail(pe, EEH_LOG_TEMP); | |
856 | } | |
ede8ca26 | 857 | |
77bd7415 LV |
858 | /* If all device drivers were EEH-unaware, then shut |
859 | * down all of the device drivers, and hope they | |
860 | * go down willingly, without panicing the system. | |
861 | */ | |
18eb3b39 | 862 | if (result == PCI_ERS_RESULT_NONE) { |
56ca4fde | 863 | pr_info("EEH: Reset with hotplug activity\n"); |
5fd13460 | 864 | rc = eeh_reset_device(pe, bus, NULL, false); |
e0f90b64 | 865 | if (rc) { |
0dae2743 GS |
866 | pr_warn("%s: Unable to reset, err=%d\n", |
867 | __func__, rc); | |
b90484ec | 868 | result = PCI_ERS_RESULT_DISCONNECT; |
e0f90b64 | 869 | } |
77bd7415 LV |
870 | } |
871 | ||
6a1ca373 LV |
872 | /* If all devices reported they can proceed, then re-enable MMIO */ |
873 | if (result == PCI_ERS_RESULT_CAN_RECOVER) { | |
56ca4fde | 874 | pr_info("EEH: Enable I/O for affected devices\n"); |
9b3c76f0 | 875 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); |
6a1ca373 | 876 | |
b90484ec SB |
877 | if (rc < 0) { |
878 | result = PCI_ERS_RESULT_DISCONNECT; | |
879 | } else if (rc) { | |
6a1ca373 LV |
880 | result = PCI_ERS_RESULT_NEED_RESET; |
881 | } else { | |
56ca4fde | 882 | pr_info("EEH: Notify device drivers to resume I/O\n"); |
20b34497 SB |
883 | eeh_pe_report("mmio_enabled", pe, |
884 | eeh_report_mmio_enabled, &result); | |
6a1ca373 | 885 | } |
77bd7415 LV |
886 | } |
887 | ||
6a1ca373 | 888 | /* If all devices reported they can proceed, then re-enable DMA */ |
18eb3b39 | 889 | if (result == PCI_ERS_RESULT_CAN_RECOVER) { |
56ca4fde | 890 | pr_info("EEH: Enabled DMA for affected devices\n"); |
9b3c76f0 | 891 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); |
6a1ca373 | 892 | |
b90484ec SB |
893 | if (rc < 0) { |
894 | result = PCI_ERS_RESULT_DISCONNECT; | |
895 | } else if (rc) { | |
6a1ca373 | 896 | result = PCI_ERS_RESULT_NEED_RESET; |
35845a78 GS |
897 | } else { |
898 | /* | |
899 | * We didn't do PE reset for the case. The PE | |
900 | * is still in frozen state. Clear it before | |
901 | * resuming the PE. | |
902 | */ | |
903 | eeh_pe_state_clear(pe, EEH_PE_ISOLATED); | |
d0e70341 | 904 | result = PCI_ERS_RESULT_RECOVERED; |
35845a78 | 905 | } |
6a1ca373 LV |
906 | } |
907 | ||
6a1ca373 LV |
908 | /* If any device called out for a reset, then reset the slot */ |
909 | if (result == PCI_ERS_RESULT_NEED_RESET) { | |
56ca4fde | 910 | pr_info("EEH: Reset without hotplug activity\n"); |
5fd13460 | 911 | rc = eeh_reset_device(pe, bus, &rmv_data, true); |
e0f90b64 | 912 | if (rc) { |
0dae2743 GS |
913 | pr_warn("%s: Cannot reset, err=%d\n", |
914 | __func__, rc); | |
b90484ec SB |
915 | result = PCI_ERS_RESULT_DISCONNECT; |
916 | } else { | |
917 | result = PCI_ERS_RESULT_NONE; | |
918 | eeh_set_channel_state(pe, pci_channel_io_normal); | |
919 | eeh_set_irq_state(pe, true); | |
920 | eeh_pe_report("slot_reset", pe, eeh_report_reset, | |
921 | &result); | |
e0f90b64 | 922 | } |
e0f90b64 | 923 | } |
6a1ca373 | 924 | |
b90484ec SB |
925 | if ((result == PCI_ERS_RESULT_RECOVERED) || |
926 | (result == PCI_ERS_RESULT_NONE)) { | |
927 | /* | |
928 | * For those hot removed VFs, we should add back them after PF | |
929 | * get recovered properly. | |
930 | */ | |
931 | list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, | |
932 | rmv_entry) { | |
933 | eeh_add_virt_device(edev); | |
934 | list_del(&edev->rmv_entry); | |
20b34497 | 935 | } |
665012c5 | 936 | |
b90484ec SB |
937 | /* Tell all device drivers that they can resume operations */ |
938 | pr_info("EEH: Notify device driver to resume\n"); | |
939 | eeh_set_channel_state(pe, pci_channel_io_normal); | |
940 | eeh_set_irq_state(pe, true); | |
941 | eeh_pe_report("resume", pe, eeh_report_resume, NULL); | |
942 | eeh_for_each_pe(pe, tmp_pe) { | |
943 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) { | |
944 | edev->mode &= ~EEH_DEV_NO_HANDLER; | |
945 | edev->in_error = false; | |
946 | } | |
947 | } | |
a84f273c | 948 | |
b90484ec SB |
949 | pr_info("EEH: Recovery successful.\n"); |
950 | } else { | |
951 | /* | |
952 | * About 90% of all real-life EEH failures in the field | |
953 | * are due to poorly seated PCI cards. Only 10% or so are | |
954 | * due to actual, failed cards. | |
955 | */ | |
956 | pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" | |
957 | "Please try reseating or replacing it\n", | |
958 | pe->phb->global_number, pe->addr); | |
b6495c0c | 959 | |
b90484ec | 960 | eeh_slot_error_detail(pe, EEH_LOG_PERM); |
b6495c0c | 961 | |
b90484ec SB |
962 | /* Notify all devices that they're about to go down. */ |
963 | eeh_set_channel_state(pe, pci_channel_io_perm_failure); | |
964 | eeh_set_irq_state(pe, false); | |
965 | eeh_pe_report("error_detected(permanent failure)", pe, | |
966 | eeh_report_failure, NULL); | |
b6495c0c | 967 | |
b90484ec SB |
968 | /* Mark the PE to be removed permanently */ |
969 | eeh_pe_state_mark(pe, EEH_PE_REMOVED); | |
d2b0f6f7 | 970 | |
b90484ec SB |
971 | /* |
972 | * Shut down the device drivers for good. We mark | |
973 | * all removed devices correctly to avoid access | |
974 | * the their PCI config any more. | |
975 | */ | |
976 | if (pe->type & EEH_PE_VF) { | |
977 | eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); | |
978 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); | |
979 | } else { | |
980 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); | |
981 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); | |
982 | ||
983 | pci_lock_rescan_remove(); | |
984 | pci_hp_remove_devices(bus); | |
985 | pci_unlock_rescan_remove(); | |
986 | /* The passed PE should no longer be used */ | |
987 | return; | |
988 | } | |
1c2042c8 | 989 | } |
37fd8125 | 990 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
77bd7415 | 991 | } |
8a6b1bc7 | 992 | |
c0b64978 RC |
993 | /** |
994 | * eeh_handle_special_event - Handle EEH events without a specific failing PE | |
995 | * | |
996 | * Called when an EEH event is detected but can't be narrowed down to a | |
997 | * specific PE. Iterates through possible failures and handles them as | |
998 | * necessary. | |
999 | */ | |
68701780 | 1000 | void eeh_handle_special_event(void) |
8a6b1bc7 GS |
1001 | { |
1002 | struct eeh_pe *pe, *phb_pe; | |
1003 | struct pci_bus *bus; | |
7e4e7867 | 1004 | struct pci_controller *hose; |
8a6b1bc7 | 1005 | unsigned long flags; |
7e4e7867 | 1006 | int rc; |
8a6b1bc7 | 1007 | |
8a6b1bc7 | 1008 | |
7e4e7867 GS |
1009 | do { |
1010 | rc = eeh_ops->next_error(&pe); | |
1011 | ||
1012 | switch (rc) { | |
1013 | case EEH_NEXT_ERR_DEAD_IOC: | |
1014 | /* Mark all PHBs in dead state */ | |
1015 | eeh_serialize_lock(&flags); | |
1016 | ||
1017 | /* Purge all events */ | |
5c7a35e3 | 1018 | eeh_remove_event(NULL, true); |
7e4e7867 GS |
1019 | |
1020 | list_for_each_entry(hose, &hose_list, list_node) { | |
1021 | phb_pe = eeh_phb_pe_get(hose); | |
1022 | if (!phb_pe) continue; | |
1023 | ||
e762bb89 | 1024 | eeh_pe_mark_isolated(phb_pe); |
7e4e7867 GS |
1025 | } |
1026 | ||
1027 | eeh_serialize_unlock(flags); | |
1028 | ||
1029 | break; | |
1030 | case EEH_NEXT_ERR_FROZEN_PE: | |
1031 | case EEH_NEXT_ERR_FENCED_PHB: | |
1032 | case EEH_NEXT_ERR_DEAD_PHB: | |
1033 | /* Mark the PE in fenced state */ | |
1034 | eeh_serialize_lock(&flags); | |
1035 | ||
1036 | /* Purge all events of the PHB */ | |
5c7a35e3 | 1037 | eeh_remove_event(pe, true); |
7e4e7867 | 1038 | |
e762bb89 SB |
1039 | if (rc != EEH_NEXT_ERR_DEAD_PHB) |
1040 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); | |
1041 | eeh_pe_mark_isolated(pe); | |
7e4e7867 GS |
1042 | |
1043 | eeh_serialize_unlock(flags); | |
1044 | ||
1045 | break; | |
1046 | case EEH_NEXT_ERR_NONE: | |
1047 | return; | |
1048 | default: | |
1049 | pr_warn("%s: Invalid value %d from next_error()\n", | |
1050 | __func__, rc); | |
1051 | return; | |
8a6b1bc7 | 1052 | } |
8a6b1bc7 | 1053 | |
7e4e7867 GS |
1054 | /* |
1055 | * For fenced PHB and frozen PE, it's handled as normal | |
1056 | * event. We have to remove the affected PHBs for dead | |
1057 | * PHB and IOC | |
1058 | */ | |
1059 | if (rc == EEH_NEXT_ERR_FROZEN_PE || | |
1060 | rc == EEH_NEXT_ERR_FENCED_PHB) { | |
37fd8125 | 1061 | eeh_handle_normal_event(pe); |
7e4e7867 | 1062 | } else { |
1b17366d | 1063 | pci_lock_rescan_remove(); |
7e4e7867 GS |
1064 | list_for_each_entry(hose, &hose_list, list_node) { |
1065 | phb_pe = eeh_phb_pe_get(hose); | |
1066 | if (!phb_pe || | |
9e049375 GS |
1067 | !(phb_pe->state & EEH_PE_ISOLATED) || |
1068 | (phb_pe->state & EEH_PE_RECOVERING)) | |
7e4e7867 GS |
1069 | continue; |
1070 | ||
1071 | /* Notify all devices to be down */ | |
05ba75f8 | 1072 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
47cc8c1c | 1073 | eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
20b34497 SB |
1074 | eeh_pe_report( |
1075 | "error_detected(permanent failure)", pe, | |
af2e3a00 | 1076 | eeh_report_failure, NULL); |
7e4e7867 | 1077 | bus = eeh_pe_bus_get(phb_pe); |
04fec21c RC |
1078 | if (!bus) { |
1079 | pr_err("%s: Cannot find PCI bus for " | |
1f52f176 | 1080 | "PHB#%x-PE#%x\n", |
04fec21c RC |
1081 | __func__, |
1082 | pe->phb->global_number, | |
1083 | pe->addr); | |
1084 | break; | |
1085 | } | |
bd251b89 | 1086 | pci_hp_remove_devices(bus); |
7e4e7867 | 1087 | } |
1b17366d | 1088 | pci_unlock_rescan_remove(); |
8a6b1bc7 | 1089 | } |
7e4e7867 GS |
1090 | |
1091 | /* | |
1092 | * If we have detected dead IOC, we needn't proceed | |
1093 | * any more since all PHBs would have been removed | |
1094 | */ | |
1095 | if (rc == EEH_NEXT_ERR_DEAD_IOC) | |
1096 | break; | |
1097 | } while (rc != EEH_NEXT_ERR_NONE); | |
8a6b1bc7 | 1098 | } |