powerpc/eeh: Remove unused eeh_pcid_name()
[linux-2.6-block.git] / arch / powerpc / kernel / eeh_driver.c
CommitLineData
77bd7415
LV
1/*
2 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
3c8c90ab
LV
3 * Copyright IBM Corp. 2004 2005
4 * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
77bd7415
LV
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or (at
11 * your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
3c8c90ab 23 * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
77bd7415
LV
24 */
25#include <linux/delay.h>
77bd7415 26#include <linux/interrupt.h>
ac325acd 27#include <linux/irq.h>
feadf7c0 28#include <linux/module.h>
77bd7415
LV
29#include <linux/pci.h>
30#include <asm/eeh.h>
31#include <asm/eeh_event.h>
32#include <asm/ppc-pci.h>
33#include <asm/pci-bridge.h>
34#include <asm/prom.h>
35#include <asm/rtas.h>
36
67086e32
WY
37struct eeh_rmv_data {
38 struct list_head edev_list;
39 int removed;
40};
41
feadf7c0
GS
42/**
43 * eeh_pcid_get - Get the PCI device driver
44 * @pdev: PCI device
45 *
46 * The function is used to retrieve the PCI device driver for
47 * the indicated PCI device. Besides, we will increase the reference
48 * of the PCI device driver to prevent that being unloaded on
49 * the fly. Otherwise, kernel crash would be seen.
50 */
51static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
52{
53 if (!pdev || !pdev->driver)
54 return NULL;
55
56 if (!try_module_get(pdev->driver->driver.owner))
57 return NULL;
58
59 return pdev->driver;
60}
61
62/**
63 * eeh_pcid_put - Dereference on the PCI device driver
64 * @pdev: PCI device
65 *
66 * The function is called to do dereference on the PCI device
67 * driver of the indicated PCI device.
68 */
69static inline void eeh_pcid_put(struct pci_dev *pdev)
70{
71 if (!pdev || !pdev->driver)
72 return;
73
74 module_put(pdev->driver->driver.owner);
75}
76
8535ef05 77/**
29f8bf1b
GS
78 * eeh_disable_irq - Disable interrupt for the recovering device
79 * @dev: PCI device
80 *
81 * This routine must be called when reporting temporary or permanent
82 * error to the particular PCI device to disable interrupt of that
83 * device. If the device has enabled MSI or MSI-X interrupt, we needn't
84 * do real work because EEH should freeze DMA transfers for those PCI
85 * devices encountering EEH errors, which includes MSI or MSI-X.
8535ef05
MM
86 */
87static void eeh_disable_irq(struct pci_dev *dev)
88{
40a7cd92 89 struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
8535ef05
MM
90
91 /* Don't disable MSI and MSI-X interrupts. They are
92 * effectively disabled by the DMA Stopped state
93 * when an EEH error occurs.
29f8bf1b 94 */
8535ef05
MM
95 if (dev->msi_enabled || dev->msix_enabled)
96 return;
97
59e3f837 98 if (!irq_has_action(dev->irq))
8535ef05
MM
99 return;
100
dbbceee1 101 edev->mode |= EEH_DEV_IRQ_DISABLED;
8535ef05
MM
102 disable_irq_nosync(dev->irq);
103}
104
105/**
29f8bf1b
GS
106 * eeh_enable_irq - Enable interrupt for the recovering device
107 * @dev: PCI device
108 *
109 * This routine must be called to enable interrupt while failed
110 * device could be resumed.
8535ef05
MM
111 */
112static void eeh_enable_irq(struct pci_dev *dev)
113{
40a7cd92 114 struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
8535ef05 115
dbbceee1
GS
116 if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
117 edev->mode &= ~EEH_DEV_IRQ_DISABLED;
b8a9a11b
TG
118 /*
119 * FIXME !!!!!
120 *
121 * This is just ass backwards. This maze has
122 * unbalanced irq_enable/disable calls. So instead of
123 * finding the root cause it works around the warning
124 * in the irq_enable code by conditionally calling
125 * into it.
126 *
127 * That's just wrong.The warning in the core code is
027dfac6 128 * there to tell people to fix their asymmetries in
b8a9a11b
TG
129 * their own code, not by abusing the core information
130 * to avoid it.
131 *
132 * I so wish that the assymetry would be the other way
133 * round and a few more irq_disable calls render that
134 * shit unusable forever.
135 *
136 * tglx
137 */
57310c3c 138 if (irqd_irq_disabled(irq_get_irq_data(dev->irq)))
91150af3 139 enable_irq(dev->irq);
57310c3c 140 }
8535ef05
MM
141}
142
d2b0f6f7
GS
143static bool eeh_dev_removed(struct eeh_dev *edev)
144{
145 /* EEH device removed ? */
146 if (!edev || (edev->mode & EEH_DEV_REMOVED))
147 return true;
148
149 return false;
150}
151
5cfb20b9
GS
152static void *eeh_dev_save_state(void *data, void *userdata)
153{
154 struct eeh_dev *edev = data;
155 struct pci_dev *pdev;
156
157 if (!edev)
158 return NULL;
159
5a0cdbfd
GS
160 /*
161 * We cannot access the config space on some adapters.
162 * Otherwise, it will cause fenced PHB. We don't save
163 * the content in their config space and will restore
164 * from the initial config space saved when the EEH
165 * device is created.
166 */
167 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
168 return NULL;
169
5cfb20b9
GS
170 pdev = eeh_dev_to_pci_dev(edev);
171 if (!pdev)
172 return NULL;
173
174 pci_save_state(pdev);
175 return NULL;
176}
177
cb5b5624 178/**
29f8bf1b 179 * eeh_report_error - Report pci error to each device driver
9b3c76f0 180 * @data: eeh device
29f8bf1b 181 * @userdata: return value
a84f273c
GS
182 *
183 * Report an EEH error to each device driver, collect up and
184 * merge the device driver responses. Cumulative response
cb5b5624 185 * passed back in "userdata".
77bd7415 186 */
9b3c76f0 187static void *eeh_report_error(void *data, void *userdata)
77bd7415 188{
9b3c76f0
GS
189 struct eeh_dev *edev = (struct eeh_dev *)data;
190 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
18eb3b39 191 enum pci_ers_result rc, *res = userdata;
feadf7c0 192 struct pci_driver *driver;
77bd7415 193
2311cca5 194 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
d2b0f6f7 195 return NULL;
f0295e04
MN
196
197 device_lock(&dev->dev);
77bd7415
LV
198 dev->error_state = pci_channel_io_frozen;
199
feadf7c0 200 driver = eeh_pcid_get(dev);
f0295e04 201 if (!driver) goto out_no_dev;
77bd7415 202
8535ef05
MM
203 eeh_disable_irq(dev);
204
6a1ca373 205 if (!driver->err_handler ||
f0295e04
MN
206 !driver->err_handler->error_detected)
207 goto out;
77bd7415 208
29f8bf1b 209 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
2a50f144
LV
210
211 /* A driver that needs a reset trumps all others */
212 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
18eb3b39 213 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
70298c6e 214
67086e32 215 edev->in_error = true;
856e1eb9 216 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
f0295e04
MN
217
218out:
219 eeh_pcid_put(dev);
220out_no_dev:
221 device_unlock(&dev->dev);
9b3c76f0 222 return NULL;
6a1ca373
LV
223}
224
225/**
29f8bf1b 226 * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
9b3c76f0 227 * @data: eeh device
29f8bf1b 228 * @userdata: return value
6a1ca373 229 *
638799b3
LV
230 * Tells each device driver that IO ports, MMIO and config space I/O
231 * are now enabled. Collects up and merges the device driver responses.
232 * Cumulative response passed back in "userdata".
6a1ca373 233 */
9b3c76f0 234static void *eeh_report_mmio_enabled(void *data, void *userdata)
6a1ca373 235{
9b3c76f0
GS
236 struct eeh_dev *edev = (struct eeh_dev *)data;
237 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
6a1ca373 238 enum pci_ers_result rc, *res = userdata;
9b3c76f0 239 struct pci_driver *driver;
6a1ca373 240
2311cca5 241 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
d2b0f6f7
GS
242 return NULL;
243
f0295e04 244 device_lock(&dev->dev);
feadf7c0 245 driver = eeh_pcid_get(dev);
f0295e04 246 if (!driver) goto out_no_dev;
9b3c76f0 247
feadf7c0 248 if (!driver->err_handler ||
f26c7a03 249 !driver->err_handler->mmio_enabled ||
f0295e04
MN
250 (edev->mode & EEH_DEV_NO_HANDLER))
251 goto out;
6a1ca373 252
29f8bf1b 253 rc = driver->err_handler->mmio_enabled(dev);
2a50f144
LV
254
255 /* A driver that needs a reset trumps all others */
256 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
6a1ca373 257 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
70298c6e 258
f0295e04 259out:
feadf7c0 260 eeh_pcid_put(dev);
f0295e04
MN
261out_no_dev:
262 device_unlock(&dev->dev);
9b3c76f0 263 return NULL;
77bd7415
LV
264}
265
cb5b5624 266/**
29f8bf1b 267 * eeh_report_reset - Tell device that slot has been reset
9b3c76f0 268 * @data: eeh device
29f8bf1b
GS
269 * @userdata: return value
270 *
271 * This routine must be called while EEH tries to reset particular
272 * PCI device so that the associated PCI device driver could take
273 * some actions, usually to save data the driver needs so that the
274 * driver can work again while the device is recovered.
77bd7415 275 */
9b3c76f0 276static void *eeh_report_reset(void *data, void *userdata)
77bd7415 277{
9b3c76f0
GS
278 struct eeh_dev *edev = (struct eeh_dev *)data;
279 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
6a1ca373 280 enum pci_ers_result rc, *res = userdata;
9b3c76f0 281 struct pci_driver *driver;
77bd7415 282
2311cca5 283 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
d2b0f6f7 284 return NULL;
f0295e04
MN
285
286 device_lock(&dev->dev);
c58dc575
MM
287 dev->error_state = pci_channel_io_normal;
288
feadf7c0 289 driver = eeh_pcid_get(dev);
f0295e04 290 if (!driver) goto out_no_dev;
feadf7c0 291
8535ef05
MM
292 eeh_enable_irq(dev);
293
6a1ca373 294 if (!driver->err_handler ||
f26c7a03 295 !driver->err_handler->slot_reset ||
67086e32 296 (edev->mode & EEH_DEV_NO_HANDLER) ||
f0295e04
MN
297 (!edev->in_error))
298 goto out;
77bd7415 299
6a1ca373 300 rc = driver->err_handler->slot_reset(dev);
5794dbcb
LV
301 if ((*res == PCI_ERS_RESULT_NONE) ||
302 (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
6a1ca373
LV
303 if (*res == PCI_ERS_RESULT_DISCONNECT &&
304 rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
70298c6e 305
f0295e04 306out:
feadf7c0 307 eeh_pcid_put(dev);
f0295e04
MN
308out_no_dev:
309 device_unlock(&dev->dev);
9b3c76f0 310 return NULL;
77bd7415
LV
311}
312
5cfb20b9
GS
313static void *eeh_dev_restore_state(void *data, void *userdata)
314{
315 struct eeh_dev *edev = data;
316 struct pci_dev *pdev;
317
318 if (!edev)
319 return NULL;
320
5a0cdbfd
GS
321 /*
322 * The content in the config space isn't saved because
323 * the blocked config space on some adapters. We have
324 * to restore the initial saved config space when the
325 * EEH device is created.
326 */
327 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
328 if (list_is_last(&edev->list, &edev->pe->edevs))
329 eeh_pe_restore_bars(edev->pe);
330
331 return NULL;
332 }
333
5cfb20b9
GS
334 pdev = eeh_dev_to_pci_dev(edev);
335 if (!pdev)
336 return NULL;
337
338 pci_restore_state(pdev);
339 return NULL;
340}
341
cb5b5624 342/**
29f8bf1b 343 * eeh_report_resume - Tell device to resume normal operations
9b3c76f0 344 * @data: eeh device
29f8bf1b
GS
345 * @userdata: return value
346 *
347 * This routine must be called to notify the device driver that it
348 * could resume so that the device driver can do some initialization
349 * to make the recovered device work again.
cb5b5624 350 */
9b3c76f0 351static void *eeh_report_resume(void *data, void *userdata)
77bd7415 352{
9b3c76f0
GS
353 struct eeh_dev *edev = (struct eeh_dev *)data;
354 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
67086e32 355 bool was_in_error;
9b3c76f0
GS
356 struct pci_driver *driver;
357
2311cca5 358 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
d2b0f6f7 359 return NULL;
f0295e04
MN
360
361 device_lock(&dev->dev);
77bd7415
LV
362 dev->error_state = pci_channel_io_normal;
363
feadf7c0 364 driver = eeh_pcid_get(dev);
f0295e04 365 if (!driver) goto out_no_dev;
d0e70341 366
67086e32
WY
367 was_in_error = edev->in_error;
368 edev->in_error = false;
8535ef05
MM
369 eeh_enable_irq(dev);
370
d0e70341 371 if (!driver->err_handler ||
f26c7a03 372 !driver->err_handler->resume ||
67086e32 373 (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
f26c7a03 374 edev->mode &= ~EEH_DEV_NO_HANDLER;
f0295e04 375 goto out;
feadf7c0 376 }
77bd7415
LV
377
378 driver->err_handler->resume(dev);
70298c6e 379
856e1eb9 380 pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
f0295e04
MN
381out:
382 eeh_pcid_put(dev);
856e1eb9 383#ifdef CONFIG_PCI_IOV
521ca5a9
JA
384 if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
385 eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
856e1eb9 386#endif
f0295e04
MN
387out_no_dev:
388 device_unlock(&dev->dev);
9b3c76f0 389 return NULL;
77bd7415
LV
390}
391
cb5b5624 392/**
29f8bf1b 393 * eeh_report_failure - Tell device driver that device is dead.
9b3c76f0 394 * @data: eeh device
29f8bf1b 395 * @userdata: return value
cb5b5624
LV
396 *
397 * This informs the device driver that the device is permanently
398 * dead, and that no further recovery attempts will be made on it.
399 */
9b3c76f0 400static void *eeh_report_failure(void *data, void *userdata)
77bd7415 401{
9b3c76f0
GS
402 struct eeh_dev *edev = (struct eeh_dev *)data;
403 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
404 struct pci_driver *driver;
405
2311cca5 406 if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
d2b0f6f7 407 return NULL;
f0295e04
MN
408
409 device_lock(&dev->dev);
77bd7415
LV
410 dev->error_state = pci_channel_io_perm_failure;
411
feadf7c0 412 driver = eeh_pcid_get(dev);
f0295e04 413 if (!driver) goto out_no_dev;
77bd7415 414
8535ef05
MM
415 eeh_disable_irq(dev);
416
417 if (!driver->err_handler ||
f0295e04
MN
418 !driver->err_handler->error_detected)
419 goto out;
8535ef05 420
77bd7415 421 driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
70298c6e 422
856e1eb9 423 pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
f0295e04
MN
424out:
425 eeh_pcid_put(dev);
426out_no_dev:
427 device_unlock(&dev->dev);
9b3c76f0 428 return NULL;
77bd7415
LV
429}
430
67086e32
WY
431static void *eeh_add_virt_device(void *data, void *userdata)
432{
433 struct pci_driver *driver;
434 struct eeh_dev *edev = (struct eeh_dev *)data;
435 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
436 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
437
438 if (!(edev->physfn)) {
439 pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
69672bd7 440 __func__, pdn->phb->global_number, pdn->busno,
67086e32
WY
441 PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
442 return NULL;
443 }
444
445 driver = eeh_pcid_get(dev);
446 if (driver) {
46d4be41
SB
447 if (driver->err_handler) {
448 eeh_pcid_put(dev);
67086e32 449 return NULL;
46d4be41
SB
450 }
451 eeh_pcid_put(dev);
67086e32
WY
452 }
453
988fc3ba 454#ifdef CONFIG_PCI_IOV
753f6124 455 pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
67086e32
WY
456#endif
457 return NULL;
458}
459
f5c57710
GS
460static void *eeh_rmv_device(void *data, void *userdata)
461{
462 struct pci_driver *driver;
463 struct eeh_dev *edev = (struct eeh_dev *)data;
464 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
67086e32
WY
465 struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
466 int *removed = rmv_data ? &rmv_data->removed : NULL;
f5c57710
GS
467
468 /*
469 * Actually, we should remove the PCI bridges as well.
470 * However, that's lots of complexity to do that,
471 * particularly some of devices under the bridge might
472 * support EEH. So we just care about PCI devices for
473 * simplicity here.
474 */
93de6901 475 if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
f5c57710 476 return NULL;
8cc6b6cd 477
d2b0f6f7
GS
478 /*
479 * We rely on count-based pcibios_release_device() to
480 * detach permanently offlined PEs. Unfortunately, that's
481 * not reliable enough. We might have the permanently
482 * offlined PEs attached, but we needn't take care of
483 * them and their child devices.
484 */
485 if (eeh_dev_removed(edev))
486 return NULL;
487
46d4be41
SB
488 if (removed) {
489 if (eeh_pe_passed(edev->pe))
8cc6b6cd 490 return NULL;
46d4be41
SB
491 driver = eeh_pcid_get(dev);
492 if (driver) {
493 if (driver->err_handler &&
494 driver->err_handler->error_detected &&
495 driver->err_handler->slot_reset) {
496 eeh_pcid_put(dev);
497 return NULL;
498 }
499 eeh_pcid_put(dev);
500 }
8cc6b6cd 501 }
f5c57710
GS
502
503 /* Remove it from PCI subsystem */
504 pr_debug("EEH: Removing %s without EEH sensitive driver\n",
505 pci_name(dev));
506 edev->bus = dev->bus;
507 edev->mode |= EEH_DEV_DISCONNECTED;
67086e32
WY
508 if (removed)
509 (*removed)++;
f5c57710 510
67086e32 511 if (edev->physfn) {
988fc3ba 512#ifdef CONFIG_PCI_IOV
67086e32
WY
513 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
514
753f6124 515 pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
67086e32
WY
516 edev->pdev = NULL;
517
518 /*
519 * We have to set the VF PE number to invalid one, which is
520 * required to plug the VF successfully.
521 */
522 pdn->pe_number = IODA_INVALID_PE;
523#endif
524 if (rmv_data)
525 list_add(&edev->rmv_list, &rmv_data->edev_list);
526 } else {
527 pci_lock_rescan_remove();
528 pci_stop_and_remove_bus_device(dev);
529 pci_unlock_rescan_remove();
530 }
f5c57710
GS
531
532 return NULL;
533}
534
535static void *eeh_pe_detach_dev(void *data, void *userdata)
536{
537 struct eeh_pe *pe = (struct eeh_pe *)data;
538 struct eeh_dev *edev, *tmp;
539
540 eeh_pe_for_each_dev(pe, edev, tmp) {
541 if (!(edev->mode & EEH_DEV_DISCONNECTED))
542 continue;
543
544 edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
545 eeh_rmv_from_parent_pe(edev);
546 }
547
548 return NULL;
549}
550
78954700
GS
551/*
552 * Explicitly clear PE's frozen state for PowerNV where
553 * we have frozen PE until BAR restore is completed. It's
554 * harmless to clear it for pSeries. To be consistent with
555 * PE reset (for 3 times), we try to clear the frozen state
556 * for 3 times as well.
557 */
2c665992 558static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
78954700 559{
2c665992 560 struct eeh_pe *pe = (struct eeh_pe *)data;
f05fea5b 561 bool clear_sw_state = *(bool *)flag;
c9dd0143 562 int i, rc = 1;
78954700 563
c9dd0143 564 for (i = 0; rc && i < 3; i++)
5cfb20b9 565 rc = eeh_unfreeze_pe(pe, clear_sw_state);
78954700 566
c9dd0143 567 /* Stop immediately on any errors */
2c665992 568 if (rc) {
c9dd0143
GS
569 pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n",
570 __func__, rc, pe->phb->global_number, pe->addr);
2c665992
GS
571 return (void *)pe;
572 }
573
574 return NULL;
575}
576
5cfb20b9
GS
577static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
578 bool clear_sw_state)
2c665992
GS
579{
580 void *rc;
581
5cfb20b9 582 rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state);
2c665992 583 if (!rc)
78954700
GS
584 eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
585
2c665992 586 return rc ? -EIO : 0;
78954700
GS
587}
588
5cfb20b9
GS
589int eeh_pe_reset_and_recover(struct eeh_pe *pe)
590{
2efc771f 591 int ret;
5cfb20b9
GS
592
593 /* Bail if the PE is being recovered */
594 if (pe->state & EEH_PE_RECOVERING)
595 return 0;
596
597 /* Put the PE into recovery mode */
598 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
599
600 /* Save states */
601 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
602
5cfb20b9 603 /* Issue reset */
6654c936 604 ret = eeh_pe_reset_full(pe);
5cfb20b9 605 if (ret) {
28bf36f9 606 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
5cfb20b9
GS
607 return ret;
608 }
5cfb20b9
GS
609
610 /* Unfreeze the PE */
611 ret = eeh_clear_pe_frozen_state(pe, true);
612 if (ret) {
613 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
614 return ret;
615 }
616
5cfb20b9
GS
617 /* Restore device state */
618 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
619
5cfb20b9
GS
620 /* Clear recovery mode */
621 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
622
623 return 0;
624}
625
77bd7415 626/**
29f8bf1b 627 * eeh_reset_device - Perform actual reset of a pci slot
5fd13460 628 * @driver_eeh_aware: Does the device's driver provide EEH support?
9b3c76f0 629 * @pe: EEH PE
29f8bf1b 630 * @bus: PCI bus corresponding to the isolcated slot
5fd13460 631 * @rmv_data: Optional, list to record removed devices
77bd7415 632 *
29f8bf1b
GS
633 * This routine must be called to do reset on the indicated PE.
634 * During the reset, udev might be invoked because those affected
635 * PCI devices will be removed and then added.
77bd7415 636 */
67086e32 637static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
5fd13460
SB
638 struct eeh_rmv_data *rmv_data,
639 bool driver_eeh_aware)
77bd7415 640{
edfd17ff 641 time64_t tstamp;
67086e32
WY
642 int cnt, rc;
643 struct eeh_dev *edev;
42405456
LV
644
645 /* pcibios will clear the counter; save the value */
9b3c76f0 646 cnt = pe->freeze_count;
5a71978e 647 tstamp = pe->tstamp;
42405456 648
20ee6a97
GS
649 /*
650 * We don't remove the corresponding PE instances because
651 * we need the information afterwords. The attached EEH
652 * devices are expected to be attached soon when calling
bd251b89 653 * into pci_hp_add_devices().
20ee6a97 654 */
f5c57710 655 eeh_pe_state_mark(pe, EEH_PE_KEEP);
54048cf8 656 if (driver_eeh_aware || (pe->type & EEH_PE_VF)) {
cca0e542 657 eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
54048cf8
SB
658 } else {
659 pci_lock_rescan_remove();
660 pci_hp_remove_devices(bus);
661 pci_unlock_rescan_remove();
1c2042c8 662 }
77bd7415 663
d0914f50
GS
664 /*
665 * Reset the pci controller. (Asserts RST#; resets config space).
b6495c0c 666 * Reconfigure bridges and devices. Don't try to bring the system
29f8bf1b 667 * up if the reset failed for some reason.
d0914f50
GS
668 *
669 * During the reset, it's very dangerous to have uncontrolled PCI
670 * config accesses. So we prefer to block them. However, controlled
671 * PCI config accesses initiated from EEH itself are allowed.
29f8bf1b 672 */
6654c936 673 rc = eeh_pe_reset_full(pe);
28bf36f9 674 if (rc)
b6495c0c 675 return rc;
77bd7415 676
1c2042c8
RW
677 pci_lock_rescan_remove();
678
9b3c76f0
GS
679 /* Restore PE */
680 eeh_ops->configure_bridge(pe);
681 eeh_pe_restore_bars(pe);
77bd7415 682
dc9c41bd
AD
683 /* Clear frozen state */
684 rc = eeh_clear_pe_frozen_state(pe, false);
409bf7f8
AD
685 if (rc) {
686 pci_unlock_rescan_remove();
dc9c41bd 687 return rc;
409bf7f8 688 }
78954700 689
77bd7415 690 /* Give the system 5 seconds to finish running the user-space
a84f273c
GS
691 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
692 * this is a hack, but if we don't do this, and try to bring
693 * the device up before the scripts have taken it down,
77bd7415
LV
694 * potentially weird things happen.
695 */
54048cf8
SB
696 if (!driver_eeh_aware || rmv_data->removed) {
697 pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
698 (driver_eeh_aware ? "partial" : "complete"));
29f8bf1b 699 ssleep(5);
f5c57710
GS
700
701 /*
702 * The EEH device is still connected with its parent
703 * PE. We should disconnect it so the binding can be
704 * rebuilt when adding PCI devices.
705 */
67086e32 706 edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
f5c57710 707 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
a3aa256b 708 if (pe->type & EEH_PE_VF) {
67086e32 709 eeh_add_virt_device(edev, NULL);
a3aa256b 710 } else {
54048cf8
SB
711 if (!driver_eeh_aware)
712 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
bd251b89 713 pci_hp_add_devices(bus);
a3aa256b 714 }
77bd7415 715 }
f5c57710 716 eeh_pe_state_clear(pe, EEH_PE_KEEP);
5a71978e
GS
717
718 pe->tstamp = tstamp;
9b3c76f0 719 pe->freeze_count = cnt;
b6495c0c 720
1c2042c8 721 pci_unlock_rescan_remove();
b6495c0c 722 return 0;
77bd7415
LV
723}
724
725/* The longest amount of time to wait for a pci device
726 * to come back on line, in seconds.
727 */
fb48dc22 728#define MAX_WAIT_FOR_RECOVERY 300
77bd7415 729
c0b64978
RC
730/**
731 * eeh_handle_normal_event - Handle EEH events on a specific PE
37fd8125
SB
732 * @pe: EEH PE - which should not be used after we return, as it may
733 * have been invalidated.
c0b64978
RC
734 *
735 * Attempts to recover the given PE. If recovery fails or the PE has failed
736 * too many times, remove the PE.
737 *
68701780
SB
738 * While PHB detects address or data parity errors on particular PCI
739 * slot, the associated PE will be frozen. Besides, DMA's occurring
740 * to wild addresses (which usually happen due to bugs in device
741 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
742 * #PERR or other misc PCI-related errors also can trigger EEH errors.
743 *
744 * Recovery process consists of unplugging the device driver (which
745 * generated hotplug events to userspace), then issuing a PCI #RST to
746 * the device, then reconfiguring the PCI config space for all bridges
747 * & devices under this slot, and then finally restarting the device
748 * drivers (which cause a second set of hotplug events to go out to
749 * userspace).
c0b64978 750 */
37fd8125 751void eeh_handle_normal_event(struct eeh_pe *pe)
77bd7415 752{
cd95f804 753 struct pci_bus *bus;
67086e32 754 struct eeh_dev *edev, *tmp;
b6495c0c 755 int rc = 0;
18eb3b39 756 enum pci_ers_result result = PCI_ERS_RESULT_NONE;
67086e32 757 struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
77bd7415 758
cd95f804
SB
759 bus = eeh_pe_bus_get(pe);
760 if (!bus) {
1f52f176 761 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
9b3c76f0 762 __func__, pe->phb->global_number, pe->addr);
37fd8125 763 return;
77bd7415
LV
764 }
765
37fd8125
SB
766 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
767
5a71978e 768 eeh_pe_update_time_stamp(pe);
9b3c76f0 769 pe->freeze_count++;
c0b64978 770 if (pe->freeze_count > eeh_max_freezes) {
796b9f5b 771 pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
c0b64978
RC
772 pe->phb->global_number, pe->addr,
773 pe->freeze_count);
774 goto hard_fail;
775 }
796b9f5b
SB
776 pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
777 pe->freeze_count, eeh_max_freezes);
77bd7415
LV
778
779 /* Walk the various device drivers attached to this slot through
780 * a reset sequence, giving each an opportunity to do what it needs
781 * to accomplish the reset. Each child gets a report of the
782 * status ... if any child can't handle the reset, then the entire
783 * slot is dlpar removed and added.
8234fced
GS
784 *
785 * When the PHB is fenced, we have to issue a reset to recover from
786 * the error. Override the result if necessary to have partially
787 * hotplug for this case.
77bd7415 788 */
56ca4fde 789 pr_info("EEH: Notify device drivers to shutdown\n");
9b3c76f0 790 eeh_pe_dev_traverse(pe, eeh_report_error, &result);
8234fced
GS
791 if ((pe->type & EEH_PE_PHB) &&
792 result != PCI_ERS_RESULT_NONE &&
793 result != PCI_ERS_RESULT_NEED_RESET)
794 result = PCI_ERS_RESULT_NEED_RESET;
77bd7415 795
5f1a7c81 796 /* Get the current PCI slot state. This can take a long time,
2ac3990c 797 * sometimes over 300 seconds for certain systems.
29f8bf1b 798 */
9b3c76f0 799 rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
eb594a47 800 if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
0dae2743 801 pr_warn("EEH: Permanent failure\n");
5f1a7c81
LV
802 goto hard_fail;
803 }
804
ede8ca26
LV
805 /* Since rtas may enable MMIO when posting the error log,
806 * don't post the error log until after all dev drivers
17213c3b
LV
807 * have been informed.
808 */
56ca4fde 809 pr_info("EEH: Collect temporary log\n");
9b3c76f0 810 eeh_slot_error_detail(pe, EEH_LOG_TEMP);
ede8ca26 811
77bd7415
LV
812 /* If all device drivers were EEH-unaware, then shut
813 * down all of the device drivers, and hope they
814 * go down willingly, without panicing the system.
815 */
18eb3b39 816 if (result == PCI_ERS_RESULT_NONE) {
56ca4fde 817 pr_info("EEH: Reset with hotplug activity\n");
5fd13460 818 rc = eeh_reset_device(pe, bus, NULL, false);
e0f90b64 819 if (rc) {
0dae2743
GS
820 pr_warn("%s: Unable to reset, err=%d\n",
821 __func__, rc);
b6495c0c 822 goto hard_fail;
e0f90b64 823 }
77bd7415
LV
824 }
825
6a1ca373
LV
826 /* If all devices reported they can proceed, then re-enable MMIO */
827 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
56ca4fde 828 pr_info("EEH: Enable I/O for affected devices\n");
9b3c76f0 829 rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
6a1ca373 830
fa1be476
LV
831 if (rc < 0)
832 goto hard_fail;
6a1ca373
LV
833 if (rc) {
834 result = PCI_ERS_RESULT_NEED_RESET;
835 } else {
56ca4fde 836 pr_info("EEH: Notify device drivers to resume I/O\n");
9b3c76f0 837 eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
6a1ca373 838 }
77bd7415
LV
839 }
840
6a1ca373 841 /* If all devices reported they can proceed, then re-enable DMA */
18eb3b39 842 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
56ca4fde 843 pr_info("EEH: Enabled DMA for affected devices\n");
9b3c76f0 844 rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
6a1ca373 845
fa1be476
LV
846 if (rc < 0)
847 goto hard_fail;
35845a78 848 if (rc) {
6a1ca373 849 result = PCI_ERS_RESULT_NEED_RESET;
35845a78
GS
850 } else {
851 /*
852 * We didn't do PE reset for the case. The PE
853 * is still in frozen state. Clear it before
854 * resuming the PE.
855 */
856 eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
d0e70341 857 result = PCI_ERS_RESULT_RECOVERED;
35845a78 858 }
6a1ca373
LV
859 }
860
861 /* If any device has a hard failure, then shut off everything. */
e0f90b64 862 if (result == PCI_ERS_RESULT_DISCONNECT) {
0dae2743 863 pr_warn("EEH: Device driver gave up\n");
6a1ca373 864 goto hard_fail;
e0f90b64 865 }
6a1ca373
LV
866
867 /* If any device called out for a reset, then reset the slot */
868 if (result == PCI_ERS_RESULT_NEED_RESET) {
56ca4fde 869 pr_info("EEH: Reset without hotplug activity\n");
5fd13460 870 rc = eeh_reset_device(pe, bus, &rmv_data, true);
e0f90b64 871 if (rc) {
0dae2743
GS
872 pr_warn("%s: Cannot reset, err=%d\n",
873 __func__, rc);
b6495c0c 874 goto hard_fail;
e0f90b64 875 }
56ca4fde
GS
876
877 pr_info("EEH: Notify device drivers "
878 "the completion of reset\n");
6a1ca373 879 result = PCI_ERS_RESULT_NONE;
9b3c76f0 880 eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
77bd7415
LV
881 }
882
6a1ca373 883 /* All devices should claim they have recovered by now. */
90fdd613
LV
884 if ((result != PCI_ERS_RESULT_RECOVERED) &&
885 (result != PCI_ERS_RESULT_NONE)) {
0dae2743 886 pr_warn("EEH: Not recovered\n");
6a1ca373 887 goto hard_fail;
e0f90b64 888 }
6a1ca373 889
67086e32
WY
890 /*
891 * For those hot removed VFs, we should add back them after PF get
892 * recovered properly.
893 */
894 list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) {
895 eeh_add_virt_device(edev, NULL);
896 list_del(&edev->rmv_list);
897 }
898
77bd7415 899 /* Tell all device drivers that they can resume operations */
56ca4fde 900 pr_info("EEH: Notify device driver to resume\n");
9b3c76f0 901 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
b6495c0c 902
796b9f5b 903 pr_info("EEH: Recovery successful.\n");
37fd8125 904 goto final;
a84f273c 905
c0b64978 906hard_fail:
b6495c0c
LV
907 /*
908 * About 90% of all real-life EEH failures in the field
909 * are due to poorly seated PCI cards. Only 10% or so are
910 * due to actual, failed cards.
911 */
1f52f176 912 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
9b3c76f0
GS
913 "Please try reseating or replacing it\n",
914 pe->phb->global_number, pe->addr);
b6495c0c 915
9b3c76f0 916 eeh_slot_error_detail(pe, EEH_LOG_PERM);
b6495c0c
LV
917
918 /* Notify all devices that they're about to go down. */
9b3c76f0 919 eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
b6495c0c 920
d2b0f6f7 921 /* Mark the PE to be removed permanently */
432227e9 922 eeh_pe_state_mark(pe, EEH_PE_REMOVED);
d2b0f6f7
GS
923
924 /*
925 * Shut down the device drivers for good. We mark
926 * all removed devices correctly to avoid access
927 * the their PCI config any more.
928 */
5b86ac9e
SB
929 if (pe->type & EEH_PE_VF) {
930 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
931 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
932 } else {
933 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
934 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
d2b0f6f7 935
5b86ac9e 936 pci_lock_rescan_remove();
cd95f804 937 pci_hp_remove_devices(bus);
5b86ac9e
SB
938 pci_unlock_rescan_remove();
939 /* The passed PE should no longer be used */
940 return;
1c2042c8 941 }
37fd8125
SB
942final:
943 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
77bd7415 944}
8a6b1bc7 945
c0b64978
RC
946/**
947 * eeh_handle_special_event - Handle EEH events without a specific failing PE
948 *
949 * Called when an EEH event is detected but can't be narrowed down to a
950 * specific PE. Iterates through possible failures and handles them as
951 * necessary.
952 */
68701780 953void eeh_handle_special_event(void)
8a6b1bc7
GS
954{
955 struct eeh_pe *pe, *phb_pe;
956 struct pci_bus *bus;
7e4e7867 957 struct pci_controller *hose;
8a6b1bc7 958 unsigned long flags;
7e4e7867 959 int rc;
8a6b1bc7 960
8a6b1bc7 961
7e4e7867
GS
962 do {
963 rc = eeh_ops->next_error(&pe);
964
965 switch (rc) {
966 case EEH_NEXT_ERR_DEAD_IOC:
967 /* Mark all PHBs in dead state */
968 eeh_serialize_lock(&flags);
969
970 /* Purge all events */
5c7a35e3 971 eeh_remove_event(NULL, true);
7e4e7867
GS
972
973 list_for_each_entry(hose, &hose_list, list_node) {
974 phb_pe = eeh_phb_pe_get(hose);
975 if (!phb_pe) continue;
976
9e049375 977 eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
7e4e7867
GS
978 }
979
980 eeh_serialize_unlock(flags);
981
982 break;
983 case EEH_NEXT_ERR_FROZEN_PE:
984 case EEH_NEXT_ERR_FENCED_PHB:
985 case EEH_NEXT_ERR_DEAD_PHB:
986 /* Mark the PE in fenced state */
987 eeh_serialize_lock(&flags);
988
989 /* Purge all events of the PHB */
5c7a35e3 990 eeh_remove_event(pe, true);
7e4e7867
GS
991
992 if (rc == EEH_NEXT_ERR_DEAD_PHB)
9e049375 993 eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
7e4e7867
GS
994 else
995 eeh_pe_state_mark(pe,
996 EEH_PE_ISOLATED | EEH_PE_RECOVERING);
997
998 eeh_serialize_unlock(flags);
999
1000 break;
1001 case EEH_NEXT_ERR_NONE:
1002 return;
1003 default:
1004 pr_warn("%s: Invalid value %d from next_error()\n",
1005 __func__, rc);
1006 return;
8a6b1bc7 1007 }
8a6b1bc7 1008
7e4e7867
GS
1009 /*
1010 * For fenced PHB and frozen PE, it's handled as normal
1011 * event. We have to remove the affected PHBs for dead
1012 * PHB and IOC
1013 */
1014 if (rc == EEH_NEXT_ERR_FROZEN_PE ||
1015 rc == EEH_NEXT_ERR_FENCED_PHB) {
37fd8125 1016 eeh_handle_normal_event(pe);
7e4e7867 1017 } else {
1b17366d 1018 pci_lock_rescan_remove();
7e4e7867
GS
1019 list_for_each_entry(hose, &hose_list, list_node) {
1020 phb_pe = eeh_phb_pe_get(hose);
1021 if (!phb_pe ||
9e049375
GS
1022 !(phb_pe->state & EEH_PE_ISOLATED) ||
1023 (phb_pe->state & EEH_PE_RECOVERING))
7e4e7867
GS
1024 continue;
1025
1026 /* Notify all devices to be down */
05ba75f8 1027 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
af2e3a00
RC
1028 eeh_pe_dev_traverse(pe,
1029 eeh_report_failure, NULL);
7e4e7867 1030 bus = eeh_pe_bus_get(phb_pe);
04fec21c
RC
1031 if (!bus) {
1032 pr_err("%s: Cannot find PCI bus for "
1f52f176 1033 "PHB#%x-PE#%x\n",
04fec21c
RC
1034 __func__,
1035 pe->phb->global_number,
1036 pe->addr);
1037 break;
1038 }
bd251b89 1039 pci_hp_remove_devices(bus);
7e4e7867 1040 }
1b17366d 1041 pci_unlock_rescan_remove();
8a6b1bc7 1042 }
7e4e7867
GS
1043
1044 /*
1045 * If we have detected dead IOC, we needn't proceed
1046 * any more since all PHBs would have been removed
1047 */
1048 if (rc == EEH_NEXT_ERR_DEAD_IOC)
1049 break;
1050 } while (rc != EEH_NEXT_ERR_NONE);
8a6b1bc7 1051}