Commit | Line | Data |
---|---|---|
172ca926 | 1 | /* |
172ca926 LV |
2 | * This program is free software; you can redistribute it and/or modify |
3 | * it under the terms of the GNU General Public License as published by | |
4 | * the Free Software Foundation; either version 2 of the License, or | |
5 | * (at your option) any later version. | |
6 | * | |
7 | * This program is distributed in the hope that it will be useful, | |
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
10 | * GNU General Public License for more details. | |
11 | * | |
12 | * You should have received a copy of the GNU General Public License | |
13 | * along with this program; if not, write to the Free Software | |
14 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
15 | * | |
16 | * Copyright (c) 2005 Linas Vepstas <linas@linas.org> | |
17 | */ | |
18 | ||
ac325acd | 19 | #include <linux/delay.h> |
172ca926 | 20 | #include <linux/list.h> |
62fe91bb | 21 | #include <linux/sched.h> |
c8608558 | 22 | #include <linux/semaphore.h> |
172ca926 | 23 | #include <linux/pci.h> |
5a0e3ad6 | 24 | #include <linux/slab.h> |
ecf89e58 | 25 | #include <linux/kthread.h> |
172ca926 | 26 | #include <asm/eeh_event.h> |
77bd7415 | 27 | #include <asm/ppc-pci.h> |
172ca926 LV |
28 | |
29 | /** Overview: | |
30 | * EEH error states may be detected within exception handlers; | |
31 | * however, the recovery processing needs to occur asynchronously | |
32 | * in a normal kernel context and not an interrupt context. | |
33 | * This pair of routines creates an event and queues it onto a | |
34 | * work-queue, where a worker thread can drive recovery. | |
35 | */ | |
36 | ||
34af946a | 37 | static DEFINE_SPINLOCK(eeh_eventlist_lock); |
c8608558 | 38 | static struct semaphore eeh_eventlist_sem; |
172ca926 | 39 | LIST_HEAD(eeh_eventlist); |
8c33fd11 | 40 | |
172ca926 | 41 | /** |
29f8bf1b | 42 | * eeh_event_handler - Dispatch EEH events. |
172ca926 | 43 | * @dummy - unused |
8c33fd11 LV |
44 | * |
45 | * The detection of a frozen slot can occur inside an interrupt, | |
46 | * where it can be hard to do anything about it. The goal of this | |
47 | * routine is to pull these detection events out of the context | |
48 | * of the interrupt handler, and re-dispatch them for processing | |
49 | * at a later time in a normal context. | |
172ca926 LV |
50 | */ |
51 | static int eeh_event_handler(void * dummy) | |
52 | { | |
53 | unsigned long flags; | |
40a7cd92 | 54 | struct eeh_event *event; |
120dc496 | 55 | struct eeh_pe *pe; |
172ca926 | 56 | |
c8608558 | 57 | while (!kthread_should_stop()) { |
5459ae14 GS |
58 | if (down_interruptible(&eeh_eventlist_sem)) |
59 | break; | |
c8608558 GS |
60 | |
61 | /* Fetch EEH event from the queue */ | |
62 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | |
63 | event = NULL; | |
64 | if (!list_empty(&eeh_eventlist)) { | |
65 | event = list_entry(eeh_eventlist.next, | |
66 | struct eeh_event, list); | |
67 | list_del(&event->list); | |
68 | } | |
69 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | |
70 | if (!event) | |
71 | continue; | |
72 | ||
73 | /* We might have event without binding PE */ | |
74 | pe = event->pe; | |
75 | if (pe) { | |
76 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); | |
0b5381a6 GS |
77 | if (pe->type & EEH_PE_PHB) |
78 | pr_info("EEH: Detected error on PHB#%d\n", | |
79 | pe->phb->global_number); | |
80 | else | |
81 | pr_info("EEH: Detected PCI bus error on " | |
82 | "PHB#%d-PE#%x\n", | |
83 | pe->phb->global_number, pe->addr); | |
c8608558 GS |
84 | eeh_handle_event(pe); |
85 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING); | |
86 | } else { | |
87 | eeh_handle_event(NULL); | |
88 | } | |
89 | ||
90 | kfree(event); | |
172ca926 LV |
91 | } |
92 | ||
93 | return 0; | |
94 | } | |
95 | ||
96 | /** | |
c8608558 | 97 | * eeh_event_init - Start kernel thread to handle EEH events |
29f8bf1b GS |
98 | * |
99 | * This routine is called to start the kernel thread for processing | |
100 | * EEH event. | |
172ca926 | 101 | */ |
c8608558 | 102 | int eeh_event_init(void) |
172ca926 | 103 | { |
c8608558 GS |
104 | struct task_struct *t; |
105 | int ret = 0; | |
106 | ||
107 | /* Initialize semaphore */ | |
108 | sema_init(&eeh_eventlist_sem, 0); | |
109 | ||
110 | t = kthread_run(eeh_event_handler, NULL, "eehd"); | |
111 | if (IS_ERR(t)) { | |
112 | ret = PTR_ERR(t); | |
113 | pr_err("%s: Failed to start EEH daemon (%d)\n", | |
114 | __func__, ret); | |
115 | return ret; | |
116 | } | |
117 | ||
118 | return 0; | |
172ca926 LV |
119 | } |
120 | ||
121 | /** | |
29f8bf1b | 122 | * eeh_send_failure_event - Generate a PCI error event |
c533b46c | 123 | * @pe: EEH PE |
172ca926 LV |
124 | * |
125 | * This routine can be called within an interrupt context; | |
126 | * the actual event will be delivered in a normal context | |
127 | * (from a workqueue). | |
128 | */ | |
c533b46c | 129 | int eeh_send_failure_event(struct eeh_pe *pe) |
172ca926 LV |
130 | { |
131 | unsigned long flags; | |
132 | struct eeh_event *event; | |
172ca926 | 133 | |
c533b46c GS |
134 | event = kzalloc(sizeof(*event), GFP_ATOMIC); |
135 | if (!event) { | |
136 | pr_err("EEH: out of memory, event not handled\n"); | |
137 | return -ENOMEM; | |
138 | } | |
139 | event->pe = pe; | |
172ca926 LV |
140 | |
141 | /* We may or may not be called in an interrupt context */ | |
142 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | |
143 | list_add(&event->list, &eeh_eventlist); | |
144 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | |
145 | ||
c8608558 GS |
146 | /* For EEH deamon to knick in */ |
147 | up(&eeh_eventlist_sem); | |
172ca926 LV |
148 | |
149 | return 0; | |
150 | } | |
99866595 GS |
151 | |
152 | /** | |
153 | * eeh_remove_event - Remove EEH event from the queue | |
154 | * @pe: Event binding to the PE | |
5c7a35e3 | 155 | * @force: Event will be removed unconditionally |
99866595 GS |
156 | * |
157 | * On PowerNV platform, we might have subsequent coming events | |
158 | * is part of the former one. For that case, those subsequent | |
159 | * coming events are totally duplicated and unnecessary, thus | |
160 | * they should be removed. | |
161 | */ | |
5c7a35e3 | 162 | void eeh_remove_event(struct eeh_pe *pe, bool force) |
99866595 GS |
163 | { |
164 | unsigned long flags; | |
165 | struct eeh_event *event, *tmp; | |
166 | ||
5c7a35e3 GS |
167 | /* |
168 | * If we have NULL PE passed in, we have dead IOC | |
169 | * or we're sure we can report all existing errors | |
170 | * by the caller. | |
171 | * | |
172 | * With "force", the event with associated PE that | |
173 | * have been isolated, the event won't be removed | |
174 | * to avoid event lost. | |
175 | */ | |
99866595 GS |
176 | spin_lock_irqsave(&eeh_eventlist_lock, flags); |
177 | list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { | |
5c7a35e3 GS |
178 | if (!force && event->pe && |
179 | (event->pe->state & EEH_PE_ISOLATED)) | |
180 | continue; | |
181 | ||
99866595 GS |
182 | if (!pe) { |
183 | list_del(&event->list); | |
184 | kfree(event); | |
185 | } else if (pe->type & EEH_PE_PHB) { | |
186 | if (event->pe && event->pe->phb == pe->phb) { | |
187 | list_del(&event->list); | |
188 | kfree(event); | |
189 | } | |
190 | } else if (event->pe == pe) { | |
191 | list_del(&event->list); | |
192 | kfree(event); | |
193 | } | |
194 | } | |
195 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | |
196 | } |