Commit | Line | Data |
---|---|---|
0ef95b41 MS |
1 | /* |
2 | * OPAL hypervisor Maintenance interrupt handling support in PowreNV. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2 of the License, or | |
7 | * (at your option) any later version. | |
8 | * | |
9 | * This program is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | * GNU General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU General Public License | |
15 | * along with this program; If not, see <http://www.gnu.org/licenses/>. | |
16 | * | |
17 | * Copyright 2014 IBM Corporation | |
18 | * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> | |
19 | */ | |
20 | ||
21 | #undef DEBUG | |
22 | ||
23 | #include <linux/kernel.h> | |
24 | #include <linux/init.h> | |
25 | #include <linux/of.h> | |
26 | #include <linux/mm.h> | |
27 | #include <linux/slab.h> | |
28 | ||
29 | #include <asm/opal.h> | |
30 | #include <asm/cputable.h> | |
c1c8a92f | 31 | #include <asm/machdep.h> |
0ef95b41 MS |
32 | |
33 | static int opal_hmi_handler_nb_init; | |
34 | struct OpalHmiEvtNode { | |
35 | struct list_head list; | |
36 | struct OpalHMIEvent hmi_evt; | |
37 | }; | |
c33e11d0 MS |
38 | |
39 | struct xstop_reason { | |
40 | uint32_t xstop_reason; | |
41 | const char *unit_failed; | |
42 | const char *description; | |
43 | }; | |
44 | ||
0ef95b41 MS |
45 | static LIST_HEAD(opal_hmi_evt_list); |
46 | static DEFINE_SPINLOCK(opal_hmi_evt_lock); | |
47 | ||
c33e11d0 MS |
48 | static void print_core_checkstop_reason(const char *level, |
49 | struct OpalHMIEvent *hmi_evt) | |
50 | { | |
51 | int i; | |
52 | static const struct xstop_reason xstop_reason[] = { | |
53 | { CORE_CHECKSTOP_IFU_REGFILE, "IFU", | |
54 | "RegFile core check stop" }, | |
55 | { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" }, | |
56 | { CORE_CHECKSTOP_PC_DURING_RECOV, "PC", | |
57 | "Core checkstop during recovery" }, | |
58 | { CORE_CHECKSTOP_ISU_REGFILE, "ISU", | |
59 | "RegFile core check stop (mapper error)" }, | |
60 | { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" }, | |
61 | { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" }, | |
62 | { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" }, | |
63 | { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC", | |
64 | "Recovery in maintenance mode" }, | |
65 | { CORE_CHECKSTOP_LSU_REGFILE, "LSU", | |
66 | "RegFile core check stop" }, | |
67 | { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC", | |
68 | "Forward Progress Error" }, | |
69 | { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" }, | |
70 | { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" }, | |
71 | { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC", | |
72 | "Hypervisor Resource error - core check stop" }, | |
73 | { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC", | |
74 | "Hang Recovery Failed (core check stop)" }, | |
75 | { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC", | |
76 | "Ambiguous Hang Detected (unknown source)" }, | |
77 | { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC", | |
78 | "Debug Trigger Error inject" }, | |
79 | { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC", | |
80 | "Hypervisor check stop via SPRC/SPRD" }, | |
81 | }; | |
82 | ||
83 | /* Validity check */ | |
84 | if (!hmi_evt->u.xstop_error.xstop_reason) { | |
85 | printk("%s Unknown Core check stop.\n", level); | |
86 | return; | |
87 | } | |
88 | ||
89 | printk("%s CPU PIR: %08x\n", level, | |
90 | be32_to_cpu(hmi_evt->u.xstop_error.u.pir)); | |
91 | for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) | |
92 | if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & | |
93 | xstop_reason[i].xstop_reason) | |
94 | printk("%s [Unit: %-3s] %s\n", level, | |
95 | xstop_reason[i].unit_failed, | |
96 | xstop_reason[i].description); | |
97 | } | |
98 | ||
99 | static void print_nx_checkstop_reason(const char *level, | |
100 | struct OpalHMIEvent *hmi_evt) | |
101 | { | |
102 | int i; | |
103 | static const struct xstop_reason xstop_reason[] = { | |
104 | { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine", | |
105 | "SHM invalid state error" }, | |
106 | { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine", | |
107 | "DMA invalid state error bit 15" }, | |
108 | { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine", | |
109 | "DMA invalid state error bit 16" }, | |
110 | { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine", | |
111 | "Channel 0 invalid state error" }, | |
112 | { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine", | |
113 | "Channel 1 invalid state error" }, | |
114 | { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine", | |
115 | "Channel 2 invalid state error" }, | |
116 | { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine", | |
117 | "Channel 3 invalid state error" }, | |
118 | { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine", | |
119 | "Channel 4 invalid state error" }, | |
120 | { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine", | |
121 | "Channel 5 invalid state error" }, | |
122 | { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine", | |
123 | "Channel 6 invalid state error" }, | |
124 | { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine", | |
125 | "Channel 7 invalid state error" }, | |
126 | { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine", | |
127 | "UE error on CRB(CSB address, CCB)" }, | |
128 | { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine", | |
129 | "SUE error on CRB(CSB address, CCB)" }, | |
130 | { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface", | |
131 | "CRB Kill ISN received while holding ISN with UE error" }, | |
132 | }; | |
133 | ||
134 | /* Validity check */ | |
135 | if (!hmi_evt->u.xstop_error.xstop_reason) { | |
136 | printk("%s Unknown NX check stop.\n", level); | |
137 | return; | |
138 | } | |
139 | ||
140 | printk("%s NX checkstop on CHIP ID: %x\n", level, | |
141 | be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id)); | |
142 | for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) | |
143 | if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & | |
144 | xstop_reason[i].xstop_reason) | |
145 | printk("%s [Unit: %-3s] %s\n", level, | |
146 | xstop_reason[i].unit_failed, | |
147 | xstop_reason[i].description); | |
148 | } | |
149 | ||
150 | static void print_checkstop_reason(const char *level, | |
151 | struct OpalHMIEvent *hmi_evt) | |
152 | { | |
f8a25db4 RC |
153 | uint8_t type = hmi_evt->u.xstop_error.xstop_type; |
154 | switch (type) { | |
c33e11d0 MS |
155 | case CHECKSTOP_TYPE_CORE: |
156 | print_core_checkstop_reason(level, hmi_evt); | |
157 | break; | |
158 | case CHECKSTOP_TYPE_NX: | |
159 | print_nx_checkstop_reason(level, hmi_evt); | |
160 | break; | |
f8a25db4 RC |
161 | default: |
162 | printk("%s Unknown Malfunction Alert of type %d\n", | |
163 | level, type); | |
c33e11d0 MS |
164 | break; |
165 | } | |
166 | } | |
167 | ||
0ef95b41 MS |
168 | static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) |
169 | { | |
170 | const char *level, *sevstr, *error_info; | |
171 | static const char *hmi_error_types[] = { | |
172 | "Malfunction Alert", | |
173 | "Processor Recovery done", | |
174 | "Processor recovery occurred again", | |
175 | "Processor recovery occurred for masked error", | |
176 | "Timer facility experienced an error", | |
177 | "TFMR SPR is corrupted", | |
178 | "UPS (Uniterrupted Power System) Overflow indication", | |
179 | "An XSCOM operation failure", | |
180 | "An XSCOM operation completed", | |
181 | "SCOM has set a reserved FIR bit to cause recovery", | |
182 | "Debug trigger has set a reserved FIR bit to cause recovery", | |
183 | "A hypervisor resource error occurred" | |
184 | }; | |
185 | ||
186 | /* Print things out */ | |
6acbc5a1 | 187 | if (hmi_evt->version < OpalHMIEvt_V1) { |
0ef95b41 MS |
188 | pr_err("HMI Interrupt, Unknown event version %d !\n", |
189 | hmi_evt->version); | |
190 | return; | |
191 | } | |
192 | switch (hmi_evt->severity) { | |
193 | case OpalHMI_SEV_NO_ERROR: | |
194 | level = KERN_INFO; | |
195 | sevstr = "Harmless"; | |
196 | break; | |
197 | case OpalHMI_SEV_WARNING: | |
198 | level = KERN_WARNING; | |
199 | sevstr = ""; | |
200 | break; | |
201 | case OpalHMI_SEV_ERROR_SYNC: | |
202 | level = KERN_ERR; | |
203 | sevstr = "Severe"; | |
204 | break; | |
205 | case OpalHMI_SEV_FATAL: | |
206 | default: | |
207 | level = KERN_ERR; | |
208 | sevstr = "Fatal"; | |
209 | break; | |
210 | } | |
211 | ||
212 | printk("%s%s Hypervisor Maintenance interrupt [%s]\n", | |
213 | level, sevstr, | |
214 | hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? | |
215 | "Recovered" : "Not recovered"); | |
216 | error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? | |
217 | hmi_error_types[hmi_evt->type] | |
218 | : "Unknown"; | |
219 | printk("%s Error detail: %s\n", level, error_info); | |
220 | printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); | |
221 | if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || | |
222 | (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) | |
223 | printk("%s TFMR: %016llx\n", level, | |
224 | be64_to_cpu(hmi_evt->tfmr)); | |
c33e11d0 MS |
225 | |
226 | if (hmi_evt->version < OpalHMIEvt_V2) | |
227 | return; | |
228 | ||
229 | /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */ | |
230 | if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT) | |
231 | print_checkstop_reason(level, hmi_evt); | |
0ef95b41 MS |
232 | } |
233 | ||
234 | static void hmi_event_handler(struct work_struct *work) | |
235 | { | |
236 | unsigned long flags; | |
237 | struct OpalHMIEvent *hmi_evt; | |
238 | struct OpalHmiEvtNode *msg_node; | |
239 | uint8_t disposition; | |
1852ae27 MS |
240 | struct opal_msg msg; |
241 | int unrecoverable = 0; | |
0ef95b41 MS |
242 | |
243 | spin_lock_irqsave(&opal_hmi_evt_lock, flags); | |
244 | while (!list_empty(&opal_hmi_evt_list)) { | |
245 | msg_node = list_entry(opal_hmi_evt_list.next, | |
246 | struct OpalHmiEvtNode, list); | |
247 | list_del(&msg_node->list); | |
248 | spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); | |
249 | ||
250 | hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt; | |
251 | print_hmi_event_info(hmi_evt); | |
252 | disposition = hmi_evt->disposition; | |
253 | kfree(msg_node); | |
254 | ||
255 | /* | |
256 | * Check if HMI event has been recovered or not. If not | |
1852ae27 MS |
257 | * then kernel can't continue, we need to panic. |
258 | * But before we do that, display all the HMI event | |
259 | * available on the list and set unrecoverable flag to 1. | |
0ef95b41 MS |
260 | */ |
261 | if (disposition != OpalHMI_DISPOSITION_RECOVERED) | |
1852ae27 | 262 | unrecoverable = 1; |
0ef95b41 MS |
263 | |
264 | spin_lock_irqsave(&opal_hmi_evt_lock, flags); | |
265 | } | |
266 | spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); | |
1852ae27 MS |
267 | |
268 | if (unrecoverable) { | |
62521ea6 MS |
269 | int ret; |
270 | ||
1852ae27 MS |
271 | /* Pull all HMI events from OPAL before we panic. */ |
272 | while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { | |
273 | u32 type; | |
274 | ||
275 | type = be32_to_cpu(msg.msg_type); | |
276 | ||
277 | /* skip if not HMI event */ | |
278 | if (type != OPAL_MSG_HMI_EVT) | |
279 | continue; | |
280 | ||
281 | /* HMI event info starts from param[0] */ | |
282 | hmi_evt = (struct OpalHMIEvent *)&msg.params[0]; | |
283 | print_hmi_event_info(hmi_evt); | |
284 | } | |
62521ea6 MS |
285 | |
286 | /* | |
287 | * Unrecoverable HMI exception. We need to inform BMC/OCC | |
288 | * about this error so that it can collect relevant data | |
289 | * for error analysis before rebooting. | |
290 | */ | |
291 | ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, | |
292 | "Unrecoverable HMI exception"); | |
293 | if (ret == OPAL_UNSUPPORTED) { | |
294 | pr_emerg("Reboot type %d not supported\n", | |
295 | OPAL_REBOOT_PLATFORM_ERROR); | |
296 | } | |
297 | ||
298 | /* | |
299 | * Fall through and panic if opal_cec_reboot2() returns | |
300 | * OPAL_UNSUPPORTED. | |
301 | */ | |
1852ae27 MS |
302 | panic("Unrecoverable HMI exception"); |
303 | } | |
0ef95b41 MS |
304 | } |
305 | ||
306 | static DECLARE_WORK(hmi_event_work, hmi_event_handler); | |
307 | /* | |
308 | * opal_handle_hmi_event - notifier handler that queues up HMI events | |
309 | * to be preocessed later. | |
310 | */ | |
311 | static int opal_handle_hmi_event(struct notifier_block *nb, | |
312 | unsigned long msg_type, void *msg) | |
313 | { | |
314 | unsigned long flags; | |
315 | struct OpalHMIEvent *hmi_evt; | |
316 | struct opal_msg *hmi_msg = msg; | |
317 | struct OpalHmiEvtNode *msg_node; | |
318 | ||
319 | /* Sanity Checks */ | |
320 | if (msg_type != OPAL_MSG_HMI_EVT) | |
321 | return 0; | |
322 | ||
323 | /* HMI event info starts from param[0] */ | |
324 | hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0]; | |
325 | ||
326 | /* Delay the logging of HMI events to workqueue. */ | |
327 | msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); | |
328 | if (!msg_node) { | |
329 | pr_err("HMI: out of memory, Opal message event not handled\n"); | |
330 | return -ENOMEM; | |
331 | } | |
332 | memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent)); | |
333 | ||
334 | spin_lock_irqsave(&opal_hmi_evt_lock, flags); | |
335 | list_add(&msg_node->list, &opal_hmi_evt_list); | |
336 | spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); | |
337 | ||
338 | schedule_work(&hmi_event_work); | |
339 | return 0; | |
340 | } | |
341 | ||
342 | static struct notifier_block opal_hmi_handler_nb = { | |
343 | .notifier_call = opal_handle_hmi_event, | |
344 | .next = NULL, | |
345 | .priority = 0, | |
346 | }; | |
347 | ||
96e023e7 | 348 | int __init opal_hmi_handler_init(void) |
0ef95b41 MS |
349 | { |
350 | int ret; | |
351 | ||
352 | if (!opal_hmi_handler_nb_init) { | |
353 | ret = opal_message_notifier_register( | |
354 | OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb); | |
355 | if (ret) { | |
356 | pr_err("%s: Can't register OPAL event notifier (%d)\n", | |
357 | __func__, ret); | |
358 | return ret; | |
359 | } | |
360 | opal_hmi_handler_nb_init = 1; | |
361 | } | |
362 | return 0; | |
363 | } |