x86, mce, cmci: define MSR names and fields for new CMCI registers
[linux-2.6-block.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
1da177e4
LT
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
38c4c97c 14#include <linux/smp_lock.h>
1da177e4
LT
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
a9415644 21#include <linux/capability.h>
91c6d400
AK
22#include <linux/cpu.h>
23#include <linux/percpu.h>
e02e68d3
TH
24#include <linux/poll.h>
25#include <linux/thread_info.h>
8c566ef5 26#include <linux/ctype.h>
a98f0dd3 27#include <linux/kmod.h>
1eeb66a1 28#include <linux/kdebug.h>
0d7482e3
AK
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
8457c84d 31#include <linux/ratelimit.h>
d88203d1 32#include <asm/processor.h>
1da177e4
LT
33#include <asm/msr.h>
34#include <asm/mce.h>
1da177e4 35#include <asm/uaccess.h>
0a9c3ee7 36#include <asm/smp.h>
e02e68d3 37#include <asm/idle.h>
1da177e4
LT
38
39#define MISC_MCELOG_MINOR 227
0d7482e3 40
553f265f
AK
41atomic_t mce_entry;
42
1da177e4
LT
43static int mce_dont_init;
44
bd78432c
TH
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
1da177e4
LT
52static int tolerant = 1;
53static int banks;
0d7482e3 54static u64 *bank;
e02e68d3 55static unsigned long notify_user;
94ad8474 56static int rip_msr;
911f6a7b 57static int mce_bootlog = -1;
a98f0dd3
AK
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
1da177e4 62
e02e68d3
TH
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
ee031c31
AK
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
b5f2fa4e
AK
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
1da177e4
LT
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
231fd906 84static struct mce_log mcelog = {
1da177e4
LT
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
d88203d1 87};
1da177e4
LT
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
a98f0dd3 92 atomic_inc(&mce_events);
1da177e4 93 mce->finished = 0;
7644143c 94 wmb();
1da177e4
LT
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
673242c1
AK
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
53756d37 101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
673242c1
AK
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
7644143c 109 break;
1da177e4 110 }
1da177e4
LT
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 117 wmb();
1da177e4 118 mcelog.entry[entry].finished = 1;
7644143c 119 wmb();
1da177e4 120
e02e68d3 121 set_bit(0, &notify_user);
1da177e4
LT
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
4855170f 127 KERN_EMERG "HARDWARE ERROR\n"
1da177e4
LT
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
65ea5b03 131 if (m->ip) {
d88203d1 132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
1da177e4 133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
65ea5b03 134 m->cs, m->ip);
1da177e4 135 if (m->cs == __KERNEL_CS)
65ea5b03 136 print_symbol("{%s}", m->ip);
1da177e4
LT
137 printk("\n");
138 }
f6d1826d 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
1da177e4 140 if (m->addr)
f6d1826d 141 printk("ADDR %llx ", m->addr);
1da177e4 142 if (m->misc)
f6d1826d 143 printk("MISC %llx ", m->misc);
1da177e4 144 printk("\n");
4855170f 145 printk(KERN_EMERG "This is not a software problem!\n");
d88203d1
TG
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
1da177e4
LT
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
d88203d1 151{
1da177e4 152 int i;
e02e68d3 153
1da177e4
LT
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
d88203d1 157
1da177e4
LT
158 if (time_before(tsc, start))
159 continue;
d88203d1 160 print_mce(&mcelog.entry[i]);
1da177e4
LT
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
e02e68d3 166 panic(msg);
d88203d1 167}
1da177e4
LT
168
169static int mce_available(struct cpuinfo_x86 *c)
170{
5b4408fd
AK
171 if (mce_dont_init)
172 return 0;
3d1712c9 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
174}
175
94ad8474
AK
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
65ea5b03 179 m->ip = regs->ip;
94ad8474
AK
180 m->cs = regs->cs;
181 } else {
65ea5b03 182 m->ip = 0;
94ad8474
AK
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
65ea5b03 188 rdmsrl(rip_msr, m->ip);
94ad8474
AK
189 m->cs = 0;
190 }
191}
192
d88203d1 193/*
b79109c3
AK
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
ee031c31 199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3
AK
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
ee031c31 208 if (!bank[i] || !test_bit(i, *b))
b79109c3
AK
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
1da177e4 265 */
1da177e4
LT
266void do_machine_check(struct pt_regs * regs, long error_code)
267{
268 struct mce m, panicm;
1da177e4
LT
269 u64 mcestart = 0;
270 int i;
271 int panicm_found = 0;
bd78432c
TH
272 /*
273 * If no_way_out gets set, there is no safe way to recover from this
274 * MCE. If tolerant is cranked up, we'll try anyway.
275 */
276 int no_way_out = 0;
277 /*
278 * If kill_it gets set, there might be a way to recover from this
279 * error.
280 */
281 int kill_it = 0;
b79109c3 282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1da177e4 283
553f265f
AK
284 atomic_inc(&mce_entry);
285
b79109c3 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
22f5991c 287 18, SIGKILL) == NOTIFY_STOP)
b79109c3
AK
288 goto out2;
289 if (!banks)
553f265f 290 goto out2;
1da177e4 291
b5f2fa4e
AK
292 mce_setup(&m);
293
1da177e4 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
bd78432c 295 /* if the restart IP is not valid, we're done for */
1da177e4 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
bd78432c 297 no_way_out = 1;
d88203d1 298
1da177e4
LT
299 rdtscll(mcestart);
300 barrier();
301
302 for (i = 0; i < banks; i++) {
b79109c3 303 __clear_bit(i, toclear);
0d7482e3 304 if (!bank[i])
1da177e4 305 continue;
d88203d1
TG
306
307 m.misc = 0;
1da177e4
LT
308 m.addr = 0;
309 m.bank = i;
1da177e4
LT
310
311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
312 if ((m.status & MCI_STATUS_VAL) == 0)
313 continue;
314
b79109c3
AK
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
1da177e4 329 if (m.status & MCI_STATUS_EN) {
bd78432c
TH
330 /* if PCC was set, there's no way out */
331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
332 /*
333 * If this error was uncorrectable and there was
334 * an overflow, we're in trouble. If no overflow,
335 * we might get away with just killing a task.
336 */
337 if (m.status & MCI_STATUS_UC) {
338 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
339 no_way_out = 1;
340 kill_it = 1;
341 }
b79109c3
AK
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
1da177e4
LT
348 }
349
350 if (m.status & MCI_STATUS_MISCV)
351 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
352 if (m.status & MCI_STATUS_ADDRV)
353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
354
94ad8474 355 mce_get_rip(&m, regs);
b79109c3 356 mce_log(&m);
1da177e4
LT
357
358 /* Did this bank cause the exception? */
359 /* Assume that the bank with uncorrectable errors did it,
360 and that there is only a single one. */
361 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
362 panicm = m;
363 panicm_found = 1;
364 }
1da177e4
LT
365 }
366
1da177e4
LT
367 /* If we didn't find an uncorrectable error, pick
368 the last one (shouldn't happen, just being safe). */
369 if (!panicm_found)
370 panicm = m;
bd78432c
TH
371
372 /*
373 * If we have decided that we just CAN'T continue, and the user
374 * has not set tolerant to an insane level, give up and die.
375 */
376 if (no_way_out && tolerant < 3)
1da177e4 377 mce_panic("Machine check", &panicm, mcestart);
bd78432c
TH
378
379 /*
380 * If the error seems to be unrecoverable, something should be
381 * done. Try to kill as little as possible. If we can kill just
382 * one task, do that. If the user has set the tolerance very
383 * high, don't try to do anything at all.
384 */
385 if (kill_it && tolerant < 3) {
1da177e4
LT
386 int user_space = 0;
387
bd78432c
TH
388 /*
389 * If the EIPV bit is set, it means the saved IP is the
390 * instruction which caused the MCE.
391 */
392 if (m.mcgstatus & MCG_STATUS_EIPV)
65ea5b03 393 user_space = panicm.ip && (panicm.cs & 3);
bd78432c
TH
394
395 /*
396 * If we know that the error was in user space, send a
397 * SIGBUS. Otherwise, panic if tolerance is low.
398 *
380851bc 399 * force_sig() takes an awful lot of locks and has a slight
bd78432c
TH
400 * risk of deadlocking.
401 */
402 if (user_space) {
380851bc 403 force_sig(SIGBUS, current);
bd78432c
TH
404 } else if (panic_on_oops || tolerant < 2) {
405 mce_panic("Uncorrected machine check",
406 &panicm, mcestart);
407 }
1da177e4
LT
408 }
409
e02e68d3
TH
410 /* notify userspace ASAP */
411 set_thread_flag(TIF_MCE_NOTIFY);
412
bd78432c 413 /* the last thing we do is clear state */
b79109c3
AK
414 for (i = 0; i < banks; i++) {
415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
1da177e4 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
553f265f
AK
419 out2:
420 atomic_dec(&mce_entry);
1da177e4
LT
421}
422
15d5f839
DZ
423#ifdef CONFIG_X86_MCE_INTEL
424/***
425 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
676b1855 426 * @cpu: The CPU on which the event occurred.
15d5f839
DZ
427 * @status: Event status information
428 *
429 * This function should be called by the thermal interrupt after the
430 * event has been processed and the decision was made to log the event
431 * further.
432 *
433 * The status parameter will be saved to the 'status' field of 'struct mce'
434 * and historically has been the register value of the
435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
436 */
b5f2fa4e 437void mce_log_therm_throt_event(__u64 status)
15d5f839
DZ
438{
439 struct mce m;
440
b5f2fa4e 441 mce_setup(&m);
15d5f839
DZ
442 m.bank = MCE_THERMAL_BANK;
443 m.status = status;
15d5f839
DZ
444 mce_log(&m);
445}
446#endif /* CONFIG_X86_MCE_INTEL */
447
1da177e4 448/*
8a336b0a
TH
449 * Periodic polling timer for "silent" machine check errors. If the
450 * poller finds an MCE, poll 2x faster. When the poller finds no more
451 * errors, poll 2x slower (up to check_interval seconds).
1da177e4
LT
452 */
453
454static int check_interval = 5 * 60; /* 5 minutes */
8a336b0a 455static int next_interval; /* in jiffies */
52d168e2
AK
456static void mcheck_timer(unsigned long);
457static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 458
52d168e2 459static void mcheck_timer(unsigned long data)
1da177e4 460{
52d168e2
AK
461 struct timer_list *t = &per_cpu(mce_timer, data);
462
463 WARN_ON(smp_processor_id() != data);
464
1da177e4 465 if (mce_available(&current_cpu_data))
ee031c31
AK
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
1da177e4
LT
468
469 /*
e02e68d3
TH
470 * Alert userspace if needed. If we logged an MCE, reduce the
471 * polling interval, otherwise increase the polling interval.
1da177e4 472 */
e02e68d3
TH
473 if (mce_notify_user()) {
474 next_interval = max(next_interval/2, HZ/100);
475 } else {
d88203d1 476 next_interval = min(next_interval * 2,
22293e58 477 (int)round_jiffies_relative(check_interval*HZ));
e02e68d3
TH
478 }
479
52d168e2
AK
480 t->expires = jiffies + next_interval;
481 add_timer(t);
e02e68d3
TH
482}
483
9bd98405
AK
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
487}
488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
e02e68d3 491/*
9bd98405
AK
492 * Notify the user(s) about new machine check events.
493 * Can be called from interrupt context, but not from machine check/NMI
494 * context.
e02e68d3
TH
495 */
496int mce_notify_user(void)
497{
8457c84d
AK
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
e02e68d3
TH
501 clear_thread_flag(TIF_MCE_NOTIFY);
502 if (test_and_clear_bit(0, &notify_user)) {
e02e68d3 503 wake_up_interruptible(&mce_wait);
9bd98405
AK
504
505 /*
506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
e02e68d3 512
8457c84d 513 if (__ratelimit(&ratelimit))
8a336b0a 514 printk(KERN_INFO "Machine check events logged\n");
e02e68d3
TH
515
516 return 1;
1da177e4 517 }
e02e68d3
TH
518 return 0;
519}
8a336b0a 520
e02e68d3
TH
521/* see if the idle task needs to notify userspace */
522static int
523mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
524{
525 /* IDLE_END should be safe - interrupts are back on */
526 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
527 mce_notify_user();
528
529 return NOTIFY_OK;
1da177e4
LT
530}
531
e02e68d3
TH
532static struct notifier_block mce_idle_notifier = {
533 .notifier_call = mce_idle_callback,
534};
1da177e4
LT
535
536static __init int periodic_mcheck_init(void)
d88203d1 537{
52d168e2
AK
538 idle_notifier_register(&mce_idle_notifier);
539 return 0;
d88203d1 540}
1da177e4
LT
541__initcall(periodic_mcheck_init);
542
d88203d1 543/*
1da177e4
LT
544 * Initialize Machine Checks for a CPU.
545 */
0d7482e3 546static int mce_cap_init(void)
1da177e4
LT
547{
548 u64 cap;
0d7482e3 549 unsigned b;
1da177e4
LT
550
551 rdmsrl(MSR_IA32_MCG_CAP, cap);
0d7482e3
AK
552 b = cap & 0xff;
553 if (b > MAX_NR_BANKS) {
554 printk(KERN_WARNING
555 "MCE: Using only %u machine check banks out of %u\n",
556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
1da177e4 568 }
0d7482e3 569
94ad8474
AK
570 /* Use accurate RIP reporting if available. */
571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
572 rip_msr = MSR_IA32_MCG_EIP;
1da177e4 573
0d7482e3
AK
574 return 0;
575}
576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
ee031c31 581 mce_banks_t all_banks;
0d7482e3 582
b79109c3
AK
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
ee031c31
AK
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
1da177e4
LT
588
589 set_in_cr4(X86_CR4_MCE);
590
0d7482e3 591 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
592 if (cap & MCG_CTL_P)
593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
594
595 for (i = 0; i < banks; i++) {
0d7482e3 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1da177e4 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
d88203d1 598 }
1da177e4
LT
599}
600
601/* Add per CPU specific workarounds here */
ec5b3d32 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
d88203d1 603{
1da177e4 604 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 605 if (c->x86_vendor == X86_VENDOR_AMD) {
0d7482e3 606 if (c->x86 == 15 && banks > 4)
911f6a7b
JB
607 /* disable GART TBL walk error reporting, which trips off
608 incorrectly with the IOMMU & 3ware & Cerberus. */
0d7482e3 609 clear_bit(10, (unsigned long *)&bank[4]);
911f6a7b
JB
610 if(c->x86 <= 17 && mce_bootlog < 0)
611 /* Lots of broken BIOS around that don't clear them
612 by default and leave crap in there. Don't log. */
613 mce_bootlog = 0;
1da177e4 614 }
e583538f 615
d88203d1 616}
1da177e4 617
cc3ca220 618static void mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
619{
620 switch (c->x86_vendor) {
621 case X86_VENDOR_INTEL:
622 mce_intel_feature_init(c);
623 break;
89b831ef
JS
624 case X86_VENDOR_AMD:
625 mce_amd_feature_init(c);
626 break;
1da177e4
LT
627 default:
628 break;
629 }
630}
631
52d168e2
AK
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies_relative(jiffies + next_interval);
643 add_timer(t);
644}
645
d88203d1 646/*
1da177e4 647 * Called for each booted CPU to set up machine checks.
d88203d1 648 * Must be called with preempt off.
1da177e4 649 */
e6982c67 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 651{
5b4408fd 652 if (!mce_available(c))
1da177e4
LT
653 return;
654
0d7482e3
AK
655 if (mce_cap_init() < 0) {
656 mce_dont_init = 1;
657 return;
658 }
659 mce_cpu_quirks(c);
660
1da177e4
LT
661 mce_init(NULL);
662 mce_cpu_features(c);
52d168e2 663 mce_init_timer();
1da177e4
LT
664}
665
666/*
667 * Character device to read and clear the MCE log.
668 */
669
f528e7ba
TH
670static DEFINE_SPINLOCK(mce_state_lock);
671static int open_count; /* #times opened */
672static int open_exclu; /* already open exclusive? */
673
674static int mce_open(struct inode *inode, struct file *file)
675{
38c4c97c 676 lock_kernel();
f528e7ba
TH
677 spin_lock(&mce_state_lock);
678
679 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
680 spin_unlock(&mce_state_lock);
38c4c97c 681 unlock_kernel();
f528e7ba
TH
682 return -EBUSY;
683 }
684
685 if (file->f_flags & O_EXCL)
686 open_exclu = 1;
687 open_count++;
688
689 spin_unlock(&mce_state_lock);
38c4c97c 690 unlock_kernel();
f528e7ba 691
bd78432c 692 return nonseekable_open(inode, file);
f528e7ba
TH
693}
694
695static int mce_release(struct inode *inode, struct file *file)
696{
697 spin_lock(&mce_state_lock);
698
699 open_count--;
700 open_exclu = 0;
701
702 spin_unlock(&mce_state_lock);
703
704 return 0;
705}
706
d88203d1
TG
707static void collect_tscs(void *data)
708{
1da177e4 709 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 710
1da177e4 711 rdtscll(cpu_tsc[smp_processor_id()]);
d88203d1 712}
1da177e4 713
d88203d1
TG
714static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
715 loff_t *off)
1da177e4 716{
f0de53bb 717 unsigned long *cpu_tsc;
8c8b8859 718 static DEFINE_MUTEX(mce_read_mutex);
ef41df43 719 unsigned prev, next;
1da177e4
LT
720 char __user *buf = ubuf;
721 int i, err;
722
6bca67f9 723 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
724 if (!cpu_tsc)
725 return -ENOMEM;
726
8c8b8859 727 mutex_lock(&mce_read_mutex);
1da177e4
LT
728 next = rcu_dereference(mcelog.next);
729
730 /* Only supports full reads right now */
d88203d1 731 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
8c8b8859 732 mutex_unlock(&mce_read_mutex);
f0de53bb 733 kfree(cpu_tsc);
1da177e4
LT
734 return -EINVAL;
735 }
736
737 err = 0;
ef41df43
HY
738 prev = 0;
739 do {
740 for (i = prev; i < next; i++) {
741 unsigned long start = jiffies;
742
743 while (!mcelog.entry[i].finished) {
744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
673242c1 750 }
ef41df43
HY
751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
673242c1 757 }
1da177e4 758
ef41df43
HY
759 memset(mcelog.entry + prev, 0,
760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
1da177e4 764
b2b18660 765 synchronize_sched();
1da177e4 766
d88203d1
TG
767 /*
768 * Collect entries that were still getting written before the
769 * synchronize.
770 */
15c8b6c1 771 on_each_cpu(collect_tscs, cpu_tsc, 1);
d88203d1
TG
772 for (i = next; i < MCE_LOG_LEN; i++) {
773 if (mcelog.entry[i].finished &&
774 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
775 err |= copy_to_user(buf, mcelog.entry+i,
776 sizeof(struct mce));
1da177e4
LT
777 smp_rmb();
778 buf += sizeof(struct mce);
779 memset(&mcelog.entry[i], 0, sizeof(struct mce));
780 }
d88203d1 781 }
8c8b8859 782 mutex_unlock(&mce_read_mutex);
f0de53bb 783 kfree(cpu_tsc);
d88203d1 784 return err ? -EFAULT : buf - ubuf;
1da177e4
LT
785}
786
e02e68d3
TH
787static unsigned int mce_poll(struct file *file, poll_table *wait)
788{
789 poll_wait(file, &mce_wait, wait);
790 if (rcu_dereference(mcelog.next))
791 return POLLIN | POLLRDNORM;
792 return 0;
793}
794
c68461b6 795static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1da177e4
LT
796{
797 int __user *p = (int __user *)arg;
d88203d1 798
1da177e4 799 if (!capable(CAP_SYS_ADMIN))
d88203d1 800 return -EPERM;
1da177e4 801 switch (cmd) {
d88203d1 802 case MCE_GET_RECORD_LEN:
1da177e4
LT
803 return put_user(sizeof(struct mce), p);
804 case MCE_GET_LOG_LEN:
d88203d1 805 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
806 case MCE_GETCLEAR_FLAGS: {
807 unsigned flags;
d88203d1
TG
808
809 do {
1da177e4 810 flags = mcelog.flags;
d88203d1
TG
811 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
812 return put_user(flags, p);
1da177e4
LT
813 }
814 default:
d88203d1
TG
815 return -ENOTTY;
816 }
1da177e4
LT
817}
818
5dfe4c96 819static const struct file_operations mce_chrdev_ops = {
f528e7ba
TH
820 .open = mce_open,
821 .release = mce_release,
1da177e4 822 .read = mce_read,
e02e68d3 823 .poll = mce_poll,
c68461b6 824 .unlocked_ioctl = mce_ioctl,
1da177e4
LT
825};
826
827static struct miscdevice mce_log_device = {
828 MISC_MCELOG_MINOR,
829 "mcelog",
830 &mce_chrdev_ops,
831};
832
d88203d1
TG
833/*
834 * Old style boot options parsing. Only for compatibility.
1da177e4 835 */
1da177e4
LT
836static int __init mcheck_disable(char *str)
837{
838 mce_dont_init = 1;
9b41046c 839 return 1;
1da177e4
LT
840}
841
5b4408fd 842/* mce=off disables machine check.
8c566ef5 843 mce=TOLERANCELEVEL (number, see above)
e583538f
AK
844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
845 mce=nobootlog Don't log MCEs from before booting. */
1da177e4
LT
846static int __init mcheck_enable(char *str)
847{
848 if (!strcmp(str, "off"))
849 mce_dont_init = 1;
e583538f
AK
850 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
851 mce_bootlog = str[0] == 'b';
8c566ef5
AK
852 else if (isdigit(str[0]))
853 get_option(&str, &tolerant);
1da177e4 854 else
d88203d1 855 printk("mce= argument %s ignored. Please use /sys", str);
9b41046c 856 return 1;
1da177e4
LT
857}
858
859__setup("nomce", mcheck_disable);
909dd324 860__setup("mce=", mcheck_enable);
1da177e4 861
d88203d1 862/*
1da177e4 863 * Sysfs support
d88203d1 864 */
1da177e4 865
973a2dd1
AK
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
413588c7
AK
889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
890 Only one CPU is active at this time, the others get readded later using
891 CPU hotplug. */
1da177e4
LT
892static int mce_resume(struct sys_device *dev)
893{
413588c7 894 mce_init(NULL);
6ec68bff 895 mce_cpu_features(&current_cpu_data);
1da177e4
LT
896 return 0;
897}
898
52d168e2
AK
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
1da177e4 907/* Reinit MCEs after user configuration changes */
d88203d1
TG
908static void mce_restart(void)
909{
8a336b0a 910 next_interval = check_interval * HZ;
52d168e2 911 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
912}
913
914static struct sysdev_class mce_sysclass = {
973a2dd1
AK
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
1da177e4 917 .resume = mce_resume,
af5ca3f4 918 .name = "machinecheck",
1da177e4
LT
919};
920
fff2e89f 921DEFINE_PER_CPU(struct sys_device, device_mce);
8735728e 922void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
1da177e4
LT
923
924/* Why are there no generic functions for this? */
925#define ACCESSOR(name, var, start) \
4a0b2b4d
AK
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
d88203d1
TG
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 } \
4a0b2b4d
AK
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
d88203d1
TG
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
940 } \
1da177e4
LT
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942
0d7482e3
AK
943static struct sysdev_attribute *bank_attrs;
944
945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
947{
948 u64 b = bank[attr - bank_attrs];
f6d1826d 949 return sprintf(buf, "%llx\n", b);
0d7482e3
AK
950}
951
952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
a98f0dd3 963
4a0b2b4d
AK
964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
a98f0dd3
AK
966{
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
970}
971
4a0b2b4d
AK
972static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
a98f0dd3
AK
974{
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
983}
984
985static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
d95d62c0 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1da177e4 987ACCESSOR(check_interval,check_interval,mce_restart())
a98f0dd3 988static struct sysdev_attribute *mce_attributes[] = {
d95d62c0 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
a98f0dd3
AK
990 NULL
991};
1da177e4 992
bae19fe0
AH
993static cpumask_t mce_device_initialized = CPU_MASK_NONE;
994
91c6d400
AK
995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
997{
998 int err;
73ca5358 999 int i;
92cb7612 1000
90367556 1001 if (!mce_available(&boot_cpu_data))
91c6d400
AK
1002 return -EIO;
1003
d435d862 1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
91c6d400
AK
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
d435d862
AM
1009 if (err)
1010 return err;
1011
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1017 }
0d7482e3
AK
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
bae19fe0 1024 cpu_set(cpu, mce_device_initialized);
91c6d400 1025
d435d862 1026 return 0;
0d7482e3
AK
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
d435d862 1032error:
0d7482e3 1033 while (--i >= 0) {
d435d862
AM
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
91c6d400 1036 }
d435d862
AM
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1038
91c6d400
AK
1039 return err;
1040}
1041
2d9cd6c2 1042static __cpuinit void mce_remove_device(unsigned int cpu)
91c6d400 1043{
73ca5358
SL
1044 int i;
1045
bae19fe0
AH
1046 if (!cpu_isset(cpu, mce_device_initialized))
1047 return;
1048
a98f0dd3 1049 for (i = 0; mce_attributes[i]; i++)
73ca5358 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
a98f0dd3 1051 mce_attributes[i]);
0d7482e3
AK
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
91c6d400 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
bae19fe0 1056 cpu_clear(cpu, mce_device_initialized);
91c6d400 1057}
91c6d400 1058
d6b75584 1059/* Make sure there are no machine checks on offlined CPUs. */
ec5b3d32 1060static void mce_disable_cpu(void *h)
d6b75584
AK
1061{
1062 int i;
1063
1064 if (!mce_available(&current_cpu_data))
1065 return;
1066 for (i = 0; i < banks; i++)
1067 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1068}
1069
ec5b3d32 1070static void mce_reenable_cpu(void *h)
d6b75584
AK
1071{
1072 int i;
1073
1074 if (!mce_available(&current_cpu_data))
1075 return;
1076 for (i = 0; i < banks; i++)
1077 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1078}
1079
91c6d400 1080/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1e35669d
SR
1081static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1082 unsigned long action, void *hcpu)
91c6d400
AK
1083{
1084 unsigned int cpu = (unsigned long)hcpu;
52d168e2 1085 struct timer_list *t = &per_cpu(mce_timer, cpu);
91c6d400
AK
1086
1087 switch (action) {
bae19fe0
AH
1088 case CPU_ONLINE:
1089 case CPU_ONLINE_FROZEN:
1090 mce_create_device(cpu);
8735728e
RW
1091 if (threshold_cpu_callback)
1092 threshold_cpu_callback(action, cpu);
91c6d400 1093 break;
91c6d400 1094 case CPU_DEAD:
8bb78442 1095 case CPU_DEAD_FROZEN:
8735728e
RW
1096 if (threshold_cpu_callback)
1097 threshold_cpu_callback(action, cpu);
91c6d400
AK
1098 mce_remove_device(cpu);
1099 break;
52d168e2
AK
1100 case CPU_DOWN_PREPARE:
1101 case CPU_DOWN_PREPARE_FROZEN:
1102 del_timer_sync(t);
d6b75584 1103 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
52d168e2
AK
1104 break;
1105 case CPU_DOWN_FAILED:
1106 case CPU_DOWN_FAILED_FROZEN:
1107 t->expires = round_jiffies_relative(jiffies + next_interval);
1108 add_timer_on(t, cpu);
d6b75584 1109 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
52d168e2 1110 break;
91c6d400 1111 }
bae19fe0 1112 return NOTIFY_OK;
91c6d400
AK
1113}
1114
1e35669d 1115static struct notifier_block mce_cpu_notifier __cpuinitdata = {
91c6d400
AK
1116 .notifier_call = mce_cpu_callback,
1117};
1118
0d7482e3
AK
1119static __init int mce_init_banks(void)
1120{
1121 int i;
1122
1123 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1124 GFP_KERNEL);
1125 if (!bank_attrs)
1126 return -ENOMEM;
1127
1128 for (i = 0; i < banks; i++) {
1129 struct sysdev_attribute *a = &bank_attrs[i];
1130 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1131 if (!a->attr.name)
1132 goto nomem;
1133 a->attr.mode = 0644;
1134 a->show = show_bank;
1135 a->store = set_bank;
1136 }
1137 return 0;
1138
1139nomem:
1140 while (--i >= 0)
1141 kfree(bank_attrs[i].attr.name);
1142 kfree(bank_attrs);
1143 bank_attrs = NULL;
1144 return -ENOMEM;
1145}
1146
91c6d400
AK
1147static __init int mce_init_device(void)
1148{
1149 int err;
1150 int i = 0;
1151
1da177e4
LT
1152 if (!mce_available(&boot_cpu_data))
1153 return -EIO;
0d7482e3
AK
1154
1155 err = mce_init_banks();
1156 if (err)
1157 return err;
1158
1da177e4 1159 err = sysdev_class_register(&mce_sysclass);
d435d862
AM
1160 if (err)
1161 return err;
91c6d400
AK
1162
1163 for_each_online_cpu(i) {
d435d862
AM
1164 err = mce_create_device(i);
1165 if (err)
1166 return err;
91c6d400
AK
1167 }
1168
be6b5a35 1169 register_hotcpu_notifier(&mce_cpu_notifier);
1da177e4
LT
1170 misc_register(&mce_log_device);
1171 return err;
1da177e4 1172}
91c6d400 1173
1da177e4 1174device_initcall(mce_init_device);