KVM: Add VT-x machine check support
[linux-2.6-block.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
1da177e4
LT
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
38c4c97c 14#include <linux/smp_lock.h>
1da177e4
LT
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
a9415644 21#include <linux/capability.h>
91c6d400
AK
22#include <linux/cpu.h>
23#include <linux/percpu.h>
e02e68d3
TH
24#include <linux/poll.h>
25#include <linux/thread_info.h>
8c566ef5 26#include <linux/ctype.h>
a98f0dd3 27#include <linux/kmod.h>
1eeb66a1 28#include <linux/kdebug.h>
0d7482e3
AK
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
8457c84d 31#include <linux/ratelimit.h>
d88203d1 32#include <asm/processor.h>
1da177e4
LT
33#include <asm/msr.h>
34#include <asm/mce.h>
1da177e4 35#include <asm/uaccess.h>
0a9c3ee7 36#include <asm/smp.h>
e02e68d3 37#include <asm/idle.h>
1da177e4
LT
38
39#define MISC_MCELOG_MINOR 227
0d7482e3 40
553f265f
AK
41atomic_t mce_entry;
42
1da177e4
LT
43static int mce_dont_init;
44
bd78432c
TH
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
1da177e4
LT
52static int tolerant = 1;
53static int banks;
0d7482e3 54static u64 *bank;
e02e68d3 55static unsigned long notify_user;
94ad8474 56static int rip_msr;
911f6a7b 57static int mce_bootlog = -1;
a98f0dd3
AK
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
1da177e4 62
e02e68d3
TH
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
ee031c31
AK
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
b5f2fa4e
AK
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
1da177e4
LT
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
231fd906 84static struct mce_log mcelog = {
1da177e4
LT
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
d88203d1 87};
1da177e4
LT
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
a98f0dd3 92 atomic_inc(&mce_events);
1da177e4 93 mce->finished = 0;
7644143c 94 wmb();
1da177e4
LT
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
673242c1
AK
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
53756d37 101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
673242c1
AK
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
7644143c 109 break;
1da177e4 110 }
1da177e4
LT
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 117 wmb();
1da177e4 118 mcelog.entry[entry].finished = 1;
7644143c 119 wmb();
1da177e4 120
e02e68d3 121 set_bit(0, &notify_user);
1da177e4
LT
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
4855170f 127 KERN_EMERG "HARDWARE ERROR\n"
1da177e4
LT
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
65ea5b03 131 if (m->ip) {
d88203d1 132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
1da177e4 133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
65ea5b03 134 m->cs, m->ip);
1da177e4 135 if (m->cs == __KERNEL_CS)
65ea5b03 136 print_symbol("{%s}", m->ip);
1da177e4
LT
137 printk("\n");
138 }
f6d1826d 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
1da177e4 140 if (m->addr)
f6d1826d 141 printk("ADDR %llx ", m->addr);
1da177e4 142 if (m->misc)
f6d1826d 143 printk("MISC %llx ", m->misc);
1da177e4 144 printk("\n");
4855170f 145 printk(KERN_EMERG "This is not a software problem!\n");
d88203d1
TG
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
1da177e4
LT
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
d88203d1 151{
1da177e4 152 int i;
e02e68d3 153
1da177e4
LT
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
d88203d1 157
1da177e4
LT
158 if (time_before(tsc, start))
159 continue;
d88203d1 160 print_mce(&mcelog.entry[i]);
1da177e4
LT
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
e02e68d3 166 panic(msg);
d88203d1 167}
1da177e4 168
88ccbedd 169int mce_available(struct cpuinfo_x86 *c)
1da177e4 170{
5b4408fd
AK
171 if (mce_dont_init)
172 return 0;
3d1712c9 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
174}
175
94ad8474
AK
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
65ea5b03 179 m->ip = regs->ip;
94ad8474
AK
180 m->cs = regs->cs;
181 } else {
65ea5b03 182 m->ip = 0;
94ad8474
AK
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
65ea5b03 188 rdmsrl(rip_msr, m->ip);
94ad8474
AK
189 m->cs = 0;
190 }
191}
192
d88203d1 193/*
b79109c3
AK
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
ee031c31 199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3
AK
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
ee031c31 208 if (!bank[i] || !test_bit(i, *b))
b79109c3
AK
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
5679af4c
AK
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
b79109c3
AK
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
1da177e4 266 */
1da177e4
LT
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
1da177e4
LT
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
bd78432c
TH
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
b79109c3 283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1da177e4 284
553f265f
AK
285 atomic_inc(&mce_entry);
286
b79109c3 287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
22f5991c 288 18, SIGKILL) == NOTIFY_STOP)
b79109c3
AK
289 goto out2;
290 if (!banks)
553f265f 291 goto out2;
1da177e4 292
b5f2fa4e
AK
293 mce_setup(&m);
294
1da177e4 295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
bd78432c 296 /* if the restart IP is not valid, we're done for */
1da177e4 297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
bd78432c 298 no_way_out = 1;
d88203d1 299
1da177e4
LT
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
b79109c3 304 __clear_bit(i, toclear);
0d7482e3 305 if (!bank[i])
1da177e4 306 continue;
d88203d1
TG
307
308 m.misc = 0;
1da177e4
LT
309 m.addr = 0;
310 m.bank = i;
1da177e4
LT
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
b79109c3
AK
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
1da177e4 330 if (m.status & MCI_STATUS_EN) {
bd78432c
TH
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
b79109c3
AK
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
1da177e4
LT
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
94ad8474 356 mce_get_rip(&m, regs);
b79109c3 357 mce_log(&m);
1da177e4
LT
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
1da177e4
LT
366 }
367
1da177e4
LT
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
bd78432c
TH
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
1da177e4 378 mce_panic("Machine check", &panicm, mcestart);
bd78432c
TH
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
1da177e4
LT
387 int user_space = 0;
388
bd78432c
TH
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
65ea5b03 394 user_space = panicm.ip && (panicm.cs & 3);
bd78432c
TH
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
380851bc 400 * force_sig() takes an awful lot of locks and has a slight
bd78432c
TH
401 * risk of deadlocking.
402 */
403 if (user_space) {
380851bc 404 force_sig(SIGBUS, current);
bd78432c
TH
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
1da177e4
LT
409 }
410
e02e68d3
TH
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
bd78432c 414 /* the last thing we do is clear state */
b79109c3
AK
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
1da177e4 419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
553f265f
AK
420 out2:
421 atomic_dec(&mce_entry);
1da177e4 422}
a0861c02 423EXPORT_SYMBOL_GPL(do_machine_check);
1da177e4 424
15d5f839
DZ
425#ifdef CONFIG_X86_MCE_INTEL
426/***
427 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
676b1855 428 * @cpu: The CPU on which the event occurred.
15d5f839
DZ
429 * @status: Event status information
430 *
431 * This function should be called by the thermal interrupt after the
432 * event has been processed and the decision was made to log the event
433 * further.
434 *
435 * The status parameter will be saved to the 'status' field of 'struct mce'
436 * and historically has been the register value of the
437 * MSR_IA32_THERMAL_STATUS (Intel) msr.
438 */
b5f2fa4e 439void mce_log_therm_throt_event(__u64 status)
15d5f839
DZ
440{
441 struct mce m;
442
b5f2fa4e 443 mce_setup(&m);
15d5f839
DZ
444 m.bank = MCE_THERMAL_BANK;
445 m.status = status;
15d5f839
DZ
446 mce_log(&m);
447}
448#endif /* CONFIG_X86_MCE_INTEL */
449
1da177e4 450/*
8a336b0a
TH
451 * Periodic polling timer for "silent" machine check errors. If the
452 * poller finds an MCE, poll 2x faster. When the poller finds no more
453 * errors, poll 2x slower (up to check_interval seconds).
1da177e4
LT
454 */
455
456static int check_interval = 5 * 60; /* 5 minutes */
6298c512 457static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
52d168e2
AK
458static void mcheck_timer(unsigned long);
459static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 460
52d168e2 461static void mcheck_timer(unsigned long data)
1da177e4 462{
52d168e2 463 struct timer_list *t = &per_cpu(mce_timer, data);
6298c512 464 int *n;
52d168e2
AK
465
466 WARN_ON(smp_processor_id() != data);
467
1da177e4 468 if (mce_available(&current_cpu_data))
ee031c31
AK
469 machine_check_poll(MCP_TIMESTAMP,
470 &__get_cpu_var(mce_poll_banks));
1da177e4
LT
471
472 /*
e02e68d3
TH
473 * Alert userspace if needed. If we logged an MCE, reduce the
474 * polling interval, otherwise increase the polling interval.
1da177e4 475 */
6298c512 476 n = &__get_cpu_var(next_interval);
e02e68d3 477 if (mce_notify_user()) {
6298c512 478 *n = max(*n/2, HZ/100);
e02e68d3 479 } else {
6298c512 480 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
e02e68d3
TH
481 }
482
6298c512 483 t->expires = jiffies + *n;
52d168e2 484 add_timer(t);
e02e68d3
TH
485}
486
9bd98405
AK
487static void mce_do_trigger(struct work_struct *work)
488{
489 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
490}
491
492static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
493
e02e68d3 494/*
9bd98405
AK
495 * Notify the user(s) about new machine check events.
496 * Can be called from interrupt context, but not from machine check/NMI
497 * context.
e02e68d3
TH
498 */
499int mce_notify_user(void)
500{
8457c84d
AK
501 /* Not more than two messages every minute */
502 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
503
e02e68d3
TH
504 clear_thread_flag(TIF_MCE_NOTIFY);
505 if (test_and_clear_bit(0, &notify_user)) {
e02e68d3 506 wake_up_interruptible(&mce_wait);
9bd98405
AK
507
508 /*
509 * There is no risk of missing notifications because
510 * work_pending is always cleared before the function is
511 * executed.
512 */
513 if (trigger[0] && !work_pending(&mce_trigger_work))
514 schedule_work(&mce_trigger_work);
e02e68d3 515
8457c84d 516 if (__ratelimit(&ratelimit))
8a336b0a 517 printk(KERN_INFO "Machine check events logged\n");
e02e68d3
TH
518
519 return 1;
1da177e4 520 }
e02e68d3
TH
521 return 0;
522}
8a336b0a 523
e02e68d3
TH
524/* see if the idle task needs to notify userspace */
525static int
526mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
527{
528 /* IDLE_END should be safe - interrupts are back on */
529 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
530 mce_notify_user();
531
532 return NOTIFY_OK;
1da177e4
LT
533}
534
e02e68d3
TH
535static struct notifier_block mce_idle_notifier = {
536 .notifier_call = mce_idle_callback,
537};
1da177e4
LT
538
539static __init int periodic_mcheck_init(void)
d88203d1 540{
52d168e2
AK
541 idle_notifier_register(&mce_idle_notifier);
542 return 0;
d88203d1 543}
1da177e4
LT
544__initcall(periodic_mcheck_init);
545
d88203d1 546/*
1da177e4
LT
547 * Initialize Machine Checks for a CPU.
548 */
0d7482e3 549static int mce_cap_init(void)
1da177e4
LT
550{
551 u64 cap;
0d7482e3 552 unsigned b;
1da177e4
LT
553
554 rdmsrl(MSR_IA32_MCG_CAP, cap);
0d7482e3
AK
555 b = cap & 0xff;
556 if (b > MAX_NR_BANKS) {
557 printk(KERN_WARNING
558 "MCE: Using only %u machine check banks out of %u\n",
559 MAX_NR_BANKS, b);
560 b = MAX_NR_BANKS;
561 }
562
563 /* Don't support asymmetric configurations today */
564 WARN_ON(banks != 0 && b != banks);
565 banks = b;
566 if (!bank) {
567 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
568 if (!bank)
569 return -ENOMEM;
570 memset(bank, 0xff, banks * sizeof(u64));
1da177e4 571 }
0d7482e3 572
94ad8474
AK
573 /* Use accurate RIP reporting if available. */
574 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
575 rip_msr = MSR_IA32_MCG_EIP;
1da177e4 576
0d7482e3
AK
577 return 0;
578}
579
580static void mce_init(void *dummy)
581{
582 u64 cap;
583 int i;
ee031c31 584 mce_banks_t all_banks;
0d7482e3 585
b79109c3
AK
586 /*
587 * Log the machine checks left over from the previous reset.
588 */
ee031c31 589 bitmap_fill(all_banks, MAX_NR_BANKS);
5679af4c 590 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1da177e4
LT
591
592 set_in_cr4(X86_CR4_MCE);
593
0d7482e3 594 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
595 if (cap & MCG_CTL_P)
596 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
597
598 for (i = 0; i < banks; i++) {
0d7482e3 599 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1da177e4 600 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
d88203d1 601 }
1da177e4
LT
602}
603
604/* Add per CPU specific workarounds here */
ec5b3d32 605static void mce_cpu_quirks(struct cpuinfo_x86 *c)
d88203d1 606{
1da177e4 607 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 608 if (c->x86_vendor == X86_VENDOR_AMD) {
0d7482e3 609 if (c->x86 == 15 && banks > 4)
911f6a7b
JB
610 /* disable GART TBL walk error reporting, which trips off
611 incorrectly with the IOMMU & 3ware & Cerberus. */
0d7482e3 612 clear_bit(10, (unsigned long *)&bank[4]);
911f6a7b
JB
613 if(c->x86 <= 17 && mce_bootlog < 0)
614 /* Lots of broken BIOS around that don't clear them
615 by default and leave crap in there. Don't log. */
616 mce_bootlog = 0;
1da177e4 617 }
e583538f 618
d88203d1 619}
1da177e4 620
cc3ca220 621static void mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
622{
623 switch (c->x86_vendor) {
624 case X86_VENDOR_INTEL:
625 mce_intel_feature_init(c);
626 break;
89b831ef
JS
627 case X86_VENDOR_AMD:
628 mce_amd_feature_init(c);
629 break;
1da177e4
LT
630 default:
631 break;
632 }
633}
634
52d168e2
AK
635static void mce_init_timer(void)
636{
637 struct timer_list *t = &__get_cpu_var(mce_timer);
6298c512 638 int *n = &__get_cpu_var(next_interval);
52d168e2 639
6298c512
AK
640 *n = check_interval * HZ;
641 if (!*n)
52d168e2
AK
642 return;
643 setup_timer(t, mcheck_timer, smp_processor_id());
6298c512 644 t->expires = round_jiffies(jiffies + *n);
52d168e2
AK
645 add_timer(t);
646}
647
d88203d1 648/*
1da177e4 649 * Called for each booted CPU to set up machine checks.
d88203d1 650 * Must be called with preempt off.
1da177e4 651 */
e6982c67 652void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 653{
5b4408fd 654 if (!mce_available(c))
1da177e4
LT
655 return;
656
0d7482e3
AK
657 if (mce_cap_init() < 0) {
658 mce_dont_init = 1;
659 return;
660 }
661 mce_cpu_quirks(c);
662
1da177e4
LT
663 mce_init(NULL);
664 mce_cpu_features(c);
52d168e2 665 mce_init_timer();
1da177e4
LT
666}
667
668/*
669 * Character device to read and clear the MCE log.
670 */
671
f528e7ba
TH
672static DEFINE_SPINLOCK(mce_state_lock);
673static int open_count; /* #times opened */
674static int open_exclu; /* already open exclusive? */
675
676static int mce_open(struct inode *inode, struct file *file)
677{
38c4c97c 678 lock_kernel();
f528e7ba
TH
679 spin_lock(&mce_state_lock);
680
681 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
682 spin_unlock(&mce_state_lock);
38c4c97c 683 unlock_kernel();
f528e7ba
TH
684 return -EBUSY;
685 }
686
687 if (file->f_flags & O_EXCL)
688 open_exclu = 1;
689 open_count++;
690
691 spin_unlock(&mce_state_lock);
38c4c97c 692 unlock_kernel();
f528e7ba 693
bd78432c 694 return nonseekable_open(inode, file);
f528e7ba
TH
695}
696
697static int mce_release(struct inode *inode, struct file *file)
698{
699 spin_lock(&mce_state_lock);
700
701 open_count--;
702 open_exclu = 0;
703
704 spin_unlock(&mce_state_lock);
705
706 return 0;
707}
708
d88203d1
TG
709static void collect_tscs(void *data)
710{
1da177e4 711 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 712
1da177e4 713 rdtscll(cpu_tsc[smp_processor_id()]);
d88203d1 714}
1da177e4 715
d88203d1
TG
716static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
717 loff_t *off)
1da177e4 718{
f0de53bb 719 unsigned long *cpu_tsc;
8c8b8859 720 static DEFINE_MUTEX(mce_read_mutex);
ef41df43 721 unsigned prev, next;
1da177e4
LT
722 char __user *buf = ubuf;
723 int i, err;
724
6bca67f9 725 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
726 if (!cpu_tsc)
727 return -ENOMEM;
728
8c8b8859 729 mutex_lock(&mce_read_mutex);
1da177e4
LT
730 next = rcu_dereference(mcelog.next);
731
732 /* Only supports full reads right now */
d88203d1 733 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
8c8b8859 734 mutex_unlock(&mce_read_mutex);
f0de53bb 735 kfree(cpu_tsc);
1da177e4
LT
736 return -EINVAL;
737 }
738
739 err = 0;
ef41df43
HY
740 prev = 0;
741 do {
742 for (i = prev; i < next; i++) {
743 unsigned long start = jiffies;
744
745 while (!mcelog.entry[i].finished) {
746 if (time_after_eq(jiffies, start + 2)) {
747 memset(mcelog.entry + i, 0,
748 sizeof(struct mce));
749 goto timeout;
750 }
751 cpu_relax();
673242c1 752 }
ef41df43
HY
753 smp_rmb();
754 err |= copy_to_user(buf, mcelog.entry + i,
755 sizeof(struct mce));
756 buf += sizeof(struct mce);
757timeout:
758 ;
673242c1 759 }
1da177e4 760
ef41df43
HY
761 memset(mcelog.entry + prev, 0,
762 (next - prev) * sizeof(struct mce));
763 prev = next;
764 next = cmpxchg(&mcelog.next, prev, 0);
765 } while (next != prev);
1da177e4 766
b2b18660 767 synchronize_sched();
1da177e4 768
d88203d1
TG
769 /*
770 * Collect entries that were still getting written before the
771 * synchronize.
772 */
15c8b6c1 773 on_each_cpu(collect_tscs, cpu_tsc, 1);
d88203d1
TG
774 for (i = next; i < MCE_LOG_LEN; i++) {
775 if (mcelog.entry[i].finished &&
776 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
777 err |= copy_to_user(buf, mcelog.entry+i,
778 sizeof(struct mce));
1da177e4
LT
779 smp_rmb();
780 buf += sizeof(struct mce);
781 memset(&mcelog.entry[i], 0, sizeof(struct mce));
782 }
d88203d1 783 }
8c8b8859 784 mutex_unlock(&mce_read_mutex);
f0de53bb 785 kfree(cpu_tsc);
d88203d1 786 return err ? -EFAULT : buf - ubuf;
1da177e4
LT
787}
788
e02e68d3
TH
789static unsigned int mce_poll(struct file *file, poll_table *wait)
790{
791 poll_wait(file, &mce_wait, wait);
792 if (rcu_dereference(mcelog.next))
793 return POLLIN | POLLRDNORM;
794 return 0;
795}
796
c68461b6 797static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1da177e4
LT
798{
799 int __user *p = (int __user *)arg;
d88203d1 800
1da177e4 801 if (!capable(CAP_SYS_ADMIN))
d88203d1 802 return -EPERM;
1da177e4 803 switch (cmd) {
d88203d1 804 case MCE_GET_RECORD_LEN:
1da177e4
LT
805 return put_user(sizeof(struct mce), p);
806 case MCE_GET_LOG_LEN:
d88203d1 807 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
808 case MCE_GETCLEAR_FLAGS: {
809 unsigned flags;
d88203d1
TG
810
811 do {
1da177e4 812 flags = mcelog.flags;
d88203d1
TG
813 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
814 return put_user(flags, p);
1da177e4
LT
815 }
816 default:
d88203d1
TG
817 return -ENOTTY;
818 }
1da177e4
LT
819}
820
5dfe4c96 821static const struct file_operations mce_chrdev_ops = {
f528e7ba
TH
822 .open = mce_open,
823 .release = mce_release,
1da177e4 824 .read = mce_read,
e02e68d3 825 .poll = mce_poll,
c68461b6 826 .unlocked_ioctl = mce_ioctl,
1da177e4
LT
827};
828
829static struct miscdevice mce_log_device = {
830 MISC_MCELOG_MINOR,
831 "mcelog",
832 &mce_chrdev_ops,
833};
834
d88203d1
TG
835/*
836 * Old style boot options parsing. Only for compatibility.
1da177e4 837 */
1da177e4
LT
838static int __init mcheck_disable(char *str)
839{
840 mce_dont_init = 1;
9b41046c 841 return 1;
1da177e4
LT
842}
843
5b4408fd 844/* mce=off disables machine check.
8c566ef5 845 mce=TOLERANCELEVEL (number, see above)
e583538f
AK
846 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
847 mce=nobootlog Don't log MCEs from before booting. */
1da177e4
LT
848static int __init mcheck_enable(char *str)
849{
850 if (!strcmp(str, "off"))
851 mce_dont_init = 1;
e583538f
AK
852 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
853 mce_bootlog = str[0] == 'b';
8c566ef5
AK
854 else if (isdigit(str[0]))
855 get_option(&str, &tolerant);
1da177e4 856 else
d88203d1 857 printk("mce= argument %s ignored. Please use /sys", str);
9b41046c 858 return 1;
1da177e4
LT
859}
860
861__setup("nomce", mcheck_disable);
909dd324 862__setup("mce=", mcheck_enable);
1da177e4 863
d88203d1 864/*
1da177e4 865 * Sysfs support
d88203d1 866 */
1da177e4 867
973a2dd1
AK
868/*
869 * Disable machine checks on suspend and shutdown. We can't really handle
870 * them later.
871 */
872static int mce_disable(void)
873{
874 int i;
875
876 for (i = 0; i < banks; i++)
877 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
878 return 0;
879}
880
881static int mce_suspend(struct sys_device *dev, pm_message_t state)
882{
883 return mce_disable();
884}
885
886static int mce_shutdown(struct sys_device *dev)
887{
888 return mce_disable();
889}
890
413588c7
AK
891/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
892 Only one CPU is active at this time, the others get readded later using
893 CPU hotplug. */
1da177e4
LT
894static int mce_resume(struct sys_device *dev)
895{
413588c7 896 mce_init(NULL);
6ec68bff 897 mce_cpu_features(&current_cpu_data);
1da177e4
LT
898 return 0;
899}
900
52d168e2
AK
901static void mce_cpu_restart(void *data)
902{
903 del_timer_sync(&__get_cpu_var(mce_timer));
904 if (mce_available(&current_cpu_data))
905 mce_init(NULL);
906 mce_init_timer();
907}
908
1da177e4 909/* Reinit MCEs after user configuration changes */
d88203d1
TG
910static void mce_restart(void)
911{
52d168e2 912 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
913}
914
915static struct sysdev_class mce_sysclass = {
973a2dd1
AK
916 .suspend = mce_suspend,
917 .shutdown = mce_shutdown,
1da177e4 918 .resume = mce_resume,
af5ca3f4 919 .name = "machinecheck",
1da177e4
LT
920};
921
fff2e89f 922DEFINE_PER_CPU(struct sys_device, device_mce);
8735728e 923void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
1da177e4
LT
924
925/* Why are there no generic functions for this? */
926#define ACCESSOR(name, var, start) \
4a0b2b4d
AK
927 static ssize_t show_ ## name(struct sys_device *s, \
928 struct sysdev_attribute *attr, \
929 char *buf) { \
d88203d1
TG
930 return sprintf(buf, "%lx\n", (unsigned long)var); \
931 } \
4a0b2b4d
AK
932 static ssize_t set_ ## name(struct sys_device *s, \
933 struct sysdev_attribute *attr, \
934 const char *buf, size_t siz) { \
d88203d1
TG
935 char *end; \
936 unsigned long new = simple_strtoul(buf, &end, 0); \
937 if (end == buf) return -EINVAL; \
938 var = new; \
939 start; \
940 return end-buf; \
941 } \
1da177e4
LT
942 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
943
0d7482e3
AK
944static struct sysdev_attribute *bank_attrs;
945
946static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
947 char *buf)
948{
949 u64 b = bank[attr - bank_attrs];
f6d1826d 950 return sprintf(buf, "%llx\n", b);
0d7482e3
AK
951}
952
953static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
954 const char *buf, size_t siz)
955{
956 char *end;
957 u64 new = simple_strtoull(buf, &end, 0);
958 if (end == buf)
959 return -EINVAL;
960 bank[attr - bank_attrs] = new;
961 mce_restart();
962 return end-buf;
963}
a98f0dd3 964
4a0b2b4d
AK
965static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
966 char *buf)
a98f0dd3
AK
967{
968 strcpy(buf, trigger);
969 strcat(buf, "\n");
970 return strlen(trigger) + 1;
971}
972
4a0b2b4d
AK
973static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
974 const char *buf,size_t siz)
a98f0dd3
AK
975{
976 char *p;
977 int len;
978 strncpy(trigger, buf, sizeof(trigger));
979 trigger[sizeof(trigger)-1] = 0;
980 len = strlen(trigger);
981 p = strchr(trigger, '\n');
982 if (*p) *p = 0;
983 return len;
984}
985
986static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
d95d62c0 987static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1da177e4 988ACCESSOR(check_interval,check_interval,mce_restart())
a98f0dd3 989static struct sysdev_attribute *mce_attributes[] = {
d95d62c0 990 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
a98f0dd3
AK
991 NULL
992};
1da177e4 993
996867d0 994static cpumask_var_t mce_device_initialized;
bae19fe0 995
91c6d400
AK
996/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
997static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
998{
999 int err;
73ca5358 1000 int i;
92cb7612 1001
90367556 1002 if (!mce_available(&boot_cpu_data))
91c6d400
AK
1003 return -EIO;
1004
d435d862 1005 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
91c6d400
AK
1006 per_cpu(device_mce,cpu).id = cpu;
1007 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1008
1009 err = sysdev_register(&per_cpu(device_mce,cpu));
d435d862
AM
1010 if (err)
1011 return err;
1012
1013 for (i = 0; mce_attributes[i]; i++) {
1014 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1015 mce_attributes[i]);
1016 if (err)
1017 goto error;
1018 }
0d7482e3
AK
1019 for (i = 0; i < banks; i++) {
1020 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1021 &bank_attrs[i]);
1022 if (err)
1023 goto error2;
1024 }
996867d0 1025 cpumask_set_cpu(cpu, mce_device_initialized);
91c6d400 1026
d435d862 1027 return 0;
0d7482e3
AK
1028error2:
1029 while (--i >= 0) {
1030 sysdev_remove_file(&per_cpu(device_mce, cpu),
1031 &bank_attrs[i]);
1032 }
d435d862 1033error:
0d7482e3 1034 while (--i >= 0) {
d435d862
AM
1035 sysdev_remove_file(&per_cpu(device_mce,cpu),
1036 mce_attributes[i]);
91c6d400 1037 }
d435d862
AM
1038 sysdev_unregister(&per_cpu(device_mce,cpu));
1039
91c6d400
AK
1040 return err;
1041}
1042
2d9cd6c2 1043static __cpuinit void mce_remove_device(unsigned int cpu)
91c6d400 1044{
73ca5358
SL
1045 int i;
1046
996867d0 1047 if (!cpumask_test_cpu(cpu, mce_device_initialized))
bae19fe0
AH
1048 return;
1049
a98f0dd3 1050 for (i = 0; mce_attributes[i]; i++)
73ca5358 1051 sysdev_remove_file(&per_cpu(device_mce,cpu),
a98f0dd3 1052 mce_attributes[i]);
0d7482e3
AK
1053 for (i = 0; i < banks; i++)
1054 sysdev_remove_file(&per_cpu(device_mce, cpu),
1055 &bank_attrs[i]);
91c6d400 1056 sysdev_unregister(&per_cpu(device_mce,cpu));
996867d0 1057 cpumask_clear_cpu(cpu, mce_device_initialized);
91c6d400 1058}
91c6d400 1059
d6b75584 1060/* Make sure there are no machine checks on offlined CPUs. */
ec5b3d32 1061static void mce_disable_cpu(void *h)
d6b75584
AK
1062{
1063 int i;
88ccbedd 1064 unsigned long action = *(unsigned long *)h;
d6b75584
AK
1065
1066 if (!mce_available(&current_cpu_data))
1067 return;
88ccbedd
AK
1068 if (!(action & CPU_TASKS_FROZEN))
1069 cmci_clear();
d6b75584
AK
1070 for (i = 0; i < banks; i++)
1071 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1072}
1073
ec5b3d32 1074static void mce_reenable_cpu(void *h)
d6b75584
AK
1075{
1076 int i;
88ccbedd 1077 unsigned long action = *(unsigned long *)h;
d6b75584
AK
1078
1079 if (!mce_available(&current_cpu_data))
1080 return;
88ccbedd
AK
1081 if (!(action & CPU_TASKS_FROZEN))
1082 cmci_reenable();
d6b75584
AK
1083 for (i = 0; i < banks; i++)
1084 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1085}
1086
91c6d400 1087/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1e35669d
SR
1088static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1089 unsigned long action, void *hcpu)
91c6d400
AK
1090{
1091 unsigned int cpu = (unsigned long)hcpu;
52d168e2 1092 struct timer_list *t = &per_cpu(mce_timer, cpu);
91c6d400
AK
1093
1094 switch (action) {
bae19fe0
AH
1095 case CPU_ONLINE:
1096 case CPU_ONLINE_FROZEN:
1097 mce_create_device(cpu);
8735728e
RW
1098 if (threshold_cpu_callback)
1099 threshold_cpu_callback(action, cpu);
91c6d400 1100 break;
91c6d400 1101 case CPU_DEAD:
8bb78442 1102 case CPU_DEAD_FROZEN:
8735728e
RW
1103 if (threshold_cpu_callback)
1104 threshold_cpu_callback(action, cpu);
91c6d400
AK
1105 mce_remove_device(cpu);
1106 break;
52d168e2
AK
1107 case CPU_DOWN_PREPARE:
1108 case CPU_DOWN_PREPARE_FROZEN:
1109 del_timer_sync(t);
88ccbedd 1110 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
52d168e2
AK
1111 break;
1112 case CPU_DOWN_FAILED:
1113 case CPU_DOWN_FAILED_FROZEN:
6298c512
AK
1114 t->expires = round_jiffies(jiffies +
1115 __get_cpu_var(next_interval));
52d168e2 1116 add_timer_on(t, cpu);
88ccbedd
AK
1117 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1118 break;
1119 case CPU_POST_DEAD:
1120 /* intentionally ignoring frozen here */
1121 cmci_rediscover(cpu);
52d168e2 1122 break;
91c6d400 1123 }
bae19fe0 1124 return NOTIFY_OK;
91c6d400
AK
1125}
1126
1e35669d 1127static struct notifier_block mce_cpu_notifier __cpuinitdata = {
91c6d400
AK
1128 .notifier_call = mce_cpu_callback,
1129};
1130
0d7482e3
AK
1131static __init int mce_init_banks(void)
1132{
1133 int i;
1134
1135 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1136 GFP_KERNEL);
1137 if (!bank_attrs)
1138 return -ENOMEM;
1139
1140 for (i = 0; i < banks; i++) {
1141 struct sysdev_attribute *a = &bank_attrs[i];
1142 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1143 if (!a->attr.name)
1144 goto nomem;
1145 a->attr.mode = 0644;
1146 a->show = show_bank;
1147 a->store = set_bank;
1148 }
1149 return 0;
1150
1151nomem:
1152 while (--i >= 0)
1153 kfree(bank_attrs[i].attr.name);
1154 kfree(bank_attrs);
1155 bank_attrs = NULL;
1156 return -ENOMEM;
1157}
1158
91c6d400
AK
1159static __init int mce_init_device(void)
1160{
1161 int err;
1162 int i = 0;
1163
1da177e4
LT
1164 if (!mce_available(&boot_cpu_data))
1165 return -EIO;
0d7482e3 1166
eaa95840 1167 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
996867d0 1168
0d7482e3
AK
1169 err = mce_init_banks();
1170 if (err)
1171 return err;
1172
1da177e4 1173 err = sysdev_class_register(&mce_sysclass);
d435d862
AM
1174 if (err)
1175 return err;
91c6d400
AK
1176
1177 for_each_online_cpu(i) {
d435d862
AM
1178 err = mce_create_device(i);
1179 if (err)
1180 return err;
91c6d400
AK
1181 }
1182
be6b5a35 1183 register_hotcpu_notifier(&mce_cpu_notifier);
1da177e4
LT
1184 misc_register(&mce_log_device);
1185 return err;
1da177e4 1186}
91c6d400 1187
1da177e4 1188device_initcall(mce_init_device);