[PATCH] x86_64: Make ACPI NUMA and NUMA emulation peers of K8_NUMA in Kconfig
[linux-2.6-block.git] / arch / x86_64 / kernel / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
91c6d400
AK
18#include <linux/cpu.h>
19#include <linux/percpu.h>
8c566ef5 20#include <linux/ctype.h>
1da177e4
LT
21#include <asm/processor.h>
22#include <asm/msr.h>
23#include <asm/mce.h>
24#include <asm/kdebug.h>
25#include <asm/uaccess.h>
26
27#define MISC_MCELOG_MINOR 227
28#define NR_BANKS 5
29
30static int mce_dont_init;
31
32/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34static int tolerant = 1;
35static int banks;
36static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37static unsigned long console_logged;
38static int notify_user;
94ad8474 39static int rip_msr;
d5172f26 40static int mce_bootlog;
1da177e4
LT
41
42/*
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
46 */
47
48struct mce_log mcelog = {
49 MCE_LOG_SIGNATURE,
50 MCE_LOG_LEN,
51};
52
53void mce_log(struct mce *mce)
54{
55 unsigned next, entry;
56 mce->finished = 0;
7644143c 57 wmb();
1da177e4
LT
58 for (;;) {
59 entry = rcu_dereference(mcelog.next);
7644143c
MW
60 /* The rmb forces the compiler to reload next in each
61 iteration */
62 rmb();
673242c1
AK
63 for (;;) {
64 /* When the buffer fills up discard new entries. Assume
65 that the earlier errors are the more interesting. */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW, &mcelog.flags);
68 return;
69 }
70 /* Old left over entry. Skip. */
71 if (mcelog.entry[entry].finished) {
72 entry++;
73 continue;
74 }
7644143c 75 break;
1da177e4 76 }
1da177e4
LT
77 smp_rmb();
78 next = entry + 1;
79 if (cmpxchg(&mcelog.next, entry, next) == entry)
80 break;
81 }
82 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 83 wmb();
1da177e4 84 mcelog.entry[entry].finished = 1;
7644143c 85 wmb();
1da177e4
LT
86
87 if (!test_and_set_bit(0, &console_logged))
88 notify_user = 1;
89}
90
91static void print_mce(struct mce *m)
92{
93 printk(KERN_EMERG "\n"
94 KERN_EMERG
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status);
97 if (m->rip) {
98 printk(KERN_EMERG
99 "RIP%s %02x:<%016Lx> ",
100 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
101 m->cs, m->rip);
102 if (m->cs == __KERNEL_CS)
103 print_symbol("{%s}", m->rip);
104 printk("\n");
105 }
106 printk(KERN_EMERG "TSC %Lx ", m->tsc);
107 if (m->addr)
108 printk("ADDR %Lx ", m->addr);
109 if (m->misc)
110 printk("MISC %Lx ", m->misc);
111 printk("\n");
112}
113
114static void mce_panic(char *msg, struct mce *backup, unsigned long start)
115{
116 int i;
117 oops_begin();
118 for (i = 0; i < MCE_LOG_LEN; i++) {
119 unsigned long tsc = mcelog.entry[i].tsc;
120 if (time_before(tsc, start))
121 continue;
122 print_mce(&mcelog.entry[i]);
123 if (backup && mcelog.entry[i].tsc == backup->tsc)
124 backup = NULL;
125 }
126 if (backup)
127 print_mce(backup);
128 if (tolerant >= 3)
129 printk("Fake panic: %s\n", msg);
130 else
131 panic(msg);
132}
133
134static int mce_available(struct cpuinfo_x86 *c)
135{
136 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
137 test_bit(X86_FEATURE_MCA, &c->x86_capability);
138}
139
94ad8474
AK
140static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
141{
142 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
143 m->rip = regs->rip;
144 m->cs = regs->cs;
145 } else {
146 m->rip = 0;
147 m->cs = 0;
148 }
149 if (rip_msr) {
150 /* Assume the RIP in the MSR is exact. Is this true? */
151 m->mcgstatus |= MCG_STATUS_EIPV;
152 rdmsrl(rip_msr, m->rip);
153 m->cs = 0;
154 }
155}
156
1da177e4
LT
157/*
158 * The actual machine check handler
159 */
160
161void do_machine_check(struct pt_regs * regs, long error_code)
162{
163 struct mce m, panicm;
164 int nowayout = (tolerant < 1);
165 int kill_it = 0;
166 u64 mcestart = 0;
167 int i;
168 int panicm_found = 0;
169
170 if (regs)
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
172 if (!banks)
173 return;
174
175 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV))
179 kill_it = 1;
180
181 rdtscll(mcestart);
182 barrier();
183
184 for (i = 0; i < banks; i++) {
185 if (!bank[i])
186 continue;
187
188 m.misc = 0;
189 m.addr = 0;
190 m.bank = i;
191 m.tsc = 0;
192
193 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
194 if ((m.status & MCI_STATUS_VAL) == 0)
195 continue;
196
197 if (m.status & MCI_STATUS_EN) {
198 /* In theory _OVER could be a nowayout too, but
199 assume any overflowed errors were no fatal. */
200 nowayout |= !!(m.status & MCI_STATUS_PCC);
201 kill_it |= !!(m.status & MCI_STATUS_UC);
202 }
203
204 if (m.status & MCI_STATUS_MISCV)
205 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
206 if (m.status & MCI_STATUS_ADDRV)
207 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
208
94ad8474 209 mce_get_rip(&m, regs);
d5172f26 210 if (error_code >= 0)
1da177e4
LT
211 rdtscll(m.tsc);
212 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
d5172f26
AK
213 if (error_code != -2)
214 mce_log(&m);
1da177e4
LT
215
216 /* Did this bank cause the exception? */
217 /* Assume that the bank with uncorrectable errors did it,
218 and that there is only a single one. */
219 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
220 panicm = m;
221 panicm_found = 1;
222 }
223
9f158333 224 add_taint(TAINT_MACHINE_CHECK);
1da177e4
LT
225 }
226
227 /* Never do anything final in the polling timer */
228 if (!regs)
229 goto out;
230
231 /* If we didn't find an uncorrectable error, pick
232 the last one (shouldn't happen, just being safe). */
233 if (!panicm_found)
234 panicm = m;
235 if (nowayout)
236 mce_panic("Machine check", &panicm, mcestart);
237 if (kill_it) {
238 int user_space = 0;
239
240 if (m.mcgstatus & MCG_STATUS_RIPV)
241 user_space = panicm.rip && (panicm.cs & 3);
242
243 /* When the machine was in user space and the CPU didn't get
244 confused it's normally not necessary to panic, unless you
245 are paranoid (tolerant == 0)
246
247 RED-PEN could be more tolerant for MCEs in idle,
248 but most likely they occur at boot anyways, where
249 it is best to just halt the machine. */
250 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
251 (unsigned)current->pid <= 1)
252 mce_panic("Uncorrected machine check", &panicm, mcestart);
253
254 /* do_exit takes an awful lot of locks and has as
255 slight risk of deadlocking. If you don't want that
256 don't set tolerant >= 2 */
257 if (tolerant < 3)
258 do_exit(SIGBUS);
259 }
260
261 out:
262 /* Last thing done in the machine check exception to clear state. */
263 wrmsrl(MSR_IA32_MCG_STATUS, 0);
264}
265
266/*
267 * Periodic polling timer for "silent" machine check errors.
268 */
269
270static int check_interval = 5 * 60; /* 5 minutes */
271static void mcheck_timer(void *data);
272static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
273
274static void mcheck_check_cpu(void *info)
275{
276 if (mce_available(&current_cpu_data))
277 do_machine_check(NULL, 0);
278}
279
280static void mcheck_timer(void *data)
281{
282 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
283 schedule_delayed_work(&mcheck_work, check_interval * HZ);
284
285 /*
286 * It's ok to read stale data here for notify_user and
287 * console_logged as we'll simply get the updated versions
288 * on the next mcheck_timer execution and atomic operations
289 * on console_logged act as synchronization for notify_user
290 * writes.
291 */
292 if (notify_user && console_logged) {
293 notify_user = 0;
294 clear_bit(0, &console_logged);
295 printk(KERN_INFO "Machine check events logged\n");
296 }
297}
298
299
300static __init int periodic_mcheck_init(void)
301{
302 if (check_interval)
303 schedule_delayed_work(&mcheck_work, check_interval*HZ);
304 return 0;
305}
306__initcall(periodic_mcheck_init);
307
308
309/*
310 * Initialize Machine Checks for a CPU.
311 */
312static void mce_init(void *dummy)
313{
314 u64 cap;
315 int i;
316
317 rdmsrl(MSR_IA32_MCG_CAP, cap);
318 banks = cap & 0xff;
319 if (banks > NR_BANKS) {
320 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
321 banks = NR_BANKS;
322 }
94ad8474
AK
323 /* Use accurate RIP reporting if available. */
324 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
325 rip_msr = MSR_IA32_MCG_EIP;
1da177e4
LT
326
327 /* Log the machine checks left over from the previous reset.
328 This also clears all registers */
d5172f26 329 do_machine_check(NULL, mce_bootlog ? -1 : -2);
1da177e4
LT
330
331 set_in_cr4(X86_CR4_MCE);
332
333 if (cap & MCG_CTL_P)
334 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
335
336 for (i = 0; i < banks; i++) {
337 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
338 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
339 }
340}
341
342/* Add per CPU specific workarounds here */
e6982c67 343static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1da177e4
LT
344{
345 /* This should be disabled by the BIOS, but isn't always */
346 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
347 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]);
350 }
351}
352
e6982c67 353static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
354{
355 switch (c->x86_vendor) {
356 case X86_VENDOR_INTEL:
357 mce_intel_feature_init(c);
358 break;
89b831ef
JS
359 case X86_VENDOR_AMD:
360 mce_amd_feature_init(c);
361 break;
1da177e4
LT
362 default:
363 break;
364 }
365}
366
367/*
368 * Called for each booted CPU to set up machine checks.
369 * Must be called with preempt off.
370 */
e6982c67 371void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4
LT
372{
373 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
374
375 mce_cpu_quirks(c);
376
377 if (mce_dont_init ||
378 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
379 !mce_available(c))
380 return;
381
382 mce_init(NULL);
383 mce_cpu_features(c);
384}
385
386/*
387 * Character device to read and clear the MCE log.
388 */
389
390static void collect_tscs(void *data)
391{
392 unsigned long *cpu_tsc = (unsigned long *)data;
393 rdtscll(cpu_tsc[smp_processor_id()]);
394}
395
396static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
397{
f0de53bb 398 unsigned long *cpu_tsc;
1da177e4
LT
399 static DECLARE_MUTEX(mce_read_sem);
400 unsigned next;
401 char __user *buf = ubuf;
402 int i, err;
403
f0de53bb
AK
404 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
405 if (!cpu_tsc)
406 return -ENOMEM;
407
1da177e4
LT
408 down(&mce_read_sem);
409 next = rcu_dereference(mcelog.next);
410
411 /* Only supports full reads right now */
412 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
413 up(&mce_read_sem);
f0de53bb 414 kfree(cpu_tsc);
1da177e4
LT
415 return -EINVAL;
416 }
417
418 err = 0;
673242c1
AK
419 for (i = 0; i < next; i++) {
420 unsigned long start = jiffies;
421 while (!mcelog.entry[i].finished) {
422 if (!time_before(jiffies, start + 2)) {
423 memset(mcelog.entry + i,0, sizeof(struct mce));
424 continue;
425 }
426 cpu_relax();
427 }
1da177e4
LT
428 smp_rmb();
429 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
430 buf += sizeof(struct mce);
431 }
432
433 memset(mcelog.entry, 0, next * sizeof(struct mce));
434 mcelog.next = 0;
435
b2b18660 436 synchronize_sched();
1da177e4
LT
437
438 /* Collect entries that were still getting written before the synchronize. */
439
440 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
441 for (i = next; i < MCE_LOG_LEN; i++) {
442 if (mcelog.entry[i].finished &&
443 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
444 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
445 smp_rmb();
446 buf += sizeof(struct mce);
447 memset(&mcelog.entry[i], 0, sizeof(struct mce));
448 }
449 }
450 up(&mce_read_sem);
f0de53bb 451 kfree(cpu_tsc);
1da177e4
LT
452 return err ? -EFAULT : buf - ubuf;
453}
454
455static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
456{
457 int __user *p = (int __user *)arg;
458 if (!capable(CAP_SYS_ADMIN))
459 return -EPERM;
460 switch (cmd) {
461 case MCE_GET_RECORD_LEN:
462 return put_user(sizeof(struct mce), p);
463 case MCE_GET_LOG_LEN:
464 return put_user(MCE_LOG_LEN, p);
465 case MCE_GETCLEAR_FLAGS: {
466 unsigned flags;
467 do {
468 flags = mcelog.flags;
469 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
470 return put_user(flags, p);
471 }
472 default:
473 return -ENOTTY;
474 }
475}
476
477static struct file_operations mce_chrdev_ops = {
478 .read = mce_read,
479 .ioctl = mce_ioctl,
480};
481
482static struct miscdevice mce_log_device = {
483 MISC_MCELOG_MINOR,
484 "mcelog",
485 &mce_chrdev_ops,
486};
487
488/*
489 * Old style boot options parsing. Only for compatibility.
490 */
491
492static int __init mcheck_disable(char *str)
493{
494 mce_dont_init = 1;
495 return 0;
496}
497
498/* mce=off disables machine check. Note you can reenable it later
d5172f26 499 using sysfs.
8c566ef5 500 mce=TOLERANCELEVEL (number, see above)
d5172f26
AK
501 mce=bootlog Log MCEs from before booting. Disabled by default to work
502 around buggy BIOS that leave bogus MCEs. */
1da177e4
LT
503static int __init mcheck_enable(char *str)
504{
d5172f26
AK
505 if (*str == '=')
506 str++;
1da177e4
LT
507 if (!strcmp(str, "off"))
508 mce_dont_init = 1;
d5172f26
AK
509 else if (!strcmp(str, "bootlog"))
510 mce_bootlog = 1;
8c566ef5
AK
511 else if (isdigit(str[0]))
512 get_option(&str, &tolerant);
1da177e4
LT
513 else
514 printk("mce= argument %s ignored. Please use /sys", str);
515 return 0;
516}
517
518__setup("nomce", mcheck_disable);
519__setup("mce", mcheck_enable);
520
521/*
522 * Sysfs support
523 */
524
413588c7
AK
525/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
526 Only one CPU is active at this time, the others get readded later using
527 CPU hotplug. */
1da177e4
LT
528static int mce_resume(struct sys_device *dev)
529{
413588c7 530 mce_init(NULL);
1da177e4
LT
531 return 0;
532}
533
534/* Reinit MCEs after user configuration changes */
535static void mce_restart(void)
536{
537 if (check_interval)
538 cancel_delayed_work(&mcheck_work);
539 /* Timer race is harmless here */
540 on_each_cpu(mce_init, NULL, 1, 1);
541 if (check_interval)
542 schedule_delayed_work(&mcheck_work, check_interval*HZ);
543}
544
545static struct sysdev_class mce_sysclass = {
546 .resume = mce_resume,
547 set_kset_name("machinecheck"),
548};
549
91c6d400 550static DEFINE_PER_CPU(struct sys_device, device_mce);
1da177e4
LT
551
552/* Why are there no generic functions for this? */
553#define ACCESSOR(name, var, start) \
554 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
555 return sprintf(buf, "%lx\n", (unsigned long)var); \
556 } \
557 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
558 char *end; \
559 unsigned long new = simple_strtoul(buf, &end, 0); \
560 if (end == buf) return -EINVAL; \
561 var = new; \
562 start; \
563 return end-buf; \
564 } \
565 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
566
567ACCESSOR(bank0ctl,bank[0],mce_restart())
568ACCESSOR(bank1ctl,bank[1],mce_restart())
569ACCESSOR(bank2ctl,bank[2],mce_restart())
570ACCESSOR(bank3ctl,bank[3],mce_restart())
571ACCESSOR(bank4ctl,bank[4],mce_restart())
572ACCESSOR(tolerant,tolerant,)
573ACCESSOR(check_interval,check_interval,mce_restart())
574
91c6d400
AK
575/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
576static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
577{
578 int err;
91c6d400
AK
579 if (!mce_available(&cpu_data[cpu]))
580 return -EIO;
581
582 per_cpu(device_mce,cpu).id = cpu;
583 per_cpu(device_mce,cpu).cls = &mce_sysclass;
584
585 err = sysdev_register(&per_cpu(device_mce,cpu));
586
587 if (!err) {
588 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
589 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
590 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
591 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
592 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
593 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
594 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
595 }
596 return err;
597}
598
599#ifdef CONFIG_HOTPLUG_CPU
600static __cpuinit void mce_remove_device(unsigned int cpu)
601{
602 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
603 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
604 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
605 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
606 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
607 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
608 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
609 sysdev_unregister(&per_cpu(device_mce,cpu));
610}
611#endif
612
613/* Get notified when a cpu comes on/off. Be hotplug friendly. */
614static __cpuinit int
615mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
616{
617 unsigned int cpu = (unsigned long)hcpu;
618
619 switch (action) {
620 case CPU_ONLINE:
621 mce_create_device(cpu);
622 break;
623#ifdef CONFIG_HOTPLUG_CPU
624 case CPU_DEAD:
625 mce_remove_device(cpu);
626 break;
627#endif
628 }
629 return NOTIFY_OK;
630}
631
632static struct notifier_block mce_cpu_notifier = {
633 .notifier_call = mce_cpu_callback,
634};
635
636static __init int mce_init_device(void)
637{
638 int err;
639 int i = 0;
640
1da177e4
LT
641 if (!mce_available(&boot_cpu_data))
642 return -EIO;
643 err = sysdev_class_register(&mce_sysclass);
91c6d400
AK
644
645 for_each_online_cpu(i) {
646 mce_create_device(i);
647 }
648
649 register_cpu_notifier(&mce_cpu_notifier);
1da177e4
LT
650 misc_register(&mce_log_device);
651 return err;
1da177e4 652}
91c6d400 653
1da177e4 654device_initcall(mce_init_device);