[PATCH] x86_64: Adjust, correct, and complete the HPET definitions for x86-64.
[linux-2.6-block.git] / arch / x86_64 / kernel / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
91c6d400
AK
18#include <linux/cpu.h>
19#include <linux/percpu.h>
8c566ef5 20#include <linux/ctype.h>
1da177e4
LT
21#include <asm/processor.h>
22#include <asm/msr.h>
23#include <asm/mce.h>
24#include <asm/kdebug.h>
25#include <asm/uaccess.h>
26
27#define MISC_MCELOG_MINOR 227
28#define NR_BANKS 5
29
30static int mce_dont_init;
31
32/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34static int tolerant = 1;
35static int banks;
36static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37static unsigned long console_logged;
38static int notify_user;
94ad8474 39static int rip_msr;
d5172f26 40static int mce_bootlog;
1da177e4
LT
41
42/*
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
46 */
47
48struct mce_log mcelog = {
49 MCE_LOG_SIGNATURE,
50 MCE_LOG_LEN,
51};
52
53void mce_log(struct mce *mce)
54{
55 unsigned next, entry;
56 mce->finished = 0;
7644143c 57 wmb();
1da177e4
LT
58 for (;;) {
59 entry = rcu_dereference(mcelog.next);
7644143c
MW
60 /* The rmb forces the compiler to reload next in each
61 iteration */
62 rmb();
673242c1
AK
63 for (;;) {
64 /* When the buffer fills up discard new entries. Assume
65 that the earlier errors are the more interesting. */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW, &mcelog.flags);
68 return;
69 }
70 /* Old left over entry. Skip. */
71 if (mcelog.entry[entry].finished) {
72 entry++;
73 continue;
74 }
7644143c 75 break;
1da177e4 76 }
1da177e4
LT
77 smp_rmb();
78 next = entry + 1;
79 if (cmpxchg(&mcelog.next, entry, next) == entry)
80 break;
81 }
82 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 83 wmb();
1da177e4 84 mcelog.entry[entry].finished = 1;
7644143c 85 wmb();
1da177e4
LT
86
87 if (!test_and_set_bit(0, &console_logged))
88 notify_user = 1;
89}
90
91static void print_mce(struct mce *m)
92{
93 printk(KERN_EMERG "\n"
94 KERN_EMERG
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status);
97 if (m->rip) {
98 printk(KERN_EMERG
99 "RIP%s %02x:<%016Lx> ",
100 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
101 m->cs, m->rip);
102 if (m->cs == __KERNEL_CS)
103 print_symbol("{%s}", m->rip);
104 printk("\n");
105 }
106 printk(KERN_EMERG "TSC %Lx ", m->tsc);
107 if (m->addr)
108 printk("ADDR %Lx ", m->addr);
109 if (m->misc)
110 printk("MISC %Lx ", m->misc);
111 printk("\n");
112}
113
114static void mce_panic(char *msg, struct mce *backup, unsigned long start)
115{
116 int i;
117 oops_begin();
118 for (i = 0; i < MCE_LOG_LEN; i++) {
119 unsigned long tsc = mcelog.entry[i].tsc;
120 if (time_before(tsc, start))
121 continue;
122 print_mce(&mcelog.entry[i]);
123 if (backup && mcelog.entry[i].tsc == backup->tsc)
124 backup = NULL;
125 }
126 if (backup)
127 print_mce(backup);
128 if (tolerant >= 3)
129 printk("Fake panic: %s\n", msg);
130 else
131 panic(msg);
132}
133
134static int mce_available(struct cpuinfo_x86 *c)
135{
136 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
137 test_bit(X86_FEATURE_MCA, &c->x86_capability);
138}
139
94ad8474
AK
140static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
141{
142 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
143 m->rip = regs->rip;
144 m->cs = regs->cs;
145 } else {
146 m->rip = 0;
147 m->cs = 0;
148 }
149 if (rip_msr) {
150 /* Assume the RIP in the MSR is exact. Is this true? */
151 m->mcgstatus |= MCG_STATUS_EIPV;
152 rdmsrl(rip_msr, m->rip);
153 m->cs = 0;
154 }
155}
156
1da177e4
LT
157/*
158 * The actual machine check handler
159 */
160
161void do_machine_check(struct pt_regs * regs, long error_code)
162{
163 struct mce m, panicm;
164 int nowayout = (tolerant < 1);
165 int kill_it = 0;
166 u64 mcestart = 0;
167 int i;
168 int panicm_found = 0;
169
170 if (regs)
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
172 if (!banks)
173 return;
174
175 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV))
179 kill_it = 1;
180
181 rdtscll(mcestart);
182 barrier();
183
184 for (i = 0; i < banks; i++) {
185 if (!bank[i])
186 continue;
187
188 m.misc = 0;
189 m.addr = 0;
190 m.bank = i;
191 m.tsc = 0;
192
193 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
194 if ((m.status & MCI_STATUS_VAL) == 0)
195 continue;
196
197 if (m.status & MCI_STATUS_EN) {
198 /* In theory _OVER could be a nowayout too, but
199 assume any overflowed errors were no fatal. */
200 nowayout |= !!(m.status & MCI_STATUS_PCC);
201 kill_it |= !!(m.status & MCI_STATUS_UC);
202 }
203
204 if (m.status & MCI_STATUS_MISCV)
205 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
206 if (m.status & MCI_STATUS_ADDRV)
207 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
208
94ad8474 209 mce_get_rip(&m, regs);
d5172f26 210 if (error_code >= 0)
1da177e4
LT
211 rdtscll(m.tsc);
212 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
d5172f26
AK
213 if (error_code != -2)
214 mce_log(&m);
1da177e4
LT
215
216 /* Did this bank cause the exception? */
217 /* Assume that the bank with uncorrectable errors did it,
218 and that there is only a single one. */
219 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
220 panicm = m;
221 panicm_found = 1;
222 }
223
9f158333 224 add_taint(TAINT_MACHINE_CHECK);
1da177e4
LT
225 }
226
227 /* Never do anything final in the polling timer */
228 if (!regs)
229 goto out;
230
231 /* If we didn't find an uncorrectable error, pick
232 the last one (shouldn't happen, just being safe). */
233 if (!panicm_found)
234 panicm = m;
235 if (nowayout)
236 mce_panic("Machine check", &panicm, mcestart);
237 if (kill_it) {
238 int user_space = 0;
239
240 if (m.mcgstatus & MCG_STATUS_RIPV)
241 user_space = panicm.rip && (panicm.cs & 3);
242
243 /* When the machine was in user space and the CPU didn't get
244 confused it's normally not necessary to panic, unless you
245 are paranoid (tolerant == 0)
246
247 RED-PEN could be more tolerant for MCEs in idle,
248 but most likely they occur at boot anyways, where
249 it is best to just halt the machine. */
250 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
251 (unsigned)current->pid <= 1)
252 mce_panic("Uncorrected machine check", &panicm, mcestart);
253
254 /* do_exit takes an awful lot of locks and has as
255 slight risk of deadlocking. If you don't want that
256 don't set tolerant >= 2 */
257 if (tolerant < 3)
258 do_exit(SIGBUS);
259 }
260
261 out:
262 /* Last thing done in the machine check exception to clear state. */
263 wrmsrl(MSR_IA32_MCG_STATUS, 0);
264}
265
266/*
267 * Periodic polling timer for "silent" machine check errors.
268 */
269
270static int check_interval = 5 * 60; /* 5 minutes */
271static void mcheck_timer(void *data);
272static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
273
274static void mcheck_check_cpu(void *info)
275{
276 if (mce_available(&current_cpu_data))
277 do_machine_check(NULL, 0);
278}
279
280static void mcheck_timer(void *data)
281{
282 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
283 schedule_delayed_work(&mcheck_work, check_interval * HZ);
284
285 /*
286 * It's ok to read stale data here for notify_user and
287 * console_logged as we'll simply get the updated versions
288 * on the next mcheck_timer execution and atomic operations
289 * on console_logged act as synchronization for notify_user
290 * writes.
291 */
292 if (notify_user && console_logged) {
293 notify_user = 0;
294 clear_bit(0, &console_logged);
295 printk(KERN_INFO "Machine check events logged\n");
296 }
297}
298
299
300static __init int periodic_mcheck_init(void)
301{
302 if (check_interval)
303 schedule_delayed_work(&mcheck_work, check_interval*HZ);
304 return 0;
305}
306__initcall(periodic_mcheck_init);
307
308
309/*
310 * Initialize Machine Checks for a CPU.
311 */
312static void mce_init(void *dummy)
313{
314 u64 cap;
315 int i;
316
317 rdmsrl(MSR_IA32_MCG_CAP, cap);
318 banks = cap & 0xff;
319 if (banks > NR_BANKS) {
320 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
321 banks = NR_BANKS;
322 }
94ad8474
AK
323 /* Use accurate RIP reporting if available. */
324 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
325 rip_msr = MSR_IA32_MCG_EIP;
1da177e4
LT
326
327 /* Log the machine checks left over from the previous reset.
328 This also clears all registers */
d5172f26 329 do_machine_check(NULL, mce_bootlog ? -1 : -2);
1da177e4
LT
330
331 set_in_cr4(X86_CR4_MCE);
332
333 if (cap & MCG_CTL_P)
334 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
335
336 for (i = 0; i < banks; i++) {
337 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
338 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
339 }
340}
341
342/* Add per CPU specific workarounds here */
e6982c67 343static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1da177e4
LT
344{
345 /* This should be disabled by the BIOS, but isn't always */
346 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
347 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]);
350 }
351}
352
e6982c67 353static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
354{
355 switch (c->x86_vendor) {
356 case X86_VENDOR_INTEL:
357 mce_intel_feature_init(c);
358 break;
359 default:
360 break;
361 }
362}
363
364/*
365 * Called for each booted CPU to set up machine checks.
366 * Must be called with preempt off.
367 */
e6982c67 368void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4
LT
369{
370 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
371
372 mce_cpu_quirks(c);
373
374 if (mce_dont_init ||
375 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
376 !mce_available(c))
377 return;
378
379 mce_init(NULL);
380 mce_cpu_features(c);
381}
382
383/*
384 * Character device to read and clear the MCE log.
385 */
386
387static void collect_tscs(void *data)
388{
389 unsigned long *cpu_tsc = (unsigned long *)data;
390 rdtscll(cpu_tsc[smp_processor_id()]);
391}
392
393static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
394{
f0de53bb 395 unsigned long *cpu_tsc;
1da177e4
LT
396 static DECLARE_MUTEX(mce_read_sem);
397 unsigned next;
398 char __user *buf = ubuf;
399 int i, err;
400
f0de53bb
AK
401 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
402 if (!cpu_tsc)
403 return -ENOMEM;
404
1da177e4
LT
405 down(&mce_read_sem);
406 next = rcu_dereference(mcelog.next);
407
408 /* Only supports full reads right now */
409 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
410 up(&mce_read_sem);
f0de53bb 411 kfree(cpu_tsc);
1da177e4
LT
412 return -EINVAL;
413 }
414
415 err = 0;
673242c1
AK
416 for (i = 0; i < next; i++) {
417 unsigned long start = jiffies;
418 while (!mcelog.entry[i].finished) {
419 if (!time_before(jiffies, start + 2)) {
420 memset(mcelog.entry + i,0, sizeof(struct mce));
421 continue;
422 }
423 cpu_relax();
424 }
1da177e4
LT
425 smp_rmb();
426 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
427 buf += sizeof(struct mce);
428 }
429
430 memset(mcelog.entry, 0, next * sizeof(struct mce));
431 mcelog.next = 0;
432
b2b18660 433 synchronize_sched();
1da177e4
LT
434
435 /* Collect entries that were still getting written before the synchronize. */
436
437 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
438 for (i = next; i < MCE_LOG_LEN; i++) {
439 if (mcelog.entry[i].finished &&
440 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
441 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
442 smp_rmb();
443 buf += sizeof(struct mce);
444 memset(&mcelog.entry[i], 0, sizeof(struct mce));
445 }
446 }
447 up(&mce_read_sem);
f0de53bb 448 kfree(cpu_tsc);
1da177e4
LT
449 return err ? -EFAULT : buf - ubuf;
450}
451
452static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
453{
454 int __user *p = (int __user *)arg;
455 if (!capable(CAP_SYS_ADMIN))
456 return -EPERM;
457 switch (cmd) {
458 case MCE_GET_RECORD_LEN:
459 return put_user(sizeof(struct mce), p);
460 case MCE_GET_LOG_LEN:
461 return put_user(MCE_LOG_LEN, p);
462 case MCE_GETCLEAR_FLAGS: {
463 unsigned flags;
464 do {
465 flags = mcelog.flags;
466 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
467 return put_user(flags, p);
468 }
469 default:
470 return -ENOTTY;
471 }
472}
473
474static struct file_operations mce_chrdev_ops = {
475 .read = mce_read,
476 .ioctl = mce_ioctl,
477};
478
479static struct miscdevice mce_log_device = {
480 MISC_MCELOG_MINOR,
481 "mcelog",
482 &mce_chrdev_ops,
483};
484
485/*
486 * Old style boot options parsing. Only for compatibility.
487 */
488
489static int __init mcheck_disable(char *str)
490{
491 mce_dont_init = 1;
492 return 0;
493}
494
495/* mce=off disables machine check. Note you can reenable it later
d5172f26 496 using sysfs.
8c566ef5 497 mce=TOLERANCELEVEL (number, see above)
d5172f26
AK
498 mce=bootlog Log MCEs from before booting. Disabled by default to work
499 around buggy BIOS that leave bogus MCEs. */
1da177e4
LT
500static int __init mcheck_enable(char *str)
501{
d5172f26
AK
502 if (*str == '=')
503 str++;
1da177e4
LT
504 if (!strcmp(str, "off"))
505 mce_dont_init = 1;
d5172f26
AK
506 else if (!strcmp(str, "bootlog"))
507 mce_bootlog = 1;
8c566ef5
AK
508 else if (isdigit(str[0]))
509 get_option(&str, &tolerant);
1da177e4
LT
510 else
511 printk("mce= argument %s ignored. Please use /sys", str);
512 return 0;
513}
514
515__setup("nomce", mcheck_disable);
516__setup("mce", mcheck_enable);
517
518/*
519 * Sysfs support
520 */
521
413588c7
AK
522/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
523 Only one CPU is active at this time, the others get readded later using
524 CPU hotplug. */
1da177e4
LT
525static int mce_resume(struct sys_device *dev)
526{
413588c7 527 mce_init(NULL);
1da177e4
LT
528 return 0;
529}
530
531/* Reinit MCEs after user configuration changes */
532static void mce_restart(void)
533{
534 if (check_interval)
535 cancel_delayed_work(&mcheck_work);
536 /* Timer race is harmless here */
537 on_each_cpu(mce_init, NULL, 1, 1);
538 if (check_interval)
539 schedule_delayed_work(&mcheck_work, check_interval*HZ);
540}
541
542static struct sysdev_class mce_sysclass = {
543 .resume = mce_resume,
544 set_kset_name("machinecheck"),
545};
546
91c6d400 547static DEFINE_PER_CPU(struct sys_device, device_mce);
1da177e4
LT
548
549/* Why are there no generic functions for this? */
550#define ACCESSOR(name, var, start) \
551 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
552 return sprintf(buf, "%lx\n", (unsigned long)var); \
553 } \
554 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
555 char *end; \
556 unsigned long new = simple_strtoul(buf, &end, 0); \
557 if (end == buf) return -EINVAL; \
558 var = new; \
559 start; \
560 return end-buf; \
561 } \
562 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
563
564ACCESSOR(bank0ctl,bank[0],mce_restart())
565ACCESSOR(bank1ctl,bank[1],mce_restart())
566ACCESSOR(bank2ctl,bank[2],mce_restart())
567ACCESSOR(bank3ctl,bank[3],mce_restart())
568ACCESSOR(bank4ctl,bank[4],mce_restart())
569ACCESSOR(tolerant,tolerant,)
570ACCESSOR(check_interval,check_interval,mce_restart())
571
91c6d400
AK
572/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
573static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
574{
575 int err;
91c6d400
AK
576 if (!mce_available(&cpu_data[cpu]))
577 return -EIO;
578
579 per_cpu(device_mce,cpu).id = cpu;
580 per_cpu(device_mce,cpu).cls = &mce_sysclass;
581
582 err = sysdev_register(&per_cpu(device_mce,cpu));
583
584 if (!err) {
585 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
586 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
587 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
588 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
589 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
590 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
591 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
592 }
593 return err;
594}
595
596#ifdef CONFIG_HOTPLUG_CPU
597static __cpuinit void mce_remove_device(unsigned int cpu)
598{
599 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
600 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
601 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
602 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
603 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
604 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
605 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
606 sysdev_unregister(&per_cpu(device_mce,cpu));
607}
608#endif
609
610/* Get notified when a cpu comes on/off. Be hotplug friendly. */
611static __cpuinit int
612mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
613{
614 unsigned int cpu = (unsigned long)hcpu;
615
616 switch (action) {
617 case CPU_ONLINE:
618 mce_create_device(cpu);
619 break;
620#ifdef CONFIG_HOTPLUG_CPU
621 case CPU_DEAD:
622 mce_remove_device(cpu);
623 break;
624#endif
625 }
626 return NOTIFY_OK;
627}
628
629static struct notifier_block mce_cpu_notifier = {
630 .notifier_call = mce_cpu_callback,
631};
632
633static __init int mce_init_device(void)
634{
635 int err;
636 int i = 0;
637
1da177e4
LT
638 if (!mce_available(&boot_cpu_data))
639 return -EIO;
640 err = sysdev_class_register(&mce_sysclass);
91c6d400
AK
641
642 for_each_online_cpu(i) {
643 mce_create_device(i);
644 }
645
646 register_cpu_notifier(&mce_cpu_notifier);
1da177e4
LT
647 misc_register(&mce_log_device);
648 return err;
1da177e4 649}
91c6d400 650
1da177e4 651device_initcall(mce_init_device);