Linux 4.11-rc2
[linux-2.6-block.git] / arch / x86 / kernel / cpu / mcheck / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
e9eee03e 3 *
1da177e4 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
1da177e4 9 */
c767a54b
JP
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
e9eee03e
IM
13#include <linux/thread_info.h>
14#include <linux/capability.h>
15#include <linux/miscdevice.h>
16#include <linux/ratelimit.h>
17#include <linux/kallsyms.h>
18#include <linux/rcupdate.h>
e9eee03e 19#include <linux/kobject.h>
14a02530 20#include <linux/uaccess.h>
e9eee03e
IM
21#include <linux/kdebug.h>
22#include <linux/kernel.h>
23#include <linux/percpu.h>
1da177e4 24#include <linux/string.h>
8a25a2fd 25#include <linux/device.h>
f3c6ea1b 26#include <linux/syscore_ops.h>
3c079792 27#include <linux/delay.h>
8c566ef5 28#include <linux/ctype.h>
e9eee03e 29#include <linux/sched.h>
0d7482e3 30#include <linux/sysfs.h>
e9eee03e 31#include <linux/types.h>
5a0e3ad6 32#include <linux/slab.h>
e9eee03e
IM
33#include <linux/init.h>
34#include <linux/kmod.h>
35#include <linux/poll.h>
3c079792 36#include <linux/nmi.h>
e9eee03e 37#include <linux/cpu.h>
14a02530 38#include <linux/smp.h>
e9eee03e 39#include <linux/fs.h>
9b1beaf2 40#include <linux/mm.h>
5be9ed25 41#include <linux/debugfs.h>
b77e70bf 42#include <linux/irq_work.h>
69c60c88 43#include <linux/export.h>
3637efb0 44#include <linux/jump_label.h>
e9eee03e 45
3f5a7896 46#include <asm/intel-family.h>
d88203d1 47#include <asm/processor.h>
95927475 48#include <asm/traps.h>
375074cc 49#include <asm/tlbflush.h>
e9eee03e
IM
50#include <asm/mce.h>
51#include <asm/msr.h>
1da177e4 52
bd19a5e6 53#include "mce-internal.h"
711c2e48 54
93b62c3c 55static DEFINE_MUTEX(mce_chrdev_read_mutex);
2aa2b50d 56
9a7783d0 57#define mce_log_get_idx_check(p) \
e90328b8 58({ \
f78f5b90
PM
59 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
60 !lockdep_is_held(&mce_chrdev_read_mutex), \
3959df1d 61 "suspicious mce_log_get_idx_check() usage"); \
e90328b8
PM
62 smp_load_acquire(&(p)); \
63})
f56e8a07 64
8968f9d3
HS
65#define CREATE_TRACE_POINTS
66#include <trace/events/mce.h>
67
3f2f0680 68#define SPINUNIT 100 /* 100ns */
3c079792 69
01ca79f1
AK
70DEFINE_PER_CPU(unsigned, mce_exception_count);
71
1462594b 72struct mce_bank *mce_banks __read_mostly;
bf80bbd7 73struct mce_vendor_flags mce_flags __read_mostly;
cebe1820 74
d203f0b8 75struct mca_config mca_cfg __read_mostly = {
84c2559d 76 .bootlog = -1,
d203f0b8
BP
77 /*
78 * Tolerant levels:
79 * 0: always panic on uncorrected errors, log corrected errors
80 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
81 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
82 * 3: never panic or SIGBUS, log all errors (for testing only)
83 */
84c2559d
BP
84 .tolerant = 1,
85 .monarch_timeout = -1
d203f0b8
BP
86};
87
1020bcbc
HS
88/* User mode helper program triggered by machine check event */
89static unsigned long mce_need_notify;
90static char mce_helper[128];
91static char *mce_helper_argv[2] = { mce_helper, NULL };
1da177e4 92
93b62c3c
HS
93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
94
3c079792
AK
95static DEFINE_PER_CPU(struct mce, mces_seen);
96static int cpu_missing;
97
0644414e
NR
98/*
99 * MCA banks polled by the period polling timer for corrected events.
100 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
101 */
ee031c31
AK
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
103 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
104};
105
c3d1fb56
NR
106/*
107 * MCA banks controlled through firmware first for corrected errors.
108 * This is a global list of banks for which we won't enable CMCI and we
109 * won't poll. Firmware controls these banks and is responsible for
110 * reporting corrected errors through GHES. Uncorrected/recoverable
111 * errors are still notified through a machine check.
112 */
113mce_banks_t mce_banks_ce_disabled;
114
061120ae
CG
115static struct work_struct mce_work;
116static struct irq_work mce_irq_work;
9b1beaf2 117
61b0fccd
TL
118static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
119
3653ada5
BP
120/*
121 * CPU/chipset specific EDAC code can register a notifier call here to print
122 * MCE errors in a human-readable form.
123 */
648ed940 124ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
3653ada5 125
b5f2fa4e
AK
126/* Do initial initialization of a struct mce */
127void mce_setup(struct mce *m)
128{
129 memset(m, 0, sizeof(struct mce));
d620c67f 130 m->cpu = m->extcpu = smp_processor_id();
8ee08347
AK
131 /* We hope get_seconds stays lockless */
132 m->time = get_seconds();
133 m->cpuvendor = boot_cpu_data.x86_vendor;
134 m->cpuid = cpuid_eax(1);
8ee08347 135 m->socketid = cpu_data(m->extcpu).phys_proc_id;
8ee08347
AK
136 m->apicid = cpu_data(m->extcpu).initial_apicid;
137 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
3f5a7896
TL
138
139 if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
140 rdmsrl(MSR_PPIN, m->ppin);
b5f2fa4e
AK
141}
142
ea149b36
AK
143DEFINE_PER_CPU(struct mce, injectm);
144EXPORT_PER_CPU_SYMBOL_GPL(injectm);
145
1da177e4
LT
146/*
147 * Lockless MCE logging infrastructure.
148 * This avoids deadlocks on printk locks without having to break locks. Also
149 * separate MCEs from kernel messages to avoid bogus bug reports.
150 */
151
231fd906 152static struct mce_log mcelog = {
f6fb0ac0
AK
153 .signature = MCE_LOG_SIGNATURE,
154 .len = MCE_LOG_LEN,
155 .recordlen = sizeof(struct mce),
d88203d1 156};
1da177e4
LT
157
158void mce_log(struct mce *mce)
159{
160 unsigned next, entry;
e9eee03e 161
8968f9d3
HS
162 /* Emit the trace record: */
163 trace_mce_record(mce);
164
f29a7aff
CG
165 if (!mce_gen_pool_add(mce))
166 irq_work_queue(&mce_irq_work);
f0cb5452 167
7644143c 168 wmb();
1da177e4 169 for (;;) {
9a7783d0 170 entry = mce_log_get_idx_check(mcelog.next);
673242c1 171 for (;;) {
696e409d 172
e9eee03e
IM
173 /*
174 * When the buffer fills up discard new entries.
175 * Assume that the earlier errors are the more
176 * interesting ones:
177 */
673242c1 178 if (entry >= MCE_LOG_LEN) {
14a02530
HS
179 set_bit(MCE_OVERFLOW,
180 (unsigned long *)&mcelog.flags);
673242c1
AK
181 return;
182 }
e9eee03e 183 /* Old left over entry. Skip: */
673242c1
AK
184 if (mcelog.entry[entry].finished) {
185 entry++;
186 continue;
187 }
7644143c 188 break;
1da177e4 189 }
1da177e4
LT
190 smp_rmb();
191 next = entry + 1;
192 if (cmpxchg(&mcelog.next, entry, next) == entry)
193 break;
194 }
195 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 196 wmb();
1da177e4 197 mcelog.entry[entry].finished = 1;
7644143c 198 wmb();
1da177e4 199
1020bcbc 200 set_bit(0, &mce_need_notify);
1da177e4
LT
201}
202
a79da384 203void mce_inject_log(struct mce *m)
09371957 204{
a79da384
BP
205 mutex_lock(&mce_chrdev_read_mutex);
206 mce_log(m);
207 mutex_unlock(&mce_chrdev_read_mutex);
09371957 208}
a79da384 209EXPORT_SYMBOL_GPL(mce_inject_log);
09371957 210
fd4cf79f 211static struct notifier_block mce_srao_nb;
09371957 212
cd9c57ca
BP
213static atomic_t num_notifiers;
214
3653ada5
BP
215void mce_register_decode_chain(struct notifier_block *nb)
216{
cd9c57ca
BP
217 atomic_inc(&num_notifiers);
218
9026cc82 219 WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
fd4cf79f 220
3653ada5
BP
221 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
222}
223EXPORT_SYMBOL_GPL(mce_register_decode_chain);
224
225void mce_unregister_decode_chain(struct notifier_block *nb)
226{
cd9c57ca
BP
227 atomic_dec(&num_notifiers);
228
3653ada5
BP
229 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
230}
231EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
232
a9750a31
YG
233static inline u32 ctl_reg(int bank)
234{
235 return MSR_IA32_MCx_CTL(bank);
236}
237
238static inline u32 status_reg(int bank)
239{
240 return MSR_IA32_MCx_STATUS(bank);
241}
242
243static inline u32 addr_reg(int bank)
244{
245 return MSR_IA32_MCx_ADDR(bank);
246}
247
248static inline u32 misc_reg(int bank)
249{
250 return MSR_IA32_MCx_MISC(bank);
251}
252
253static inline u32 smca_ctl_reg(int bank)
254{
255 return MSR_AMD64_SMCA_MCx_CTL(bank);
256}
257
258static inline u32 smca_status_reg(int bank)
259{
260 return MSR_AMD64_SMCA_MCx_STATUS(bank);
261}
262
263static inline u32 smca_addr_reg(int bank)
264{
265 return MSR_AMD64_SMCA_MCx_ADDR(bank);
266}
267
268static inline u32 smca_misc_reg(int bank)
269{
270 return MSR_AMD64_SMCA_MCx_MISC(bank);
271}
272
273struct mca_msr_regs msr_ops = {
274 .ctl = ctl_reg,
275 .status = status_reg,
276 .addr = addr_reg,
277 .misc = misc_reg
278};
279
cd9c57ca 280static void __print_mce(struct mce *m)
1da177e4 281{
cd9c57ca
BP
282 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
283 m->extcpu,
284 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
285 m->mcgstatus, m->bank, m->status);
f436f8bb 286
65ea5b03 287 if (m->ip) {
a2d7b0d4 288 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
f436f8bb 289 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
cd9c57ca 290 m->cs, m->ip);
f436f8bb 291
1da177e4 292 if (m->cs == __KERNEL_CS)
65ea5b03 293 print_symbol("{%s}", m->ip);
f436f8bb 294 pr_cont("\n");
1da177e4 295 }
f436f8bb 296
a2d7b0d4 297 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
1da177e4 298 if (m->addr)
f436f8bb 299 pr_cont("ADDR %llx ", m->addr);
1da177e4 300 if (m->misc)
f436f8bb 301 pr_cont("MISC %llx ", m->misc);
549d042d 302
4b711f92
YG
303 if (mce_flags.smca) {
304 if (m->synd)
305 pr_cont("SYND %llx ", m->synd);
306 if (m->ipid)
307 pr_cont("IPID %llx ", m->ipid);
308 }
309
f436f8bb 310 pr_cont("\n");
506ed6b5
AK
311 /*
312 * Note this output is parsed by external tools and old fields
313 * should not be changed.
314 */
881e23e5 315 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
506ed6b5
AK
316 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
317 cpu_data(m->extcpu).microcode);
cd9c57ca
BP
318}
319
320static void print_mce(struct mce *m)
321{
322 int ret = 0;
323
324 __print_mce(m);
f436f8bb
IM
325
326 /*
327 * Print out human-readable details about the MCE error,
fb253195 328 * (if the CPU has an implementation for that)
f436f8bb 329 */
dffa4b2f
BP
330 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
331 if (ret == NOTIFY_STOP)
332 return;
333
334 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
86503560
AK
335}
336
f94b61c2
AK
337#define PANIC_TIMEOUT 5 /* 5 seconds */
338
c7c9b392 339static atomic_t mce_panicked;
f94b61c2 340
bf783f9f 341static int fake_panic;
c7c9b392 342static atomic_t mce_fake_panicked;
bf783f9f 343
f94b61c2
AK
344/* Panic in progress. Enable interrupts and wait for final IPI */
345static void wait_for_panic(void)
346{
347 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
f436f8bb 348
f94b61c2
AK
349 preempt_disable();
350 local_irq_enable();
351 while (timeout-- > 0)
352 udelay(1);
29b0f591 353 if (panic_timeout == 0)
7af19e4a 354 panic_timeout = mca_cfg.panic_timeout;
f94b61c2
AK
355 panic("Panicing machine check CPU died");
356}
357
6c80f87e 358static void mce_panic(const char *msg, struct mce *final, char *exp)
d88203d1 359{
5541c93c
TL
360 int apei_err = 0;
361 struct llist_node *pending;
362 struct mce_evt_llist *l;
e02e68d3 363
bf783f9f
HY
364 if (!fake_panic) {
365 /*
366 * Make sure only one CPU runs in machine check panic
367 */
c7c9b392 368 if (atomic_inc_return(&mce_panicked) > 1)
bf783f9f
HY
369 wait_for_panic();
370 barrier();
f94b61c2 371
bf783f9f
HY
372 bust_spinlocks(1);
373 console_verbose();
374 } else {
375 /* Don't log too much for fake panic */
c7c9b392 376 if (atomic_inc_return(&mce_fake_panicked) > 1)
bf783f9f
HY
377 return;
378 }
5541c93c 379 pending = mce_gen_pool_prepare_records();
a0189c70 380 /* First print corrected ones that are still unlogged */
5541c93c
TL
381 llist_for_each_entry(l, pending, llnode) {
382 struct mce *m = &l->mce;
482908b4 383 if (!(m->status & MCI_STATUS_UC)) {
77e26cca 384 print_mce(m);
482908b4
HY
385 if (!apei_err)
386 apei_err = apei_write_mce(m);
387 }
a0189c70
AK
388 }
389 /* Now print uncorrected but with the final one last */
5541c93c
TL
390 llist_for_each_entry(l, pending, llnode) {
391 struct mce *m = &l->mce;
77e26cca
HS
392 if (!(m->status & MCI_STATUS_UC))
393 continue;
5541c93c 394 if (!final || mce_cmp(m, final)) {
77e26cca 395 print_mce(m);
482908b4
HY
396 if (!apei_err)
397 apei_err = apei_write_mce(m);
398 }
1da177e4 399 }
482908b4 400 if (final) {
77e26cca 401 print_mce(final);
482908b4
HY
402 if (!apei_err)
403 apei_err = apei_write_mce(final);
404 }
3c079792 405 if (cpu_missing)
a2d7b0d4 406 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
bd19a5e6 407 if (exp)
a2d7b0d4 408 pr_emerg(HW_ERR "Machine check: %s\n", exp);
bf783f9f
HY
409 if (!fake_panic) {
410 if (panic_timeout == 0)
7af19e4a 411 panic_timeout = mca_cfg.panic_timeout;
bf783f9f
HY
412 panic(msg);
413 } else
a2d7b0d4 414 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
d88203d1 415}
1da177e4 416
ea149b36
AK
417/* Support code for software error injection */
418
419static int msr_to_offset(u32 msr)
420{
0a3aee0d 421 unsigned bank = __this_cpu_read(injectm.bank);
f436f8bb 422
84c2559d 423 if (msr == mca_cfg.rip_msr)
ea149b36 424 return offsetof(struct mce, ip);
d9d73fcc 425 if (msr == msr_ops.status(bank))
ea149b36 426 return offsetof(struct mce, status);
d9d73fcc 427 if (msr == msr_ops.addr(bank))
ea149b36 428 return offsetof(struct mce, addr);
d9d73fcc 429 if (msr == msr_ops.misc(bank))
ea149b36
AK
430 return offsetof(struct mce, misc);
431 if (msr == MSR_IA32_MCG_STATUS)
432 return offsetof(struct mce, mcgstatus);
433 return -1;
434}
435
5f8c1a54
AK
436/* MSR access wrappers used for error injection */
437static u64 mce_rdmsrl(u32 msr)
438{
439 u64 v;
11868a2d 440
0a3aee0d 441 if (__this_cpu_read(injectm.finished)) {
ea149b36 442 int offset = msr_to_offset(msr);
11868a2d 443
ea149b36
AK
444 if (offset < 0)
445 return 0;
89cbc767 446 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
ea149b36 447 }
11868a2d
IM
448
449 if (rdmsrl_safe(msr, &v)) {
38c54ccb 450 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
11868a2d
IM
451 /*
452 * Return zero in case the access faulted. This should
453 * not happen normally but can happen if the CPU does
454 * something weird, or if the code is buggy.
455 */
456 v = 0;
457 }
458
5f8c1a54
AK
459 return v;
460}
461
462static void mce_wrmsrl(u32 msr, u64 v)
463{
0a3aee0d 464 if (__this_cpu_read(injectm.finished)) {
ea149b36 465 int offset = msr_to_offset(msr);
11868a2d 466
ea149b36 467 if (offset >= 0)
89cbc767 468 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
ea149b36
AK
469 return;
470 }
5f8c1a54
AK
471 wrmsrl(msr, v);
472}
473
b8325c5b
HS
474/*
475 * Collect all global (w.r.t. this processor) status about this machine
476 * check into our "mce" struct so that we can use it later to assess
477 * the severity of the problem as we read per-bank specific details.
478 */
479static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
480{
481 mce_setup(m);
482
483 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
484 if (regs) {
485 /*
486 * Get the address of the instruction at the time of
487 * the machine check error.
488 */
489 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
490 m->ip = regs->ip;
491 m->cs = regs->cs;
a129a7c8
AK
492
493 /*
494 * When in VM86 mode make the cs look like ring 3
495 * always. This is a lie, but it's better than passing
496 * the additional vm86 bit around everywhere.
497 */
498 if (v8086_mode(regs))
499 m->cs |= 3;
b8325c5b
HS
500 }
501 /* Use accurate RIP reporting if available. */
84c2559d
BP
502 if (mca_cfg.rip_msr)
503 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
b8325c5b
HS
504 }
505}
506
88ccbedd 507int mce_available(struct cpuinfo_x86 *c)
1da177e4 508{
1462594b 509 if (mca_cfg.disabled)
5b4408fd 510 return 0;
3d1712c9 511 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
512}
513
9b1beaf2
AK
514static void mce_schedule_work(void)
515{
a2c2727d 516 if (!mce_gen_pool_empty())
061120ae 517 schedule_work(&mce_work);
9b1beaf2
AK
518}
519
b77e70bf 520static void mce_irq_work_cb(struct irq_work *entry)
ccc3c319 521{
9ff36ee9 522 mce_notify_irq();
9b1beaf2 523 mce_schedule_work();
ccc3c319 524}
ccc3c319
AK
525
526static void mce_report_event(struct pt_regs *regs)
527{
528 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
9ff36ee9 529 mce_notify_irq();
9b1beaf2
AK
530 /*
531 * Triggering the work queue here is just an insurance
532 * policy in case the syscall exit notify handler
533 * doesn't run soon enough or ends up running on the
534 * wrong CPU (can happen when audit sleeps)
535 */
536 mce_schedule_work();
ccc3c319
AK
537 return;
538 }
539
061120ae 540 irq_work_queue(&mce_irq_work);
ccc3c319
AK
541}
542
feab21f8
BP
543/*
544 * Check if the address reported by the CPU is in a format we can parse.
545 * It would be possible to add code for most other cases, but all would
546 * be somewhat complicated (e.g. segment offset would require an instruction
547 * parser). So only support physical addresses up to page granuality for now.
548 */
549static int mce_usable_address(struct mce *m)
550{
551 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
552 return 0;
553
554 /* Checks after this one are Intel-specific: */
555 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
556 return 1;
557
558 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
559 return 0;
560 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
561 return 0;
562 return 1;
563}
564
fd4cf79f
CG
565static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
566 void *data)
567{
568 struct mce *mce = (struct mce *)data;
569 unsigned long pfn;
570
571 if (!mce)
572 return NOTIFY_DONE;
573
c0ec382e 574 if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
fd4cf79f
CG
575 pfn = mce->addr >> PAGE_SHIFT;
576 memory_failure(pfn, MCE_VECTOR, 0);
577 }
578
579 return NOTIFY_OK;
ccc3c319 580}
fd4cf79f
CG
581static struct notifier_block mce_srao_nb = {
582 .notifier_call = srao_decode_notifier,
9026cc82 583 .priority = MCE_PRIO_SRAO,
fd4cf79f 584};
ccc3c319 585
cd9c57ca
BP
586static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
587 void *data)
588{
589 struct mce *m = (struct mce *)data;
590
591 if (!m)
592 return NOTIFY_DONE;
593
594 /*
595 * Run the default notifier if we have only the SRAO
596 * notifier and us registered.
597 */
598 if (atomic_read(&num_notifiers) > 2)
599 return NOTIFY_DONE;
600
601 __print_mce(m);
602
603 return NOTIFY_DONE;
604}
605
606static struct notifier_block mce_default_nb = {
607 .notifier_call = mce_default_notifier,
608 /* lowest prio, we want it to run last. */
9026cc82 609 .priority = MCE_PRIO_LOWEST,
cd9c57ca
BP
610};
611
85f92694
TL
612/*
613 * Read ADDR and MISC registers.
614 */
615static void mce_read_aux(struct mce *m, int i)
616{
617 if (m->status & MCI_STATUS_MISCV)
d9d73fcc 618 m->misc = mce_rdmsrl(msr_ops.misc(i));
db819d60 619
85f92694 620 if (m->status & MCI_STATUS_ADDRV) {
d9d73fcc 621 m->addr = mce_rdmsrl(msr_ops.addr(i));
85f92694
TL
622
623 /*
624 * Mask the reported address by the reported granularity.
625 */
1462594b 626 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
85f92694
TL
627 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
628 m->addr >>= shift;
629 m->addr <<= shift;
630 }
4f29b73b
YG
631
632 /*
633 * Extract [55:<lsb>] where lsb is the least significant
634 * *valid* bit of the address bits.
635 */
636 if (mce_flags.smca) {
637 u8 lsb = (m->addr >> 56) & 0x3f;
638
639 m->addr &= GENMASK_ULL(55, lsb);
640 }
85f92694 641 }
db819d60 642
5828c46f
YG
643 if (mce_flags.smca) {
644 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
645
646 if (m->status & MCI_STATUS_SYNDV)
647 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
648 }
85f92694
TL
649}
650
fa92c586
CY
651static bool memory_error(struct mce *m)
652{
653 struct cpuinfo_x86 *c = &boot_cpu_data;
654
655 if (c->x86_vendor == X86_VENDOR_AMD) {
db548a28
BP
656 /* ErrCodeExt[20:16] */
657 u8 xec = (m->status >> 16) & 0x1f;
658
659 return (xec == 0x0 || xec == 0x8);
fa92c586
CY
660 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
661 /*
662 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
663 *
664 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
665 * indicating a memory error. Bit 8 is used for indicating a
666 * cache hierarchy error. The combination of bit 2 and bit 3
667 * is used for indicating a `generic' cache hierarchy error
668 * But we can't just blindly check the above bits, because if
669 * bit 11 is set, then it is a bus/interconnect error - and
670 * either way the above bits just gives more detail on what
671 * bus/interconnect error happened. Note that bit 12 can be
672 * ignored, as it's the "filter" bit.
673 */
674 return (m->status & 0xef80) == BIT(7) ||
675 (m->status & 0xef00) == BIT(8) ||
676 (m->status & 0xeffc) == 0xc;
677 }
678
679 return false;
680}
681
ca84f696
AK
682DEFINE_PER_CPU(unsigned, mce_poll_count);
683
d88203d1 684/*
b79109c3
AK
685 * Poll for corrected events or events that happened before reset.
686 * Those are just logged through /dev/mcelog.
687 *
688 * This is executed in standard interrupt context.
ed7290d0
AK
689 *
690 * Note: spec recommends to panic for fatal unsignalled
691 * errors here. However this would be quite problematic --
692 * we would need to reimplement the Monarch handling and
693 * it would mess up the exclusion between exception handler
694 * and poll hander -- * so we skip this for now.
695 * These cases should not happen anyways, or only when the CPU
696 * is already totally * confused. In this case it's likely it will
697 * not fully execute the machine check handler either.
b79109c3 698 */
3f2f0680 699bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3 700{
8b38937b 701 bool error_seen = false;
b79109c3 702 struct mce m;
fa92c586 703 int severity;
b79109c3
AK
704 int i;
705
c6ae41e7 706 this_cpu_inc(mce_poll_count);
ca84f696 707
b8325c5b 708 mce_gather_info(&m, NULL);
b79109c3 709
669c00f0
BP
710 if (flags & MCP_TIMESTAMP)
711 m.tsc = rdtsc();
54467353 712
d203f0b8 713 for (i = 0; i < mca_cfg.banks; i++) {
cebe1820 714 if (!mce_banks[i].ctl || !test_bit(i, *b))
b79109c3
AK
715 continue;
716
717 m.misc = 0;
718 m.addr = 0;
719 m.bank = i;
b79109c3
AK
720
721 barrier();
d9d73fcc 722 m.status = mce_rdmsrl(msr_ops.status(i));
b79109c3
AK
723 if (!(m.status & MCI_STATUS_VAL))
724 continue;
725
726 /*
ed7290d0
AK
727 * Uncorrected or signalled events are handled by the exception
728 * handler when it is enabled, so don't process those here.
b79109c3
AK
729 *
730 * TBD do the same check for MCI_STATUS_EN here?
731 */
ed7290d0 732 if (!(flags & MCP_UC) &&
1462594b 733 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
b79109c3
AK
734 continue;
735
8b38937b
TL
736 error_seen = true;
737
85f92694 738 mce_read_aux(&m, i);
b79109c3 739
fa92c586
CY
740 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
741
c0ec382e
BP
742 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
743 if (m.status & MCI_STATUS_ADDRV)
fd4cf79f 744 m.severity = severity;
fa92c586 745
b79109c3
AK
746 /*
747 * Don't get the IP here because it's unlikely to
748 * have anything to do with the actual error location.
749 */
8b38937b 750 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
5679af4c 751 mce_log(&m);
c0ec382e 752 else if (mce_usable_address(&m)) {
8b38937b
TL
753 /*
754 * Although we skipped logging this, we still want
755 * to take action. Add to the pool so the registered
756 * notifiers will see it.
757 */
758 if (!mce_gen_pool_add(&m))
759 mce_schedule_work();
3f2f0680 760 }
b79109c3
AK
761
762 /*
763 * Clear state for this bank.
764 */
d9d73fcc 765 mce_wrmsrl(msr_ops.status(i), 0);
b79109c3
AK
766 }
767
768 /*
769 * Don't clear MCG_STATUS here because it's only defined for
770 * exceptions.
771 */
88921be3
AK
772
773 sync_core();
3f2f0680 774
8b38937b 775 return error_seen;
b79109c3 776}
ea149b36 777EXPORT_SYMBOL_GPL(machine_check_poll);
b79109c3 778
bd19a5e6
AK
779/*
780 * Do a quick check if any of the events requires a panic.
781 * This decides if we keep the events around or clear them.
782 */
61b0fccd
TL
783static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
784 struct pt_regs *regs)
bd19a5e6 785{
95022b8c 786 int i, ret = 0;
17fea54b 787 char *tmp;
bd19a5e6 788
d203f0b8 789 for (i = 0; i < mca_cfg.banks; i++) {
d9d73fcc 790 m->status = mce_rdmsrl(msr_ops.status(i));
61b0fccd 791 if (m->status & MCI_STATUS_VAL) {
95022b8c 792 __set_bit(i, validp);
61b0fccd
TL
793 if (quirk_no_way_out)
794 quirk_no_way_out(i, m, regs);
795 }
17fea54b
BP
796
797 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
798 *msg = tmp;
95022b8c 799 ret = 1;
17fea54b 800 }
bd19a5e6 801 }
95022b8c 802 return ret;
bd19a5e6
AK
803}
804
3c079792
AK
805/*
806 * Variable to establish order between CPUs while scanning.
807 * Each CPU spins initially until executing is equal its number.
808 */
809static atomic_t mce_executing;
810
811/*
812 * Defines order of CPUs on entry. First CPU becomes Monarch.
813 */
814static atomic_t mce_callin;
815
816/*
817 * Check if a timeout waiting for other CPUs happened.
818 */
6c80f87e 819static int mce_timed_out(u64 *t, const char *msg)
3c079792
AK
820{
821 /*
822 * The others already did panic for some reason.
823 * Bail out like in a timeout.
824 * rmb() to tell the compiler that system_state
825 * might have been modified by someone else.
826 */
827 rmb();
c7c9b392 828 if (atomic_read(&mce_panicked))
3c079792 829 wait_for_panic();
84c2559d 830 if (!mca_cfg.monarch_timeout)
3c079792
AK
831 goto out;
832 if ((s64)*t < SPINUNIT) {
716079f6 833 if (mca_cfg.tolerant <= 1)
6c80f87e 834 mce_panic(msg, NULL, NULL);
3c079792
AK
835 cpu_missing = 1;
836 return 1;
837 }
838 *t -= SPINUNIT;
839out:
840 touch_nmi_watchdog();
841 return 0;
842}
843
844/*
845 * The Monarch's reign. The Monarch is the CPU who entered
846 * the machine check handler first. It waits for the others to
847 * raise the exception too and then grades them. When any
848 * error is fatal panic. Only then let the others continue.
849 *
850 * The other CPUs entering the MCE handler will be controlled by the
851 * Monarch. They are called Subjects.
852 *
853 * This way we prevent any potential data corruption in a unrecoverable case
854 * and also makes sure always all CPU's errors are examined.
855 *
680b6cfd 856 * Also this detects the case of a machine check event coming from outer
3c079792
AK
857 * space (not detected by any CPUs) In this case some external agent wants
858 * us to shut down, so panic too.
859 *
860 * The other CPUs might still decide to panic if the handler happens
861 * in a unrecoverable place, but in this case the system is in a semi-stable
862 * state and won't corrupt anything by itself. It's ok to let the others
863 * continue for a bit first.
864 *
865 * All the spin loops have timeouts; when a timeout happens a CPU
866 * typically elects itself to be Monarch.
867 */
868static void mce_reign(void)
869{
870 int cpu;
871 struct mce *m = NULL;
872 int global_worst = 0;
873 char *msg = NULL;
874 char *nmsg = NULL;
875
876 /*
877 * This CPU is the Monarch and the other CPUs have run
878 * through their handlers.
879 * Grade the severity of the errors of all the CPUs.
880 */
881 for_each_possible_cpu(cpu) {
d203f0b8
BP
882 int severity = mce_severity(&per_cpu(mces_seen, cpu),
883 mca_cfg.tolerant,
e3480271 884 &nmsg, true);
3c079792
AK
885 if (severity > global_worst) {
886 msg = nmsg;
887 global_worst = severity;
888 m = &per_cpu(mces_seen, cpu);
889 }
890 }
891
892 /*
893 * Cannot recover? Panic here then.
894 * This dumps all the mces in the log buffer and stops the
895 * other CPUs.
896 */
d203f0b8 897 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 898 mce_panic("Fatal machine check", m, msg);
3c079792
AK
899
900 /*
901 * For UC somewhere we let the CPU who detects it handle it.
902 * Also must let continue the others, otherwise the handling
903 * CPU could deadlock on a lock.
904 */
905
906 /*
907 * No machine check event found. Must be some external
908 * source or one CPU is hung. Panic.
909 */
d203f0b8 910 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 911 mce_panic("Fatal machine check from unknown source", NULL, NULL);
3c079792
AK
912
913 /*
914 * Now clear all the mces_seen so that they don't reappear on
915 * the next mce.
916 */
917 for_each_possible_cpu(cpu)
918 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
919}
920
921static atomic_t global_nwo;
922
923/*
924 * Start of Monarch synchronization. This waits until all CPUs have
925 * entered the exception handler and then determines if any of them
926 * saw a fatal event that requires panic. Then it executes them
927 * in the entry order.
928 * TBD double check parallel CPU hotunplug
929 */
7fb06fc9 930static int mce_start(int *no_way_out)
3c079792 931{
7fb06fc9 932 int order;
3c079792 933 int cpus = num_online_cpus();
84c2559d 934 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792 935
7fb06fc9
HS
936 if (!timeout)
937 return -1;
3c079792 938
7fb06fc9 939 atomic_add(*no_way_out, &global_nwo);
184e1fdf 940 /*
bf92b1fe
DB
941 * Rely on the implied barrier below, such that global_nwo
942 * is updated before mce_callin.
184e1fdf 943 */
a95436e4 944 order = atomic_inc_return(&mce_callin);
3c079792
AK
945
946 /*
947 * Wait for everyone.
948 */
949 while (atomic_read(&mce_callin) != cpus) {
6c80f87e
AL
950 if (mce_timed_out(&timeout,
951 "Timeout: Not all CPUs entered broadcast exception handler")) {
3c079792 952 atomic_set(&global_nwo, 0);
7fb06fc9 953 return -1;
3c079792
AK
954 }
955 ndelay(SPINUNIT);
956 }
957
184e1fdf
HY
958 /*
959 * mce_callin should be read before global_nwo
960 */
961 smp_rmb();
3c079792 962
7fb06fc9
HS
963 if (order == 1) {
964 /*
965 * Monarch: Starts executing now, the others wait.
966 */
3c079792 967 atomic_set(&mce_executing, 1);
7fb06fc9
HS
968 } else {
969 /*
970 * Subject: Now start the scanning loop one by one in
971 * the original callin order.
972 * This way when there are any shared banks it will be
973 * only seen by one CPU before cleared, avoiding duplicates.
974 */
975 while (atomic_read(&mce_executing) < order) {
6c80f87e
AL
976 if (mce_timed_out(&timeout,
977 "Timeout: Subject CPUs unable to finish machine check processing")) {
7fb06fc9
HS
978 atomic_set(&global_nwo, 0);
979 return -1;
980 }
981 ndelay(SPINUNIT);
982 }
3c079792
AK
983 }
984
985 /*
7fb06fc9 986 * Cache the global no_way_out state.
3c079792 987 */
7fb06fc9
HS
988 *no_way_out = atomic_read(&global_nwo);
989
990 return order;
3c079792
AK
991}
992
993/*
994 * Synchronize between CPUs after main scanning loop.
995 * This invokes the bulk of the Monarch processing.
996 */
997static int mce_end(int order)
998{
999 int ret = -1;
84c2559d 1000 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792
AK
1001
1002 if (!timeout)
1003 goto reset;
1004 if (order < 0)
1005 goto reset;
1006
1007 /*
1008 * Allow others to run.
1009 */
1010 atomic_inc(&mce_executing);
1011
1012 if (order == 1) {
1013 /* CHECKME: Can this race with a parallel hotplug? */
1014 int cpus = num_online_cpus();
1015
1016 /*
1017 * Monarch: Wait for everyone to go through their scanning
1018 * loops.
1019 */
1020 while (atomic_read(&mce_executing) <= cpus) {
6c80f87e
AL
1021 if (mce_timed_out(&timeout,
1022 "Timeout: Monarch CPU unable to finish machine check processing"))
3c079792
AK
1023 goto reset;
1024 ndelay(SPINUNIT);
1025 }
1026
1027 mce_reign();
1028 barrier();
1029 ret = 0;
1030 } else {
1031 /*
1032 * Subject: Wait for Monarch to finish.
1033 */
1034 while (atomic_read(&mce_executing) != 0) {
6c80f87e
AL
1035 if (mce_timed_out(&timeout,
1036 "Timeout: Monarch CPU did not finish machine check processing"))
3c079792
AK
1037 goto reset;
1038 ndelay(SPINUNIT);
1039 }
1040
1041 /*
1042 * Don't reset anything. That's done by the Monarch.
1043 */
1044 return 0;
1045 }
1046
1047 /*
1048 * Reset all global state.
1049 */
1050reset:
1051 atomic_set(&global_nwo, 0);
1052 atomic_set(&mce_callin, 0);
1053 barrier();
1054
1055 /*
1056 * Let others run again.
1057 */
1058 atomic_set(&mce_executing, 0);
1059 return ret;
1060}
1061
1062static void mce_clear_state(unsigned long *toclear)
1063{
1064 int i;
1065
d203f0b8 1066 for (i = 0; i < mca_cfg.banks; i++) {
3c079792 1067 if (test_bit(i, toclear))
d9d73fcc 1068 mce_wrmsrl(msr_ops.status(i), 0);
3c079792
AK
1069 }
1070}
1071
b2f9d678
TL
1072static int do_memory_failure(struct mce *m)
1073{
1074 int flags = MF_ACTION_REQUIRED;
1075 int ret;
1076
1077 pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1078 if (!(m->mcgstatus & MCG_STATUS_RIPV))
1079 flags |= MF_MUST_KILL;
1080 ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
1081 if (ret)
1082 pr_err("Memory error not recovered");
1083 return ret;
1084}
1085
b79109c3
AK
1086/*
1087 * The actual machine check handler. This only handles real
1088 * exceptions when something got corrupted coming in through int 18.
1089 *
1090 * This is executed in NMI context not subject to normal locking rules. This
1091 * implies that most kernel services cannot be safely used. Don't even
1092 * think about putting a printk in there!
3c079792
AK
1093 *
1094 * On Intel systems this is entered on all CPUs in parallel through
1095 * MCE broadcast. However some CPUs might be broken beyond repair,
1096 * so be always careful when synchronizing with others.
1da177e4 1097 */
e9eee03e 1098void do_machine_check(struct pt_regs *regs, long error_code)
1da177e4 1099{
1462594b 1100 struct mca_config *cfg = &mca_cfg;
3c079792 1101 struct mce m, *final;
1da177e4 1102 int i;
3c079792
AK
1103 int worst = 0;
1104 int severity;
fead35c6 1105
3c079792
AK
1106 /*
1107 * Establish sequential order between the CPUs entering the machine
1108 * check handler.
1109 */
fead35c6 1110 int order = -1;
bd78432c
TH
1111 /*
1112 * If no_way_out gets set, there is no safe way to recover from this
d203f0b8 1113 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
bd78432c
TH
1114 */
1115 int no_way_out = 0;
1116 /*
1117 * If kill_it gets set, there might be a way to recover from this
1118 * error.
1119 */
1120 int kill_it = 0;
b79109c3 1121 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
95022b8c 1122 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
bd19a5e6 1123 char *msg = "Unknown";
fead35c6
YG
1124
1125 /*
1126 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1127 * on Intel.
1128 */
1129 int lmce = 1;
1da177e4 1130
d90167a9
AR
1131 /* If this CPU is offline, just bail out. */
1132 if (cpu_is_offline(smp_processor_id())) {
1133 u64 mcgstatus;
1134
1135 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1136 if (mcgstatus & MCG_STATUS_RIPV) {
1137 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1138 return;
1139 }
1140 }
1141
8c84014f 1142 ist_enter(regs);
95927475 1143
c6ae41e7 1144 this_cpu_inc(mce_exception_count);
01ca79f1 1145
1462594b 1146 if (!cfg->banks)
32561696 1147 goto out;
1da177e4 1148
b8325c5b 1149 mce_gather_info(&m, regs);
669c00f0 1150 m.tsc = rdtsc();
b5f2fa4e 1151
89cbc767 1152 final = this_cpu_ptr(&mces_seen);
3c079792
AK
1153 *final = m;
1154
95022b8c 1155 memset(valid_banks, 0, sizeof(valid_banks));
61b0fccd 1156 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
680b6cfd 1157
1da177e4
LT
1158 barrier();
1159
ed7290d0 1160 /*
a8c321fb
TL
1161 * When no restart IP might need to kill or panic.
1162 * Assume the worst for now, but if we find the
1163 * severity is MCE_AR_SEVERITY we have other options.
ed7290d0
AK
1164 */
1165 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1166 kill_it = 1;
1167
3c079792 1168 /*
fead35c6
YG
1169 * Check if this MCE is signaled to only this logical processor,
1170 * on Intel only.
3c079792 1171 */
fead35c6
YG
1172 if (m.cpuvendor == X86_VENDOR_INTEL)
1173 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1174
1175 /*
1176 * Go through all banks in exclusion of the other CPUs. This way we
1177 * don't report duplicated events on shared banks because the first one
1178 * to see it will clear it. If this is a Local MCE, then no need to
1179 * perform rendezvous.
1180 */
1181 if (!lmce)
243d657e 1182 order = mce_start(&no_way_out);
243d657e 1183
1462594b 1184 for (i = 0; i < cfg->banks; i++) {
b79109c3 1185 __clear_bit(i, toclear);
95022b8c
TL
1186 if (!test_bit(i, valid_banks))
1187 continue;
cebe1820 1188 if (!mce_banks[i].ctl)
1da177e4 1189 continue;
d88203d1
TG
1190
1191 m.misc = 0;
1da177e4
LT
1192 m.addr = 0;
1193 m.bank = i;
1da177e4 1194
d9d73fcc 1195 m.status = mce_rdmsrl(msr_ops.status(i));
1da177e4
LT
1196 if ((m.status & MCI_STATUS_VAL) == 0)
1197 continue;
1198
b79109c3 1199 /*
ed7290d0
AK
1200 * Non uncorrected or non signaled errors are handled by
1201 * machine_check_poll. Leave them alone, unless this panics.
b79109c3 1202 */
1462594b 1203 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
ed7290d0 1204 !no_way_out)
b79109c3
AK
1205 continue;
1206
1207 /*
1208 * Set taint even when machine check was not enabled.
1209 */
373d4d09 1210 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
b79109c3 1211
e3480271 1212 severity = mce_severity(&m, cfg->tolerant, NULL, true);
b79109c3 1213
ed7290d0 1214 /*
e3480271
CY
1215 * When machine check was for corrected/deferred handler don't
1216 * touch, unless we're panicing.
ed7290d0 1217 */
e3480271
CY
1218 if ((severity == MCE_KEEP_SEVERITY ||
1219 severity == MCE_UCNA_SEVERITY) && !no_way_out)
ed7290d0
AK
1220 continue;
1221 __set_bit(i, toclear);
1222 if (severity == MCE_NO_SEVERITY) {
b79109c3
AK
1223 /*
1224 * Machine check event was not enabled. Clear, but
1225 * ignore.
1226 */
1227 continue;
1da177e4
LT
1228 }
1229
85f92694 1230 mce_read_aux(&m, i);
1da177e4 1231
fd4cf79f
CG
1232 /* assuming valid severity level != 0 */
1233 m.severity = severity;
9b1beaf2 1234
b79109c3 1235 mce_log(&m);
1da177e4 1236
3c079792
AK
1237 if (severity > worst) {
1238 *final = m;
1239 worst = severity;
1da177e4 1240 }
1da177e4
LT
1241 }
1242
a8c321fb
TL
1243 /* mce_clear_state will clear *final, save locally for use later */
1244 m = *final;
1245
3c079792
AK
1246 if (!no_way_out)
1247 mce_clear_state(toclear);
1248
e9eee03e 1249 /*
3c079792
AK
1250 * Do most of the synchronization with other CPUs.
1251 * When there's any problem use only local no_way_out state.
e9eee03e 1252 */
243d657e
AR
1253 if (!lmce) {
1254 if (mce_end(order) < 0)
1255 no_way_out = worst >= MCE_PANIC_SEVERITY;
1256 } else {
1257 /*
1258 * Local MCE skipped calling mce_reign()
1259 * If we found a fatal error, we need to panic here.
1260 */
1261 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1262 mce_panic("Machine check from unknown source",
1263 NULL, NULL);
1264 }
bd78432c
TH
1265
1266 /*
b2f9d678
TL
1267 * If tolerant is at an insane level we drop requests to kill
1268 * processes and continue even when there is no way out.
bd78432c 1269 */
b2f9d678
TL
1270 if (cfg->tolerant == 3)
1271 kill_it = 0;
1272 else if (no_way_out)
1273 mce_panic("Fatal machine check on current CPU", &m, msg);
e02e68d3 1274
3c079792
AK
1275 if (worst > 0)
1276 mce_report_event(regs);
5f8c1a54 1277 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
32561696 1278out:
88921be3 1279 sync_core();
d4812e16 1280
b2f9d678
TL
1281 if (worst != MCE_AR_SEVERITY && !kill_it)
1282 goto out_ist;
d4812e16 1283
b2f9d678
TL
1284 /* Fault was in user mode and we need to take some action */
1285 if ((m.cs & 3) == 3) {
1286 ist_begin_non_atomic(regs);
1287 local_irq_enable();
1288
1289 if (kill_it || do_memory_failure(&m))
1290 force_sig(SIGBUS, current);
1291 local_irq_disable();
1292 ist_end_non_atomic();
1293 } else {
1294 if (!fixup_exception(regs, X86_TRAP_MC))
1295 mce_panic("Failed kernel mode recovery", &m, NULL);
d4812e16 1296 }
b2f9d678
TL
1297
1298out_ist:
8c84014f 1299 ist_exit(regs);
1da177e4 1300}
ea149b36 1301EXPORT_SYMBOL_GPL(do_machine_check);
1da177e4 1302
cd42f4a3
TL
1303#ifndef CONFIG_MEMORY_FAILURE
1304int memory_failure(unsigned long pfn, int vector, int flags)
9b1beaf2 1305{
a8c321fb
TL
1306 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1307 BUG_ON(flags & MF_ACTION_REQUIRED);
c767a54b
JP
1308 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1309 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1310 pfn);
cd42f4a3
TL
1311
1312 return 0;
9b1beaf2 1313}
cd42f4a3 1314#endif
9b1beaf2 1315
1da177e4 1316/*
8a336b0a
TH
1317 * Periodic polling timer for "silent" machine check errors. If the
1318 * poller finds an MCE, poll 2x faster. When the poller finds no more
1319 * errors, poll 2x slower (up to check_interval seconds).
1da177e4 1320 */
3f2f0680 1321static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
e9eee03e 1322
82f7af09 1323static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
52d168e2 1324static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 1325
55babd8f
CG
1326static unsigned long mce_adjust_timer_default(unsigned long interval)
1327{
1328 return interval;
1329}
1330
3f2f0680 1331static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
55babd8f 1332
0becc0ae 1333static void __start_timer(struct timer_list *t, unsigned long interval)
27f6c573 1334{
3f2f0680
BP
1335 unsigned long when = jiffies + interval;
1336 unsigned long flags;
27f6c573 1337
3f2f0680 1338 local_irq_save(flags);
27f6c573 1339
0becc0ae
TG
1340 if (!timer_pending(t) || time_before(when, t->expires))
1341 mod_timer(t, round_jiffies(when));
3f2f0680
BP
1342
1343 local_irq_restore(flags);
27f6c573
CG
1344}
1345
82f7af09 1346static void mce_timer_fn(unsigned long data)
1da177e4 1347{
89cbc767 1348 struct timer_list *t = this_cpu_ptr(&mce_timer);
3f2f0680 1349 int cpu = smp_processor_id();
82f7af09 1350 unsigned long iv;
52d168e2 1351
3f2f0680
BP
1352 WARN_ON(cpu != data);
1353
1354 iv = __this_cpu_read(mce_next_interval);
52d168e2 1355
89cbc767 1356 if (mce_available(this_cpu_ptr(&cpu_info))) {
54467353 1357 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
3f2f0680
BP
1358
1359 if (mce_intel_cmci_poll()) {
1360 iv = mce_adjust_timer(iv);
1361 goto done;
1362 }
e9eee03e 1363 }
1da177e4
LT
1364
1365 /*
3f2f0680
BP
1366 * Alert userspace if needed. If we logged an MCE, reduce the polling
1367 * interval, otherwise increase the polling interval.
1da177e4 1368 */
3f2f0680 1369 if (mce_notify_irq())
958fb3c5 1370 iv = max(iv / 2, (unsigned long) HZ/100);
3f2f0680 1371 else
82f7af09 1372 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
3f2f0680
BP
1373
1374done:
82f7af09 1375 __this_cpu_write(mce_next_interval, iv);
0becc0ae 1376 __start_timer(t, iv);
55babd8f 1377}
e02e68d3 1378
55babd8f
CG
1379/*
1380 * Ensure that the timer is firing in @interval from now.
1381 */
1382void mce_timer_kick(unsigned long interval)
1383{
89cbc767 1384 struct timer_list *t = this_cpu_ptr(&mce_timer);
55babd8f
CG
1385 unsigned long iv = __this_cpu_read(mce_next_interval);
1386
0becc0ae 1387 __start_timer(t, interval);
3f2f0680 1388
55babd8f
CG
1389 if (interval < iv)
1390 __this_cpu_write(mce_next_interval, interval);
e02e68d3
TH
1391}
1392
9aaef96f
HS
1393/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1394static void mce_timer_delete_all(void)
1395{
1396 int cpu;
1397
1398 for_each_online_cpu(cpu)
1399 del_timer_sync(&per_cpu(mce_timer, cpu));
1400}
1401
9bd98405
AK
1402static void mce_do_trigger(struct work_struct *work)
1403{
1020bcbc 1404 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
9bd98405
AK
1405}
1406
1407static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1408
e02e68d3 1409/*
9bd98405
AK
1410 * Notify the user(s) about new machine check events.
1411 * Can be called from interrupt context, but not from machine check/NMI
1412 * context.
e02e68d3 1413 */
9ff36ee9 1414int mce_notify_irq(void)
e02e68d3 1415{
8457c84d
AK
1416 /* Not more than two messages every minute */
1417 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1418
1020bcbc 1419 if (test_and_clear_bit(0, &mce_need_notify)) {
93b62c3c
HS
1420 /* wake processes polling /dev/mcelog */
1421 wake_up_interruptible(&mce_chrdev_wait);
9bd98405 1422
4d899be5 1423 if (mce_helper[0])
9bd98405 1424 schedule_work(&mce_trigger_work);
e02e68d3 1425
8457c84d 1426 if (__ratelimit(&ratelimit))
a2d7b0d4 1427 pr_info(HW_ERR "Machine check events logged\n");
e02e68d3
TH
1428
1429 return 1;
1da177e4 1430 }
e02e68d3
TH
1431 return 0;
1432}
9ff36ee9 1433EXPORT_SYMBOL_GPL(mce_notify_irq);
8a336b0a 1434
148f9bb8 1435static int __mcheck_cpu_mce_banks_init(void)
cebe1820
AK
1436{
1437 int i;
d203f0b8 1438 u8 num_banks = mca_cfg.banks;
cebe1820 1439
d203f0b8 1440 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
cebe1820
AK
1441 if (!mce_banks)
1442 return -ENOMEM;
d203f0b8
BP
1443
1444 for (i = 0; i < num_banks; i++) {
cebe1820 1445 struct mce_bank *b = &mce_banks[i];
11868a2d 1446
cebe1820
AK
1447 b->ctl = -1ULL;
1448 b->init = 1;
1449 }
1450 return 0;
1451}
1452
d88203d1 1453/*
1da177e4
LT
1454 * Initialize Machine Checks for a CPU.
1455 */
148f9bb8 1456static int __mcheck_cpu_cap_init(void)
1da177e4 1457{
0d7482e3 1458 unsigned b;
e9eee03e 1459 u64 cap;
1da177e4
LT
1460
1461 rdmsrl(MSR_IA32_MCG_CAP, cap);
01c6680a
TG
1462
1463 b = cap & MCG_BANKCNT_MASK;
d203f0b8 1464 if (!mca_cfg.banks)
c767a54b 1465 pr_info("CPU supports %d MCE banks\n", b);
b659294b 1466
0d7482e3 1467 if (b > MAX_NR_BANKS) {
c767a54b 1468 pr_warn("Using only %u machine check banks out of %u\n",
0d7482e3
AK
1469 MAX_NR_BANKS, b);
1470 b = MAX_NR_BANKS;
1471 }
1472
1473 /* Don't support asymmetric configurations today */
d203f0b8
BP
1474 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1475 mca_cfg.banks = b;
1476
cebe1820 1477 if (!mce_banks) {
cffd377e 1478 int err = __mcheck_cpu_mce_banks_init();
11868a2d 1479
cebe1820
AK
1480 if (err)
1481 return err;
1da177e4 1482 }
0d7482e3 1483
94ad8474 1484 /* Use accurate RIP reporting if available. */
01c6680a 1485 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
84c2559d 1486 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1da177e4 1487
ed7290d0 1488 if (cap & MCG_SER_P)
1462594b 1489 mca_cfg.ser = true;
ed7290d0 1490
0d7482e3
AK
1491 return 0;
1492}
1493
5e09954a 1494static void __mcheck_cpu_init_generic(void)
0d7482e3 1495{
84c2559d 1496 enum mcp_flags m_fl = 0;
e9eee03e 1497 mce_banks_t all_banks;
0d7482e3 1498 u64 cap;
0d7482e3 1499
84c2559d
BP
1500 if (!mca_cfg.bootlog)
1501 m_fl = MCP_DONTLOG;
1502
b79109c3
AK
1503 /*
1504 * Log the machine checks left over from the previous reset.
1505 */
ee031c31 1506 bitmap_fill(all_banks, MAX_NR_BANKS);
84c2559d 1507 machine_check_poll(MCP_UC | m_fl, &all_banks);
1da177e4 1508
375074cc 1509 cr4_set_bits(X86_CR4_MCE);
1da177e4 1510
0d7482e3 1511 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
1512 if (cap & MCG_CTL_P)
1513 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
bb91f8c0
AG
1514}
1515
1516static void __mcheck_cpu_init_clear_banks(void)
1517{
1518 int i;
1da177e4 1519
d203f0b8 1520 for (i = 0; i < mca_cfg.banks; i++) {
cebe1820 1521 struct mce_bank *b = &mce_banks[i];
11868a2d 1522
cebe1820 1523 if (!b->init)
06b7a7a5 1524 continue;
d9d73fcc
YG
1525 wrmsrl(msr_ops.ctl(i), b->ctl);
1526 wrmsrl(msr_ops.status(i), 0);
d88203d1 1527 }
1da177e4
LT
1528}
1529
61b0fccd
TL
1530/*
1531 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1532 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1533 * Vol 3B Table 15-20). But this confuses both the code that determines
1534 * whether the machine check occurred in kernel or user mode, and also
1535 * the severity assessment code. Pretend that EIPV was set, and take the
1536 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1537 */
1538static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1539{
1540 if (bank != 0)
1541 return;
1542 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1543 return;
1544 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1545 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1546 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1547 MCACOD)) !=
1548 (MCI_STATUS_UC|MCI_STATUS_EN|
1549 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1550 MCI_STATUS_AR|MCACOD_INSTR))
1551 return;
1552
1553 m->mcgstatus |= MCG_STATUS_EIPV;
1554 m->ip = regs->ip;
1555 m->cs = regs->cs;
1556}
1557
1da177e4 1558/* Add per CPU specific workarounds here */
148f9bb8 1559static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
d88203d1 1560{
d203f0b8
BP
1561 struct mca_config *cfg = &mca_cfg;
1562
e412cd25 1563 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
c767a54b 1564 pr_info("unknown CPU type - not enabling MCE support\n");
e412cd25
IM
1565 return -EOPNOTSUPP;
1566 }
1567
1da177e4 1568 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 1569 if (c->x86_vendor == X86_VENDOR_AMD) {
d203f0b8 1570 if (c->x86 == 15 && cfg->banks > 4) {
e9eee03e
IM
1571 /*
1572 * disable GART TBL walk error reporting, which
1573 * trips off incorrectly with the IOMMU & 3ware
1574 * & Cerberus:
1575 */
cebe1820 1576 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
e9eee03e 1577 }
10001d91 1578 if (c->x86 < 17 && cfg->bootlog < 0) {
e9eee03e
IM
1579 /*
1580 * Lots of broken BIOS around that don't clear them
1581 * by default and leave crap in there. Don't log:
1582 */
84c2559d 1583 cfg->bootlog = 0;
e9eee03e 1584 }
2e6f694f
AK
1585 /*
1586 * Various K7s with broken bank 0 around. Always disable
1587 * by default.
1588 */
c9ce8712 1589 if (c->x86 == 6 && cfg->banks > 0)
cebe1820 1590 mce_banks[0].ctl = 0;
575203b4 1591
bf80bbd7
AG
1592 /*
1593 * overflow_recov is supported for F15h Models 00h-0fh
1594 * even though we don't have a CPUID bit for it.
1595 */
1596 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1597 mce_flags.overflow_recov = 1;
1598
c9ce8712
BP
1599 /*
1600 * Turn off MC4_MISC thresholding banks on those models since
1601 * they're not supported there.
1602 */
1603 if (c->x86 == 0x15 &&
1604 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1605 int i;
1606 u64 hwcr;
1607 bool need_toggle;
1608 u32 msrs[] = {
575203b4
BP
1609 0x00000413, /* MC4_MISC0 */
1610 0xc0000408, /* MC4_MISC1 */
c9ce8712 1611 };
575203b4 1612
c9ce8712 1613 rdmsrl(MSR_K7_HWCR, hwcr);
575203b4 1614
c9ce8712
BP
1615 /* McStatusWrEn has to be set */
1616 need_toggle = !(hwcr & BIT(18));
575203b4 1617
c9ce8712
BP
1618 if (need_toggle)
1619 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
575203b4 1620
c9ce8712
BP
1621 /* Clear CntP bit safely */
1622 for (i = 0; i < ARRAY_SIZE(msrs); i++)
1623 msr_clear_bit(msrs[i], 62);
575203b4 1624
c9ce8712
BP
1625 /* restore old settings */
1626 if (need_toggle)
1627 wrmsrl(MSR_K7_HWCR, hwcr);
1628 }
1da177e4 1629 }
e583538f 1630
06b7a7a5
AK
1631 if (c->x86_vendor == X86_VENDOR_INTEL) {
1632 /*
1633 * SDM documents that on family 6 bank 0 should not be written
1634 * because it aliases to another special BIOS controlled
1635 * register.
1636 * But it's not aliased anymore on model 0x1a+
1637 * Don't ignore bank 0 completely because there could be a
1638 * valid event later, merely don't write CTL0.
1639 */
1640
d203f0b8 1641 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
cebe1820 1642 mce_banks[0].init = 0;
3c079792
AK
1643
1644 /*
1645 * All newer Intel systems support MCE broadcasting. Enable
1646 * synchronization with a one second timeout.
1647 */
1648 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
84c2559d
BP
1649 cfg->monarch_timeout < 0)
1650 cfg->monarch_timeout = USEC_PER_SEC;
c7f6fa44 1651
e412cd25
IM
1652 /*
1653 * There are also broken BIOSes on some Pentium M and
1654 * earlier systems:
1655 */
84c2559d
BP
1656 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1657 cfg->bootlog = 0;
61b0fccd
TL
1658
1659 if (c->x86 == 6 && c->x86_model == 45)
1660 quirk_no_way_out = quirk_sandybridge_ifu;
06b7a7a5 1661 }
84c2559d
BP
1662 if (cfg->monarch_timeout < 0)
1663 cfg->monarch_timeout = 0;
1664 if (cfg->bootlog != 0)
7af19e4a 1665 cfg->panic_timeout = 30;
e412cd25
IM
1666
1667 return 0;
d88203d1 1668}
1da177e4 1669
148f9bb8 1670static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
4efc0670
AK
1671{
1672 if (c->x86 != 5)
3a97fc34
HS
1673 return 0;
1674
4efc0670
AK
1675 switch (c->x86_vendor) {
1676 case X86_VENDOR_INTEL:
c6978369 1677 intel_p5_mcheck_init(c);
3a97fc34 1678 return 1;
4efc0670
AK
1679 break;
1680 case X86_VENDOR_CENTAUR:
1681 winchip_mcheck_init(c);
3a97fc34 1682 return 1;
4efc0670 1683 break;
dc34bdd2
BP
1684 default:
1685 return 0;
4efc0670 1686 }
3a97fc34
HS
1687
1688 return 0;
4efc0670
AK
1689}
1690
5e09954a 1691static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1da177e4
LT
1692{
1693 switch (c->x86_vendor) {
1694 case X86_VENDOR_INTEL:
1695 mce_intel_feature_init(c);
3f2f0680 1696 mce_adjust_timer = cmci_intel_adjust_timer;
1da177e4 1697 break;
7559e13f
AG
1698
1699 case X86_VENDOR_AMD: {
14cddfd5
YG
1700 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1701 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1702 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
d9d73fcc
YG
1703
1704 /*
1705 * Install proper ops for Scalable MCA enabled processors
1706 */
1707 if (mce_flags.smca) {
1708 msr_ops.ctl = smca_ctl_reg;
1709 msr_ops.status = smca_status_reg;
1710 msr_ops.addr = smca_addr_reg;
1711 msr_ops.misc = smca_misc_reg;
1712 }
bfbe0eeb 1713 mce_amd_feature_init(c);
c7f54d21 1714
89b831ef 1715 break;
7559e13f
AG
1716 }
1717
1da177e4
LT
1718 default:
1719 break;
1720 }
1721}
1722
8838eb6c
AR
1723static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1724{
1725 switch (c->x86_vendor) {
1726 case X86_VENDOR_INTEL:
1727 mce_intel_feature_clear(c);
1728 break;
1729 default:
1730 break;
1731 }
1732}
1733
0becc0ae 1734static void mce_start_timer(struct timer_list *t)
52d168e2 1735{
4f75d841 1736 unsigned long iv = check_interval * HZ;
bc09effa 1737
7af19e4a 1738 if (mca_cfg.ignore_ce || !iv)
62fdac59
HS
1739 return;
1740
0becc0ae
TG
1741 this_cpu_write(mce_next_interval, iv);
1742 __start_timer(t, iv);
52d168e2
AK
1743}
1744
39f152ff
SAS
1745static void __mcheck_cpu_setup_timer(void)
1746{
1747 struct timer_list *t = this_cpu_ptr(&mce_timer);
1748 unsigned int cpu = smp_processor_id();
1749
1750 setup_pinned_timer(t, mce_timer_fn, cpu);
1751}
1752
26c3c283
TG
1753static void __mcheck_cpu_init_timer(void)
1754{
89cbc767 1755 struct timer_list *t = this_cpu_ptr(&mce_timer);
26c3c283
TG
1756 unsigned int cpu = smp_processor_id();
1757
f9c287ba 1758 setup_pinned_timer(t, mce_timer_fn, cpu);
0becc0ae 1759 mce_start_timer(t);
26c3c283
TG
1760}
1761
9eda8cb3
AK
1762/* Handle unconfigured int18 (should never happen) */
1763static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1764{
c767a54b 1765 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
9eda8cb3
AK
1766 smp_processor_id());
1767}
1768
1769/* Call the installed machine check handler for this CPU setup. */
1770void (*machine_check_vector)(struct pt_regs *, long error_code) =
1771 unexpected_machine_check;
1772
d88203d1 1773/*
1da177e4 1774 * Called for each booted CPU to set up machine checks.
e9eee03e 1775 * Must be called with preempt off:
1da177e4 1776 */
148f9bb8 1777void mcheck_cpu_init(struct cpuinfo_x86 *c)
1da177e4 1778{
1462594b 1779 if (mca_cfg.disabled)
4efc0670
AK
1780 return;
1781
3a97fc34
HS
1782 if (__mcheck_cpu_ancient_init(c))
1783 return;
4efc0670 1784
5b4408fd 1785 if (!mce_available(c))
1da177e4
LT
1786 return;
1787
5e09954a 1788 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1462594b 1789 mca_cfg.disabled = true;
0d7482e3
AK
1790 return;
1791 }
0d7482e3 1792
648ed940
CG
1793 if (mce_gen_pool_init()) {
1794 mca_cfg.disabled = true;
1795 pr_emerg("Couldn't allocate MCE records pool!\n");
1796 return;
1797 }
1798
5d727926
AK
1799 machine_check_vector = do_machine_check;
1800
5e09954a
BP
1801 __mcheck_cpu_init_generic();
1802 __mcheck_cpu_init_vendor(c);
bb91f8c0 1803 __mcheck_cpu_init_clear_banks();
39f152ff 1804 __mcheck_cpu_setup_timer();
1da177e4
LT
1805}
1806
8838eb6c
AR
1807/*
1808 * Called for each booted CPU to clear some machine checks opt-ins
1809 */
1810void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1811{
1812 if (mca_cfg.disabled)
1813 return;
1814
1815 if (!mce_available(c))
1816 return;
1817
1818 /*
1819 * Possibly to clear general settings generic to x86
1820 * __mcheck_cpu_clear_generic(c);
1821 */
1822 __mcheck_cpu_clear_vendor(c);
1823
1da177e4
LT
1824}
1825
1826/*
93b62c3c 1827 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1da177e4
LT
1828 */
1829
93b62c3c
HS
1830static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1831static int mce_chrdev_open_count; /* #times opened */
1832static int mce_chrdev_open_exclu; /* already open exclusive? */
f528e7ba 1833
93b62c3c 1834static int mce_chrdev_open(struct inode *inode, struct file *file)
f528e7ba 1835{
93b62c3c 1836 spin_lock(&mce_chrdev_state_lock);
f528e7ba 1837
93b62c3c
HS
1838 if (mce_chrdev_open_exclu ||
1839 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1840 spin_unlock(&mce_chrdev_state_lock);
e9eee03e 1841
f528e7ba
TH
1842 return -EBUSY;
1843 }
1844
1845 if (file->f_flags & O_EXCL)
93b62c3c
HS
1846 mce_chrdev_open_exclu = 1;
1847 mce_chrdev_open_count++;
f528e7ba 1848
93b62c3c 1849 spin_unlock(&mce_chrdev_state_lock);
f528e7ba 1850
bd78432c 1851 return nonseekable_open(inode, file);
f528e7ba
TH
1852}
1853
93b62c3c 1854static int mce_chrdev_release(struct inode *inode, struct file *file)
f528e7ba 1855{
93b62c3c 1856 spin_lock(&mce_chrdev_state_lock);
f528e7ba 1857
93b62c3c
HS
1858 mce_chrdev_open_count--;
1859 mce_chrdev_open_exclu = 0;
f528e7ba 1860
93b62c3c 1861 spin_unlock(&mce_chrdev_state_lock);
f528e7ba
TH
1862
1863 return 0;
1864}
1865
d88203d1
TG
1866static void collect_tscs(void *data)
1867{
1da177e4 1868 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 1869
4ea1636b 1870 cpu_tsc[smp_processor_id()] = rdtsc();
d88203d1 1871}
1da177e4 1872
482908b4
HY
1873static int mce_apei_read_done;
1874
1875/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1876static int __mce_read_apei(char __user **ubuf, size_t usize)
1877{
1878 int rc;
1879 u64 record_id;
1880 struct mce m;
1881
1882 if (usize < sizeof(struct mce))
1883 return -EINVAL;
1884
1885 rc = apei_read_mce(&m, &record_id);
1886 /* Error or no more MCE record */
1887 if (rc <= 0) {
1888 mce_apei_read_done = 1;
fadd85f1
NH
1889 /*
1890 * When ERST is disabled, mce_chrdev_read() should return
1891 * "no record" instead of "no device."
1892 */
1893 if (rc == -ENODEV)
1894 return 0;
482908b4
HY
1895 return rc;
1896 }
1897 rc = -EFAULT;
1898 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1899 return rc;
1900 /*
1901 * In fact, we should have cleared the record after that has
1902 * been flushed to the disk or sent to network in
1903 * /sbin/mcelog, but we have no interface to support that now,
1904 * so just clear it to avoid duplication.
1905 */
1906 rc = apei_clear_mce(record_id);
1907 if (rc) {
1908 mce_apei_read_done = 1;
1909 return rc;
1910 }
1911 *ubuf += sizeof(struct mce);
1912
1913 return 0;
1914}
1915
93b62c3c
HS
1916static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1917 size_t usize, loff_t *off)
1da177e4 1918{
e9eee03e 1919 char __user *buf = ubuf;
f0de53bb 1920 unsigned long *cpu_tsc;
ef41df43 1921 unsigned prev, next;
1da177e4
LT
1922 int i, err;
1923
6bca67f9 1924 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
1925 if (!cpu_tsc)
1926 return -ENOMEM;
1927
93b62c3c 1928 mutex_lock(&mce_chrdev_read_mutex);
482908b4
HY
1929
1930 if (!mce_apei_read_done) {
1931 err = __mce_read_apei(&buf, usize);
1932 if (err || buf != ubuf)
1933 goto out;
1934 }
1935
9a7783d0 1936 next = mce_log_get_idx_check(mcelog.next);
1da177e4
LT
1937
1938 /* Only supports full reads right now */
482908b4
HY
1939 err = -EINVAL;
1940 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1941 goto out;
1da177e4
LT
1942
1943 err = 0;
ef41df43
HY
1944 prev = 0;
1945 do {
1946 for (i = prev; i < next; i++) {
1947 unsigned long start = jiffies;
559faa6b 1948 struct mce *m = &mcelog.entry[i];
ef41df43 1949
559faa6b 1950 while (!m->finished) {
ef41df43 1951 if (time_after_eq(jiffies, start + 2)) {
559faa6b 1952 memset(m, 0, sizeof(*m));
ef41df43
HY
1953 goto timeout;
1954 }
1955 cpu_relax();
673242c1 1956 }
ef41df43 1957 smp_rmb();
559faa6b
HS
1958 err |= copy_to_user(buf, m, sizeof(*m));
1959 buf += sizeof(*m);
ef41df43
HY
1960timeout:
1961 ;
673242c1 1962 }
1da177e4 1963
ef41df43
HY
1964 memset(mcelog.entry + prev, 0,
1965 (next - prev) * sizeof(struct mce));
1966 prev = next;
1967 next = cmpxchg(&mcelog.next, prev, 0);
1968 } while (next != prev);
1da177e4 1969
b2b18660 1970 synchronize_sched();
1da177e4 1971
d88203d1
TG
1972 /*
1973 * Collect entries that were still getting written before the
1974 * synchronize.
1975 */
15c8b6c1 1976 on_each_cpu(collect_tscs, cpu_tsc, 1);
e9eee03e 1977
d88203d1 1978 for (i = next; i < MCE_LOG_LEN; i++) {
559faa6b
HS
1979 struct mce *m = &mcelog.entry[i];
1980
1981 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1982 err |= copy_to_user(buf, m, sizeof(*m));
1da177e4 1983 smp_rmb();
559faa6b
HS
1984 buf += sizeof(*m);
1985 memset(m, 0, sizeof(*m));
1da177e4 1986 }
d88203d1 1987 }
482908b4
HY
1988
1989 if (err)
1990 err = -EFAULT;
1991
1992out:
93b62c3c 1993 mutex_unlock(&mce_chrdev_read_mutex);
f0de53bb 1994 kfree(cpu_tsc);
e9eee03e 1995
482908b4 1996 return err ? err : buf - ubuf;
1da177e4
LT
1997}
1998
93b62c3c 1999static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
e02e68d3 2000{
93b62c3c 2001 poll_wait(file, &mce_chrdev_wait, wait);
e90328b8 2002 if (READ_ONCE(mcelog.next))
e02e68d3 2003 return POLLIN | POLLRDNORM;
482908b4
HY
2004 if (!mce_apei_read_done && apei_check_mce())
2005 return POLLIN | POLLRDNORM;
e02e68d3
TH
2006 return 0;
2007}
2008
93b62c3c
HS
2009static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
2010 unsigned long arg)
1da177e4
LT
2011{
2012 int __user *p = (int __user *)arg;
d88203d1 2013
1da177e4 2014 if (!capable(CAP_SYS_ADMIN))
d88203d1 2015 return -EPERM;
e9eee03e 2016
1da177e4 2017 switch (cmd) {
d88203d1 2018 case MCE_GET_RECORD_LEN:
1da177e4
LT
2019 return put_user(sizeof(struct mce), p);
2020 case MCE_GET_LOG_LEN:
d88203d1 2021 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
2022 case MCE_GETCLEAR_FLAGS: {
2023 unsigned flags;
d88203d1
TG
2024
2025 do {
1da177e4 2026 flags = mcelog.flags;
d88203d1 2027 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
e9eee03e 2028
d88203d1 2029 return put_user(flags, p);
1da177e4
LT
2030 }
2031 default:
d88203d1
TG
2032 return -ENOTTY;
2033 }
1da177e4
LT
2034}
2035
66f5ddf3
LT
2036static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
2037 size_t usize, loff_t *off);
2038
2039void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
2040 const char __user *ubuf,
2041 size_t usize, loff_t *off))
2042{
2043 mce_write = fn;
2044}
2045EXPORT_SYMBOL_GPL(register_mce_write_callback);
2046
29c6820f
PM
2047static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
2048 size_t usize, loff_t *off)
66f5ddf3
LT
2049{
2050 if (mce_write)
2051 return mce_write(filp, ubuf, usize, off);
2052 else
2053 return -EINVAL;
2054}
2055
2056static const struct file_operations mce_chrdev_ops = {
93b62c3c
HS
2057 .open = mce_chrdev_open,
2058 .release = mce_chrdev_release,
2059 .read = mce_chrdev_read,
66f5ddf3 2060 .write = mce_chrdev_write,
93b62c3c
HS
2061 .poll = mce_chrdev_poll,
2062 .unlocked_ioctl = mce_chrdev_ioctl,
2063 .llseek = no_llseek,
1da177e4
LT
2064};
2065
93b62c3c 2066static struct miscdevice mce_chrdev_device = {
1da177e4
LT
2067 MISC_MCELOG_MINOR,
2068 "mcelog",
2069 &mce_chrdev_ops,
2070};
2071
c3d1fb56
NR
2072static void __mce_disable_bank(void *arg)
2073{
2074 int bank = *((int *)arg);
89cbc767 2075 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
c3d1fb56
NR
2076 cmci_disable_bank(bank);
2077}
2078
2079void mce_disable_bank(int bank)
2080{
2081 if (bank >= mca_cfg.banks) {
2082 pr_warn(FW_BUG
2083 "Ignoring request to disable invalid MCA bank %d.\n",
2084 bank);
2085 return;
2086 }
2087 set_bit(bank, mce_banks_ce_disabled);
2088 on_each_cpu(__mce_disable_bank, &bank, 1);
2089}
2090
13503fa9 2091/*
62fdac59
HS
2092 * mce=off Disables machine check
2093 * mce=no_cmci Disables CMCI
88d53867 2094 * mce=no_lmce Disables LMCE
62fdac59
HS
2095 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2096 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
3c079792
AK
2097 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2098 * monarchtimeout is how long to wait for other CPUs on machine
2099 * check, or 0 to not wait
13503fa9
HS
2100 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2101 * mce=nobootlog Don't log MCEs from before booting.
450cc201 2102 * mce=bios_cmci_threshold Don't program the CMCI threshold
3637efb0 2103 * mce=recovery force enable memcpy_mcsafe()
13503fa9 2104 */
1da177e4
LT
2105static int __init mcheck_enable(char *str)
2106{
d203f0b8
BP
2107 struct mca_config *cfg = &mca_cfg;
2108
e3346fc4 2109 if (*str == 0) {
4efc0670 2110 enable_p5_mce();
e3346fc4
BZ
2111 return 1;
2112 }
4efc0670
AK
2113 if (*str == '=')
2114 str++;
1da177e4 2115 if (!strcmp(str, "off"))
1462594b 2116 cfg->disabled = true;
62fdac59 2117 else if (!strcmp(str, "no_cmci"))
7af19e4a 2118 cfg->cmci_disabled = true;
88d53867
AR
2119 else if (!strcmp(str, "no_lmce"))
2120 cfg->lmce_disabled = true;
62fdac59 2121 else if (!strcmp(str, "dont_log_ce"))
d203f0b8 2122 cfg->dont_log_ce = true;
62fdac59 2123 else if (!strcmp(str, "ignore_ce"))
7af19e4a 2124 cfg->ignore_ce = true;
13503fa9 2125 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
84c2559d 2126 cfg->bootlog = (str[0] == 'b');
450cc201 2127 else if (!strcmp(str, "bios_cmci_threshold"))
1462594b 2128 cfg->bios_cmci_threshold = true;
0f68c088
TL
2129 else if (!strcmp(str, "recovery"))
2130 cfg->recovery = true;
3c079792 2131 else if (isdigit(str[0])) {
5c31b280 2132 if (get_option(&str, &cfg->tolerant) == 2)
84c2559d 2133 get_option(&str, &(cfg->monarch_timeout));
3c079792 2134 } else {
c767a54b 2135 pr_info("mce argument %s ignored. Please use /sys\n", str);
13503fa9
HS
2136 return 0;
2137 }
9b41046c 2138 return 1;
1da177e4 2139}
4efc0670 2140__setup("mce", mcheck_enable);
1da177e4 2141
a2202aa2 2142int __init mcheck_init(void)
b33a6363 2143{
a2202aa2 2144 mcheck_intel_therm_init();
eef4dfa0 2145 mce_register_decode_chain(&mce_srao_nb);
cd9c57ca 2146 mce_register_decode_chain(&mce_default_nb);
43eaa2a1 2147 mcheck_vendor_init_severity();
a2202aa2 2148
cff4c039 2149 INIT_WORK(&mce_work, mce_gen_pool_process);
061120ae
CG
2150 init_irq_work(&mce_irq_work, mce_irq_work_cb);
2151
b33a6363
BP
2152 return 0;
2153}
b33a6363 2154
d88203d1 2155/*
c7cece89 2156 * mce_syscore: PM support
d88203d1 2157 */
1da177e4 2158
973a2dd1
AK
2159/*
2160 * Disable machine checks on suspend and shutdown. We can't really handle
2161 * them later.
2162 */
6e06780a 2163static void mce_disable_error_reporting(void)
973a2dd1
AK
2164{
2165 int i;
2166
d203f0b8 2167 for (i = 0; i < mca_cfg.banks; i++) {
cebe1820 2168 struct mce_bank *b = &mce_banks[i];
11868a2d 2169
cebe1820 2170 if (b->init)
d9d73fcc 2171 wrmsrl(msr_ops.ctl(i), 0);
06b7a7a5 2172 }
6e06780a
AR
2173 return;
2174}
2175
2176static void vendor_disable_error_reporting(void)
2177{
2178 /*
2179 * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2180 * Disabling them for just a single offlined CPU is bad, since it will
2181 * inhibit reporting for all shared resources on the socket like the
2182 * last level cache (LLC), the integrated memory controller (iMC), etc.
2183 */
2184 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2185 return;
2186
2187 mce_disable_error_reporting();
973a2dd1
AK
2188}
2189
c7cece89 2190static int mce_syscore_suspend(void)
973a2dd1 2191{
6e06780a
AR
2192 vendor_disable_error_reporting();
2193 return 0;
973a2dd1
AK
2194}
2195
c7cece89 2196static void mce_syscore_shutdown(void)
973a2dd1 2197{
6e06780a 2198 vendor_disable_error_reporting();
973a2dd1
AK
2199}
2200
e9eee03e
IM
2201/*
2202 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2203 * Only one CPU is active at this time, the others get re-added later using
2204 * CPU hotplug:
2205 */
c7cece89 2206static void mce_syscore_resume(void)
1da177e4 2207{
5e09954a 2208 __mcheck_cpu_init_generic();
89cbc767 2209 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
bb91f8c0 2210 __mcheck_cpu_init_clear_banks();
1da177e4
LT
2211}
2212
f3c6ea1b 2213static struct syscore_ops mce_syscore_ops = {
c7cece89
HS
2214 .suspend = mce_syscore_suspend,
2215 .shutdown = mce_syscore_shutdown,
2216 .resume = mce_syscore_resume,
f3c6ea1b
RW
2217};
2218
c7cece89 2219/*
8a25a2fd 2220 * mce_device: Sysfs support
c7cece89
HS
2221 */
2222
52d168e2
AK
2223static void mce_cpu_restart(void *data)
2224{
89cbc767 2225 if (!mce_available(raw_cpu_ptr(&cpu_info)))
33edbf02 2226 return;
5e09954a 2227 __mcheck_cpu_init_generic();
bb91f8c0 2228 __mcheck_cpu_init_clear_banks();
5e09954a 2229 __mcheck_cpu_init_timer();
52d168e2
AK
2230}
2231
1da177e4 2232/* Reinit MCEs after user configuration changes */
d88203d1
TG
2233static void mce_restart(void)
2234{
9aaef96f 2235 mce_timer_delete_all();
52d168e2 2236 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
2237}
2238
9af43b54 2239/* Toggle features for corrected errors */
9aaef96f 2240static void mce_disable_cmci(void *data)
9af43b54 2241{
89cbc767 2242 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54 2243 return;
9af43b54
HS
2244 cmci_clear();
2245}
2246
2247static void mce_enable_ce(void *all)
2248{
89cbc767 2249 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54
HS
2250 return;
2251 cmci_reenable();
2252 cmci_recheck();
2253 if (all)
5e09954a 2254 __mcheck_cpu_init_timer();
9af43b54
HS
2255}
2256
8a25a2fd 2257static struct bus_type mce_subsys = {
e9eee03e 2258 .name = "machinecheck",
8a25a2fd 2259 .dev_name = "machinecheck",
1da177e4
LT
2260};
2261
d6126ef5 2262DEFINE_PER_CPU(struct device *, mce_device);
e9eee03e 2263
8a25a2fd 2264static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
cebe1820
AK
2265{
2266 return container_of(attr, struct mce_bank, attr);
2267}
0d7482e3 2268
8a25a2fd 2269static ssize_t show_bank(struct device *s, struct device_attribute *attr,
0d7482e3
AK
2270 char *buf)
2271{
cebe1820 2272 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
0d7482e3
AK
2273}
2274
8a25a2fd 2275static ssize_t set_bank(struct device *s, struct device_attribute *attr,
9319cec8 2276 const char *buf, size_t size)
0d7482e3 2277{
9319cec8 2278 u64 new;
e9eee03e 2279
164109e3 2280 if (kstrtou64(buf, 0, &new) < 0)
0d7482e3 2281 return -EINVAL;
e9eee03e 2282
cebe1820 2283 attr_to_bank(attr)->ctl = new;
0d7482e3 2284 mce_restart();
e9eee03e 2285
9319cec8 2286 return size;
0d7482e3 2287}
a98f0dd3 2288
e9eee03e 2289static ssize_t
8a25a2fd 2290show_trigger(struct device *s, struct device_attribute *attr, char *buf)
a98f0dd3 2291{
1020bcbc 2292 strcpy(buf, mce_helper);
a98f0dd3 2293 strcat(buf, "\n");
1020bcbc 2294 return strlen(mce_helper) + 1;
a98f0dd3
AK
2295}
2296
8a25a2fd 2297static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
e9eee03e 2298 const char *buf, size_t siz)
a98f0dd3
AK
2299{
2300 char *p;
e9eee03e 2301
1020bcbc
HS
2302 strncpy(mce_helper, buf, sizeof(mce_helper));
2303 mce_helper[sizeof(mce_helper)-1] = 0;
1020bcbc 2304 p = strchr(mce_helper, '\n');
e9eee03e 2305
e9084ec9 2306 if (p)
e9eee03e
IM
2307 *p = 0;
2308
e9084ec9 2309 return strlen(mce_helper) + !!p;
a98f0dd3
AK
2310}
2311
8a25a2fd
KS
2312static ssize_t set_ignore_ce(struct device *s,
2313 struct device_attribute *attr,
9af43b54
HS
2314 const char *buf, size_t size)
2315{
2316 u64 new;
2317
164109e3 2318 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2319 return -EINVAL;
2320
7af19e4a 2321 if (mca_cfg.ignore_ce ^ !!new) {
9af43b54
HS
2322 if (new) {
2323 /* disable ce features */
9aaef96f
HS
2324 mce_timer_delete_all();
2325 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2326 mca_cfg.ignore_ce = true;
9af43b54
HS
2327 } else {
2328 /* enable ce features */
7af19e4a 2329 mca_cfg.ignore_ce = false;
9af43b54
HS
2330 on_each_cpu(mce_enable_ce, (void *)1, 1);
2331 }
2332 }
2333 return size;
2334}
2335
8a25a2fd
KS
2336static ssize_t set_cmci_disabled(struct device *s,
2337 struct device_attribute *attr,
9af43b54
HS
2338 const char *buf, size_t size)
2339{
2340 u64 new;
2341
164109e3 2342 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2343 return -EINVAL;
2344
7af19e4a 2345 if (mca_cfg.cmci_disabled ^ !!new) {
9af43b54
HS
2346 if (new) {
2347 /* disable cmci */
9aaef96f 2348 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2349 mca_cfg.cmci_disabled = true;
9af43b54
HS
2350 } else {
2351 /* enable cmci */
7af19e4a 2352 mca_cfg.cmci_disabled = false;
9af43b54
HS
2353 on_each_cpu(mce_enable_ce, NULL, 1);
2354 }
2355 }
2356 return size;
2357}
2358
8a25a2fd
KS
2359static ssize_t store_int_with_restart(struct device *s,
2360 struct device_attribute *attr,
b56f642d
AK
2361 const char *buf, size_t size)
2362{
8a25a2fd 2363 ssize_t ret = device_store_int(s, attr, buf, size);
b56f642d
AK
2364 mce_restart();
2365 return ret;
2366}
2367
8a25a2fd 2368static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
d203f0b8 2369static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
84c2559d 2370static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
d203f0b8 2371static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
e9eee03e 2372
8a25a2fd
KS
2373static struct dev_ext_attribute dev_attr_check_interval = {
2374 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
b56f642d
AK
2375 &check_interval
2376};
e9eee03e 2377
8a25a2fd 2378static struct dev_ext_attribute dev_attr_ignore_ce = {
7af19e4a
BP
2379 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2380 &mca_cfg.ignore_ce
9af43b54
HS
2381};
2382
8a25a2fd 2383static struct dev_ext_attribute dev_attr_cmci_disabled = {
7af19e4a
BP
2384 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2385 &mca_cfg.cmci_disabled
9af43b54
HS
2386};
2387
8a25a2fd
KS
2388static struct device_attribute *mce_device_attrs[] = {
2389 &dev_attr_tolerant.attr,
2390 &dev_attr_check_interval.attr,
2391 &dev_attr_trigger,
2392 &dev_attr_monarch_timeout.attr,
2393 &dev_attr_dont_log_ce.attr,
2394 &dev_attr_ignore_ce.attr,
2395 &dev_attr_cmci_disabled.attr,
a98f0dd3
AK
2396 NULL
2397};
1da177e4 2398
8a25a2fd 2399static cpumask_var_t mce_device_initialized;
bae19fe0 2400
e032d807
GKH
2401static void mce_device_release(struct device *dev)
2402{
2403 kfree(dev);
2404}
2405
8a25a2fd 2406/* Per cpu device init. All of the cpus still share the same ctrl bank: */
148f9bb8 2407static int mce_device_create(unsigned int cpu)
1da177e4 2408{
e032d807 2409 struct device *dev;
1da177e4 2410 int err;
b1f49f95 2411 int i, j;
92cb7612 2412
90367556 2413 if (!mce_available(&boot_cpu_data))
91c6d400
AK
2414 return -EIO;
2415
7f34b935
SAS
2416 dev = per_cpu(mce_device, cpu);
2417 if (dev)
2418 return 0;
2419
e032d807
GKH
2420 dev = kzalloc(sizeof *dev, GFP_KERNEL);
2421 if (!dev)
2422 return -ENOMEM;
8a25a2fd
KS
2423 dev->id = cpu;
2424 dev->bus = &mce_subsys;
e032d807 2425 dev->release = &mce_device_release;
91c6d400 2426
8a25a2fd 2427 err = device_register(dev);
853d9b18
LK
2428 if (err) {
2429 put_device(dev);
d435d862 2430 return err;
853d9b18 2431 }
d435d862 2432
8a25a2fd
KS
2433 for (i = 0; mce_device_attrs[i]; i++) {
2434 err = device_create_file(dev, mce_device_attrs[i]);
d435d862
AM
2435 if (err)
2436 goto error;
2437 }
d203f0b8 2438 for (j = 0; j < mca_cfg.banks; j++) {
8a25a2fd 2439 err = device_create_file(dev, &mce_banks[j].attr);
0d7482e3
AK
2440 if (err)
2441 goto error2;
2442 }
8a25a2fd 2443 cpumask_set_cpu(cpu, mce_device_initialized);
d6126ef5 2444 per_cpu(mce_device, cpu) = dev;
91c6d400 2445
d435d862 2446 return 0;
0d7482e3 2447error2:
b1f49f95 2448 while (--j >= 0)
8a25a2fd 2449 device_remove_file(dev, &mce_banks[j].attr);
d435d862 2450error:
cb491fca 2451 while (--i >= 0)
8a25a2fd 2452 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2453
8a25a2fd 2454 device_unregister(dev);
d435d862 2455
91c6d400
AK
2456 return err;
2457}
2458
148f9bb8 2459static void mce_device_remove(unsigned int cpu)
91c6d400 2460{
d6126ef5 2461 struct device *dev = per_cpu(mce_device, cpu);
73ca5358
SL
2462 int i;
2463
8a25a2fd 2464 if (!cpumask_test_cpu(cpu, mce_device_initialized))
bae19fe0
AH
2465 return;
2466
8a25a2fd
KS
2467 for (i = 0; mce_device_attrs[i]; i++)
2468 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2469
d203f0b8 2470 for (i = 0; i < mca_cfg.banks; i++)
8a25a2fd 2471 device_remove_file(dev, &mce_banks[i].attr);
cb491fca 2472
8a25a2fd
KS
2473 device_unregister(dev);
2474 cpumask_clear_cpu(cpu, mce_device_initialized);
d6126ef5 2475 per_cpu(mce_device, cpu) = NULL;
91c6d400 2476}
91c6d400 2477
d6b75584 2478/* Make sure there are no machine checks on offlined CPUs. */
39f152ff 2479static void mce_disable_cpu(void)
d6b75584 2480{
89cbc767 2481 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2482 return;
767df1bd 2483
39f152ff 2484 if (!cpuhp_tasks_frozen)
88ccbedd 2485 cmci_clear();
11868a2d 2486
6e06780a 2487 vendor_disable_error_reporting();
d6b75584
AK
2488}
2489
39f152ff 2490static void mce_reenable_cpu(void)
d6b75584 2491{
e9eee03e 2492 int i;
d6b75584 2493
89cbc767 2494 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2495 return;
e9eee03e 2496
39f152ff 2497 if (!cpuhp_tasks_frozen)
88ccbedd 2498 cmci_reenable();
d203f0b8 2499 for (i = 0; i < mca_cfg.banks; i++) {
cebe1820 2500 struct mce_bank *b = &mce_banks[i];
11868a2d 2501
cebe1820 2502 if (b->init)
d9d73fcc 2503 wrmsrl(msr_ops.ctl(i), b->ctl);
06b7a7a5 2504 }
d6b75584
AK
2505}
2506
0e285d36 2507static int mce_cpu_dead(unsigned int cpu)
91c6d400 2508{
0e285d36 2509 mce_intel_hcpu_update(cpu);
91c6d400 2510
0e285d36
SAS
2511 /* intentionally ignoring frozen here */
2512 if (!cpuhp_tasks_frozen)
2513 cmci_rediscover();
2514 return 0;
91c6d400
AK
2515}
2516
8c0eeac8 2517static int mce_cpu_online(unsigned int cpu)
91c6d400 2518{
0becc0ae 2519 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8 2520 int ret;
91c6d400 2521
8c0eeac8 2522 mce_device_create(cpu);
38356c1f 2523
8c0eeac8
SAS
2524 ret = mce_threshold_create_device(cpu);
2525 if (ret) {
2526 mce_device_remove(cpu);
2527 return ret;
1a65f970 2528 }
8c0eeac8 2529 mce_reenable_cpu();
0becc0ae 2530 mce_start_timer(t);
8c0eeac8 2531 return 0;
91c6d400
AK
2532}
2533
8c0eeac8
SAS
2534static int mce_cpu_pre_down(unsigned int cpu)
2535{
0becc0ae 2536 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8
SAS
2537
2538 mce_disable_cpu();
2539 del_timer_sync(t);
2540 mce_threshold_remove_device(cpu);
2541 mce_device_remove(cpu);
2542 return 0;
2543}
91c6d400 2544
cebe1820 2545static __init void mce_init_banks(void)
0d7482e3
AK
2546{
2547 int i;
2548
d203f0b8 2549 for (i = 0; i < mca_cfg.banks; i++) {
cebe1820 2550 struct mce_bank *b = &mce_banks[i];
8a25a2fd 2551 struct device_attribute *a = &b->attr;
e9eee03e 2552
a07e4156 2553 sysfs_attr_init(&a->attr);
cebe1820
AK
2554 a->attr.name = b->attrname;
2555 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
e9eee03e
IM
2556
2557 a->attr.mode = 0644;
2558 a->show = show_bank;
2559 a->store = set_bank;
0d7482e3 2560 }
0d7482e3
AK
2561}
2562
5e09954a 2563static __init int mcheck_init_device(void)
91c6d400 2564{
8c0eeac8 2565 enum cpuhp_state hp_online;
91c6d400 2566 int err;
91c6d400 2567
9c15a24b
MS
2568 if (!mce_available(&boot_cpu_data)) {
2569 err = -EIO;
2570 goto err_out;
2571 }
0d7482e3 2572
9c15a24b
MS
2573 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2574 err = -ENOMEM;
2575 goto err_out;
2576 }
996867d0 2577
cebe1820 2578 mce_init_banks();
0d7482e3 2579
8a25a2fd 2580 err = subsys_system_register(&mce_subsys, NULL);
d435d862 2581 if (err)
9c15a24b 2582 goto err_out_mem;
91c6d400 2583
0e285d36
SAS
2584 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2585 mce_cpu_dead);
2586 if (err)
2587 goto err_out_mem;
91c6d400 2588
8c0eeac8
SAS
2589 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2590 mce_cpu_online, mce_cpu_pre_down);
2591 if (err < 0)
0e285d36 2592 goto err_out_online;
8c0eeac8 2593 hp_online = err;
93b62c3c 2594
9c15a24b
MS
2595 register_syscore_ops(&mce_syscore_ops);
2596
93b62c3c 2597 /* register character device /dev/mcelog */
9c15a24b
MS
2598 err = misc_register(&mce_chrdev_device);
2599 if (err)
2600 goto err_register;
2601
2602 return 0;
2603
2604err_register:
2605 unregister_syscore_ops(&mce_syscore_ops);
8c0eeac8 2606 cpuhp_remove_state(hp_online);
9c15a24b 2607
0e285d36
SAS
2608err_out_online:
2609 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
9c15a24b
MS
2610
2611err_out_mem:
2612 free_cpumask_var(mce_device_initialized);
2613
2614err_out:
2615 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
e9eee03e 2616
1da177e4 2617 return err;
1da177e4 2618}
cef12ee5 2619device_initcall_sync(mcheck_init_device);
a988d334 2620
d7c3c9a6
AK
2621/*
2622 * Old style boot options parsing. Only for compatibility.
2623 */
2624static int __init mcheck_disable(char *str)
2625{
1462594b 2626 mca_cfg.disabled = true;
d7c3c9a6
AK
2627 return 1;
2628}
2629__setup("nomce", mcheck_disable);
a988d334 2630
5be9ed25
HY
2631#ifdef CONFIG_DEBUG_FS
2632struct dentry *mce_get_debugfs_dir(void)
a988d334 2633{
5be9ed25 2634 static struct dentry *dmce;
a988d334 2635
5be9ed25
HY
2636 if (!dmce)
2637 dmce = debugfs_create_dir("mce", NULL);
a988d334 2638
5be9ed25
HY
2639 return dmce;
2640}
a988d334 2641
bf783f9f
HY
2642static void mce_reset(void)
2643{
2644 cpu_missing = 0;
c7c9b392 2645 atomic_set(&mce_fake_panicked, 0);
bf783f9f
HY
2646 atomic_set(&mce_executing, 0);
2647 atomic_set(&mce_callin, 0);
2648 atomic_set(&global_nwo, 0);
2649}
a988d334 2650
bf783f9f
HY
2651static int fake_panic_get(void *data, u64 *val)
2652{
2653 *val = fake_panic;
2654 return 0;
a988d334
IM
2655}
2656
bf783f9f 2657static int fake_panic_set(void *data, u64 val)
a988d334 2658{
bf783f9f
HY
2659 mce_reset();
2660 fake_panic = val;
2661 return 0;
a988d334 2662}
a988d334 2663
bf783f9f
HY
2664DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2665 fake_panic_set, "%llu\n");
d7c3c9a6 2666
5e09954a 2667static int __init mcheck_debugfs_init(void)
d7c3c9a6 2668{
bf783f9f
HY
2669 struct dentry *dmce, *ffake_panic;
2670
2671 dmce = mce_get_debugfs_dir();
2672 if (!dmce)
2673 return -ENOMEM;
2674 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2675 &fake_panic_fops);
2676 if (!ffake_panic)
2677 return -ENOMEM;
2678
2679 return 0;
d7c3c9a6 2680}
fd4cf79f
CG
2681#else
2682static int __init mcheck_debugfs_init(void) { return -EINVAL; }
5be9ed25 2683#endif
fd4cf79f 2684
3637efb0
TL
2685DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2686EXPORT_SYMBOL_GPL(mcsafe_key);
2687
fd4cf79f
CG
2688static int __init mcheck_late_init(void)
2689{
3637efb0
TL
2690 if (mca_cfg.recovery)
2691 static_branch_inc(&mcsafe_key);
2692
fd4cf79f
CG
2693 mcheck_debugfs_init();
2694
2695 /*
2696 * Flush out everything that has been logged during early boot, now that
2697 * everything has been initialized (workqueues, decoders, ...).
2698 */
2699 mce_schedule_work();
2700
2701 return 0;
2702}
2703late_initcall(mcheck_late_init);