x86/mce: Get rid of the ->quirk_no_way_out() indirect call
[linux-block.git] / arch / x86 / kernel / cpu / mce / core.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Machine check handler.
e9eee03e 4 *
1da177e4 5 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
6 * Rest from unknown author(s).
7 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
8 * Copyright 2008 Intel Corporation
9 * Author: Andi Kleen
1da177e4 10 */
c767a54b 11
e9eee03e
IM
12#include <linux/thread_info.h>
13#include <linux/capability.h>
14#include <linux/miscdevice.h>
15#include <linux/ratelimit.h>
e9eee03e 16#include <linux/rcupdate.h>
e9eee03e 17#include <linux/kobject.h>
14a02530 18#include <linux/uaccess.h>
e9eee03e
IM
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
1da177e4 22#include <linux/string.h>
8a25a2fd 23#include <linux/device.h>
f3c6ea1b 24#include <linux/syscore_ops.h>
3c079792 25#include <linux/delay.h>
8c566ef5 26#include <linux/ctype.h>
e9eee03e 27#include <linux/sched.h>
0d7482e3 28#include <linux/sysfs.h>
e9eee03e 29#include <linux/types.h>
5a0e3ad6 30#include <linux/slab.h>
e9eee03e
IM
31#include <linux/init.h>
32#include <linux/kmod.h>
33#include <linux/poll.h>
3c079792 34#include <linux/nmi.h>
e9eee03e 35#include <linux/cpu.h>
011d8261 36#include <linux/ras.h>
14a02530 37#include <linux/smp.h>
e9eee03e 38#include <linux/fs.h>
9b1beaf2 39#include <linux/mm.h>
5be9ed25 40#include <linux/debugfs.h>
b77e70bf 41#include <linux/irq_work.h>
69c60c88 42#include <linux/export.h>
284ce401 43#include <linux/set_memory.h>
9998a983 44#include <linux/sync_core.h>
5567d11c 45#include <linux/task_work.h>
0d00449c 46#include <linux/hardirq.h>
e9eee03e 47
3f5a7896 48#include <asm/intel-family.h>
d88203d1 49#include <asm/processor.h>
95927475 50#include <asm/traps.h>
375074cc 51#include <asm/tlbflush.h>
e9eee03e
IM
52#include <asm/mce.h>
53#include <asm/msr.h>
5bc32950 54#include <asm/reboot.h>
1da177e4 55
21afaf18 56#include "internal.h"
711c2e48 57
b3b7c479
SH
58/* sysfs synchronization */
59static DEFINE_MUTEX(mce_sysfs_mutex);
60
8968f9d3
HS
61#define CREATE_TRACE_POINTS
62#include <trace/events/mce.h>
63
3f2f0680 64#define SPINUNIT 100 /* 100ns */
3c079792 65
01ca79f1
AK
66DEFINE_PER_CPU(unsigned, mce_exception_count);
67
c7d314f3
YG
68DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
69
95fdce6b
YG
70struct mce_bank {
71 u64 ctl; /* subevents to enable */
72 bool init; /* initialise bank? */
b4914508
YG
73};
74static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
75
76#define ATTR_LEN 16
77/* One object for each MCE bank, shared by all CPUs */
78struct mce_bank_dev {
95fdce6b
YG
79 struct device_attribute attr; /* device attribute */
80 char attrname[ATTR_LEN]; /* attribute name */
b4914508 81 u8 bank; /* bank number */
95fdce6b 82};
b4914508 83static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
95fdce6b 84
bf80bbd7 85struct mce_vendor_flags mce_flags __read_mostly;
cebe1820 86
d203f0b8 87struct mca_config mca_cfg __read_mostly = {
84c2559d 88 .bootlog = -1,
d203f0b8
BP
89 /*
90 * Tolerant levels:
91 * 0: always panic on uncorrected errors, log corrected errors
92 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
93 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
94 * 3: never panic or SIGBUS, log all errors (for testing only)
95 */
84c2559d
BP
96 .tolerant = 1,
97 .monarch_timeout = -1
d203f0b8
BP
98};
99
3c079792 100static DEFINE_PER_CPU(struct mce, mces_seen);
5de97c9f
TL
101static unsigned long mce_need_notify;
102static int cpu_missing;
3c079792 103
0644414e
NR
104/*
105 * MCA banks polled by the period polling timer for corrected events.
106 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
107 */
ee031c31
AK
108DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
109 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
110};
111
c3d1fb56
NR
112/*
113 * MCA banks controlled through firmware first for corrected errors.
114 * This is a global list of banks for which we won't enable CMCI and we
115 * won't poll. Firmware controls these banks and is responsible for
116 * reporting corrected errors through GHES. Uncorrected/recoverable
117 * errors are still notified through a machine check.
118 */
119mce_banks_t mce_banks_ce_disabled;
120
061120ae
CG
121static struct work_struct mce_work;
122static struct irq_work mce_irq_work;
9b1beaf2 123
3653ada5
BP
124/*
125 * CPU/chipset specific EDAC code can register a notifier call here to print
126 * MCE errors in a human-readable form.
127 */
0dc9c639 128BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
3653ada5 129
b5f2fa4e 130/* Do initial initialization of a struct mce */
865d3a9a 131noinstr void mce_setup(struct mce *m)
b5f2fa4e
AK
132{
133 memset(m, 0, sizeof(struct mce));
d620c67f 134 m->cpu = m->extcpu = smp_processor_id();
bc39f010
AB
135 /* need the internal __ version to avoid deadlocks */
136 m->time = __ktime_get_real_seconds();
8ee08347
AK
137 m->cpuvendor = boot_cpu_data.x86_vendor;
138 m->cpuid = cpuid_eax(1);
8ee08347 139 m->socketid = cpu_data(m->extcpu).phys_proc_id;
8ee08347 140 m->apicid = cpu_data(m->extcpu).initial_apicid;
865d3a9a 141 m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
3f5a7896
TL
142
143 if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
865d3a9a 144 m->ppin = __rdmsr(MSR_PPIN);
077168e2 145 else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
865d3a9a 146 m->ppin = __rdmsr(MSR_AMD_PPIN);
fa94d0c6
TL
147
148 m->microcode = boot_cpu_data.microcode;
b5f2fa4e
AK
149}
150
ea149b36
AK
151DEFINE_PER_CPU(struct mce, injectm);
152EXPORT_PER_CPU_SYMBOL_GPL(injectm);
153
fe3ed20f 154void mce_log(struct mce *m)
1da177e4 155{
fe3ed20f 156 if (!mce_gen_pool_add(m))
f29a7aff 157 irq_work_queue(&mce_irq_work);
1da177e4 158}
81736abd 159EXPORT_SYMBOL_GPL(mce_log);
09371957 160
3653ada5
BP
161void mce_register_decode_chain(struct notifier_block *nb)
162{
15af3659
ZL
163 if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
164 nb->priority > MCE_PRIO_HIGHEST))
32b40a82 165 return;
cd9c57ca 166
0dc9c639 167 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
3653ada5
BP
168}
169EXPORT_SYMBOL_GPL(mce_register_decode_chain);
170
171void mce_unregister_decode_chain(struct notifier_block *nb)
172{
0dc9c639 173 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
3653ada5
BP
174}
175EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
176
8121b8f9 177u32 mca_msr_reg(int bank, enum mca_msr reg)
a9750a31 178{
8121b8f9
BP
179 if (mce_flags.smca) {
180 switch (reg) {
181 case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
182 case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
183 case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
184 case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
185 }
186 }
a9750a31 187
8121b8f9
BP
188 switch (reg) {
189 case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
190 case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
191 case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
192 case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
193 }
a9750a31 194
8121b8f9 195 return 0;
a9750a31
YG
196}
197
cd9c57ca 198static void __print_mce(struct mce *m)
1da177e4 199{
cd9c57ca
BP
200 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
201 m->extcpu,
202 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
203 m->mcgstatus, m->bank, m->status);
f436f8bb 204
65ea5b03 205 if (m->ip) {
a2d7b0d4 206 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
f436f8bb 207 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
cd9c57ca 208 m->cs, m->ip);
f436f8bb 209
1da177e4 210 if (m->cs == __KERNEL_CS)
c80c5ec1 211 pr_cont("{%pS}", (void *)(unsigned long)m->ip);
f436f8bb 212 pr_cont("\n");
1da177e4 213 }
f436f8bb 214
a2d7b0d4 215 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
1da177e4 216 if (m->addr)
f436f8bb 217 pr_cont("ADDR %llx ", m->addr);
1da177e4 218 if (m->misc)
f436f8bb 219 pr_cont("MISC %llx ", m->misc);
bb2de0ad
SK
220 if (m->ppin)
221 pr_cont("PPIN %llx ", m->ppin);
549d042d 222
4b711f92
YG
223 if (mce_flags.smca) {
224 if (m->synd)
225 pr_cont("SYND %llx ", m->synd);
226 if (m->ipid)
227 pr_cont("IPID %llx ", m->ipid);
228 }
229
f436f8bb 230 pr_cont("\n");
925946cf 231
506ed6b5
AK
232 /*
233 * Note this output is parsed by external tools and old fields
234 * should not be changed.
235 */
881e23e5 236 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
506ed6b5 237 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
fa94d0c6 238 m->microcode);
cd9c57ca
BP
239}
240
241static void print_mce(struct mce *m)
242{
cd9c57ca 243 __print_mce(m);
b2fbf6f2 244
ac78bd72 245 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
b2fbf6f2 246 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
86503560
AK
247}
248
f94b61c2
AK
249#define PANIC_TIMEOUT 5 /* 5 seconds */
250
c7c9b392 251static atomic_t mce_panicked;
f94b61c2 252
bf783f9f 253static int fake_panic;
c7c9b392 254static atomic_t mce_fake_panicked;
bf783f9f 255
f94b61c2
AK
256/* Panic in progress. Enable interrupts and wait for final IPI */
257static void wait_for_panic(void)
258{
259 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
f436f8bb 260
f94b61c2
AK
261 preempt_disable();
262 local_irq_enable();
263 while (timeout-- > 0)
264 udelay(1);
29b0f591 265 if (panic_timeout == 0)
7af19e4a 266 panic_timeout = mca_cfg.panic_timeout;
f94b61c2
AK
267 panic("Panicing machine check CPU died");
268}
269
6c80f87e 270static void mce_panic(const char *msg, struct mce *final, char *exp)
d88203d1 271{
5541c93c
TL
272 int apei_err = 0;
273 struct llist_node *pending;
274 struct mce_evt_llist *l;
e02e68d3 275
bf783f9f
HY
276 if (!fake_panic) {
277 /*
278 * Make sure only one CPU runs in machine check panic
279 */
c7c9b392 280 if (atomic_inc_return(&mce_panicked) > 1)
bf783f9f
HY
281 wait_for_panic();
282 barrier();
f94b61c2 283
bf783f9f
HY
284 bust_spinlocks(1);
285 console_verbose();
286 } else {
287 /* Don't log too much for fake panic */
c7c9b392 288 if (atomic_inc_return(&mce_fake_panicked) > 1)
bf783f9f
HY
289 return;
290 }
5541c93c 291 pending = mce_gen_pool_prepare_records();
a0189c70 292 /* First print corrected ones that are still unlogged */
5541c93c
TL
293 llist_for_each_entry(l, pending, llnode) {
294 struct mce *m = &l->mce;
482908b4 295 if (!(m->status & MCI_STATUS_UC)) {
77e26cca 296 print_mce(m);
482908b4
HY
297 if (!apei_err)
298 apei_err = apei_write_mce(m);
299 }
a0189c70
AK
300 }
301 /* Now print uncorrected but with the final one last */
5541c93c
TL
302 llist_for_each_entry(l, pending, llnode) {
303 struct mce *m = &l->mce;
77e26cca
HS
304 if (!(m->status & MCI_STATUS_UC))
305 continue;
5541c93c 306 if (!final || mce_cmp(m, final)) {
77e26cca 307 print_mce(m);
482908b4
HY
308 if (!apei_err)
309 apei_err = apei_write_mce(m);
310 }
1da177e4 311 }
482908b4 312 if (final) {
77e26cca 313 print_mce(final);
482908b4
HY
314 if (!apei_err)
315 apei_err = apei_write_mce(final);
316 }
3c079792 317 if (cpu_missing)
a2d7b0d4 318 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
bd19a5e6 319 if (exp)
a2d7b0d4 320 pr_emerg(HW_ERR "Machine check: %s\n", exp);
bf783f9f
HY
321 if (!fake_panic) {
322 if (panic_timeout == 0)
7af19e4a 323 panic_timeout = mca_cfg.panic_timeout;
bf783f9f
HY
324 panic(msg);
325 } else
a2d7b0d4 326 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
d88203d1 327}
1da177e4 328
ea149b36
AK
329/* Support code for software error injection */
330
331static int msr_to_offset(u32 msr)
332{
0a3aee0d 333 unsigned bank = __this_cpu_read(injectm.bank);
f436f8bb 334
84c2559d 335 if (msr == mca_cfg.rip_msr)
ea149b36 336 return offsetof(struct mce, ip);
8121b8f9 337 if (msr == mca_msr_reg(bank, MCA_STATUS))
ea149b36 338 return offsetof(struct mce, status);
8121b8f9 339 if (msr == mca_msr_reg(bank, MCA_ADDR))
ea149b36 340 return offsetof(struct mce, addr);
8121b8f9 341 if (msr == mca_msr_reg(bank, MCA_MISC))
ea149b36
AK
342 return offsetof(struct mce, misc);
343 if (msr == MSR_IA32_MCG_STATUS)
344 return offsetof(struct mce, mcgstatus);
345 return -1;
346}
347
e2def7d4
BP
348__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
349 struct pt_regs *regs, int trapnr,
350 unsigned long error_code,
351 unsigned long fault_addr)
352{
353 pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
354 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
355
356 show_stack_regs(regs);
357
358 panic("MCA architectural violation!\n");
359
360 while (true)
361 cpu_relax();
362
363 return true;
364}
365
5f8c1a54 366/* MSR access wrappers used for error injection */
e1007770 367static noinstr u64 mce_rdmsrl(u32 msr)
5f8c1a54 368{
e2def7d4 369 DECLARE_ARGS(val, low, high);
11868a2d 370
0a3aee0d 371 if (__this_cpu_read(injectm.finished)) {
e1007770
BP
372 int offset;
373 u64 ret;
11868a2d 374
e1007770 375 instrumentation_begin();
11868a2d 376
e1007770 377 offset = msr_to_offset(msr);
ea149b36 378 if (offset < 0)
e1007770
BP
379 ret = 0;
380 else
381 ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
11868a2d 382
e1007770
BP
383 instrumentation_end();
384
385 return ret;
11868a2d
IM
386 }
387
e2def7d4
BP
388 /*
389 * RDMSR on MCA MSRs should not fault. If they do, this is very much an
390 * architectural violation and needs to be reported to hw vendor. Panic
391 * the box to not allow any further progress.
392 */
393 asm volatile("1: rdmsr\n"
394 "2:\n"
395 _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
396 : EAX_EDX_RET(val, low, high) : "c" (msr));
11868a2d 397
e2def7d4
BP
398
399 return EAX_EDX_VAL(val, low, high);
400}
401
402__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
403 struct pt_regs *regs, int trapnr,
404 unsigned long error_code,
405 unsigned long fault_addr)
406{
407 pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
408 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
409 regs->ip, (void *)regs->ip);
410
411 show_stack_regs(regs);
412
413 panic("MCA architectural violation!\n");
414
415 while (true)
416 cpu_relax();
417
418 return true;
5f8c1a54
AK
419}
420
e1007770 421static noinstr void mce_wrmsrl(u32 msr, u64 v)
5f8c1a54 422{
e2def7d4
BP
423 u32 low, high;
424
0a3aee0d 425 if (__this_cpu_read(injectm.finished)) {
e1007770 426 int offset;
11868a2d 427
e1007770 428 instrumentation_begin();
11868a2d 429
e1007770 430 offset = msr_to_offset(msr);
ea149b36 431 if (offset >= 0)
89cbc767 432 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
e1007770
BP
433
434 instrumentation_end();
435
ea149b36
AK
436 return;
437 }
e2def7d4
BP
438
439 low = (u32)v;
440 high = (u32)(v >> 32);
441
442 /* See comment in mce_rdmsrl() */
443 asm volatile("1: wrmsr\n"
444 "2:\n"
445 _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
446 : : "c" (msr), "a"(low), "d" (high) : "memory");
5f8c1a54
AK
447}
448
b8325c5b
HS
449/*
450 * Collect all global (w.r.t. this processor) status about this machine
451 * check into our "mce" struct so that we can use it later to assess
452 * the severity of the problem as we read per-bank specific details.
453 */
454static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
455{
456 mce_setup(m);
457
458 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
459 if (regs) {
460 /*
461 * Get the address of the instruction at the time of
462 * the machine check error.
463 */
464 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
465 m->ip = regs->ip;
466 m->cs = regs->cs;
a129a7c8
AK
467
468 /*
469 * When in VM86 mode make the cs look like ring 3
470 * always. This is a lie, but it's better than passing
471 * the additional vm86 bit around everywhere.
472 */
473 if (v8086_mode(regs))
474 m->cs |= 3;
b8325c5b
HS
475 }
476 /* Use accurate RIP reporting if available. */
84c2559d
BP
477 if (mca_cfg.rip_msr)
478 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
b8325c5b
HS
479 }
480}
481
88ccbedd 482int mce_available(struct cpuinfo_x86 *c)
1da177e4 483{
1462594b 484 if (mca_cfg.disabled)
5b4408fd 485 return 0;
3d1712c9 486 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
487}
488
9b1beaf2
AK
489static void mce_schedule_work(void)
490{
a2c2727d 491 if (!mce_gen_pool_empty())
061120ae 492 schedule_work(&mce_work);
9b1beaf2
AK
493}
494
b77e70bf 495static void mce_irq_work_cb(struct irq_work *entry)
ccc3c319 496{
9b1beaf2 497 mce_schedule_work();
ccc3c319 498}
ccc3c319 499
feab21f8
BP
500/*
501 * Check if the address reported by the CPU is in a format we can parse.
502 * It would be possible to add code for most other cases, but all would
503 * be somewhat complicated (e.g. segment offset would require an instruction
d9f6e12f 504 * parser). So only support physical addresses up to page granularity for now.
feab21f8 505 */
e8a308e5 506int mce_usable_address(struct mce *m)
feab21f8 507{
c6a9583f 508 if (!(m->status & MCI_STATUS_ADDRV))
feab21f8
BP
509 return 0;
510
6e898d2b
TW
511 /* Checks after this one are Intel/Zhaoxin-specific: */
512 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
513 boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
feab21f8
BP
514 return 1;
515
c6a9583f
BP
516 if (!(m->status & MCI_STATUS_MISCV))
517 return 0;
518
feab21f8
BP
519 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
520 return 0;
c6a9583f 521
feab21f8
BP
522 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
523 return 0;
c6a9583f 524
feab21f8
BP
525 return 1;
526}
e8a308e5 527EXPORT_SYMBOL_GPL(mce_usable_address);
feab21f8 528
2d1f4061 529bool mce_is_memory_error(struct mce *m)
011d8261 530{
6e898d2b
TW
531 switch (m->cpuvendor) {
532 case X86_VENDOR_AMD:
533 case X86_VENDOR_HYGON:
c6708d50 534 return amd_mce_is_memory_error(m);
6e898d2b
TW
535
536 case X86_VENDOR_INTEL:
537 case X86_VENDOR_ZHAOXIN:
011d8261
BP
538 /*
539 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
540 *
541 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
542 * indicating a memory error. Bit 8 is used for indicating a
543 * cache hierarchy error. The combination of bit 2 and bit 3
544 * is used for indicating a `generic' cache hierarchy error
545 * But we can't just blindly check the above bits, because if
546 * bit 11 is set, then it is a bus/interconnect error - and
547 * either way the above bits just gives more detail on what
548 * bus/interconnect error happened. Note that bit 12 can be
549 * ignored, as it's the "filter" bit.
550 */
551 return (m->status & 0xef80) == BIT(7) ||
552 (m->status & 0xef00) == BIT(8) ||
553 (m->status & 0xeffc) == 0xc;
011d8261 554
6e898d2b
TW
555 default:
556 return false;
557 }
011d8261 558}
2d1f4061 559EXPORT_SYMBOL_GPL(mce_is_memory_error);
011d8261 560
17fae129
TL
561static bool whole_page(struct mce *m)
562{
563 if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
564 return true;
565
566 return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
567}
568
5d96c934 569bool mce_is_correctable(struct mce *m)
179eb850
YG
570{
571 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
572 return false;
573
ac78bd72
PW
574 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
575 return false;
576
179eb850
YG
577 if (m->status & MCI_STATUS_UC)
578 return false;
579
580 return true;
581}
5d96c934 582EXPORT_SYMBOL_GPL(mce_is_correctable);
179eb850 583
c9c6d216 584static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
011d8261
BP
585 void *data)
586{
587 struct mce *m = (struct mce *)data;
011d8261
BP
588
589 if (!m)
590 return NOTIFY_DONE;
591
011d8261
BP
592 /* Emit the trace record: */
593 trace_mce_record(m);
594
011d8261
BP
595 set_bit(0, &mce_need_notify);
596
597 mce_notify_irq();
598
599 return NOTIFY_DONE;
600}
601
c9c6d216
TL
602static struct notifier_block early_nb = {
603 .notifier_call = mce_early_notifier,
604 .priority = MCE_PRIO_EARLY,
011d8261
BP
605};
606
8438b84a
JS
607static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
608 void *data)
fd4cf79f
CG
609{
610 struct mce *mce = (struct mce *)data;
611 unsigned long pfn;
612
8438b84a 613 if (!mce || !mce_usable_address(mce))
fd4cf79f
CG
614 return NOTIFY_DONE;
615
8438b84a
JS
616 if (mce->severity != MCE_AO_SEVERITY &&
617 mce->severity != MCE_DEFERRED_SEVERITY)
618 return NOTIFY_DONE;
619
620 pfn = mce->addr >> PAGE_SHIFT;
23ba710a 621 if (!memory_failure(pfn, 0)) {
17fae129 622 set_mce_nospec(pfn, whole_page(mce));
23ba710a
TL
623 mce->kflags |= MCE_HANDLED_UC;
624 }
fd4cf79f
CG
625
626 return NOTIFY_OK;
ccc3c319 627}
8438b84a
JS
628
629static struct notifier_block mce_uc_nb = {
630 .notifier_call = uc_decode_notifier,
631 .priority = MCE_PRIO_UC,
fd4cf79f 632};
ccc3c319 633
cd9c57ca
BP
634static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
635 void *data)
636{
637 struct mce *m = (struct mce *)data;
638
639 if (!m)
640 return NOTIFY_DONE;
641
43505646 642 if (mca_cfg.print_all || !m->kflags)
925946cf 643 __print_mce(m);
cd9c57ca
BP
644
645 return NOTIFY_DONE;
646}
647
648static struct notifier_block mce_default_nb = {
649 .notifier_call = mce_default_notifier,
650 /* lowest prio, we want it to run last. */
9026cc82 651 .priority = MCE_PRIO_LOWEST,
cd9c57ca
BP
652};
653
85f92694
TL
654/*
655 * Read ADDR and MISC registers.
656 */
657static void mce_read_aux(struct mce *m, int i)
658{
659 if (m->status & MCI_STATUS_MISCV)
8121b8f9 660 m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
db819d60 661
85f92694 662 if (m->status & MCI_STATUS_ADDRV) {
8121b8f9 663 m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
85f92694
TL
664
665 /*
666 * Mask the reported address by the reported granularity.
667 */
1462594b 668 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
85f92694
TL
669 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
670 m->addr >>= shift;
671 m->addr <<= shift;
672 }
4f29b73b
YG
673
674 /*
675 * Extract [55:<lsb>] where lsb is the least significant
676 * *valid* bit of the address bits.
677 */
678 if (mce_flags.smca) {
679 u8 lsb = (m->addr >> 56) & 0x3f;
680
681 m->addr &= GENMASK_ULL(55, lsb);
682 }
85f92694 683 }
db819d60 684
5828c46f
YG
685 if (mce_flags.smca) {
686 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
687
688 if (m->status & MCI_STATUS_SYNDV)
689 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
690 }
85f92694
TL
691}
692
ca84f696
AK
693DEFINE_PER_CPU(unsigned, mce_poll_count);
694
d88203d1 695/*
b79109c3
AK
696 * Poll for corrected events or events that happened before reset.
697 * Those are just logged through /dev/mcelog.
698 *
699 * This is executed in standard interrupt context.
ed7290d0
AK
700 *
701 * Note: spec recommends to panic for fatal unsignalled
702 * errors here. However this would be quite problematic --
703 * we would need to reimplement the Monarch handling and
704 * it would mess up the exclusion between exception handler
a97673a1 705 * and poll handler -- * so we skip this for now.
ed7290d0
AK
706 * These cases should not happen anyways, or only when the CPU
707 * is already totally * confused. In this case it's likely it will
708 * not fully execute the machine check handler either.
b79109c3 709 */
3f2f0680 710bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3 711{
b4914508 712 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
8b38937b 713 bool error_seen = false;
b79109c3
AK
714 struct mce m;
715 int i;
716
c6ae41e7 717 this_cpu_inc(mce_poll_count);
ca84f696 718
b8325c5b 719 mce_gather_info(&m, NULL);
b79109c3 720
669c00f0
BP
721 if (flags & MCP_TIMESTAMP)
722 m.tsc = rdtsc();
54467353 723
c7d314f3 724 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 725 if (!mce_banks[i].ctl || !test_bit(i, *b))
b79109c3
AK
726 continue;
727
728 m.misc = 0;
729 m.addr = 0;
730 m.bank = i;
b79109c3
AK
731
732 barrier();
8121b8f9 733 m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
f19501aa
TL
734
735 /* If this entry is not valid, ignore it */
b79109c3
AK
736 if (!(m.status & MCI_STATUS_VAL))
737 continue;
738
739 /*
f19501aa
TL
740 * If we are logging everything (at CPU online) or this
741 * is a corrected error, then we must log it.
b79109c3 742 */
f19501aa
TL
743 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
744 goto log_it;
745
746 /*
747 * Newer Intel systems that support software error
748 * recovery need to make additional checks. Other
749 * CPUs should skip over uncorrected errors, but log
750 * everything else.
751 */
752 if (!mca_cfg.ser) {
753 if (m.status & MCI_STATUS_UC)
754 continue;
755 goto log_it;
756 }
757
758 /* Log "not enabled" (speculative) errors */
759 if (!(m.status & MCI_STATUS_EN))
760 goto log_it;
761
762 /*
763 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
764 * UC == 1 && PCC == 0 && S == 0
765 */
766 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
767 goto log_it;
768
769 /*
770 * Skip anything else. Presumption is that our read of this
771 * bank is racing with a machine check. Leave the log alone
772 * for do_machine_check() to deal with it.
773 */
774 continue;
b79109c3 775
f19501aa 776log_it:
8b38937b
TL
777 error_seen = true;
778
90454e49
JS
779 if (flags & MCP_DONTLOG)
780 goto clear_it;
b79109c3 781
90454e49 782 mce_read_aux(&m, i);
41ce0564 783 m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false);
b79109c3
AK
784 /*
785 * Don't get the IP here because it's unlikely to
786 * have anything to do with the actual error location.
787 */
b79109c3 788
90454e49
JS
789 if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
790 goto clear_it;
791
3bff147b
BP
792 if (flags & MCP_QUEUE_LOG)
793 mce_gen_pool_add(&m);
794 else
795 mce_log(&m);
90454e49
JS
796
797clear_it:
b79109c3
AK
798 /*
799 * Clear state for this bank.
800 */
8121b8f9 801 mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
b79109c3
AK
802 }
803
804 /*
805 * Don't clear MCG_STATUS here because it's only defined for
806 * exceptions.
807 */
88921be3
AK
808
809 sync_core();
3f2f0680 810
8b38937b 811 return error_seen;
b79109c3 812}
ea149b36 813EXPORT_SYMBOL_GPL(machine_check_poll);
b79109c3 814
cc466666
BP
815/*
816 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
817 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
818 * Vol 3B Table 15-20). But this confuses both the code that determines
819 * whether the machine check occurred in kernel or user mode, and also
820 * the severity assessment code. Pretend that EIPV was set, and take the
821 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
822 */
823static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
824{
825 if (bank != 0)
826 return;
827 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
828 return;
829 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
830 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
831 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
832 MCACOD)) !=
833 (MCI_STATUS_UC|MCI_STATUS_EN|
834 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
835 MCI_STATUS_AR|MCACOD_INSTR))
836 return;
837
838 m->mcgstatus |= MCG_STATUS_EIPV;
839 m->ip = regs->ip;
840 m->cs = regs->cs;
841}
842
bd19a5e6
AK
843/*
844 * Do a quick check if any of the events requires a panic.
845 * This decides if we keep the events around or clear them.
846 */
61b0fccd
TL
847static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
848 struct pt_regs *regs)
bd19a5e6 849{
7a8bc2b0 850 char *tmp = *msg;
1f74c8a6 851 int i;
bd19a5e6 852
c7d314f3 853 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
8121b8f9 854 m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
1f74c8a6
BP
855 if (!(m->status & MCI_STATUS_VAL))
856 continue;
857
858 __set_bit(i, validp);
cc466666
BP
859 if (mce_flags.snb_ifu_quirk)
860 quirk_sandybridge_ifu(i, m, regs);
17fea54b 861
a3a57dda 862 m->bank = i;
41ce0564 863 if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
1f74c8a6 864 mce_read_aux(m, i);
17fea54b 865 *msg = tmp;
1f74c8a6 866 return 1;
17fea54b 867 }
bd19a5e6 868 }
1f74c8a6 869 return 0;
bd19a5e6
AK
870}
871
3c079792
AK
872/*
873 * Variable to establish order between CPUs while scanning.
874 * Each CPU spins initially until executing is equal its number.
875 */
876static atomic_t mce_executing;
877
878/*
879 * Defines order of CPUs on entry. First CPU becomes Monarch.
880 */
881static atomic_t mce_callin;
882
7bb39313
PM
883/*
884 * Track which CPUs entered the MCA broadcast synchronization and which not in
885 * order to print holdouts.
886 */
887static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
888
3c079792
AK
889/*
890 * Check if a timeout waiting for other CPUs happened.
891 */
6c80f87e 892static int mce_timed_out(u64 *t, const char *msg)
3c079792
AK
893{
894 /*
895 * The others already did panic for some reason.
896 * Bail out like in a timeout.
897 * rmb() to tell the compiler that system_state
898 * might have been modified by someone else.
899 */
900 rmb();
c7c9b392 901 if (atomic_read(&mce_panicked))
3c079792 902 wait_for_panic();
84c2559d 903 if (!mca_cfg.monarch_timeout)
3c079792
AK
904 goto out;
905 if ((s64)*t < SPINUNIT) {
7bb39313
PM
906 if (mca_cfg.tolerant <= 1) {
907 if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
908 pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
909 cpumask_pr_args(&mce_missing_cpus));
6c80f87e 910 mce_panic(msg, NULL, NULL);
7bb39313 911 }
3c079792
AK
912 cpu_missing = 1;
913 return 1;
914 }
915 *t -= SPINUNIT;
916out:
917 touch_nmi_watchdog();
918 return 0;
919}
920
921/*
922 * The Monarch's reign. The Monarch is the CPU who entered
923 * the machine check handler first. It waits for the others to
924 * raise the exception too and then grades them. When any
925 * error is fatal panic. Only then let the others continue.
926 *
927 * The other CPUs entering the MCE handler will be controlled by the
928 * Monarch. They are called Subjects.
929 *
930 * This way we prevent any potential data corruption in a unrecoverable case
931 * and also makes sure always all CPU's errors are examined.
932 *
680b6cfd 933 * Also this detects the case of a machine check event coming from outer
3c079792
AK
934 * space (not detected by any CPUs) In this case some external agent wants
935 * us to shut down, so panic too.
936 *
937 * The other CPUs might still decide to panic if the handler happens
938 * in a unrecoverable place, but in this case the system is in a semi-stable
939 * state and won't corrupt anything by itself. It's ok to let the others
940 * continue for a bit first.
941 *
942 * All the spin loops have timeouts; when a timeout happens a CPU
943 * typically elects itself to be Monarch.
944 */
945static void mce_reign(void)
946{
947 int cpu;
948 struct mce *m = NULL;
949 int global_worst = 0;
950 char *msg = NULL;
3c079792
AK
951
952 /*
953 * This CPU is the Monarch and the other CPUs have run
954 * through their handlers.
955 * Grade the severity of the errors of all the CPUs.
956 */
957 for_each_possible_cpu(cpu) {
13c877f4
TL
958 struct mce *mtmp = &per_cpu(mces_seen, cpu);
959
960 if (mtmp->severity > global_worst) {
961 global_worst = mtmp->severity;
3c079792
AK
962 m = &per_cpu(mces_seen, cpu);
963 }
964 }
965
966 /*
967 * Cannot recover? Panic here then.
968 * This dumps all the mces in the log buffer and stops the
969 * other CPUs.
970 */
13c877f4
TL
971 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
972 /* call mce_severity() to get "msg" for panic */
41ce0564 973 mce_severity(m, NULL, mca_cfg.tolerant, &msg, true);
8af7043a 974 mce_panic("Fatal machine check", m, msg);
13c877f4 975 }
3c079792
AK
976
977 /*
978 * For UC somewhere we let the CPU who detects it handle it.
979 * Also must let continue the others, otherwise the handling
980 * CPU could deadlock on a lock.
981 */
982
983 /*
984 * No machine check event found. Must be some external
985 * source or one CPU is hung. Panic.
986 */
d203f0b8 987 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 988 mce_panic("Fatal machine check from unknown source", NULL, NULL);
3c079792
AK
989
990 /*
991 * Now clear all the mces_seen so that they don't reappear on
992 * the next mce.
993 */
994 for_each_possible_cpu(cpu)
995 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
996}
997
998static atomic_t global_nwo;
999
1000/*
1001 * Start of Monarch synchronization. This waits until all CPUs have
1002 * entered the exception handler and then determines if any of them
1003 * saw a fatal event that requires panic. Then it executes them
1004 * in the entry order.
1005 * TBD double check parallel CPU hotunplug
1006 */
7fb06fc9 1007static int mce_start(int *no_way_out)
3c079792 1008{
7fb06fc9 1009 int order;
3c079792 1010 int cpus = num_online_cpus();
84c2559d 1011 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792 1012
7fb06fc9
HS
1013 if (!timeout)
1014 return -1;
3c079792 1015
7fb06fc9 1016 atomic_add(*no_way_out, &global_nwo);
184e1fdf 1017 /*
bf92b1fe
DB
1018 * Rely on the implied barrier below, such that global_nwo
1019 * is updated before mce_callin.
184e1fdf 1020 */
a95436e4 1021 order = atomic_inc_return(&mce_callin);
7bb39313 1022 cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
3c079792
AK
1023
1024 /*
1025 * Wait for everyone.
1026 */
1027 while (atomic_read(&mce_callin) != cpus) {
6c80f87e
AL
1028 if (mce_timed_out(&timeout,
1029 "Timeout: Not all CPUs entered broadcast exception handler")) {
3c079792 1030 atomic_set(&global_nwo, 0);
7fb06fc9 1031 return -1;
3c079792
AK
1032 }
1033 ndelay(SPINUNIT);
1034 }
1035
184e1fdf
HY
1036 /*
1037 * mce_callin should be read before global_nwo
1038 */
1039 smp_rmb();
3c079792 1040
7fb06fc9
HS
1041 if (order == 1) {
1042 /*
1043 * Monarch: Starts executing now, the others wait.
1044 */
3c079792 1045 atomic_set(&mce_executing, 1);
7fb06fc9
HS
1046 } else {
1047 /*
1048 * Subject: Now start the scanning loop one by one in
1049 * the original callin order.
1050 * This way when there are any shared banks it will be
1051 * only seen by one CPU before cleared, avoiding duplicates.
1052 */
1053 while (atomic_read(&mce_executing) < order) {
6c80f87e
AL
1054 if (mce_timed_out(&timeout,
1055 "Timeout: Subject CPUs unable to finish machine check processing")) {
7fb06fc9
HS
1056 atomic_set(&global_nwo, 0);
1057 return -1;
1058 }
1059 ndelay(SPINUNIT);
1060 }
3c079792
AK
1061 }
1062
1063 /*
7fb06fc9 1064 * Cache the global no_way_out state.
3c079792 1065 */
7fb06fc9
HS
1066 *no_way_out = atomic_read(&global_nwo);
1067
1068 return order;
3c079792
AK
1069}
1070
1071/*
1072 * Synchronize between CPUs after main scanning loop.
1073 * This invokes the bulk of the Monarch processing.
1074 */
1075static int mce_end(int order)
1076{
1077 int ret = -1;
84c2559d 1078 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792
AK
1079
1080 if (!timeout)
1081 goto reset;
1082 if (order < 0)
1083 goto reset;
1084
1085 /*
1086 * Allow others to run.
1087 */
1088 atomic_inc(&mce_executing);
1089
1090 if (order == 1) {
1091 /* CHECKME: Can this race with a parallel hotplug? */
1092 int cpus = num_online_cpus();
1093
1094 /*
1095 * Monarch: Wait for everyone to go through their scanning
1096 * loops.
1097 */
1098 while (atomic_read(&mce_executing) <= cpus) {
6c80f87e
AL
1099 if (mce_timed_out(&timeout,
1100 "Timeout: Monarch CPU unable to finish machine check processing"))
3c079792
AK
1101 goto reset;
1102 ndelay(SPINUNIT);
1103 }
1104
1105 mce_reign();
1106 barrier();
1107 ret = 0;
1108 } else {
1109 /*
1110 * Subject: Wait for Monarch to finish.
1111 */
1112 while (atomic_read(&mce_executing) != 0) {
6c80f87e
AL
1113 if (mce_timed_out(&timeout,
1114 "Timeout: Monarch CPU did not finish machine check processing"))
3c079792
AK
1115 goto reset;
1116 ndelay(SPINUNIT);
1117 }
1118
1119 /*
1120 * Don't reset anything. That's done by the Monarch.
1121 */
1122 return 0;
1123 }
1124
1125 /*
1126 * Reset all global state.
1127 */
1128reset:
1129 atomic_set(&global_nwo, 0);
1130 atomic_set(&mce_callin, 0);
7bb39313 1131 cpumask_setall(&mce_missing_cpus);
3c079792
AK
1132 barrier();
1133
1134 /*
1135 * Let others run again.
1136 */
1137 atomic_set(&mce_executing, 0);
1138 return ret;
1139}
1140
1141static void mce_clear_state(unsigned long *toclear)
1142{
1143 int i;
1144
c7d314f3 1145 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
3c079792 1146 if (test_bit(i, toclear))
8121b8f9 1147 mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
3c079792
AK
1148 }
1149}
1150
d3d6923c
BP
1151/*
1152 * Cases where we avoid rendezvous handler timeout:
1153 * 1) If this CPU is offline.
1154 *
1155 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1156 * skip those CPUs which remain looping in the 1st kernel - see
1157 * crash_nmi_callback().
1158 *
1159 * Note: there still is a small window between kexec-ing and the new,
1160 * kdump kernel establishing a new #MC handler where a broadcasted MCE
1161 * might not get handled properly.
1162 */
94a46d31 1163static noinstr bool mce_check_crashing_cpu(void)
d3d6923c 1164{
94a46d31
TG
1165 unsigned int cpu = smp_processor_id();
1166
14d3b376 1167 if (arch_cpu_is_offline(cpu) ||
d3d6923c
BP
1168 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1169 u64 mcgstatus;
1170
aedbdeab 1171 mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
70f0c230
TW
1172
1173 if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1174 if (mcgstatus & MCG_STATUS_LMCES)
1175 return false;
1176 }
1177
d3d6923c 1178 if (mcgstatus & MCG_STATUS_RIPV) {
aedbdeab 1179 __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
d3d6923c
BP
1180 return true;
1181 }
1182 }
1183 return false;
1184}
1185
41ce0564 1186static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
f35565e3
BP
1187 unsigned long *toclear, unsigned long *valid_banks,
1188 int no_way_out, int *worst)
1189{
b4914508 1190 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
f35565e3
BP
1191 struct mca_config *cfg = &mca_cfg;
1192 int severity, i;
1193
c7d314f3 1194 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
f35565e3
BP
1195 __clear_bit(i, toclear);
1196 if (!test_bit(i, valid_banks))
1197 continue;
d5c84ef2 1198
f35565e3
BP
1199 if (!mce_banks[i].ctl)
1200 continue;
1201
1202 m->misc = 0;
1203 m->addr = 0;
1204 m->bank = i;
1205
8121b8f9 1206 m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
d5c84ef2 1207 if (!(m->status & MCI_STATUS_VAL))
f35565e3
BP
1208 continue;
1209
1210 /*
d5c84ef2
BP
1211 * Corrected or non-signaled errors are handled by
1212 * machine_check_poll(). Leave them alone, unless this panics.
f35565e3
BP
1213 */
1214 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1215 !no_way_out)
1216 continue;
1217
d5c84ef2 1218 /* Set taint even when machine check was not enabled. */
f35565e3
BP
1219 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1220
41ce0564 1221 severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
f35565e3
BP
1222
1223 /*
1224 * When machine check was for corrected/deferred handler don't
d5c84ef2 1225 * touch, unless we're panicking.
f35565e3
BP
1226 */
1227 if ((severity == MCE_KEEP_SEVERITY ||
1228 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1229 continue;
d5c84ef2 1230
f35565e3 1231 __set_bit(i, toclear);
d5c84ef2
BP
1232
1233 /* Machine check event was not enabled. Clear, but ignore. */
1234 if (severity == MCE_NO_SEVERITY)
f35565e3 1235 continue;
f35565e3
BP
1236
1237 mce_read_aux(m, i);
1238
1239 /* assuming valid severity level != 0 */
1240 m->severity = severity;
1241
1242 mce_log(m);
1243
1244 if (severity > *worst) {
1245 *final = *m;
1246 *worst = severity;
1247 }
1248 }
1249
1250 /* mce_clear_state will clear *final, save locally for use later */
1251 *m = *final;
1252}
1253
5567d11c
PZ
1254static void kill_me_now(struct callback_head *ch)
1255{
81065b35
TL
1256 struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
1257
1258 p->mce_count = 0;
5567d11c
PZ
1259 force_sig(SIGBUS);
1260}
1261
1262static void kill_me_maybe(struct callback_head *cb)
1263{
1264 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1265 int flags = MF_ACTION_REQUIRED;
a3f5d80e 1266 int ret;
5567d11c 1267
81065b35 1268 p->mce_count = 0;
5567d11c 1269 pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
17fae129
TL
1270
1271 if (!p->mce_ripv)
5567d11c
PZ
1272 flags |= MF_MUST_KILL;
1273
a3f5d80e 1274 ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
a6e3cf70 1275 if (!ret) {
17fae129 1276 set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
1e36d9c6 1277 sync_core();
5567d11c
PZ
1278 return;
1279 }
1280
a3f5d80e
NH
1281 /*
1282 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
1283 * to the current process with the proper error info, so no need to
1284 * send SIGBUS here again.
1285 */
1286 if (ret == -EHWPOISON)
1287 return;
1288
a6e3cf70
TL
1289 pr_err("Memory error not recovered");
1290 kill_me_now(cb);
1291}
1292
1293static void kill_me_never(struct callback_head *cb)
1294{
1295 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1296
1297 p->mce_count = 0;
1298 pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
1299 if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
1300 set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
5567d11c
PZ
1301}
1302
a6e3cf70 1303static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
c0ab7ffc 1304{
81065b35
TL
1305 int count = ++current->mce_count;
1306
1307 /* First call, save all the details */
1308 if (count == 1) {
1309 current->mce_addr = m->addr;
1310 current->mce_kflags = m->kflags;
1311 current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
1312 current->mce_whole_page = whole_page(m);
a6e3cf70 1313 current->mce_kill_me.func = func;
81065b35
TL
1314 }
1315
1316 /* Ten is likely overkill. Don't expect more than two faults before task_work() */
1317 if (count > 10)
1318 mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
c0ab7ffc 1319
81065b35
TL
1320 /* Second or later call, make sure page address matches the one from first call */
1321 if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
1322 mce_panic("Consecutive machine checks to different user pages", m, msg);
1323
1324 /* Do not call task_work_add() more than once */
1325 if (count > 1)
1326 return;
c0ab7ffc 1327
91989c70 1328 task_work_add(current, &current->mce_kill_me, TWA_RESUME);
5567d11c
PZ
1329}
1330
cbe1de16
BP
1331/* Handle unconfigured int18 (should never happen) */
1332static noinstr void unexpected_machine_check(struct pt_regs *regs)
1333{
1334 instrumentation_begin();
1335 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1336 smp_processor_id());
1337 instrumentation_end();
1338}
1339
b79109c3
AK
1340/*
1341 * The actual machine check handler. This only handles real
1342 * exceptions when something got corrupted coming in through int 18.
1343 *
1344 * This is executed in NMI context not subject to normal locking rules. This
1345 * implies that most kernel services cannot be safely used. Don't even
1346 * think about putting a printk in there!
3c079792
AK
1347 *
1348 * On Intel systems this is entered on all CPUs in parallel through
1349 * MCE broadcast. However some CPUs might be broken beyond repair,
1350 * so be always careful when synchronizing with others.
55ba18d6
AL
1351 *
1352 * Tracing and kprobes are disabled: if we interrupted a kernel context
1353 * with IF=1, we need to minimize stack usage. There are also recursion
1354 * issues: if the machine check was due to a failure of the memory
1355 * backing the user stack, tracing that reads the user stack will cause
1356 * potentially infinite recursion.
1da177e4 1357 */
7f6fa101 1358noinstr void do_machine_check(struct pt_regs *regs)
1da177e4 1359{
cbe1de16 1360 int worst = 0, order, no_way_out, kill_current_task, lmce;
d3d6923c
BP
1361 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1362 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1462594b 1363 struct mca_config *cfg = &mca_cfg;
3c079792 1364 struct mce m, *final;
7a8bc2b0 1365 char *msg = NULL;
cbe1de16
BP
1366
1367 if (unlikely(mce_flags.p5))
1368 return pentium_machine_check(regs);
1369 else if (unlikely(mce_flags.winchip))
1370 return winchip_machine_check(regs);
1371 else if (unlikely(!mca_cfg.initialized))
1372 return unexpected_machine_check(regs);
fead35c6 1373
3c079792
AK
1374 /*
1375 * Establish sequential order between the CPUs entering the machine
1376 * check handler.
1377 */
cbe1de16 1378 order = -1;
d3d6923c 1379
bd78432c
TH
1380 /*
1381 * If no_way_out gets set, there is no safe way to recover from this
d203f0b8 1382 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
bd78432c 1383 */
cbe1de16 1384 no_way_out = 0;
d3d6923c 1385
bd78432c 1386 /*
e1c06d23 1387 * If kill_current_task is not set, there might be a way to recover from this
bd78432c
TH
1388 * error.
1389 */
cbe1de16 1390 kill_current_task = 0;
fead35c6
YG
1391
1392 /*
1393 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1394 * on Intel.
1395 */
cbe1de16 1396 lmce = 1;
1da177e4 1397
c6ae41e7 1398 this_cpu_inc(mce_exception_count);
01ca79f1 1399
b8325c5b 1400 mce_gather_info(&m, regs);
669c00f0 1401 m.tsc = rdtsc();
b5f2fa4e 1402
89cbc767 1403 final = this_cpu_ptr(&mces_seen);
3c079792
AK
1404 *final = m;
1405
95022b8c 1406 memset(valid_banks, 0, sizeof(valid_banks));
61b0fccd 1407 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
680b6cfd 1408
1da177e4
LT
1409 barrier();
1410
ed7290d0 1411 /*
a8c321fb
TL
1412 * When no restart IP might need to kill or panic.
1413 * Assume the worst for now, but if we find the
1414 * severity is MCE_AR_SEVERITY we have other options.
ed7290d0
AK
1415 */
1416 if (!(m.mcgstatus & MCG_STATUS_RIPV))
e1c06d23 1417 kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
3c079792 1418 /*
fead35c6 1419 * Check if this MCE is signaled to only this logical processor,
70f0c230 1420 * on Intel, Zhaoxin only.
3c079792 1421 */
70f0c230
TW
1422 if (m.cpuvendor == X86_VENDOR_INTEL ||
1423 m.cpuvendor == X86_VENDOR_ZHAOXIN)
fead35c6
YG
1424 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1425
1426 /*
40c36e27
TL
1427 * Local machine check may already know that we have to panic.
1428 * Broadcast machine check begins rendezvous in mce_start()
fead35c6
YG
1429 * Go through all banks in exclusion of the other CPUs. This way we
1430 * don't report duplicated events on shared banks because the first one
40c36e27 1431 * to see it will clear it.
fead35c6 1432 */
40c36e27 1433 if (lmce) {
3a866b16 1434 if (no_way_out && cfg->tolerant < 3)
40c36e27
TL
1435 mce_panic("Fatal local machine check", &m, msg);
1436 } else {
243d657e 1437 order = mce_start(&no_way_out);
40c36e27 1438 }
243d657e 1439
41ce0564 1440 __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
a8c321fb 1441
3c079792
AK
1442 if (!no_way_out)
1443 mce_clear_state(toclear);
1444
e9eee03e 1445 /*
3c079792
AK
1446 * Do most of the synchronization with other CPUs.
1447 * When there's any problem use only local no_way_out state.
e9eee03e 1448 */
243d657e 1449 if (!lmce) {
25bc65d8
GP
1450 if (mce_end(order) < 0) {
1451 if (!no_way_out)
1452 no_way_out = worst >= MCE_PANIC_SEVERITY;
e273e6e1
GP
1453
1454 if (no_way_out && cfg->tolerant < 3)
1455 mce_panic("Fatal machine check on current CPU", &m, msg);
25bc65d8 1456 }
243d657e
AR
1457 } else {
1458 /*
40c36e27
TL
1459 * If there was a fatal machine check we should have
1460 * already called mce_panic earlier in this function.
1461 * Since we re-read the banks, we might have found
1462 * something new. Check again to see if we found a
1463 * fatal error. We call "mce_severity()" again to
1464 * make sure we have the right "msg".
243d657e 1465 */
40c36e27 1466 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
41ce0564 1467 mce_severity(&m, regs, cfg->tolerant, &msg, true);
40c36e27
TL
1468 mce_panic("Local fatal machine check!", &m, msg);
1469 }
243d657e 1470 }
bd78432c 1471
e1c06d23 1472 if (worst != MCE_AR_SEVERITY && !kill_current_task)
1e36d9c6 1473 goto out;
d4812e16 1474
b2f9d678
TL
1475 /* Fault was in user mode and we need to take some action */
1476 if ((m.cs & 3) == 3) {
b052df3d
TG
1477 /* If this triggers there is no way to recover. Die hard. */
1478 BUG_ON(!on_thread_stack() || !user_mode(regs));
b2f9d678 1479
a6e3cf70
TL
1480 if (kill_current_task)
1481 queue_task_work(&m, msg, kill_me_now);
1482 else
1483 queue_task_work(&m, msg, kill_me_maybe);
c0ab7ffc 1484
b2f9d678 1485 } else {
1df73b21
BP
1486 /*
1487 * Handle an MCE which has happened in kernel space but from
1488 * which the kernel can recover: ex_has_fault_handler() has
1489 * already verified that the rIP at which the error happened is
1490 * a rIP from which the kernel can recover (by jumping to
1491 * recovery code specified in _ASM_EXTABLE_FAULT()) and the
1492 * corresponding exception handler which would do that is the
1493 * proper one.
1494 */
1495 if (m.kflags & MCE_IN_KERNEL_RECOV) {
f77d26a9 1496 if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
1df73b21
BP
1497 mce_panic("Failed kernel mode recovery", &m, msg);
1498 }
c0ab7ffc
TL
1499
1500 if (m.kflags & MCE_IN_KERNEL_COPYIN)
a6e3cf70 1501 queue_task_work(&m, msg, kill_me_never);
d4812e16 1502 }
1e36d9c6
TL
1503out:
1504 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1da177e4 1505}
ea149b36 1506EXPORT_SYMBOL_GPL(do_machine_check);
1da177e4 1507
cd42f4a3 1508#ifndef CONFIG_MEMORY_FAILURE
83b57531 1509int memory_failure(unsigned long pfn, int flags)
9b1beaf2 1510{
a8c321fb
TL
1511 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1512 BUG_ON(flags & MF_ACTION_REQUIRED);
c767a54b
JP
1513 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1514 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1515 pfn);
cd42f4a3
TL
1516
1517 return 0;
9b1beaf2 1518}
cd42f4a3 1519#endif
9b1beaf2 1520
1da177e4 1521/*
8a336b0a
TH
1522 * Periodic polling timer for "silent" machine check errors. If the
1523 * poller finds an MCE, poll 2x faster. When the poller finds no more
1524 * errors, poll 2x slower (up to check_interval seconds).
1da177e4 1525 */
3f2f0680 1526static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
e9eee03e 1527
82f7af09 1528static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
52d168e2 1529static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 1530
55babd8f
CG
1531static unsigned long mce_adjust_timer_default(unsigned long interval)
1532{
1533 return interval;
1534}
1535
3f2f0680 1536static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
55babd8f 1537
0becc0ae 1538static void __start_timer(struct timer_list *t, unsigned long interval)
27f6c573 1539{
3f2f0680
BP
1540 unsigned long when = jiffies + interval;
1541 unsigned long flags;
27f6c573 1542
3f2f0680 1543 local_irq_save(flags);
27f6c573 1544
0becc0ae
TG
1545 if (!timer_pending(t) || time_before(when, t->expires))
1546 mod_timer(t, round_jiffies(when));
3f2f0680
BP
1547
1548 local_irq_restore(flags);
27f6c573
CG
1549}
1550
92bb6cb1 1551static void mce_timer_fn(struct timer_list *t)
1da177e4 1552{
92bb6cb1 1553 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
82f7af09 1554 unsigned long iv;
52d168e2 1555
92bb6cb1 1556 WARN_ON(cpu_t != t);
3f2f0680
BP
1557
1558 iv = __this_cpu_read(mce_next_interval);
52d168e2 1559
89cbc767 1560 if (mce_available(this_cpu_ptr(&cpu_info))) {
54467353 1561 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
3f2f0680
BP
1562
1563 if (mce_intel_cmci_poll()) {
1564 iv = mce_adjust_timer(iv);
1565 goto done;
1566 }
e9eee03e 1567 }
1da177e4
LT
1568
1569 /*
3f2f0680
BP
1570 * Alert userspace if needed. If we logged an MCE, reduce the polling
1571 * interval, otherwise increase the polling interval.
1da177e4 1572 */
3f2f0680 1573 if (mce_notify_irq())
958fb3c5 1574 iv = max(iv / 2, (unsigned long) HZ/100);
3f2f0680 1575 else
82f7af09 1576 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
3f2f0680
BP
1577
1578done:
82f7af09 1579 __this_cpu_write(mce_next_interval, iv);
0becc0ae 1580 __start_timer(t, iv);
55babd8f 1581}
e02e68d3 1582
55babd8f
CG
1583/*
1584 * Ensure that the timer is firing in @interval from now.
1585 */
1586void mce_timer_kick(unsigned long interval)
1587{
89cbc767 1588 struct timer_list *t = this_cpu_ptr(&mce_timer);
55babd8f
CG
1589 unsigned long iv = __this_cpu_read(mce_next_interval);
1590
0becc0ae 1591 __start_timer(t, interval);
3f2f0680 1592
55babd8f
CG
1593 if (interval < iv)
1594 __this_cpu_write(mce_next_interval, interval);
e02e68d3
TH
1595}
1596
9aaef96f
HS
1597/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1598static void mce_timer_delete_all(void)
1599{
1600 int cpu;
1601
1602 for_each_online_cpu(cpu)
1603 del_timer_sync(&per_cpu(mce_timer, cpu));
1604}
1605
e02e68d3 1606/*
9bd98405
AK
1607 * Notify the user(s) about new machine check events.
1608 * Can be called from interrupt context, but not from machine check/NMI
1609 * context.
e02e68d3 1610 */
9ff36ee9 1611int mce_notify_irq(void)
e02e68d3 1612{
8457c84d
AK
1613 /* Not more than two messages every minute */
1614 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1615
1020bcbc 1616 if (test_and_clear_bit(0, &mce_need_notify)) {
5de97c9f 1617 mce_work_trigger();
e02e68d3 1618
8457c84d 1619 if (__ratelimit(&ratelimit))
a2d7b0d4 1620 pr_info(HW_ERR "Machine check events logged\n");
e02e68d3
TH
1621
1622 return 1;
1da177e4 1623 }
e02e68d3
TH
1624 return 0;
1625}
9ff36ee9 1626EXPORT_SYMBOL_GPL(mce_notify_irq);
8a336b0a 1627
b4914508 1628static void __mcheck_cpu_mce_banks_init(void)
cebe1820 1629{
b4914508 1630 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
c7d314f3 1631 u8 n_banks = this_cpu_read(mce_num_banks);
cebe1820
AK
1632 int i;
1633
c7d314f3 1634 for (i = 0; i < n_banks; i++) {
cebe1820 1635 struct mce_bank *b = &mce_banks[i];
11868a2d 1636
068b053d
YG
1637 /*
1638 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
1639 * the required vendor quirks before
1640 * __mcheck_cpu_init_clear_banks() does the final bank setup.
1641 */
cebe1820 1642 b->ctl = -1ULL;
77080929 1643 b->init = true;
cebe1820 1644 }
cebe1820
AK
1645}
1646
d88203d1 1647/*
1da177e4
LT
1648 * Initialize Machine Checks for a CPU.
1649 */
b4914508 1650static void __mcheck_cpu_cap_init(void)
1da177e4 1651{
e9eee03e 1652 u64 cap;
006c0770 1653 u8 b;
1da177e4
LT
1654
1655 rdmsrl(MSR_IA32_MCG_CAP, cap);
01c6680a
TG
1656
1657 b = cap & MCG_BANKCNT_MASK;
c7d314f3
YG
1658
1659 if (b > MAX_NR_BANKS) {
1660 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1661 smp_processor_id(), MAX_NR_BANKS, b);
0d7482e3 1662 b = MAX_NR_BANKS;
c7d314f3 1663 }
0d7482e3 1664
c7d314f3 1665 this_cpu_write(mce_num_banks, b);
d203f0b8 1666
b4914508 1667 __mcheck_cpu_mce_banks_init();
0d7482e3 1668
94ad8474 1669 /* Use accurate RIP reporting if available. */
01c6680a 1670 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
84c2559d 1671 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1da177e4 1672
ed7290d0 1673 if (cap & MCG_SER_P)
09933946 1674 mca_cfg.ser = 1;
0d7482e3
AK
1675}
1676
5e09954a 1677static void __mcheck_cpu_init_generic(void)
0d7482e3 1678{
84c2559d 1679 enum mcp_flags m_fl = 0;
e9eee03e 1680 mce_banks_t all_banks;
0d7482e3 1681 u64 cap;
0d7482e3 1682
84c2559d
BP
1683 if (!mca_cfg.bootlog)
1684 m_fl = MCP_DONTLOG;
1685
b79109c3 1686 /*
3bff147b
BP
1687 * Log the machine checks left over from the previous reset. Log them
1688 * only, do not start processing them. That will happen in mcheck_late_init()
1689 * when all consumers have been registered on the notifier chain.
b79109c3 1690 */
ee031c31 1691 bitmap_fill(all_banks, MAX_NR_BANKS);
3bff147b 1692 machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
1da177e4 1693
375074cc 1694 cr4_set_bits(X86_CR4_MCE);
1da177e4 1695
0d7482e3 1696 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
1697 if (cap & MCG_CTL_P)
1698 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
bb91f8c0
AG
1699}
1700
1701static void __mcheck_cpu_init_clear_banks(void)
1702{
b4914508 1703 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
bb91f8c0 1704 int i;
1da177e4 1705
c7d314f3 1706 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 1707 struct mce_bank *b = &mce_banks[i];
11868a2d 1708
cebe1820 1709 if (!b->init)
06b7a7a5 1710 continue;
8121b8f9
BP
1711 wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
1712 wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
d88203d1 1713 }
1da177e4
LT
1714}
1715
068b053d
YG
1716/*
1717 * Do a final check to see if there are any unused/RAZ banks.
1718 *
1719 * This must be done after the banks have been initialized and any quirks have
1720 * been applied.
1721 *
1722 * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1723 * Otherwise, a user who disables a bank will not be able to re-enable it
1724 * without a system reboot.
1725 */
1726static void __mcheck_cpu_check_banks(void)
1727{
1728 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1729 u64 msrval;
1730 int i;
1731
1732 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1733 struct mce_bank *b = &mce_banks[i];
1734
1735 if (!b->init)
1736 continue;
1737
8121b8f9 1738 rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
068b053d
YG
1739 b->init = !!msrval;
1740 }
1741}
1742
1da177e4 1743/* Add per CPU specific workarounds here */
148f9bb8 1744static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
d88203d1 1745{
b4914508 1746 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
d203f0b8
BP
1747 struct mca_config *cfg = &mca_cfg;
1748
e412cd25 1749 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
c767a54b 1750 pr_info("unknown CPU type - not enabling MCE support\n");
e412cd25
IM
1751 return -EOPNOTSUPP;
1752 }
1753
1da177e4 1754 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 1755 if (c->x86_vendor == X86_VENDOR_AMD) {
c7d314f3 1756 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
e9eee03e
IM
1757 /*
1758 * disable GART TBL walk error reporting, which
1759 * trips off incorrectly with the IOMMU & 3ware
1760 * & Cerberus:
1761 */
cebe1820 1762 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
e9eee03e 1763 }
6057077f 1764 if (c->x86 < 0x11 && cfg->bootlog < 0) {
e9eee03e
IM
1765 /*
1766 * Lots of broken BIOS around that don't clear them
1767 * by default and leave crap in there. Don't log:
1768 */
84c2559d 1769 cfg->bootlog = 0;
e9eee03e 1770 }
2e6f694f
AK
1771 /*
1772 * Various K7s with broken bank 0 around. Always disable
1773 * by default.
1774 */
c7d314f3 1775 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
cebe1820 1776 mce_banks[0].ctl = 0;
575203b4 1777
bf80bbd7
AG
1778 /*
1779 * overflow_recov is supported for F15h Models 00h-0fh
1780 * even though we don't have a CPUID bit for it.
1781 */
1782 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1783 mce_flags.overflow_recov = 1;
1784
1da177e4 1785 }
e583538f 1786
06b7a7a5
AK
1787 if (c->x86_vendor == X86_VENDOR_INTEL) {
1788 /*
1789 * SDM documents that on family 6 bank 0 should not be written
1790 * because it aliases to another special BIOS controlled
1791 * register.
1792 * But it's not aliased anymore on model 0x1a+
1793 * Don't ignore bank 0 completely because there could be a
1794 * valid event later, merely don't write CTL0.
1795 */
1796
c7d314f3 1797 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
77080929 1798 mce_banks[0].init = false;
3c079792
AK
1799
1800 /*
1801 * All newer Intel systems support MCE broadcasting. Enable
1802 * synchronization with a one second timeout.
1803 */
1804 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
84c2559d
BP
1805 cfg->monarch_timeout < 0)
1806 cfg->monarch_timeout = USEC_PER_SEC;
c7f6fa44 1807
e412cd25
IM
1808 /*
1809 * There are also broken BIOSes on some Pentium M and
1810 * earlier systems:
1811 */
84c2559d
BP
1812 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1813 cfg->bootlog = 0;
61b0fccd
TL
1814
1815 if (c->x86 == 6 && c->x86_model == 45)
cc466666 1816 mce_flags.snb_ifu_quirk = 1;
06b7a7a5 1817 }
6e898d2b
TW
1818
1819 if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1820 /*
1821 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
1822 * synchronization with a one second timeout.
1823 */
1824 if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1825 if (cfg->monarch_timeout < 0)
1826 cfg->monarch_timeout = USEC_PER_SEC;
1827 }
1828 }
1829
84c2559d
BP
1830 if (cfg->monarch_timeout < 0)
1831 cfg->monarch_timeout = 0;
1832 if (cfg->bootlog != 0)
7af19e4a 1833 cfg->panic_timeout = 30;
e412cd25
IM
1834
1835 return 0;
d88203d1 1836}
1da177e4 1837
148f9bb8 1838static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
4efc0670
AK
1839{
1840 if (c->x86 != 5)
3a97fc34
HS
1841 return 0;
1842
4efc0670
AK
1843 switch (c->x86_vendor) {
1844 case X86_VENDOR_INTEL:
c6978369 1845 intel_p5_mcheck_init(c);
cbe1de16 1846 mce_flags.p5 = 1;
3a97fc34 1847 return 1;
4efc0670
AK
1848 case X86_VENDOR_CENTAUR:
1849 winchip_mcheck_init(c);
cbe1de16 1850 mce_flags.winchip = 1;
3a97fc34 1851 return 1;
dc34bdd2
BP
1852 default:
1853 return 0;
4efc0670 1854 }
3a97fc34
HS
1855
1856 return 0;
4efc0670
AK
1857}
1858
5204bf17
YG
1859/*
1860 * Init basic CPU features needed for early decoding of MCEs.
1861 */
1862static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1da177e4 1863{
ac78bd72 1864 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
14cddfd5
YG
1865 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1866 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1867 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
c9bf318f 1868 mce_flags.amd_threshold = 1;
5204bf17
YG
1869 }
1870}
c7f54d21 1871
13e85822
DW
1872static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1873{
1874 struct mca_config *cfg = &mca_cfg;
1875
1876 /*
1877 * All newer Centaur CPUs support MCE broadcasting. Enable
1878 * synchronization with a one second timeout.
1879 */
1880 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1881 c->x86 > 6) {
1882 if (cfg->monarch_timeout < 0)
1883 cfg->monarch_timeout = USEC_PER_SEC;
1884 }
1885}
1886
5a3d56a0
TW
1887static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1888{
1889 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1890
1891 /*
1892 * These CPUs have MCA bank 8 which reports only one error type called
1893 * SVAD (System View Address Decoder). The reporting of that error is
1894 * controlled by IA32_MC8.CTL.0.
1895 *
1896 * If enabled, prefetching on these CPUs will cause SVAD MCE when
1897 * virtual machines start and result in a system panic. Always disable
1898 * bank 8 SVAD error by default.
1899 */
1900 if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1901 (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1902 if (this_cpu_read(mce_num_banks) > 8)
1903 mce_banks[8].ctl = 0;
1904 }
1905
1906 intel_init_cmci();
70f0c230 1907 intel_init_lmce();
5a3d56a0
TW
1908 mce_adjust_timer = cmci_intel_adjust_timer;
1909}
1910
70f0c230
TW
1911static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
1912{
1913 intel_clear_lmce();
1914}
1915
5204bf17
YG
1916static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1917{
1918 switch (c->x86_vendor) {
1919 case X86_VENDOR_INTEL:
1920 mce_intel_feature_init(c);
1921 mce_adjust_timer = cmci_intel_adjust_timer;
1922 break;
c7f54d21 1923
5204bf17
YG
1924 case X86_VENDOR_AMD: {
1925 mce_amd_feature_init(c);
89b831ef 1926 break;
7559e13f 1927 }
ac78bd72
PW
1928
1929 case X86_VENDOR_HYGON:
1930 mce_hygon_feature_init(c);
1931 break;
1932
13e85822
DW
1933 case X86_VENDOR_CENTAUR:
1934 mce_centaur_feature_init(c);
1935 break;
7559e13f 1936
5a3d56a0
TW
1937 case X86_VENDOR_ZHAOXIN:
1938 mce_zhaoxin_feature_init(c);
1939 break;
1940
1da177e4
LT
1941 default:
1942 break;
1943 }
1944}
1945
8838eb6c
AR
1946static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1947{
1948 switch (c->x86_vendor) {
1949 case X86_VENDOR_INTEL:
1950 mce_intel_feature_clear(c);
1951 break;
70f0c230
TW
1952
1953 case X86_VENDOR_ZHAOXIN:
1954 mce_zhaoxin_feature_clear(c);
1955 break;
1956
8838eb6c
AR
1957 default:
1958 break;
1959 }
1960}
1961
0becc0ae 1962static void mce_start_timer(struct timer_list *t)
52d168e2 1963{
4f75d841 1964 unsigned long iv = check_interval * HZ;
bc09effa 1965
7af19e4a 1966 if (mca_cfg.ignore_ce || !iv)
62fdac59
HS
1967 return;
1968
0becc0ae
TG
1969 this_cpu_write(mce_next_interval, iv);
1970 __start_timer(t, iv);
52d168e2
AK
1971}
1972
39f152ff
SAS
1973static void __mcheck_cpu_setup_timer(void)
1974{
1975 struct timer_list *t = this_cpu_ptr(&mce_timer);
39f152ff 1976
92bb6cb1 1977 timer_setup(t, mce_timer_fn, TIMER_PINNED);
39f152ff
SAS
1978}
1979
26c3c283
TG
1980static void __mcheck_cpu_init_timer(void)
1981{
89cbc767 1982 struct timer_list *t = this_cpu_ptr(&mce_timer);
26c3c283 1983
92bb6cb1 1984 timer_setup(t, mce_timer_fn, TIMER_PINNED);
0becc0ae 1985 mce_start_timer(t);
26c3c283
TG
1986}
1987
45d4b7b9
YG
1988bool filter_mce(struct mce *m)
1989{
71a84402
YG
1990 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1991 return amd_filter_mce(m);
2976908e
PB
1992 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1993 return intel_filter_mce(m);
71a84402 1994
45d4b7b9
YG
1995 return false;
1996}
1997
4c0dcd83 1998static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
6f41c34d 1999{
b6be002b 2000 irqentry_state_t irq_state;
bc21a291 2001
13cbc0cd
AL
2002 WARN_ON_ONCE(user_mode(regs));
2003
4c0dcd83
TG
2004 /*
2005 * Only required when from kernel mode. See
2006 * mce_check_crashing_cpu() for details.
2007 */
cbe1de16 2008 if (mca_cfg.initialized && mce_check_crashing_cpu())
94a46d31
TG
2009 return;
2010
b6be002b 2011 irq_state = irqentry_nmi_enter(regs);
73749536 2012
cbe1de16 2013 do_machine_check(regs);
73749536 2014
b6be002b 2015 irqentry_nmi_exit(regs, irq_state);
4c0dcd83 2016}
94a46d31 2017
4c0dcd83
TG
2018static __always_inline void exc_machine_check_user(struct pt_regs *regs)
2019{
517e4992 2020 irqentry_enter_from_user_mode(regs);
73749536 2021
cbe1de16 2022 do_machine_check(regs);
73749536 2023
517e4992 2024 irqentry_exit_to_user_mode(regs);
6f41c34d 2025}
94a46d31 2026
4c0dcd83
TG
2027#ifdef CONFIG_X86_64
2028/* MCE hit kernel mode */
2029DEFINE_IDTENTRY_MCE(exc_machine_check)
2030{
cd840e42
PZ
2031 unsigned long dr7;
2032
2033 dr7 = local_db_save();
4c0dcd83 2034 exc_machine_check_kernel(regs);
cd840e42 2035 local_db_restore(dr7);
4c0dcd83
TG
2036}
2037
2038/* The user mode variant. */
2039DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
2040{
cd840e42
PZ
2041 unsigned long dr7;
2042
2043 dr7 = local_db_save();
4c0dcd83 2044 exc_machine_check_user(regs);
cd840e42 2045 local_db_restore(dr7);
4c0dcd83
TG
2046}
2047#else
2048/* 32bit unified entry point */
13cbc0cd 2049DEFINE_IDTENTRY_RAW(exc_machine_check)
4c0dcd83 2050{
cd840e42
PZ
2051 unsigned long dr7;
2052
2053 dr7 = local_db_save();
8cd501c1 2054 if (user_mode(regs))
4c0dcd83 2055 exc_machine_check_user(regs);
8cd501c1 2056 else
4c0dcd83 2057 exc_machine_check_kernel(regs);
cd840e42 2058 local_db_restore(dr7);
6f41c34d 2059}
4c0dcd83 2060#endif
6f41c34d 2061
d88203d1 2062/*
1da177e4 2063 * Called for each booted CPU to set up machine checks.
e9eee03e 2064 * Must be called with preempt off:
1da177e4 2065 */
148f9bb8 2066void mcheck_cpu_init(struct cpuinfo_x86 *c)
1da177e4 2067{
1462594b 2068 if (mca_cfg.disabled)
4efc0670
AK
2069 return;
2070
3a97fc34
HS
2071 if (__mcheck_cpu_ancient_init(c))
2072 return;
4efc0670 2073
5b4408fd 2074 if (!mce_available(c))
1da177e4
LT
2075 return;
2076
b4914508
YG
2077 __mcheck_cpu_cap_init();
2078
2079 if (__mcheck_cpu_apply_quirks(c) < 0) {
09933946 2080 mca_cfg.disabled = 1;
0d7482e3
AK
2081 return;
2082 }
0d7482e3 2083
648ed940 2084 if (mce_gen_pool_init()) {
09933946 2085 mca_cfg.disabled = 1;
648ed940
CG
2086 pr_emerg("Couldn't allocate MCE records pool!\n");
2087 return;
2088 }
2089
cbe1de16 2090 mca_cfg.initialized = 1;
5d727926 2091
5204bf17 2092 __mcheck_cpu_init_early(c);
5e09954a
BP
2093 __mcheck_cpu_init_generic();
2094 __mcheck_cpu_init_vendor(c);
bb91f8c0 2095 __mcheck_cpu_init_clear_banks();
068b053d 2096 __mcheck_cpu_check_banks();
39f152ff 2097 __mcheck_cpu_setup_timer();
1da177e4
LT
2098}
2099
8838eb6c
AR
2100/*
2101 * Called for each booted CPU to clear some machine checks opt-ins
2102 */
2103void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2104{
2105 if (mca_cfg.disabled)
2106 return;
2107
2108 if (!mce_available(c))
2109 return;
2110
2111 /*
2112 * Possibly to clear general settings generic to x86
2113 * __mcheck_cpu_clear_generic(c);
2114 */
2115 __mcheck_cpu_clear_vendor(c);
2116
1da177e4
LT
2117}
2118
c3d1fb56
NR
2119static void __mce_disable_bank(void *arg)
2120{
2121 int bank = *((int *)arg);
89cbc767 2122 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
c3d1fb56
NR
2123 cmci_disable_bank(bank);
2124}
2125
2126void mce_disable_bank(int bank)
2127{
c7d314f3 2128 if (bank >= this_cpu_read(mce_num_banks)) {
c3d1fb56
NR
2129 pr_warn(FW_BUG
2130 "Ignoring request to disable invalid MCA bank %d.\n",
2131 bank);
2132 return;
2133 }
2134 set_bit(bank, mce_banks_ce_disabled);
2135 on_each_cpu(__mce_disable_bank, &bank, 1);
2136}
2137
13503fa9 2138/*
62fdac59
HS
2139 * mce=off Disables machine check
2140 * mce=no_cmci Disables CMCI
88d53867 2141 * mce=no_lmce Disables LMCE
62fdac59 2142 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
43505646 2143 * mce=print_all Print all machine check logs to console
62fdac59 2144 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
3c079792
AK
2145 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2146 * monarchtimeout is how long to wait for other CPUs on machine
2147 * check, or 0 to not wait
6057077f
YG
2148 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
2149 and older.
13503fa9 2150 * mce=nobootlog Don't log MCEs from before booting.
450cc201 2151 * mce=bios_cmci_threshold Don't program the CMCI threshold
ec6347bb 2152 * mce=recovery force enable copy_mc_fragile()
13503fa9 2153 */
1da177e4
LT
2154static int __init mcheck_enable(char *str)
2155{
d203f0b8
BP
2156 struct mca_config *cfg = &mca_cfg;
2157
e3346fc4 2158 if (*str == 0) {
4efc0670 2159 enable_p5_mce();
e3346fc4
BZ
2160 return 1;
2161 }
4efc0670
AK
2162 if (*str == '=')
2163 str++;
1da177e4 2164 if (!strcmp(str, "off"))
09933946 2165 cfg->disabled = 1;
62fdac59 2166 else if (!strcmp(str, "no_cmci"))
7af19e4a 2167 cfg->cmci_disabled = true;
88d53867 2168 else if (!strcmp(str, "no_lmce"))
09933946 2169 cfg->lmce_disabled = 1;
62fdac59 2170 else if (!strcmp(str, "dont_log_ce"))
d203f0b8 2171 cfg->dont_log_ce = true;
43505646
TL
2172 else if (!strcmp(str, "print_all"))
2173 cfg->print_all = true;
62fdac59 2174 else if (!strcmp(str, "ignore_ce"))
7af19e4a 2175 cfg->ignore_ce = true;
13503fa9 2176 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
84c2559d 2177 cfg->bootlog = (str[0] == 'b');
450cc201 2178 else if (!strcmp(str, "bios_cmci_threshold"))
09933946 2179 cfg->bios_cmci_threshold = 1;
0f68c088 2180 else if (!strcmp(str, "recovery"))
09933946 2181 cfg->recovery = 1;
3c079792 2182 else if (isdigit(str[0])) {
5c31b280 2183 if (get_option(&str, &cfg->tolerant) == 2)
84c2559d 2184 get_option(&str, &(cfg->monarch_timeout));
3c079792 2185 } else {
c767a54b 2186 pr_info("mce argument %s ignored. Please use /sys\n", str);
13503fa9
HS
2187 return 0;
2188 }
9b41046c 2189 return 1;
1da177e4 2190}
4efc0670 2191__setup("mce", mcheck_enable);
1da177e4 2192
a2202aa2 2193int __init mcheck_init(void)
b33a6363 2194{
c9c6d216 2195 mce_register_decode_chain(&early_nb);
8438b84a 2196 mce_register_decode_chain(&mce_uc_nb);
cd9c57ca 2197 mce_register_decode_chain(&mce_default_nb);
a2202aa2 2198
cff4c039 2199 INIT_WORK(&mce_work, mce_gen_pool_process);
061120ae
CG
2200 init_irq_work(&mce_irq_work, mce_irq_work_cb);
2201
b33a6363
BP
2202 return 0;
2203}
b33a6363 2204
d88203d1 2205/*
c7cece89 2206 * mce_syscore: PM support
d88203d1 2207 */
1da177e4 2208
973a2dd1
AK
2209/*
2210 * Disable machine checks on suspend and shutdown. We can't really handle
2211 * them later.
2212 */
6e06780a 2213static void mce_disable_error_reporting(void)
973a2dd1 2214{
b4914508 2215 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
973a2dd1
AK
2216 int i;
2217
c7d314f3 2218 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 2219 struct mce_bank *b = &mce_banks[i];
11868a2d 2220
cebe1820 2221 if (b->init)
8121b8f9 2222 wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
06b7a7a5 2223 }
6e06780a
AR
2224 return;
2225}
2226
2227static void vendor_disable_error_reporting(void)
2228{
2229 /*
6e898d2b
TW
2230 * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2231 * MSRs are socket-wide. Disabling them for just a single offlined CPU
2232 * is bad, since it will inhibit reporting for all shared resources on
2233 * the socket like the last level cache (LLC), the integrated memory
2234 * controller (iMC), etc.
6e06780a 2235 */
ec338382 2236 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
ac78bd72 2237 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
6e898d2b
TW
2238 boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2239 boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
6e06780a
AR
2240 return;
2241
2242 mce_disable_error_reporting();
973a2dd1
AK
2243}
2244
c7cece89 2245static int mce_syscore_suspend(void)
973a2dd1 2246{
6e06780a
AR
2247 vendor_disable_error_reporting();
2248 return 0;
973a2dd1
AK
2249}
2250
c7cece89 2251static void mce_syscore_shutdown(void)
973a2dd1 2252{
6e06780a 2253 vendor_disable_error_reporting();
973a2dd1
AK
2254}
2255
e9eee03e
IM
2256/*
2257 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2258 * Only one CPU is active at this time, the others get re-added later using
2259 * CPU hotplug:
2260 */
c7cece89 2261static void mce_syscore_resume(void)
1da177e4 2262{
5e09954a 2263 __mcheck_cpu_init_generic();
89cbc767 2264 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
bb91f8c0 2265 __mcheck_cpu_init_clear_banks();
1da177e4
LT
2266}
2267
f3c6ea1b 2268static struct syscore_ops mce_syscore_ops = {
c7cece89
HS
2269 .suspend = mce_syscore_suspend,
2270 .shutdown = mce_syscore_shutdown,
2271 .resume = mce_syscore_resume,
f3c6ea1b
RW
2272};
2273
c7cece89 2274/*
8a25a2fd 2275 * mce_device: Sysfs support
c7cece89
HS
2276 */
2277
52d168e2
AK
2278static void mce_cpu_restart(void *data)
2279{
89cbc767 2280 if (!mce_available(raw_cpu_ptr(&cpu_info)))
33edbf02 2281 return;
5e09954a 2282 __mcheck_cpu_init_generic();
bb91f8c0 2283 __mcheck_cpu_init_clear_banks();
5e09954a 2284 __mcheck_cpu_init_timer();
52d168e2
AK
2285}
2286
1da177e4 2287/* Reinit MCEs after user configuration changes */
d88203d1
TG
2288static void mce_restart(void)
2289{
9aaef96f 2290 mce_timer_delete_all();
52d168e2 2291 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
2292}
2293
9af43b54 2294/* Toggle features for corrected errors */
9aaef96f 2295static void mce_disable_cmci(void *data)
9af43b54 2296{
89cbc767 2297 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54 2298 return;
9af43b54
HS
2299 cmci_clear();
2300}
2301
2302static void mce_enable_ce(void *all)
2303{
89cbc767 2304 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54
HS
2305 return;
2306 cmci_reenable();
2307 cmci_recheck();
2308 if (all)
5e09954a 2309 __mcheck_cpu_init_timer();
9af43b54
HS
2310}
2311
8a25a2fd 2312static struct bus_type mce_subsys = {
e9eee03e 2313 .name = "machinecheck",
8a25a2fd 2314 .dev_name = "machinecheck",
1da177e4
LT
2315};
2316
d6126ef5 2317DEFINE_PER_CPU(struct device *, mce_device);
e9eee03e 2318
b4914508 2319static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
cebe1820 2320{
b4914508 2321 return container_of(attr, struct mce_bank_dev, attr);
cebe1820 2322}
0d7482e3 2323
8a25a2fd 2324static ssize_t show_bank(struct device *s, struct device_attribute *attr,
0d7482e3
AK
2325 char *buf)
2326{
b4914508
YG
2327 u8 bank = attr_to_bank(attr)->bank;
2328 struct mce_bank *b;
2329
c7d314f3 2330 if (bank >= per_cpu(mce_num_banks, s->id))
b4914508
YG
2331 return -EINVAL;
2332
2333 b = &per_cpu(mce_banks_array, s->id)[bank];
2334
068b053d
YG
2335 if (!b->init)
2336 return -ENODEV;
2337
b4914508 2338 return sprintf(buf, "%llx\n", b->ctl);
0d7482e3
AK
2339}
2340
8a25a2fd 2341static ssize_t set_bank(struct device *s, struct device_attribute *attr,
9319cec8 2342 const char *buf, size_t size)
0d7482e3 2343{
b4914508
YG
2344 u8 bank = attr_to_bank(attr)->bank;
2345 struct mce_bank *b;
9319cec8 2346 u64 new;
e9eee03e 2347
164109e3 2348 if (kstrtou64(buf, 0, &new) < 0)
0d7482e3 2349 return -EINVAL;
e9eee03e 2350
c7d314f3 2351 if (bank >= per_cpu(mce_num_banks, s->id))
b4914508
YG
2352 return -EINVAL;
2353
2354 b = &per_cpu(mce_banks_array, s->id)[bank];
2355
068b053d
YG
2356 if (!b->init)
2357 return -ENODEV;
2358
b4914508 2359 b->ctl = new;
0d7482e3 2360 mce_restart();
e9eee03e 2361
9319cec8 2362 return size;
0d7482e3 2363}
a98f0dd3 2364
8a25a2fd
KS
2365static ssize_t set_ignore_ce(struct device *s,
2366 struct device_attribute *attr,
9af43b54
HS
2367 const char *buf, size_t size)
2368{
2369 u64 new;
2370
164109e3 2371 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2372 return -EINVAL;
2373
b3b7c479 2374 mutex_lock(&mce_sysfs_mutex);
7af19e4a 2375 if (mca_cfg.ignore_ce ^ !!new) {
9af43b54
HS
2376 if (new) {
2377 /* disable ce features */
9aaef96f
HS
2378 mce_timer_delete_all();
2379 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2380 mca_cfg.ignore_ce = true;
9af43b54
HS
2381 } else {
2382 /* enable ce features */
7af19e4a 2383 mca_cfg.ignore_ce = false;
9af43b54
HS
2384 on_each_cpu(mce_enable_ce, (void *)1, 1);
2385 }
2386 }
b3b7c479
SH
2387 mutex_unlock(&mce_sysfs_mutex);
2388
9af43b54
HS
2389 return size;
2390}
2391
8a25a2fd
KS
2392static ssize_t set_cmci_disabled(struct device *s,
2393 struct device_attribute *attr,
9af43b54
HS
2394 const char *buf, size_t size)
2395{
2396 u64 new;
2397
164109e3 2398 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2399 return -EINVAL;
2400
b3b7c479 2401 mutex_lock(&mce_sysfs_mutex);
7af19e4a 2402 if (mca_cfg.cmci_disabled ^ !!new) {
9af43b54
HS
2403 if (new) {
2404 /* disable cmci */
9aaef96f 2405 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2406 mca_cfg.cmci_disabled = true;
9af43b54
HS
2407 } else {
2408 /* enable cmci */
7af19e4a 2409 mca_cfg.cmci_disabled = false;
9af43b54
HS
2410 on_each_cpu(mce_enable_ce, NULL, 1);
2411 }
2412 }
b3b7c479
SH
2413 mutex_unlock(&mce_sysfs_mutex);
2414
9af43b54
HS
2415 return size;
2416}
2417
8a25a2fd
KS
2418static ssize_t store_int_with_restart(struct device *s,
2419 struct device_attribute *attr,
b56f642d
AK
2420 const char *buf, size_t size)
2421{
b3b7c479
SH
2422 unsigned long old_check_interval = check_interval;
2423 ssize_t ret = device_store_ulong(s, attr, buf, size);
2424
2425 if (check_interval == old_check_interval)
2426 return ret;
2427
b3b7c479 2428 mutex_lock(&mce_sysfs_mutex);
b56f642d 2429 mce_restart();
b3b7c479
SH
2430 mutex_unlock(&mce_sysfs_mutex);
2431
b56f642d
AK
2432 return ret;
2433}
2434
d203f0b8 2435static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
84c2559d 2436static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
d203f0b8 2437static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
43505646 2438static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
e9eee03e 2439
8a25a2fd
KS
2440static struct dev_ext_attribute dev_attr_check_interval = {
2441 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
b56f642d
AK
2442 &check_interval
2443};
e9eee03e 2444
8a25a2fd 2445static struct dev_ext_attribute dev_attr_ignore_ce = {
7af19e4a
BP
2446 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2447 &mca_cfg.ignore_ce
9af43b54
HS
2448};
2449
8a25a2fd 2450static struct dev_ext_attribute dev_attr_cmci_disabled = {
7af19e4a
BP
2451 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2452 &mca_cfg.cmci_disabled
9af43b54
HS
2453};
2454
8a25a2fd
KS
2455static struct device_attribute *mce_device_attrs[] = {
2456 &dev_attr_tolerant.attr,
2457 &dev_attr_check_interval.attr,
5de97c9f 2458#ifdef CONFIG_X86_MCELOG_LEGACY
8a25a2fd 2459 &dev_attr_trigger,
5de97c9f 2460#endif
8a25a2fd
KS
2461 &dev_attr_monarch_timeout.attr,
2462 &dev_attr_dont_log_ce.attr,
43505646 2463 &dev_attr_print_all.attr,
8a25a2fd
KS
2464 &dev_attr_ignore_ce.attr,
2465 &dev_attr_cmci_disabled.attr,
a98f0dd3
AK
2466 NULL
2467};
1da177e4 2468
8a25a2fd 2469static cpumask_var_t mce_device_initialized;
bae19fe0 2470
e032d807
GKH
2471static void mce_device_release(struct device *dev)
2472{
2473 kfree(dev);
2474}
2475
b4914508 2476/* Per CPU device init. All of the CPUs still share the same bank device: */
148f9bb8 2477static int mce_device_create(unsigned int cpu)
1da177e4 2478{
e032d807 2479 struct device *dev;
1da177e4 2480 int err;
b1f49f95 2481 int i, j;
92cb7612 2482
90367556 2483 if (!mce_available(&boot_cpu_data))
91c6d400
AK
2484 return -EIO;
2485
7f34b935
SAS
2486 dev = per_cpu(mce_device, cpu);
2487 if (dev)
2488 return 0;
2489
0e96f31e 2490 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
e032d807
GKH
2491 if (!dev)
2492 return -ENOMEM;
8a25a2fd
KS
2493 dev->id = cpu;
2494 dev->bus = &mce_subsys;
e032d807 2495 dev->release = &mce_device_release;
91c6d400 2496
8a25a2fd 2497 err = device_register(dev);
853d9b18
LK
2498 if (err) {
2499 put_device(dev);
d435d862 2500 return err;
853d9b18 2501 }
d435d862 2502
8a25a2fd
KS
2503 for (i = 0; mce_device_attrs[i]; i++) {
2504 err = device_create_file(dev, mce_device_attrs[i]);
d435d862
AM
2505 if (err)
2506 goto error;
2507 }
c7d314f3 2508 for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
b4914508 2509 err = device_create_file(dev, &mce_bank_devs[j].attr);
0d7482e3
AK
2510 if (err)
2511 goto error2;
2512 }
8a25a2fd 2513 cpumask_set_cpu(cpu, mce_device_initialized);
d6126ef5 2514 per_cpu(mce_device, cpu) = dev;
91c6d400 2515
d435d862 2516 return 0;
0d7482e3 2517error2:
b1f49f95 2518 while (--j >= 0)
b4914508 2519 device_remove_file(dev, &mce_bank_devs[j].attr);
d435d862 2520error:
cb491fca 2521 while (--i >= 0)
8a25a2fd 2522 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2523
8a25a2fd 2524 device_unregister(dev);
d435d862 2525
91c6d400
AK
2526 return err;
2527}
2528
148f9bb8 2529static void mce_device_remove(unsigned int cpu)
91c6d400 2530{
d6126ef5 2531 struct device *dev = per_cpu(mce_device, cpu);
73ca5358
SL
2532 int i;
2533
8a25a2fd 2534 if (!cpumask_test_cpu(cpu, mce_device_initialized))
bae19fe0
AH
2535 return;
2536
8a25a2fd
KS
2537 for (i = 0; mce_device_attrs[i]; i++)
2538 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2539
c7d314f3 2540 for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
b4914508 2541 device_remove_file(dev, &mce_bank_devs[i].attr);
cb491fca 2542
8a25a2fd
KS
2543 device_unregister(dev);
2544 cpumask_clear_cpu(cpu, mce_device_initialized);
d6126ef5 2545 per_cpu(mce_device, cpu) = NULL;
91c6d400 2546}
91c6d400 2547
d6b75584 2548/* Make sure there are no machine checks on offlined CPUs. */
39f152ff 2549static void mce_disable_cpu(void)
d6b75584 2550{
89cbc767 2551 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2552 return;
767df1bd 2553
39f152ff 2554 if (!cpuhp_tasks_frozen)
88ccbedd 2555 cmci_clear();
11868a2d 2556
6e06780a 2557 vendor_disable_error_reporting();
d6b75584
AK
2558}
2559
39f152ff 2560static void mce_reenable_cpu(void)
d6b75584 2561{
b4914508 2562 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
e9eee03e 2563 int i;
d6b75584 2564
89cbc767 2565 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2566 return;
e9eee03e 2567
39f152ff 2568 if (!cpuhp_tasks_frozen)
88ccbedd 2569 cmci_reenable();
c7d314f3 2570 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 2571 struct mce_bank *b = &mce_banks[i];
11868a2d 2572
cebe1820 2573 if (b->init)
8121b8f9 2574 wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
06b7a7a5 2575 }
d6b75584
AK
2576}
2577
0e285d36 2578static int mce_cpu_dead(unsigned int cpu)
91c6d400 2579{
0e285d36 2580 mce_intel_hcpu_update(cpu);
91c6d400 2581
0e285d36
SAS
2582 /* intentionally ignoring frozen here */
2583 if (!cpuhp_tasks_frozen)
2584 cmci_rediscover();
2585 return 0;
91c6d400
AK
2586}
2587
8c0eeac8 2588static int mce_cpu_online(unsigned int cpu)
91c6d400 2589{
0becc0ae 2590 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8 2591 int ret;
91c6d400 2592
8c0eeac8 2593 mce_device_create(cpu);
38356c1f 2594
8c0eeac8
SAS
2595 ret = mce_threshold_create_device(cpu);
2596 if (ret) {
2597 mce_device_remove(cpu);
2598 return ret;
1a65f970 2599 }
8c0eeac8 2600 mce_reenable_cpu();
0becc0ae 2601 mce_start_timer(t);
8c0eeac8 2602 return 0;
91c6d400
AK
2603}
2604
8c0eeac8
SAS
2605static int mce_cpu_pre_down(unsigned int cpu)
2606{
0becc0ae 2607 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8
SAS
2608
2609 mce_disable_cpu();
2610 del_timer_sync(t);
2611 mce_threshold_remove_device(cpu);
2612 mce_device_remove(cpu);
2613 return 0;
2614}
91c6d400 2615
cebe1820 2616static __init void mce_init_banks(void)
0d7482e3
AK
2617{
2618 int i;
2619
b4914508
YG
2620 for (i = 0; i < MAX_NR_BANKS; i++) {
2621 struct mce_bank_dev *b = &mce_bank_devs[i];
8a25a2fd 2622 struct device_attribute *a = &b->attr;
e9eee03e 2623
b4914508
YG
2624 b->bank = i;
2625
a07e4156 2626 sysfs_attr_init(&a->attr);
cebe1820
AK
2627 a->attr.name = b->attrname;
2628 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
e9eee03e
IM
2629
2630 a->attr.mode = 0644;
2631 a->show = show_bank;
2632 a->store = set_bank;
0d7482e3 2633 }
0d7482e3
AK
2634}
2635
6e7a41c6
TG
2636/*
2637 * When running on XEN, this initcall is ordered against the XEN mcelog
2638 * initcall:
2639 *
2640 * device_initcall(xen_late_init_mcelog);
2641 * device_initcall_sync(mcheck_init_device);
2642 */
5e09954a 2643static __init int mcheck_init_device(void)
91c6d400
AK
2644{
2645 int err;
91c6d400 2646
c65e774f
KS
2647 /*
2648 * Check if we have a spare virtual bit. This will only become
2649 * a problem if/when we move beyond 5-level page tables.
2650 */
2651 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2652
9c15a24b
MS
2653 if (!mce_available(&boot_cpu_data)) {
2654 err = -EIO;
2655 goto err_out;
2656 }
0d7482e3 2657
9c15a24b
MS
2658 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2659 err = -ENOMEM;
2660 goto err_out;
2661 }
996867d0 2662
cebe1820 2663 mce_init_banks();
0d7482e3 2664
8a25a2fd 2665 err = subsys_system_register(&mce_subsys, NULL);
d435d862 2666 if (err)
9c15a24b 2667 goto err_out_mem;
91c6d400 2668
0e285d36
SAS
2669 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2670 mce_cpu_dead);
2671 if (err)
2672 goto err_out_mem;
91c6d400 2673
6e7a41c6
TG
2674 /*
2675 * Invokes mce_cpu_online() on all CPUs which are online when
2676 * the state is installed.
2677 */
8c0eeac8
SAS
2678 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2679 mce_cpu_online, mce_cpu_pre_down);
2680 if (err < 0)
0e285d36 2681 goto err_out_online;
93b62c3c 2682
9c15a24b
MS
2683 register_syscore_ops(&mce_syscore_ops);
2684
9c15a24b
MS
2685 return 0;
2686
0e285d36
SAS
2687err_out_online:
2688 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
9c15a24b
MS
2689
2690err_out_mem:
2691 free_cpumask_var(mce_device_initialized);
2692
2693err_out:
5de97c9f 2694 pr_err("Unable to init MCE device (rc: %d)\n", err);
e9eee03e 2695
1da177e4 2696 return err;
1da177e4 2697}
cef12ee5 2698device_initcall_sync(mcheck_init_device);
a988d334 2699
d7c3c9a6
AK
2700/*
2701 * Old style boot options parsing. Only for compatibility.
2702 */
2703static int __init mcheck_disable(char *str)
2704{
09933946 2705 mca_cfg.disabled = 1;
d7c3c9a6
AK
2706 return 1;
2707}
2708__setup("nomce", mcheck_disable);
a988d334 2709
5be9ed25
HY
2710#ifdef CONFIG_DEBUG_FS
2711struct dentry *mce_get_debugfs_dir(void)
a988d334 2712{
5be9ed25 2713 static struct dentry *dmce;
a988d334 2714
5be9ed25
HY
2715 if (!dmce)
2716 dmce = debugfs_create_dir("mce", NULL);
a988d334 2717
5be9ed25
HY
2718 return dmce;
2719}
a988d334 2720
bf783f9f
HY
2721static void mce_reset(void)
2722{
2723 cpu_missing = 0;
c7c9b392 2724 atomic_set(&mce_fake_panicked, 0);
bf783f9f
HY
2725 atomic_set(&mce_executing, 0);
2726 atomic_set(&mce_callin, 0);
2727 atomic_set(&global_nwo, 0);
7bb39313 2728 cpumask_setall(&mce_missing_cpus);
bf783f9f 2729}
a988d334 2730
bf783f9f
HY
2731static int fake_panic_get(void *data, u64 *val)
2732{
2733 *val = fake_panic;
2734 return 0;
a988d334
IM
2735}
2736
bf783f9f 2737static int fake_panic_set(void *data, u64 val)
a988d334 2738{
bf783f9f
HY
2739 mce_reset();
2740 fake_panic = val;
2741 return 0;
a988d334 2742}
a988d334 2743
28156d76
Y
2744DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2745 "%llu\n");
d7c3c9a6 2746
6e4f929e 2747static void __init mcheck_debugfs_init(void)
d7c3c9a6 2748{
6e4f929e 2749 struct dentry *dmce;
bf783f9f
HY
2750
2751 dmce = mce_get_debugfs_dir();
6e4f929e
GKH
2752 debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2753 &fake_panic_fops);
d7c3c9a6 2754}
fd4cf79f 2755#else
6e4f929e 2756static void __init mcheck_debugfs_init(void) { }
5be9ed25 2757#endif
fd4cf79f
CG
2758
2759static int __init mcheck_late_init(void)
2760{
3637efb0 2761 if (mca_cfg.recovery)
ec6347bb 2762 enable_copy_mc_fragile();
3637efb0 2763
fd4cf79f
CG
2764 mcheck_debugfs_init();
2765
2766 /*
2767 * Flush out everything that has been logged during early boot, now that
2768 * everything has been initialized (workqueues, decoders, ...).
2769 */
2770 mce_schedule_work();
2771
2772 return 0;
2773}
2774late_initcall(mcheck_late_init);