}
}
+static int do_memory_failure(struct mce *m)
+{
+ int flags = MF_ACTION_REQUIRED;
+ int ret;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
+ ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+ if (ret)
+ pr_err("Memory error not recovered");
+ return ret;
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- u64 recover_paddr = ~0ull;
- int flags = MF_ACTION_REQUIRED;
int lmce = 0;
/* If this CPU is offline, just bail out. */
}
/*
- * At insane "tolerant" levels we take no action. Otherwise
- * we only die if we have no other choice. For less serious
- * issues we try to recover, or limit damage to the current
- * process.
+ * If tolerant is at an insane level we drop requests to kill
+ * processes and continue even when there is no way out.
*/
- if (cfg->tolerant < 3) {
- if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
- if (worst == MCE_AR_SEVERITY) {
- recover_paddr = m.addr;
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- flags |= MF_MUST_KILL;
- } else if (kill_it) {
- force_sig(SIGBUS, current);
- }
- }
+ if (cfg->tolerant == 3)
+ kill_it = 0;
+ else if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst > 0)
mce_report_event(regs);
out:
sync_core();
- if (recover_paddr == ~0ull)
- goto done;
+ if (worst != MCE_AR_SEVERITY && !kill_it)
+ goto out_ist;
- pr_err("Uncorrected hardware memory error in user-access at %llx",
- recover_paddr);
- /*
- * We must call memory_failure() here even if the current process is
- * doomed. We still need to mark the page as poisoned and alert any
- * other users of the page.
- */
- ist_begin_non_atomic(regs);
- local_irq_enable();
- if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
- pr_err("Memory error not recovered");
- force_sig(SIGBUS, current);
+ /* Fault was in user mode and we need to take some action */
+ if ((m.cs & 3) == 3) {
+ ist_begin_non_atomic(regs);
+ local_irq_enable();
+
+ if (kill_it || do_memory_failure(&m))
+ force_sig(SIGBUS, current);
+ local_irq_disable();
+ ist_end_non_atomic();
+ } else {
+ if (!fixup_exception(regs, X86_TRAP_MC))
+ mce_panic("Failed kernel mode recovery", &m, NULL);
}
- local_irq_disable();
- ist_end_non_atomic();
-done:
+
+out_ist:
ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
+ /*
+ * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+ * whether this processor will actually generate recoverable
+ * machine checks. Check to see if this is an E7 model Xeon.
+ * We can't do a model number check because E5 and E7 use the
+ * same model number. E5 doesn't support recovery, E7 does.
+ */
+ if (mca_cfg.recovery || (mca_cfg.ser &&
+ !strncmp(c->x86_model_id,
+ "Intel(R) Xeon(R) CPU E7-", 24)))
+ set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
case X86_VENDOR_AMD: {
u32 ebx = cpuid_ebx(0x80000007);
- mce_amd_feature_init(c);
mce_flags.overflow_recov = !!(ebx & BIT(0));
mce_flags.succor = !!(ebx & BIT(1));
mce_flags.smca = !!(ebx & BIT(3));
+ mce_amd_feature_init(c);
break;
}
cfg->bootlog = (str[0] == 'b');
else if (!strcmp(str, "bios_cmci_threshold"))
cfg->bios_cmci_threshold = true;
+ else if (!strcmp(str, "recovery"))
+ cfg->recovery = true;
else if (isdigit(str[0])) {
if (get_option(&str, &cfg->tolerant) == 2)
get_option(&str, &(cfg->monarch_timeout));