x86/mce: Streamline MCE subsystem's naming
authorBorislav Petkov <bp@suse.de>
Sun, 18 Nov 2018 14:15:05 +0000 (15:15 +0100)
committerBorislav Petkov <bp@suse.de>
Wed, 5 Dec 2018 17:00:29 +0000 (18:00 +0100)
Rename the containing folder to "mce" which is the most widespread name.
Drop the "mce[-_]" filename prefix of some compilation units (while
others don't have it).

This unifies the file naming in the MCE subsystem:

mce/
|-- amd.c
|-- apei.c
|-- core.c
|-- dev-mcelog.c
|-- genpool.c
|-- inject.c
|-- intel.c
|-- internal.h
|-- Makefile
|-- p5.c
|-- severity.c
|-- therm_throt.c
|-- threshold.c
`-- winchip.c

No functional changes.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20181205141323.14995-1-bp@alien8.de
29 files changed:
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/mce/Makefile [new file with mode: 0644]
arch/x86/kernel/cpu/mce/amd.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/apei.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/core.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/dev-mcelog.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/genpool.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/inject.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/intel.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/internal.h [new file with mode: 0644]
arch/x86/kernel/cpu/mce/p5.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/severity.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/therm_throt.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/threshold.c [new file with mode: 0644]
arch/x86/kernel/cpu/mce/winchip.c [new file with mode: 0644]
arch/x86/kernel/cpu/mcheck/Makefile [deleted file]
arch/x86/kernel/cpu/mcheck/dev-mcelog.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce-apei.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce-genpool.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce-inject.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce-internal.h [deleted file]
arch/x86/kernel/cpu/mcheck/mce-severity.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce_amd.c [deleted file]
arch/x86/kernel/cpu/mcheck/mce_intel.c [deleted file]
arch/x86/kernel/cpu/mcheck/p5.c [deleted file]
arch/x86/kernel/cpu/mcheck/therm_throt.c [deleted file]
arch/x86/kernel/cpu/mcheck/threshold.c [deleted file]
arch/x86/kernel/cpu/mcheck/winchip.c [deleted file]

index 1f5d2291c31ec24a765bda0f4b0f0bcb7aae069d..43afe707c6fb90d0ae7627760eed3da4eae11a4a 100644 (file)
@@ -40,7 +40,7 @@ obj-$(CONFIG_INTEL_RDT)       += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o
 obj-$(CONFIG_INTEL_RDT)        += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o
 CFLAGS_intel_rdt_pseudo_lock.o = -I$(src)
 
-obj-$(CONFIG_X86_MCE)                  += mcheck/
+obj-$(CONFIG_X86_MCE)                  += mce/
 obj-$(CONFIG_MTRR)                     += mtrr/
 obj-$(CONFIG_MICROCODE)                        += microcode/
 
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
new file mode 100644 (file)
index 0000000..7657597
--- /dev/null
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y                          =  core.o severity.o genpool.o
+
+obj-$(CONFIG_X86_ANCIENT_MCE)  += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL)    += intel.o
+obj-$(CONFIG_X86_MCE_AMD)      += amd.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+obj-$(CONFIG_X86_MCE_INJECT)   += inject.o
+
+obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI)                += apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY)        += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
new file mode 100644 (file)
index 0000000..4a2fb59
--- /dev/null
@@ -0,0 +1,1437 @@
+/*
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ *
+ *  Written by Jacob Shin - AMD, Inc.
+ *  Maintained by: Borislav Petkov <bp@alien8.de>
+ *
+ *  All MC4_MISCi registers are shared between cores on a node.
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/amd_nb.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+#define NR_BLOCKS         5
+#define THRESHOLD_MAX     0xFFF
+#define INT_TYPE_APIC     0x00020000
+#define MASK_VALID_HI     0x80000000
+#define MASK_CNTP_HI      0x40000000
+#define MASK_LOCKED_HI    0x20000000
+#define MASK_LVTOFF_HI    0x00F00000
+#define MASK_COUNT_EN_HI  0x00080000
+#define MASK_INT_TYPE_HI  0x00060000
+#define MASK_OVERFLOW_HI  0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO    0xFF000000
+#define MCG_XBLK_ADDR     0xC0000400
+
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR         0xC0000410
+#define MASK_DEF_LVTOFF                0x000000F0
+#define MASK_DEF_INT_TYPE      0x00000006
+#define DEF_LVT_OFF            0x2
+#define DEF_INT_TYPE_APIC      0x2
+
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF       0xF000
+
+static bool thresholding_irq_en;
+
+static const char * const th_names[] = {
+       "load_store",
+       "insn_fetch",
+       "combined_unit",
+       "decode_unit",
+       "northbridge",
+       "execution_unit",
+};
+
+static const char * const smca_umc_block_names[] = {
+       "dram_ecc",
+       "misc_umc"
+};
+
+struct smca_bank_name {
+       const char *name;       /* Short name for sysfs */
+       const char *long_name;  /* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
+       [SMCA_LS]       = { "load_store",       "Load Store Unit" },
+       [SMCA_IF]       = { "insn_fetch",       "Instruction Fetch Unit" },
+       [SMCA_L2_CACHE] = { "l2_cache",         "L2 Cache" },
+       [SMCA_DE]       = { "decode_unit",      "Decode Unit" },
+       [SMCA_RESERVED] = { "reserved",         "Reserved" },
+       [SMCA_EX]       = { "execution_unit",   "Execution Unit" },
+       [SMCA_FP]       = { "floating_point",   "Floating Point Unit" },
+       [SMCA_L3_CACHE] = { "l3_cache",         "L3 Cache" },
+       [SMCA_CS]       = { "coherent_slave",   "Coherent Slave" },
+       [SMCA_PIE]      = { "pie",              "Power, Interrupts, etc." },
+       [SMCA_UMC]      = { "umc",              "Unified Memory Controller" },
+       [SMCA_PB]       = { "param_block",      "Parameter Block" },
+       [SMCA_PSP]      = { "psp",              "Platform Security Processor" },
+       [SMCA_SMU]      = { "smu",              "System Management Unit" },
+};
+
+static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
+{
+       [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
+};
+
+const char *smca_get_name(enum smca_bank_types t)
+{
+       if (t >= N_SMCA_BANK_TYPES)
+               return NULL;
+
+       return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+       if (t >= N_SMCA_BANK_TYPES)
+               return NULL;
+
+       return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+{
+       struct smca_bank *b;
+
+       if (bank >= MAX_NR_BANKS)
+               return N_SMCA_BANK_TYPES;
+
+       b = &smca_banks[bank];
+       if (!b->hwid)
+               return N_SMCA_BANK_TYPES;
+
+       return b->hwid->bank_type;
+}
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+       /* { bank_type, hwid_mcatype, xec_bitmap } */
+
+       /* Reserved type */
+       { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
+
+       /* ZN Core (HWID=0xB0) MCA types */
+       { SMCA_LS,       HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+       { SMCA_IF,       HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+       { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+       { SMCA_DE,       HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+       /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+       { SMCA_EX,       HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+       { SMCA_FP,       HWID_MCATYPE(0xB0, 0x6), 0x7F },
+       { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+       /* Data Fabric MCA types */
+       { SMCA_CS,       HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+       { SMCA_PIE,      HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+       /* Unified Memory Controller MCA type */
+       { SMCA_UMC,      HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+       /* Parameter Block MCA type */
+       { SMCA_PB,       HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+       /* Platform Security Processor MCA type */
+       { SMCA_PSP,      HWID_MCATYPE(0xFF, 0x0), 0x1 },
+
+       /* System Management Unit MCA type */
+       { SMCA_SMU,      HWID_MCATYPE(0x01, 0x0), 0x1 },
+};
+
+struct smca_bank smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN   30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
+static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+       pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
+
+static void smca_configure(unsigned int bank, unsigned int cpu)
+{
+       unsigned int i, hwid_mcatype;
+       struct smca_hwid *s_hwid;
+       u32 high, low;
+       u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+       /* Set appropriate bits in MCA_CONFIG */
+       if (!rdmsr_safe(smca_config, &low, &high)) {
+               /*
+                * OS is required to set the MCAX bit to acknowledge that it is
+                * now using the new MSR ranges and new registers under each
+                * bank. It also means that the OS will configure deferred
+                * errors in the new MCx_CONFIG register. If the bit is not set,
+                * uncorrectable errors will cause a system panic.
+                *
+                * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+                */
+               high |= BIT(0);
+
+               /*
+                * SMCA sets the Deferred Error Interrupt type per bank.
+                *
+                * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+                * if the DeferredIntType bit field is available.
+                *
+                * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+                * high portion of the MSR). OS should set this to 0x1 to enable
+                * APIC based interrupt. First, check that no interrupt has been
+                * set.
+                */
+               if ((low & BIT(5)) && !((high >> 5) & 0x3))
+                       high |= BIT(5);
+
+               wrmsr(smca_config, low, high);
+       }
+
+       /* Return early if this bank was already initialized. */
+       if (smca_banks[bank].hwid)
+               return;
+
+       if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+               pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+               return;
+       }
+
+       hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+                                   (high & MCI_IPID_MCATYPE) >> 16);
+
+       for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+               s_hwid = &smca_hwid_mcatypes[i];
+               if (hwid_mcatype == s_hwid->hwid_mcatype) {
+                       smca_banks[bank].hwid = s_hwid;
+                       smca_banks[bank].id = low;
+                       smca_banks[bank].sysfs_id = s_hwid->count++;
+                       break;
+               }
+       }
+}
+
+struct thresh_restart {
+       struct threshold_block  *b;
+       int                     reset;
+       int                     set_lvt_off;
+       int                     lvt_off;
+       u16                     old_limit;
+};
+
+static inline bool is_shared_bank(int bank)
+{
+       /*
+        * Scalable MCA provides for only one core to have access to the MSRs of
+        * a shared bank.
+        */
+       if (mce_flags.smca)
+               return false;
+
+       /* Bank 4 is for northbridge reporting and is thus shared */
+       return (bank == 4);
+}
+
+static const char *bank4_names(const struct threshold_block *b)
+{
+       switch (b->address) {
+       /* MSR4_MISC0 */
+       case 0x00000413:
+               return "dram";
+
+       case 0xc0000408:
+               return "ht_links";
+
+       case 0xc0000409:
+               return "l3_cache";
+
+       default:
+               WARN(1, "Funny MSR: 0x%08x\n", b->address);
+               return "";
+       }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+       /*
+        * bank 4 supports APIC LVT interrupts implicitly since forever.
+        */
+       if (bank == 4)
+               return true;
+
+       /*
+        * IntP: interrupt present; if this bit is set, the thresholding
+        * bank can generate APIC LVT interrupts
+        */
+       return msr_high_bits & BIT(28);
+}
+
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+       int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+       if (apic < 0) {
+               pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+                      b->bank, b->block, b->address, hi, lo);
+               return 0;
+       }
+
+       if (apic != msr) {
+               /*
+                * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+                * the BIOS provides the value. The original field where LVT offset
+                * was set is reserved. Return early here:
+                */
+               if (mce_flags.smca)
+                       return 0;
+
+               pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+                      b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+               return 0;
+       }
+
+       return 1;
+};
+
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
+static void threshold_restart_bank(void *_tr)
+{
+       struct thresh_restart *tr = _tr;
+       u32 hi, lo;
+
+       rdmsr(tr->b->address, lo, hi);
+
+       if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
+               tr->reset = 1;  /* limit cannot be lower than err count */
+
+       if (tr->reset) {                /* reset err count and overflow bit */
+               hi =
+                   (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+                   (THRESHOLD_MAX - tr->b->threshold_limit);
+       } else if (tr->old_limit) {     /* change limit w/o reset */
+               int new_count = (hi & THRESHOLD_MAX) +
+                   (tr->old_limit - tr->b->threshold_limit);
+
+               hi = (hi & ~MASK_ERR_COUNT_HI) |
+                   (new_count & THRESHOLD_MAX);
+       }
+
+       /* clear IntType */
+       hi &= ~MASK_INT_TYPE_HI;
+
+       if (!tr->b->interrupt_capable)
+               goto done;
+
+       if (tr->set_lvt_off) {
+               if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+                       /* set new lvt offset */
+                       hi &= ~MASK_LVTOFF_HI;
+                       hi |= tr->lvt_off << 20;
+               }
+       }
+
+       if (tr->b->interrupt_enable)
+               hi |= INT_TYPE_APIC;
+
+ done:
+
+       hi |= MASK_COUNT_EN_HI;
+       wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+       struct thresh_restart tr = {
+               .b                      = b,
+               .set_lvt_off            = 1,
+               .lvt_off                = offset,
+       };
+
+       b->threshold_limit              = THRESHOLD_MAX;
+       threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce_threshold(int reserved, int new)
+{
+       if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+                                             APIC_EILVT_MSG_FIX, 0))
+               return new;
+
+       return reserved;
+}
+
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+       if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+                                             APIC_EILVT_MSG_FIX, 0))
+               return new;
+
+       return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+       u32 low = 0, high = 0;
+       int def_offset = -1, def_new;
+
+       if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+               return;
+
+       def_new = (low & MASK_DEF_LVTOFF) >> 4;
+       if (!(low & MASK_DEF_LVTOFF)) {
+               pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+               def_new = DEF_LVT_OFF;
+               low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+       }
+
+       def_offset = setup_APIC_deferred_error(def_offset, def_new);
+       if ((def_offset == def_new) &&
+           (deferred_error_int_vector != amd_deferred_error_interrupt))
+               deferred_error_int_vector = amd_deferred_error_interrupt;
+
+       if (!mce_flags.smca)
+               low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+
+       wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
+static u32 smca_get_block_address(unsigned int bank, unsigned int block)
+{
+       u32 low, high;
+       u32 addr = 0;
+
+       if (smca_get_bank_type(bank) == SMCA_RESERVED)
+               return addr;
+
+       if (!block)
+               return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+       /* Check our cache first: */
+       if (smca_bank_addrs[bank][block] != -1)
+               return smca_bank_addrs[bank][block];
+
+       /*
+        * For SMCA enabled processors, BLKPTR field of the first MISC register
+        * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
+        */
+       if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+               goto out;
+
+       if (!(low & MCI_CONFIG_MCAX))
+               goto out;
+
+       if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+           (low & MASK_BLKPTR_LO))
+               addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+
+out:
+       smca_bank_addrs[bank][block] = addr;
+       return addr;
+}
+
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+                            unsigned int bank, unsigned int block)
+{
+       u32 addr = 0, offset = 0;
+
+       if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+               return addr;
+
+       if (mce_flags.smca)
+               return smca_get_block_address(bank, block);
+
+       /* Fall back to method we used for older processors: */
+       switch (block) {
+       case 0:
+               addr = msr_ops.misc(bank);
+               break;
+       case 1:
+               offset = ((low & MASK_BLKPTR_LO) >> 21);
+               if (offset)
+                       addr = MCG_XBLK_ADDR + offset;
+               break;
+       default:
+               addr = ++current_addr;
+       }
+       return addr;
+}
+
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+                       int offset, u32 misc_high)
+{
+       unsigned int cpu = smp_processor_id();
+       u32 smca_low, smca_high;
+       struct threshold_block b;
+       int new;
+
+       if (!block)
+               per_cpu(bank_map, cpu) |= (1 << bank);
+
+       memset(&b, 0, sizeof(b));
+       b.cpu                   = cpu;
+       b.bank                  = bank;
+       b.block                 = block;
+       b.address               = addr;
+       b.interrupt_capable     = lvt_interrupt_supported(bank, misc_high);
+
+       if (!b.interrupt_capable)
+               goto done;
+
+       b.interrupt_enable = 1;
+
+       if (!mce_flags.smca) {
+               new = (misc_high & MASK_LVTOFF_HI) >> 20;
+               goto set_offset;
+       }
+
+       /* Gather LVT offset for thresholding: */
+       if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+               goto out;
+
+       new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+
+set_offset:
+       offset = setup_APIC_mce_threshold(offset, new);
+       if (offset == new)
+               thresholding_irq_en = true;
+
+done:
+       mce_threshold_block_init(&b, offset);
+
+out:
+       return offset;
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+       u32 low = 0, high = 0, address = 0;
+       unsigned int bank, block, cpu = smp_processor_id();
+       int offset = -1;
+
+       for (bank = 0; bank < mca_cfg.banks; ++bank) {
+               if (mce_flags.smca)
+                       smca_configure(bank, cpu);
+
+               for (block = 0; block < NR_BLOCKS; ++block) {
+                       address = get_block_address(address, low, high, bank, block);
+                       if (!address)
+                               break;
+
+                       if (rdmsr_safe(address, &low, &high))
+                               break;
+
+                       if (!(high & MASK_VALID_HI))
+                               continue;
+
+                       if (!(high & MASK_CNTP_HI)  ||
+                            (high & MASK_LOCKED_HI))
+                               continue;
+
+                       offset = prepare_threshold_block(bank, block, address, offset, high);
+               }
+       }
+
+       if (mce_flags.succor)
+               deferred_error_interrupt_enable(c);
+}
+
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+       u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+       /* We start from the normalized address */
+       u64 ret_addr = norm_addr;
+
+       u32 tmp;
+
+       u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+       u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+       u8 intlv_addr_sel, intlv_addr_bit;
+       u8 num_intlv_bits, hashed_bit;
+       u8 lgcy_mmio_hole_en, base = 0;
+       u8 cs_mask, cs_id = 0;
+       bool hash_enabled = false;
+
+       /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+       if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+               goto out_err;
+
+       /* Remove HiAddrOffset from normalized address, if enabled: */
+       if (tmp & BIT(0)) {
+               u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+               if (norm_addr >= hi_addr_offset) {
+                       ret_addr -= hi_addr_offset;
+                       base = 1;
+               }
+       }
+
+       /* Read D18F0x110 (DramBaseAddress). */
+       if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+               goto out_err;
+
+       /* Check if address range is valid. */
+       if (!(tmp & BIT(0))) {
+               pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+                       __func__, tmp);
+               goto out_err;
+       }
+
+       lgcy_mmio_hole_en = tmp & BIT(1);
+       intlv_num_chan    = (tmp >> 4) & 0xF;
+       intlv_addr_sel    = (tmp >> 8) & 0x7;
+       dram_base_addr    = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+       /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+       if (intlv_addr_sel > 3) {
+               pr_err("%s: Invalid interleave address select %d.\n",
+                       __func__, intlv_addr_sel);
+               goto out_err;
+       }
+
+       /* Read D18F0x114 (DramLimitAddress). */
+       if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+               goto out_err;
+
+       intlv_num_sockets = (tmp >> 8) & 0x1;
+       intlv_num_dies    = (tmp >> 10) & 0x3;
+       dram_limit_addr   = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+       intlv_addr_bit = intlv_addr_sel + 8;
+
+       /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+       switch (intlv_num_chan) {
+       case 0: intlv_num_chan = 0; break;
+       case 1: intlv_num_chan = 1; break;
+       case 3: intlv_num_chan = 2; break;
+       case 5: intlv_num_chan = 3; break;
+       case 7: intlv_num_chan = 4; break;
+
+       case 8: intlv_num_chan = 1;
+               hash_enabled = true;
+               break;
+       default:
+               pr_err("%s: Invalid number of interleaved channels %d.\n",
+                       __func__, intlv_num_chan);
+               goto out_err;
+       }
+
+       num_intlv_bits = intlv_num_chan;
+
+       if (intlv_num_dies > 2) {
+               pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+                       __func__, intlv_num_dies);
+               goto out_err;
+       }
+
+       num_intlv_bits += intlv_num_dies;
+
+       /* Add a bit if sockets are interleaved. */
+       num_intlv_bits += intlv_num_sockets;
+
+       /* Assert num_intlv_bits <= 4 */
+       if (num_intlv_bits > 4) {
+               pr_err("%s: Invalid interleave bits %d.\n",
+                       __func__, num_intlv_bits);
+               goto out_err;
+       }
+
+       if (num_intlv_bits > 0) {
+               u64 temp_addr_x, temp_addr_i, temp_addr_y;
+               u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+               /*
+                * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+                * This is the fabric id for this coherent slave. Use
+                * umc/channel# as instance id of the coherent slave
+                * for FICAA.
+                */
+               if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+                       goto out_err;
+
+               cs_fabric_id = (tmp >> 8) & 0xFF;
+               die_id_bit   = 0;
+
+               /* If interleaved over more than 1 channel: */
+               if (intlv_num_chan) {
+                       die_id_bit = intlv_num_chan;
+                       cs_mask    = (1 << die_id_bit) - 1;
+                       cs_id      = cs_fabric_id & cs_mask;
+               }
+
+               sock_id_bit = die_id_bit;
+
+               /* Read D18F1x208 (SystemFabricIdMask). */
+               if (intlv_num_dies || intlv_num_sockets)
+                       if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+                               goto out_err;
+
+               /* If interleaved over more than 1 die. */
+               if (intlv_num_dies) {
+                       sock_id_bit  = die_id_bit + intlv_num_dies;
+                       die_id_shift = (tmp >> 24) & 0xF;
+                       die_id_mask  = (tmp >> 8) & 0xFF;
+
+                       cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+               }
+
+               /* If interleaved over more than 1 socket. */
+               if (intlv_num_sockets) {
+                       socket_id_shift = (tmp >> 28) & 0xF;
+                       socket_id_mask  = (tmp >> 16) & 0xFF;
+
+                       cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+               }
+
+               /*
+                * The pre-interleaved address consists of XXXXXXIIIYYYYY
+                * where III is the ID for this CS, and XXXXXXYYYYY are the
+                * address bits from the post-interleaved address.
+                * "num_intlv_bits" has been calculated to tell us how many "I"
+                * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+                * there are (where "I" starts).
+                */
+               temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+               temp_addr_i = (cs_id << intlv_addr_bit);
+               temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+               ret_addr    = temp_addr_x | temp_addr_i | temp_addr_y;
+       }
+
+       /* Add dram base address */
+       ret_addr += dram_base_addr;
+
+       /* If legacy MMIO hole enabled */
+       if (lgcy_mmio_hole_en) {
+               if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+                       goto out_err;
+
+               dram_hole_base = tmp & GENMASK(31, 24);
+               if (ret_addr >= dram_hole_base)
+                       ret_addr += (BIT_ULL(32) - dram_hole_base);
+       }
+
+       if (hash_enabled) {
+               /* Save some parentheses and grab ls-bit at the end. */
+               hashed_bit =    (ret_addr >> 12) ^
+                               (ret_addr >> 18) ^
+                               (ret_addr >> 21) ^
+                               (ret_addr >> 30) ^
+                               cs_id;
+
+               hashed_bit &= BIT(0);
+
+               if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+                       ret_addr ^= BIT(intlv_addr_bit);
+       }
+
+       /* Is calculated system address is above DRAM limit address? */
+       if (ret_addr > dram_limit_addr)
+               goto out_err;
+
+       *sys_addr = ret_addr;
+       return 0;
+
+out_err:
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
+       /* ErrCodeExt[20:16] */
+       u8 xec = (m->status >> 16) & 0x1f;
+
+       if (mce_flags.smca)
+               return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+
+       return m->bank == 4 && xec == 0x8;
+}
+
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
+{
+       struct mce m;
+
+       mce_setup(&m);
+
+       m.status = status;
+       m.misc   = misc;
+       m.bank   = bank;
+       m.tsc    = rdtsc();
+
+       if (m.status & MCI_STATUS_ADDRV) {
+               m.addr = addr;
+
+               /*
+                * Extract [55:<lsb>] where lsb is the least significant
+                * *valid* bit of the address bits.
+                */
+               if (mce_flags.smca) {
+                       u8 lsb = (m.addr >> 56) & 0x3f;
+
+                       m.addr &= GENMASK_ULL(55, lsb);
+               }
+       }
+
+       if (mce_flags.smca) {
+               rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+               if (m.status & MCI_STATUS_SYNDV)
+                       rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+       }
+
+       mce_log(&m);
+}
+
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
+{
+       entering_irq();
+       trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+       inc_irq_stat(irq_deferred_error_count);
+       deferred_error_int_vector();
+       trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+       exiting_ack_irq();
+}
+
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
+{
+       u64 status, addr = 0;
+
+       rdmsrl(msr_stat, status);
+       if (!(status & MCI_STATUS_VAL))
+               return false;
+
+       if (status & MCI_STATUS_ADDRV)
+               rdmsrl(msr_addr, addr);
+
+       __log_error(bank, status, addr, misc);
+
+       wrmsrl(msr_stat, 0);
+
+       return status & MCI_STATUS_DEFERRED;
+}
+
+/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ *    clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ *    log it.
+ */
+static void log_error_deferred(unsigned int bank)
+{
+       bool defrd;
+
+       defrd = _log_error_bank(bank, msr_ops.status(bank),
+                                       msr_ops.addr(bank), 0);
+
+       if (!mce_flags.smca)
+               return;
+
+       /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+       if (defrd) {
+               wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+               return;
+       }
+
+       /*
+        * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+        * for a valid error.
+        */
+       _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+                             MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+       unsigned int bank;
+
+       for (bank = 0; bank < mca_cfg.banks; ++bank)
+               log_error_deferred(bank);
+}
+
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+       _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
+
+static void log_and_reset_block(struct threshold_block *block)
+{
+       struct thresh_restart tr;
+       u32 low = 0, high = 0;
+
+       if (!block)
+               return;
+
+       if (rdmsr_safe(block->address, &low, &high))
+               return;
+
+       if (!(high & MASK_OVERFLOW_HI))
+               return;
+
+       /* Log the MCE which caused the threshold event. */
+       log_error_thresholding(block->bank, ((u64)high << 32) | low);
+
+       /* Reset threshold block after logging error. */
+       memset(&tr, 0, sizeof(tr));
+       tr.b = block;
+       threshold_restart_bank(&tr);
+}
+
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+       struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+       unsigned int bank, cpu = smp_processor_id();
+
+       for (bank = 0; bank < mca_cfg.banks; ++bank) {
+               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+                       continue;
+
+               first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+               if (!first_block)
+                       continue;
+
+               /*
+                * The first block is also the head of the list. Check it first
+                * before iterating over the rest.
+                */
+               log_and_reset_block(first_block);
+               list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+                       log_and_reset_block(block);
+       }
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+       struct attribute attr;
+       ssize_t (*show) (struct threshold_block *, char *);
+       ssize_t (*store) (struct threshold_block *, const char *, size_t count);
+};
+
+#define SHOW_FIELDS(name)                                              \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf)     \
+{                                                                      \
+       return sprintf(buf, "%lu\n", (unsigned long) b->name);          \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
+{
+       struct thresh_restart tr;
+       unsigned long new;
+
+       if (!b->interrupt_capable)
+               return -EINVAL;
+
+       if (kstrtoul(buf, 0, &new) < 0)
+               return -EINVAL;
+
+       b->interrupt_enable = !!new;
+
+       memset(&tr, 0, sizeof(tr));
+       tr.b            = b;
+
+       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+
+       return size;
+}
+
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
+{
+       struct thresh_restart tr;
+       unsigned long new;
+
+       if (kstrtoul(buf, 0, &new) < 0)
+               return -EINVAL;
+
+       if (new > THRESHOLD_MAX)
+               new = THRESHOLD_MAX;
+       if (new < 1)
+               new = 1;
+
+       memset(&tr, 0, sizeof(tr));
+       tr.old_limit = b->threshold_limit;
+       b->threshold_limit = new;
+       tr.b = b;
+
+       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+
+       return size;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+       u32 lo, hi;
+
+       rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+
+       return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+                                    (THRESHOLD_MAX - b->threshold_limit)));
+}
+
+static struct threshold_attr error_count = {
+       .attr = {.name = __stringify(error_count), .mode = 0444 },
+       .show = show_error_count,
+};
+
+#define RW_ATTR(val)                                                   \
+static struct threshold_attr val = {                                   \
+       .attr   = {.name = __stringify(val), .mode = 0644 },            \
+       .show   = show_## val,                                          \
+       .store  = store_## val,                                         \
+};
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+
+static struct attribute *default_attrs[] = {
+       &threshold_limit.attr,
+       &error_count.attr,
+       NULL,   /* possibly interrupt_enable if supported, see below */
+       NULL,
+};
+
+#define to_block(k)    container_of(k, struct threshold_block, kobj)
+#define to_attr(a)     container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+       struct threshold_block *b = to_block(kobj);
+       struct threshold_attr *a = to_attr(attr);
+       ssize_t ret;
+
+       ret = a->show ? a->show(b, buf) : -EIO;
+
+       return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+                    const char *buf, size_t count)
+{
+       struct threshold_block *b = to_block(kobj);
+       struct threshold_attr *a = to_attr(attr);
+       ssize_t ret;
+
+       ret = a->store ? a->store(b, buf, count) : -EIO;
+
+       return ret;
+}
+
+static const struct sysfs_ops threshold_ops = {
+       .show                   = show,
+       .store                  = store,
+};
+
+static struct kobj_type threshold_ktype = {
+       .sysfs_ops              = &threshold_ops,
+       .default_attrs          = default_attrs,
+};
+
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+       enum smca_bank_types bank_type;
+
+       if (!mce_flags.smca) {
+               if (b && bank == 4)
+                       return bank4_names(b);
+
+               return th_names[bank];
+       }
+
+       bank_type = smca_get_bank_type(bank);
+       if (bank_type >= N_SMCA_BANK_TYPES)
+               return NULL;
+
+       if (b && bank_type == SMCA_UMC) {
+               if (b->block < ARRAY_SIZE(smca_umc_block_names))
+                       return smca_umc_block_names[b->block];
+               return NULL;
+       }
+
+       if (smca_banks[bank].hwid->count == 1)
+               return smca_get_name(bank_type);
+
+       snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+                "%s_%x", smca_get_name(bank_type),
+                         smca_banks[bank].sysfs_id);
+       return buf_mcatype;
+}
+
+static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
+                                    unsigned int block, u32 address)
+{
+       struct threshold_block *b = NULL;
+       u32 low, high;
+       int err;
+
+       if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+               return 0;
+
+       if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+               return 0;
+
+       if (!(high & MASK_VALID_HI)) {
+               if (block)
+                       goto recurse;
+               else
+                       return 0;
+       }
+
+       if (!(high & MASK_CNTP_HI)  ||
+            (high & MASK_LOCKED_HI))
+               goto recurse;
+
+       b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+       if (!b)
+               return -ENOMEM;
+
+       b->block                = block;
+       b->bank                 = bank;
+       b->cpu                  = cpu;
+       b->address              = address;
+       b->interrupt_enable     = 0;
+       b->interrupt_capable    = lvt_interrupt_supported(bank, high);
+       b->threshold_limit      = THRESHOLD_MAX;
+
+       if (b->interrupt_capable) {
+               threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+               b->interrupt_enable = 1;
+       } else {
+               threshold_ktype.default_attrs[2] = NULL;
+       }
+
+       INIT_LIST_HEAD(&b->miscj);
+
+       if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
+               list_add(&b->miscj,
+                        &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+       } else {
+               per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+       }
+
+       err = kobject_init_and_add(&b->kobj, &threshold_ktype,
+                                  per_cpu(threshold_banks, cpu)[bank]->kobj,
+                                  get_name(bank, b));
+       if (err)
+               goto out_free;
+recurse:
+       address = get_block_address(address, low, high, bank, ++block);
+       if (!address)
+               return 0;
+
+       err = allocate_threshold_blocks(cpu, bank, block, address);
+       if (err)
+               goto out_free;
+
+       if (b)
+               kobject_uevent(&b->kobj, KOBJ_ADD);
+
+       return err;
+
+out_free:
+       if (b) {
+               kobject_put(&b->kobj);
+               list_del(&b->miscj);
+               kfree(b);
+       }
+       return err;
+}
+
+static int __threshold_add_blocks(struct threshold_bank *b)
+{
+       struct list_head *head = &b->blocks->miscj;
+       struct threshold_block *pos = NULL;
+       struct threshold_block *tmp = NULL;
+       int err = 0;
+
+       err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
+       if (err)
+               return err;
+
+       list_for_each_entry_safe(pos, tmp, head, miscj) {
+
+               err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
+               if (err) {
+                       list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
+                               kobject_del(&pos->kobj);
+
+                       return err;
+               }
+       }
+       return err;
+}
+
+static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+{
+       struct device *dev = per_cpu(mce_device, cpu);
+       struct amd_northbridge *nb = NULL;
+       struct threshold_bank *b = NULL;
+       const char *name = get_name(bank, NULL);
+       int err = 0;
+
+       if (!dev)
+               return -ENODEV;
+
+       if (is_shared_bank(bank)) {
+               nb = node_to_amd_nb(amd_get_nb_id(cpu));
+
+               /* threshold descriptor already initialized on this node? */
+               if (nb && nb->bank4) {
+                       /* yes, use it */
+                       b = nb->bank4;
+                       err = kobject_add(b->kobj, &dev->kobj, name);
+                       if (err)
+                               goto out;
+
+                       per_cpu(threshold_banks, cpu)[bank] = b;
+                       refcount_inc(&b->cpus);
+
+                       err = __threshold_add_blocks(b);
+
+                       goto out;
+               }
+       }
+
+       b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+       if (!b) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       b->kobj = kobject_create_and_add(name, &dev->kobj);
+       if (!b->kobj) {
+               err = -EINVAL;
+               goto out_free;
+       }
+
+       per_cpu(threshold_banks, cpu)[bank] = b;
+
+       if (is_shared_bank(bank)) {
+               refcount_set(&b->cpus, 1);
+
+               /* nb is already initialized, see above */
+               if (nb) {
+                       WARN_ON(nb->bank4);
+                       nb->bank4 = b;
+               }
+       }
+
+       err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
+       if (!err)
+               goto out;
+
+ out_free:
+       kfree(b);
+
+ out:
+       return err;
+}
+
+static void deallocate_threshold_block(unsigned int cpu,
+                                                unsigned int bank)
+{
+       struct threshold_block *pos = NULL;
+       struct threshold_block *tmp = NULL;
+       struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+       if (!head)
+               return;
+
+       list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+               kobject_put(&pos->kobj);
+               list_del(&pos->miscj);
+               kfree(pos);
+       }
+
+       kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+       per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+
+static void __threshold_remove_blocks(struct threshold_bank *b)
+{
+       struct threshold_block *pos = NULL;
+       struct threshold_block *tmp = NULL;
+
+       kobject_del(b->kobj);
+
+       list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
+               kobject_del(&pos->kobj);
+}
+
+static void threshold_remove_bank(unsigned int cpu, int bank)
+{
+       struct amd_northbridge *nb;
+       struct threshold_bank *b;
+
+       b = per_cpu(threshold_banks, cpu)[bank];
+       if (!b)
+               return;
+
+       if (!b->blocks)
+               goto free_out;
+
+       if (is_shared_bank(bank)) {
+               if (!refcount_dec_and_test(&b->cpus)) {
+                       __threshold_remove_blocks(b);
+                       per_cpu(threshold_banks, cpu)[bank] = NULL;
+                       return;
+               } else {
+                       /*
+                        * the last CPU on this node using the shared bank is
+                        * going away, remove that bank now.
+                        */
+                       nb = node_to_amd_nb(amd_get_nb_id(cpu));
+                       nb->bank4 = NULL;
+               }
+       }
+
+       deallocate_threshold_block(cpu, bank);
+
+free_out:
+       kobject_del(b->kobj);
+       kobject_put(b->kobj);
+       kfree(b);
+       per_cpu(threshold_banks, cpu)[bank] = NULL;
+}
+
+int mce_threshold_remove_device(unsigned int cpu)
+{
+       unsigned int bank;
+
+       for (bank = 0; bank < mca_cfg.banks; ++bank) {
+               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+                       continue;
+               threshold_remove_bank(cpu, bank);
+       }
+       kfree(per_cpu(threshold_banks, cpu));
+       per_cpu(threshold_banks, cpu) = NULL;
+       return 0;
+}
+
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
+{
+       unsigned int bank;
+       struct threshold_bank **bp;
+       int err = 0;
+
+       bp = per_cpu(threshold_banks, cpu);
+       if (bp)
+               return 0;
+
+       bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
+                    GFP_KERNEL);
+       if (!bp)
+               return -ENOMEM;
+
+       per_cpu(threshold_banks, cpu) = bp;
+
+       for (bank = 0; bank < mca_cfg.banks; ++bank) {
+               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+                       continue;
+               err = threshold_create_bank(cpu, bank);
+               if (err)
+                       goto err;
+       }
+       return err;
+err:
+       mce_threshold_remove_device(cpu);
+       return err;
+}
+
+static __init int threshold_init_device(void)
+{
+       unsigned lcpu = 0;
+
+       /* to hit CPUs online before the notifier is up */
+       for_each_online_cpu(lcpu) {
+               int err = mce_threshold_create_device(lcpu);
+
+               if (err)
+                       return err;
+       }
+
+       if (thresholding_irq_en)
+               mce_threshold_vector = amd_threshold_interrupt;
+
+       return 0;
+}
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
new file mode 100644 (file)
index 0000000..1d9b3ce
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <acpi/ghes.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
+{
+       struct mce m;
+
+       if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+               return;
+
+       mce_setup(&m);
+       m.bank = -1;
+       /* Fake a memory read error with unknown channel */
+       m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+       if (severity >= GHES_SEV_RECOVERABLE)
+               m.status |= MCI_STATUS_UC;
+
+       if (severity >= GHES_SEV_PANIC) {
+               m.status |= MCI_STATUS_PCC;
+               m.tsc = rdtsc();
+       }
+
+       m.addr = mem_err->physical_addr;
+       mce_log(&m);
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE                                               \
+       UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,     \
+               0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE                                          \
+       UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96,     \
+               0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+       struct cper_record_header hdr;
+       struct cper_section_descriptor sec_hdr;
+       struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+       struct cper_mce_record rcd;
+
+       memset(&rcd, 0, sizeof(rcd));
+       memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+       rcd.hdr.revision = CPER_RECORD_REV;
+       rcd.hdr.signature_end = CPER_SIG_END;
+       rcd.hdr.section_count = 1;
+       rcd.hdr.error_severity = CPER_SEV_FATAL;
+       /* timestamp, platform_id, partition_id are all invalid */
+       rcd.hdr.validation_bits = 0;
+       rcd.hdr.record_length = sizeof(rcd);
+       rcd.hdr.creator_id = CPER_CREATOR_MCE;
+       rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+       rcd.hdr.record_id = cper_next_record_id();
+       rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+       rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+       rcd.sec_hdr.section_length = sizeof(rcd.mce);
+       rcd.sec_hdr.revision = CPER_SEC_REV;
+       /* fru_id and fru_text is invalid */
+       rcd.sec_hdr.validation_bits = 0;
+       rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+       rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+       rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
+
+       memcpy(&rcd.mce, m, sizeof(*m));
+
+       return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+       struct cper_mce_record rcd;
+       int rc, pos;
+
+       rc = erst_get_record_id_begin(&pos);
+       if (rc)
+               return rc;
+retry:
+       rc = erst_get_record_id_next(&pos, record_id);
+       if (rc)
+               goto out;
+       /* no more record */
+       if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+               goto out;
+       rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+       /* someone else has cleared the record, try next one */
+       if (rc == -ENOENT)
+               goto retry;
+       else if (rc < 0)
+               goto out;
+       /* try to skip other type records in storage */
+       else if (rc != sizeof(rcd) ||
+                uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+               goto retry;
+       memcpy(m, &rcd.mce, sizeof(*m));
+       rc = sizeof(*m);
+out:
+       erst_get_record_id_end();
+
+       return rc;
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+       return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+       return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
new file mode 100644 (file)
index 0000000..b0ae12c
--- /dev/null
@@ -0,0 +1,2499 @@
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/ras.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+#include <linux/set_memory.h>
+
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/reboot.h>
+
+#include "internal.h"
+
+static DEFINE_MUTEX(mce_log_mutex);
+
+/* sysfs synchronization */
+static DEFINE_MUTEX(mce_sysfs_mutex);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
+#define SPINUNIT               100     /* 100ns */
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+       .bootlog  = -1,
+       /*
+        * Tolerant levels:
+        * 0: always panic on uncorrected errors, log corrected errors
+        * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+        * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
+        * 3: never panic or SIGBUS, log all errors (for testing only)
+        */
+       .tolerant = 1,
+       .monarch_timeout = -1
+};
+
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static unsigned long mce_need_notify;
+static int cpu_missing;
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+       [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
+
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+       memset(m, 0, sizeof(struct mce));
+       m->cpu = m->extcpu = smp_processor_id();
+       /* need the internal __ version to avoid deadlocks */
+       m->time = __ktime_get_real_seconds();
+       m->cpuvendor = boot_cpu_data.x86_vendor;
+       m->cpuid = cpuid_eax(1);
+       m->socketid = cpu_data(m->extcpu).phys_proc_id;
+       m->apicid = cpu_data(m->extcpu).initial_apicid;
+       rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+       if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+               rdmsrl(MSR_PPIN, m->ppin);
+
+       m->microcode = boot_cpu_data.microcode;
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+void mce_log(struct mce *m)
+{
+       if (!mce_gen_pool_add(m))
+               irq_work_queue(&mce_irq_work);
+}
+
+void mce_inject_log(struct mce *m)
+{
+       mutex_lock(&mce_log_mutex);
+       mce_log(m);
+       mutex_unlock(&mce_log_mutex);
+}
+EXPORT_SYMBOL_GPL(mce_inject_log);
+
+static struct notifier_block mce_srao_nb;
+
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS  3
+static atomic_t num_notifiers;
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+       if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+               return;
+
+       atomic_inc(&num_notifiers);
+
+       blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+       atomic_dec(&num_notifiers);
+
+       blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
+static inline u32 ctl_reg(int bank)
+{
+       return MSR_IA32_MCx_CTL(bank);
+}
+
+static inline u32 status_reg(int bank)
+{
+       return MSR_IA32_MCx_STATUS(bank);
+}
+
+static inline u32 addr_reg(int bank)
+{
+       return MSR_IA32_MCx_ADDR(bank);
+}
+
+static inline u32 misc_reg(int bank)
+{
+       return MSR_IA32_MCx_MISC(bank);
+}
+
+static inline u32 smca_ctl_reg(int bank)
+{
+       return MSR_AMD64_SMCA_MCx_CTL(bank);
+}
+
+static inline u32 smca_status_reg(int bank)
+{
+       return MSR_AMD64_SMCA_MCx_STATUS(bank);
+}
+
+static inline u32 smca_addr_reg(int bank)
+{
+       return MSR_AMD64_SMCA_MCx_ADDR(bank);
+}
+
+static inline u32 smca_misc_reg(int bank)
+{
+       return MSR_AMD64_SMCA_MCx_MISC(bank);
+}
+
+struct mca_msr_regs msr_ops = {
+       .ctl    = ctl_reg,
+       .status = status_reg,
+       .addr   = addr_reg,
+       .misc   = misc_reg
+};
+
+static void __print_mce(struct mce *m)
+{
+       pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+                m->extcpu,
+                (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+                m->mcgstatus, m->bank, m->status);
+
+       if (m->ip) {
+               pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
+                       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+                       m->cs, m->ip);
+
+               if (m->cs == __KERNEL_CS)
+                       pr_cont("{%pS}", (void *)(unsigned long)m->ip);
+               pr_cont("\n");
+       }
+
+       pr_emerg(HW_ERR "TSC %llx ", m->tsc);
+       if (m->addr)
+               pr_cont("ADDR %llx ", m->addr);
+       if (m->misc)
+               pr_cont("MISC %llx ", m->misc);
+
+       if (mce_flags.smca) {
+               if (m->synd)
+                       pr_cont("SYND %llx ", m->synd);
+               if (m->ipid)
+                       pr_cont("IPID %llx ", m->ipid);
+       }
+
+       pr_cont("\n");
+       /*
+        * Note this output is parsed by external tools and old fields
+        * should not be changed.
+        */
+       pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+               m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+               m->microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+       __print_mce(m);
+
+       if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
+               pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_panicked;
+
+static int fake_panic;
+static atomic_t mce_fake_panicked;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+       long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+       preempt_disable();
+       local_irq_enable();
+       while (timeout-- > 0)
+               udelay(1);
+       if (panic_timeout == 0)
+               panic_timeout = mca_cfg.panic_timeout;
+       panic("Panicing machine check CPU died");
+}
+
+static void mce_panic(const char *msg, struct mce *final, char *exp)
+{
+       int apei_err = 0;
+       struct llist_node *pending;
+       struct mce_evt_llist *l;
+
+       if (!fake_panic) {
+               /*
+                * Make sure only one CPU runs in machine check panic
+                */
+               if (atomic_inc_return(&mce_panicked) > 1)
+                       wait_for_panic();
+               barrier();
+
+               bust_spinlocks(1);
+               console_verbose();
+       } else {
+               /* Don't log too much for fake panic */
+               if (atomic_inc_return(&mce_fake_panicked) > 1)
+                       return;
+       }
+       pending = mce_gen_pool_prepare_records();
+       /* First print corrected ones that are still unlogged */
+       llist_for_each_entry(l, pending, llnode) {
+               struct mce *m = &l->mce;
+               if (!(m->status & MCI_STATUS_UC)) {
+                       print_mce(m);
+                       if (!apei_err)
+                               apei_err = apei_write_mce(m);
+               }
+       }
+       /* Now print uncorrected but with the final one last */
+       llist_for_each_entry(l, pending, llnode) {
+               struct mce *m = &l->mce;
+               if (!(m->status & MCI_STATUS_UC))
+                       continue;
+               if (!final || mce_cmp(m, final)) {
+                       print_mce(m);
+                       if (!apei_err)
+                               apei_err = apei_write_mce(m);
+               }
+       }
+       if (final) {
+               print_mce(final);
+               if (!apei_err)
+                       apei_err = apei_write_mce(final);
+       }
+       if (cpu_missing)
+               pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
+       if (exp)
+               pr_emerg(HW_ERR "Machine check: %s\n", exp);
+       if (!fake_panic) {
+               if (panic_timeout == 0)
+                       panic_timeout = mca_cfg.panic_timeout;
+               panic(msg);
+       } else
+               pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+       unsigned bank = __this_cpu_read(injectm.bank);
+
+       if (msr == mca_cfg.rip_msr)
+               return offsetof(struct mce, ip);
+       if (msr == msr_ops.status(bank))
+               return offsetof(struct mce, status);
+       if (msr == msr_ops.addr(bank))
+               return offsetof(struct mce, addr);
+       if (msr == msr_ops.misc(bank))
+               return offsetof(struct mce, misc);
+       if (msr == MSR_IA32_MCG_STATUS)
+               return offsetof(struct mce, mcgstatus);
+       return -1;
+}
+
+/* MSR access wrappers used for error injection */
+static u64 mce_rdmsrl(u32 msr)
+{
+       u64 v;
+
+       if (__this_cpu_read(injectm.finished)) {
+               int offset = msr_to_offset(msr);
+
+               if (offset < 0)
+                       return 0;
+               return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+       }
+
+       if (rdmsrl_safe(msr, &v)) {
+               WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
+               /*
+                * Return zero in case the access faulted. This should
+                * not happen normally but can happen if the CPU does
+                * something weird, or if the code is buggy.
+                */
+               v = 0;
+       }
+
+       return v;
+}
+
+static void mce_wrmsrl(u32 msr, u64 v)
+{
+       if (__this_cpu_read(injectm.finished)) {
+               int offset = msr_to_offset(msr);
+
+               if (offset >= 0)
+                       *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+               return;
+       }
+       wrmsrl(msr, v);
+}
+
+/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+       mce_setup(m);
+
+       m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+       if (regs) {
+               /*
+                * Get the address of the instruction at the time of
+                * the machine check error.
+                */
+               if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+                       m->ip = regs->ip;
+                       m->cs = regs->cs;
+
+                       /*
+                        * When in VM86 mode make the cs look like ring 3
+                        * always. This is a lie, but it's better than passing
+                        * the additional vm86 bit around everywhere.
+                        */
+                       if (v8086_mode(regs))
+                               m->cs |= 3;
+               }
+               /* Use accurate RIP reporting if available. */
+               if (mca_cfg.rip_msr)
+                       m->ip = mce_rdmsrl(mca_cfg.rip_msr);
+       }
+}
+
+int mce_available(struct cpuinfo_x86 *c)
+{
+       if (mca_cfg.disabled)
+               return 0;
+       return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+       if (!mce_gen_pool_empty())
+               schedule_work(&mce_work);
+}
+
+static void mce_irq_work_cb(struct irq_work *entry)
+{
+       mce_schedule_work();
+}
+
+static void mce_report_event(struct pt_regs *regs)
+{
+       if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+               mce_notify_irq();
+               /*
+                * Triggering the work queue here is just an insurance
+                * policy in case the syscall exit notify handler
+                * doesn't run soon enough or ends up running on the
+                * wrong CPU (can happen when audit sleeps)
+                */
+               mce_schedule_work();
+               return;
+       }
+
+       irq_work_queue(&mce_irq_work);
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+int mce_usable_address(struct mce *m)
+{
+       if (!(m->status & MCI_STATUS_ADDRV))
+               return 0;
+
+       /* Checks after this one are Intel-specific: */
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return 1;
+
+       if (!(m->status & MCI_STATUS_MISCV))
+               return 0;
+
+       if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+               return 0;
+
+       if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+               return 0;
+
+       return 1;
+}
+EXPORT_SYMBOL_GPL(mce_usable_address);
+
+bool mce_is_memory_error(struct mce *m)
+{
+       if (m->cpuvendor == X86_VENDOR_AMD ||
+           m->cpuvendor == X86_VENDOR_HYGON) {
+               return amd_mce_is_memory_error(m);
+       } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+               /*
+                * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+                *
+                * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+                * indicating a memory error. Bit 8 is used for indicating a
+                * cache hierarchy error. The combination of bit 2 and bit 3
+                * is used for indicating a `generic' cache hierarchy error
+                * But we can't just blindly check the above bits, because if
+                * bit 11 is set, then it is a bus/interconnect error - and
+                * either way the above bits just gives more detail on what
+                * bus/interconnect error happened. Note that bit 12 can be
+                * ignored, as it's the "filter" bit.
+                */
+               return (m->status & 0xef80) == BIT(7) ||
+                      (m->status & 0xef00) == BIT(8) ||
+                      (m->status & 0xeffc) == 0xc;
+       }
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
+
+bool mce_is_correctable(struct mce *m)
+{
+       if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+               return false;
+
+       if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
+               return false;
+
+       if (m->status & MCI_STATUS_UC)
+               return false;
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
+
+static bool cec_add_mce(struct mce *m)
+{
+       if (!m)
+               return false;
+
+       /* We eat only correctable DRAM errors with usable addresses. */
+       if (mce_is_memory_error(m) &&
+           mce_is_correctable(m)  &&
+           mce_usable_address(m))
+               if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+                       return true;
+
+       return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+                             void *data)
+{
+       struct mce *m = (struct mce *)data;
+
+       if (!m)
+               return NOTIFY_DONE;
+
+       if (cec_add_mce(m))
+               return NOTIFY_STOP;
+
+       /* Emit the trace record: */
+       trace_mce_record(m);
+
+       set_bit(0, &mce_need_notify);
+
+       mce_notify_irq();
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+       .notifier_call  = mce_first_notifier,
+       .priority       = MCE_PRIO_FIRST,
+};
+
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+                               void *data)
+{
+       struct mce *mce = (struct mce *)data;
+       unsigned long pfn;
+
+       if (!mce)
+               return NOTIFY_DONE;
+
+       if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
+               pfn = mce->addr >> PAGE_SHIFT;
+               if (!memory_failure(pfn, 0))
+                       set_mce_nospec(pfn);
+       }
+
+       return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+       .notifier_call  = srao_decode_notifier,
+       .priority       = MCE_PRIO_SRAO,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+                               void *data)
+{
+       struct mce *m = (struct mce *)data;
+
+       if (!m)
+               return NOTIFY_DONE;
+
+       if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
+               return NOTIFY_DONE;
+
+       __print_mce(m);
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+       .notifier_call  = mce_default_notifier,
+       /* lowest prio, we want it to run last. */
+       .priority       = MCE_PRIO_LOWEST,
+};
+
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+       if (m->status & MCI_STATUS_MISCV)
+               m->misc = mce_rdmsrl(msr_ops.misc(i));
+
+       if (m->status & MCI_STATUS_ADDRV) {
+               m->addr = mce_rdmsrl(msr_ops.addr(i));
+
+               /*
+                * Mask the reported address by the reported granularity.
+                */
+               if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+                       u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+                       m->addr >>= shift;
+                       m->addr <<= shift;
+               }
+
+               /*
+                * Extract [55:<lsb>] where lsb is the least significant
+                * *valid* bit of the address bits.
+                */
+               if (mce_flags.smca) {
+                       u8 lsb = (m->addr >> 56) & 0x3f;
+
+                       m->addr &= GENMASK_ULL(55, lsb);
+               }
+       }
+
+       if (mce_flags.smca) {
+               m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+               if (m->status & MCI_STATUS_SYNDV)
+                       m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+       }
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+       bool error_seen = false;
+       struct mce m;
+       int i;
+
+       this_cpu_inc(mce_poll_count);
+
+       mce_gather_info(&m, NULL);
+
+       if (flags & MCP_TIMESTAMP)
+               m.tsc = rdtsc();
+
+       for (i = 0; i < mca_cfg.banks; i++) {
+               if (!mce_banks[i].ctl || !test_bit(i, *b))
+                       continue;
+
+               m.misc = 0;
+               m.addr = 0;
+               m.bank = i;
+
+               barrier();
+               m.status = mce_rdmsrl(msr_ops.status(i));
+               if (!(m.status & MCI_STATUS_VAL))
+                       continue;
+
+               /*
+                * Uncorrected or signalled events are handled by the exception
+                * handler when it is enabled, so don't process those here.
+                *
+                * TBD do the same check for MCI_STATUS_EN here?
+                */
+               if (!(flags & MCP_UC) &&
+                   (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+                       continue;
+
+               error_seen = true;
+
+               mce_read_aux(&m, i);
+
+               m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
+
+               /*
+                * Don't get the IP here because it's unlikely to
+                * have anything to do with the actual error location.
+                */
+               if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+                       mce_log(&m);
+               else if (mce_usable_address(&m)) {
+                       /*
+                        * Although we skipped logging this, we still want
+                        * to take action. Add to the pool so the registered
+                        * notifiers will see it.
+                        */
+                       if (!mce_gen_pool_add(&m))
+                               mce_schedule_work();
+               }
+
+               /*
+                * Clear state for this bank.
+                */
+               mce_wrmsrl(msr_ops.status(i), 0);
+       }
+
+       /*
+        * Don't clear MCG_STATUS here because it's only defined for
+        * exceptions.
+        */
+
+       sync_core();
+
+       return error_seen;
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+                         struct pt_regs *regs)
+{
+       char *tmp;
+       int i;
+
+       for (i = 0; i < mca_cfg.banks; i++) {
+               m->status = mce_rdmsrl(msr_ops.status(i));
+               if (!(m->status & MCI_STATUS_VAL))
+                       continue;
+
+               __set_bit(i, validp);
+               if (quirk_no_way_out)
+                       quirk_no_way_out(i, m, regs);
+
+               if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+                       mce_read_aux(m, i);
+                       *msg = tmp;
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t, const char *msg)
+{
+       /*
+        * The others already did panic for some reason.
+        * Bail out like in a timeout.
+        * rmb() to tell the compiler that system_state
+        * might have been modified by someone else.
+        */
+       rmb();
+       if (atomic_read(&mce_panicked))
+               wait_for_panic();
+       if (!mca_cfg.monarch_timeout)
+               goto out;
+       if ((s64)*t < SPINUNIT) {
+               if (mca_cfg.tolerant <= 1)
+                       mce_panic(msg, NULL, NULL);
+               cpu_missing = 1;
+               return 1;
+       }
+       *t -= SPINUNIT;
+out:
+       touch_nmi_watchdog();
+       return 0;
+}
+
+/*
+ * The Monarch's reign.  The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+       int cpu;
+       struct mce *m = NULL;
+       int global_worst = 0;
+       char *msg = NULL;
+       char *nmsg = NULL;
+
+       /*
+        * This CPU is the Monarch and the other CPUs have run
+        * through their handlers.
+        * Grade the severity of the errors of all the CPUs.
+        */
+       for_each_possible_cpu(cpu) {
+               int severity = mce_severity(&per_cpu(mces_seen, cpu),
+                                           mca_cfg.tolerant,
+                                           &nmsg, true);
+               if (severity > global_worst) {
+                       msg = nmsg;
+                       global_worst = severity;
+                       m = &per_cpu(mces_seen, cpu);
+               }
+       }
+
+       /*
+        * Cannot recover? Panic here then.
+        * This dumps all the mces in the log buffer and stops the
+        * other CPUs.
+        */
+       if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+               mce_panic("Fatal machine check", m, msg);
+
+       /*
+        * For UC somewhere we let the CPU who detects it handle it.
+        * Also must let continue the others, otherwise the handling
+        * CPU could deadlock on a lock.
+        */
+
+       /*
+        * No machine check event found. Must be some external
+        * source or one CPU is hung. Panic.
+        */
+       if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
+               mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+       /*
+        * Now clear all the mces_seen so that they don't reappear on
+        * the next mce.
+        */
+       for_each_possible_cpu(cpu)
+               memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int *no_way_out)
+{
+       int order;
+       int cpus = num_online_cpus();
+       u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+       if (!timeout)
+               return -1;
+
+       atomic_add(*no_way_out, &global_nwo);
+       /*
+        * Rely on the implied barrier below, such that global_nwo
+        * is updated before mce_callin.
+        */
+       order = atomic_inc_return(&mce_callin);
+
+       /*
+        * Wait for everyone.
+        */
+       while (atomic_read(&mce_callin) != cpus) {
+               if (mce_timed_out(&timeout,
+                                 "Timeout: Not all CPUs entered broadcast exception handler")) {
+                       atomic_set(&global_nwo, 0);
+                       return -1;
+               }
+               ndelay(SPINUNIT);
+       }
+
+       /*
+        * mce_callin should be read before global_nwo
+        */
+       smp_rmb();
+
+       if (order == 1) {
+               /*
+                * Monarch: Starts executing now, the others wait.
+                */
+               atomic_set(&mce_executing, 1);
+       } else {
+               /*
+                * Subject: Now start the scanning loop one by one in
+                * the original callin order.
+                * This way when there are any shared banks it will be
+                * only seen by one CPU before cleared, avoiding duplicates.
+                */
+               while (atomic_read(&mce_executing) < order) {
+                       if (mce_timed_out(&timeout,
+                                         "Timeout: Subject CPUs unable to finish machine check processing")) {
+                               atomic_set(&global_nwo, 0);
+                               return -1;
+                       }
+                       ndelay(SPINUNIT);
+               }
+       }
+
+       /*
+        * Cache the global no_way_out state.
+        */
+       *no_way_out = atomic_read(&global_nwo);
+
+       return order;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+       int ret = -1;
+       u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+       if (!timeout)
+               goto reset;
+       if (order < 0)
+               goto reset;
+
+       /*
+        * Allow others to run.
+        */
+       atomic_inc(&mce_executing);
+
+       if (order == 1) {
+               /* CHECKME: Can this race with a parallel hotplug? */
+               int cpus = num_online_cpus();
+
+               /*
+                * Monarch: Wait for everyone to go through their scanning
+                * loops.
+                */
+               while (atomic_read(&mce_executing) <= cpus) {
+                       if (mce_timed_out(&timeout,
+                                         "Timeout: Monarch CPU unable to finish machine check processing"))
+                               goto reset;
+                       ndelay(SPINUNIT);
+               }
+
+               mce_reign();
+               barrier();
+               ret = 0;
+       } else {
+               /*
+                * Subject: Wait for Monarch to finish.
+                */
+               while (atomic_read(&mce_executing) != 0) {
+                       if (mce_timed_out(&timeout,
+                                         "Timeout: Monarch CPU did not finish machine check processing"))
+                               goto reset;
+                       ndelay(SPINUNIT);
+               }
+
+               /*
+                * Don't reset anything. That's done by the Monarch.
+                */
+               return 0;
+       }
+
+       /*
+        * Reset all global state.
+        */
+reset:
+       atomic_set(&global_nwo, 0);
+       atomic_set(&mce_callin, 0);
+       barrier();
+
+       /*
+        * Let others run again.
+        */
+       atomic_set(&mce_executing, 0);
+       return ret;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+       int i;
+
+       for (i = 0; i < mca_cfg.banks; i++) {
+               if (test_bit(i, toclear))
+                       mce_wrmsrl(msr_ops.status(i), 0);
+       }
+}
+
+static int do_memory_failure(struct mce *m)
+{
+       int flags = MF_ACTION_REQUIRED;
+       int ret;
+
+       pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+       if (!(m->mcgstatus & MCG_STATUS_RIPV))
+               flags |= MF_MUST_KILL;
+       ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
+       if (ret)
+               pr_err("Memory error not recovered");
+       else
+               set_mce_nospec(m->addr >> PAGE_SHIFT);
+       return ret;
+}
+
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ *  skip those CPUs which remain looping in the 1st kernel - see
+ *  crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static bool __mc_check_crashing_cpu(int cpu)
+{
+       if (cpu_is_offline(cpu) ||
+           (crashing_cpu != -1 && crashing_cpu != cpu)) {
+               u64 mcgstatus;
+
+               mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+               if (mcgstatus & MCG_STATUS_RIPV) {
+                       mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+                       return true;
+               }
+       }
+       return false;
+}
+
+static void __mc_scan_banks(struct mce *m, struct mce *final,
+                           unsigned long *toclear, unsigned long *valid_banks,
+                           int no_way_out, int *worst)
+{
+       struct mca_config *cfg = &mca_cfg;
+       int severity, i;
+
+       for (i = 0; i < cfg->banks; i++) {
+               __clear_bit(i, toclear);
+               if (!test_bit(i, valid_banks))
+                       continue;
+
+               if (!mce_banks[i].ctl)
+                       continue;
+
+               m->misc = 0;
+               m->addr = 0;
+               m->bank = i;
+
+               m->status = mce_rdmsrl(msr_ops.status(i));
+               if (!(m->status & MCI_STATUS_VAL))
+                       continue;
+
+               /*
+                * Corrected or non-signaled errors are handled by
+                * machine_check_poll(). Leave them alone, unless this panics.
+                */
+               if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+                       !no_way_out)
+                       continue;
+
+               /* Set taint even when machine check was not enabled. */
+               add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+               severity = mce_severity(m, cfg->tolerant, NULL, true);
+
+               /*
+                * When machine check was for corrected/deferred handler don't
+                * touch, unless we're panicking.
+                */
+               if ((severity == MCE_KEEP_SEVERITY ||
+                    severity == MCE_UCNA_SEVERITY) && !no_way_out)
+                       continue;
+
+               __set_bit(i, toclear);
+
+               /* Machine check event was not enabled. Clear, but ignore. */
+               if (severity == MCE_NO_SEVERITY)
+                       continue;
+
+               mce_read_aux(m, i);
+
+               /* assuming valid severity level != 0 */
+               m->severity = severity;
+
+               mce_log(m);
+
+               if (severity > *worst) {
+                       *final = *m;
+                       *worst = severity;
+               }
+       }
+
+       /* mce_clear_state will clear *final, save locally for use later */
+       *m = *final;
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ */
+void do_machine_check(struct pt_regs *regs, long error_code)
+{
+       DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+       DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+       struct mca_config *cfg = &mca_cfg;
+       int cpu = smp_processor_id();
+       char *msg = "Unknown";
+       struct mce m, *final;
+       int worst = 0;
+
+       /*
+        * Establish sequential order between the CPUs entering the machine
+        * check handler.
+        */
+       int order = -1;
+
+       /*
+        * If no_way_out gets set, there is no safe way to recover from this
+        * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
+        */
+       int no_way_out = 0;
+
+       /*
+        * If kill_it gets set, there might be a way to recover from this
+        * error.
+        */
+       int kill_it = 0;
+
+       /*
+        * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+        * on Intel.
+        */
+       int lmce = 1;
+
+       if (__mc_check_crashing_cpu(cpu))
+               return;
+
+       ist_enter(regs);
+
+       this_cpu_inc(mce_exception_count);
+
+       mce_gather_info(&m, regs);
+       m.tsc = rdtsc();
+
+       final = this_cpu_ptr(&mces_seen);
+       *final = m;
+
+       memset(valid_banks, 0, sizeof(valid_banks));
+       no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+
+       barrier();
+
+       /*
+        * When no restart IP might need to kill or panic.
+        * Assume the worst for now, but if we find the
+        * severity is MCE_AR_SEVERITY we have other options.
+        */
+       if (!(m.mcgstatus & MCG_STATUS_RIPV))
+               kill_it = 1;
+
+       /*
+        * Check if this MCE is signaled to only this logical processor,
+        * on Intel only.
+        */
+       if (m.cpuvendor == X86_VENDOR_INTEL)
+               lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+       /*
+        * Local machine check may already know that we have to panic.
+        * Broadcast machine check begins rendezvous in mce_start()
+        * Go through all banks in exclusion of the other CPUs. This way we
+        * don't report duplicated events on shared banks because the first one
+        * to see it will clear it.
+        */
+       if (lmce) {
+               if (no_way_out)
+                       mce_panic("Fatal local machine check", &m, msg);
+       } else {
+               order = mce_start(&no_way_out);
+       }
+
+       __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
+
+       if (!no_way_out)
+               mce_clear_state(toclear);
+
+       /*
+        * Do most of the synchronization with other CPUs.
+        * When there's any problem use only local no_way_out state.
+        */
+       if (!lmce) {
+               if (mce_end(order) < 0)
+                       no_way_out = worst >= MCE_PANIC_SEVERITY;
+       } else {
+               /*
+                * If there was a fatal machine check we should have
+                * already called mce_panic earlier in this function.
+                * Since we re-read the banks, we might have found
+                * something new. Check again to see if we found a
+                * fatal error. We call "mce_severity()" again to
+                * make sure we have the right "msg".
+                */
+               if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+                       mce_severity(&m, cfg->tolerant, &msg, true);
+                       mce_panic("Local fatal machine check!", &m, msg);
+               }
+       }
+
+       /*
+        * If tolerant is at an insane level we drop requests to kill
+        * processes and continue even when there is no way out.
+        */
+       if (cfg->tolerant == 3)
+               kill_it = 0;
+       else if (no_way_out)
+               mce_panic("Fatal machine check on current CPU", &m, msg);
+
+       if (worst > 0)
+               mce_report_event(regs);
+       mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+
+       sync_core();
+
+       if (worst != MCE_AR_SEVERITY && !kill_it)
+               goto out_ist;
+
+       /* Fault was in user mode and we need to take some action */
+       if ((m.cs & 3) == 3) {
+               ist_begin_non_atomic(regs);
+               local_irq_enable();
+
+               if (kill_it || do_memory_failure(&m))
+                       force_sig(SIGBUS, current);
+               local_irq_disable();
+               ist_end_non_atomic();
+       } else {
+               if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
+                       mce_panic("Failed kernel mode recovery", &m, NULL);
+       }
+
+out_ist:
+       ist_exit(regs);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int flags)
+{
+       /* mce_severity() should not hand us an ACTION_REQUIRED error */
+       BUG_ON(flags & MF_ACTION_REQUIRED);
+       pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+              "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+              pfn);
+
+       return 0;
+}
+#endif
+
+/*
+ * Periodic polling timer for "silent" machine check errors.  If the
+ * poller finds an MCE, poll 2x faster.  When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+       return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
+
+static void __start_timer(struct timer_list *t, unsigned long interval)
+{
+       unsigned long when = jiffies + interval;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       if (!timer_pending(t) || time_before(when, t->expires))
+               mod_timer(t, round_jiffies(when));
+
+       local_irq_restore(flags);
+}
+
+static void mce_timer_fn(struct timer_list *t)
+{
+       struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
+       unsigned long iv;
+
+       WARN_ON(cpu_t != t);
+
+       iv = __this_cpu_read(mce_next_interval);
+
+       if (mce_available(this_cpu_ptr(&cpu_info))) {
+               machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+
+               if (mce_intel_cmci_poll()) {
+                       iv = mce_adjust_timer(iv);
+                       goto done;
+               }
+       }
+
+       /*
+        * Alert userspace if needed. If we logged an MCE, reduce the polling
+        * interval, otherwise increase the polling interval.
+        */
+       if (mce_notify_irq())
+               iv = max(iv / 2, (unsigned long) HZ/100);
+       else
+               iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+
+done:
+       __this_cpu_write(mce_next_interval, iv);
+       __start_timer(t, iv);
+}
+
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+       struct timer_list *t = this_cpu_ptr(&mce_timer);
+       unsigned long iv = __this_cpu_read(mce_next_interval);
+
+       __start_timer(t, interval);
+
+       if (interval < iv)
+               __this_cpu_write(mce_next_interval, interval);
+}
+
+/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               del_timer_sync(&per_cpu(mce_timer, cpu));
+}
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+int mce_notify_irq(void)
+{
+       /* Not more than two messages every minute */
+       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+       if (test_and_clear_bit(0, &mce_need_notify)) {
+               mce_work_trigger();
+
+               if (__ratelimit(&ratelimit))
+                       pr_info(HW_ERR "Machine check events logged\n");
+
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mce_notify_irq);
+
+static int __mcheck_cpu_mce_banks_init(void)
+{
+       int i;
+       u8 num_banks = mca_cfg.banks;
+
+       mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL);
+       if (!mce_banks)
+               return -ENOMEM;
+
+       for (i = 0; i < num_banks; i++) {
+               struct mce_bank *b = &mce_banks[i];
+
+               b->ctl = -1ULL;
+               b->init = 1;
+       }
+       return 0;
+}
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static int __mcheck_cpu_cap_init(void)
+{
+       unsigned b;
+       u64 cap;
+
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+
+       b = cap & MCG_BANKCNT_MASK;
+       if (!mca_cfg.banks)
+               pr_info("CPU supports %d MCE banks\n", b);
+
+       if (b > MAX_NR_BANKS) {
+               pr_warn("Using only %u machine check banks out of %u\n",
+                       MAX_NR_BANKS, b);
+               b = MAX_NR_BANKS;
+       }
+
+       /* Don't support asymmetric configurations today */
+       WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
+       mca_cfg.banks = b;
+
+       if (!mce_banks) {
+               int err = __mcheck_cpu_mce_banks_init();
+
+               if (err)
+                       return err;
+       }
+
+       /* Use accurate RIP reporting if available. */
+       if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+               mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+       if (cap & MCG_SER_P)
+               mca_cfg.ser = 1;
+
+       return 0;
+}
+
+static void __mcheck_cpu_init_generic(void)
+{
+       enum mcp_flags m_fl = 0;
+       mce_banks_t all_banks;
+       u64 cap;
+
+       if (!mca_cfg.bootlog)
+               m_fl = MCP_DONTLOG;
+
+       /*
+        * Log the machine checks left over from the previous reset.
+        */
+       bitmap_fill(all_banks, MAX_NR_BANKS);
+       machine_check_poll(MCP_UC | m_fl, &all_banks);
+
+       cr4_set_bits(X86_CR4_MCE);
+
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+       if (cap & MCG_CTL_P)
+               wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_clear_banks(void)
+{
+       int i;
+
+       for (i = 0; i < mca_cfg.banks; i++) {
+               struct mce_bank *b = &mce_banks[i];
+
+               if (!b->init)
+                       continue;
+               wrmsrl(msr_ops.ctl(i), b->ctl);
+               wrmsrl(msr_ops.status(i), 0);
+       }
+}
+
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+       if (bank != 0)
+               return;
+       if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+               return;
+       if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+                         MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+                         MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+                         MCACOD)) !=
+                        (MCI_STATUS_UC|MCI_STATUS_EN|
+                         MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+                         MCI_STATUS_AR|MCACOD_INSTR))
+               return;
+
+       m->mcgstatus |= MCG_STATUS_EIPV;
+       m->ip = regs->ip;
+       m->cs = regs->cs;
+}
+
+/* Add per CPU specific workarounds here */
+static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+{
+       struct mca_config *cfg = &mca_cfg;
+
+       if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+               pr_info("unknown CPU type - not enabling MCE support\n");
+               return -EOPNOTSUPP;
+       }
+
+       /* This should be disabled by the BIOS, but isn't always */
+       if (c->x86_vendor == X86_VENDOR_AMD) {
+               if (c->x86 == 15 && cfg->banks > 4) {
+                       /*
+                        * disable GART TBL walk error reporting, which
+                        * trips off incorrectly with the IOMMU & 3ware
+                        * & Cerberus:
+                        */
+                       clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+               }
+               if (c->x86 < 0x11 && cfg->bootlog < 0) {
+                       /*
+                        * Lots of broken BIOS around that don't clear them
+                        * by default and leave crap in there. Don't log:
+                        */
+                       cfg->bootlog = 0;
+               }
+               /*
+                * Various K7s with broken bank 0 around. Always disable
+                * by default.
+                */
+               if (c->x86 == 6 && cfg->banks > 0)
+                       mce_banks[0].ctl = 0;
+
+               /*
+                * overflow_recov is supported for F15h Models 00h-0fh
+                * even though we don't have a CPUID bit for it.
+                */
+               if (c->x86 == 0x15 && c->x86_model <= 0xf)
+                       mce_flags.overflow_recov = 1;
+
+               /*
+                * Turn off MC4_MISC thresholding banks on those models since
+                * they're not supported there.
+                */
+               if (c->x86 == 0x15 &&
+                   (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+                       int i;
+                       u64 hwcr;
+                       bool need_toggle;
+                       u32 msrs[] = {
+                               0x00000413, /* MC4_MISC0 */
+                               0xc0000408, /* MC4_MISC1 */
+                       };
+
+                       rdmsrl(MSR_K7_HWCR, hwcr);
+
+                       /* McStatusWrEn has to be set */
+                       need_toggle = !(hwcr & BIT(18));
+
+                       if (need_toggle)
+                               wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+                       /* Clear CntP bit safely */
+                       for (i = 0; i < ARRAY_SIZE(msrs); i++)
+                               msr_clear_bit(msrs[i], 62);
+
+                       /* restore old settings */
+                       if (need_toggle)
+                               wrmsrl(MSR_K7_HWCR, hwcr);
+               }
+       }
+
+       if (c->x86_vendor == X86_VENDOR_INTEL) {
+               /*
+                * SDM documents that on family 6 bank 0 should not be written
+                * because it aliases to another special BIOS controlled
+                * register.
+                * But it's not aliased anymore on model 0x1a+
+                * Don't ignore bank 0 completely because there could be a
+                * valid event later, merely don't write CTL0.
+                */
+
+               if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
+                       mce_banks[0].init = 0;
+
+               /*
+                * All newer Intel systems support MCE broadcasting. Enable
+                * synchronization with a one second timeout.
+                */
+               if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+                       cfg->monarch_timeout < 0)
+                       cfg->monarch_timeout = USEC_PER_SEC;
+
+               /*
+                * There are also broken BIOSes on some Pentium M and
+                * earlier systems:
+                */
+               if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+                       cfg->bootlog = 0;
+
+               if (c->x86 == 6 && c->x86_model == 45)
+                       quirk_no_way_out = quirk_sandybridge_ifu;
+       }
+       if (cfg->monarch_timeout < 0)
+               cfg->monarch_timeout = 0;
+       if (cfg->bootlog != 0)
+               cfg->panic_timeout = 30;
+
+       return 0;
+}
+
+static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+{
+       if (c->x86 != 5)
+               return 0;
+
+       switch (c->x86_vendor) {
+       case X86_VENDOR_INTEL:
+               intel_p5_mcheck_init(c);
+               return 1;
+               break;
+       case X86_VENDOR_CENTAUR:
+               winchip_mcheck_init(c);
+               return 1;
+               break;
+       default:
+               return 0;
+       }
+
+       return 0;
+}
+
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
+{
+       if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
+               mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
+               mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
+               mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
+
+               if (mce_flags.smca) {
+                       msr_ops.ctl     = smca_ctl_reg;
+                       msr_ops.status  = smca_status_reg;
+                       msr_ops.addr    = smca_addr_reg;
+                       msr_ops.misc    = smca_misc_reg;
+               }
+       }
+}
+
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+       struct mca_config *cfg = &mca_cfg;
+
+        /*
+         * All newer Centaur CPUs support MCE broadcasting. Enable
+         * synchronization with a one second timeout.
+         */
+       if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+            c->x86 > 6) {
+               if (cfg->monarch_timeout < 0)
+                       cfg->monarch_timeout = USEC_PER_SEC;
+       }
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+       switch (c->x86_vendor) {
+       case X86_VENDOR_INTEL:
+               mce_intel_feature_init(c);
+               mce_adjust_timer = cmci_intel_adjust_timer;
+               break;
+
+       case X86_VENDOR_AMD: {
+               mce_amd_feature_init(c);
+               break;
+               }
+
+       case X86_VENDOR_HYGON:
+               mce_hygon_feature_init(c);
+               break;
+
+       case X86_VENDOR_CENTAUR:
+               mce_centaur_feature_init(c);
+               break;
+
+       default:
+               break;
+       }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+       switch (c->x86_vendor) {
+       case X86_VENDOR_INTEL:
+               mce_intel_feature_clear(c);
+               break;
+       default:
+               break;
+       }
+}
+
+static void mce_start_timer(struct timer_list *t)
+{
+       unsigned long iv = check_interval * HZ;
+
+       if (mca_cfg.ignore_ce || !iv)
+               return;
+
+       this_cpu_write(mce_next_interval, iv);
+       __start_timer(t, iv);
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+       struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+       timer_setup(t, mce_timer_fn, TIMER_PINNED);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+       struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+       timer_setup(t, mce_timer_fn, TIMER_PINNED);
+       mce_start_timer(t);
+}
+
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+       pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+              smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+                                               unexpected_machine_check;
+
+dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+{
+       machine_check_vector(regs, error_code);
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
+{
+       if (mca_cfg.disabled)
+               return;
+
+       if (__mcheck_cpu_ancient_init(c))
+               return;
+
+       if (!mce_available(c))
+               return;
+
+       if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
+               mca_cfg.disabled = 1;
+               return;
+       }
+
+       if (mce_gen_pool_init()) {
+               mca_cfg.disabled = 1;
+               pr_emerg("Couldn't allocate MCE records pool!\n");
+               return;
+       }
+
+       machine_check_vector = do_machine_check;
+
+       __mcheck_cpu_init_early(c);
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_vendor(c);
+       __mcheck_cpu_init_clear_banks();
+       __mcheck_cpu_setup_timer();
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+       if (mca_cfg.disabled)
+               return;
+
+       if (!mce_available(c))
+               return;
+
+       /*
+        * Possibly to clear general settings generic to x86
+        * __mcheck_cpu_clear_generic(c);
+        */
+       __mcheck_cpu_clear_vendor(c);
+
+}
+
+static void __mce_disable_bank(void *arg)
+{
+       int bank = *((int *)arg);
+       __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+       cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+       if (bank >= mca_cfg.banks) {
+               pr_warn(FW_BUG
+                       "Ignoring request to disable invalid MCA bank %d.\n",
+                       bank);
+               return;
+       }
+       set_bit(bank, mce_banks_ce_disabled);
+       on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ *     monarchtimeout is how long to wait for other CPUs on machine
+ *     check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+       and older.
+ * mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
+ */
+static int __init mcheck_enable(char *str)
+{
+       struct mca_config *cfg = &mca_cfg;
+
+       if (*str == 0) {
+               enable_p5_mce();
+               return 1;
+       }
+       if (*str == '=')
+               str++;
+       if (!strcmp(str, "off"))
+               cfg->disabled = 1;
+       else if (!strcmp(str, "no_cmci"))
+               cfg->cmci_disabled = true;
+       else if (!strcmp(str, "no_lmce"))
+               cfg->lmce_disabled = 1;
+       else if (!strcmp(str, "dont_log_ce"))
+               cfg->dont_log_ce = true;
+       else if (!strcmp(str, "ignore_ce"))
+               cfg->ignore_ce = true;
+       else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+               cfg->bootlog = (str[0] == 'b');
+       else if (!strcmp(str, "bios_cmci_threshold"))
+               cfg->bios_cmci_threshold = 1;
+       else if (!strcmp(str, "recovery"))
+               cfg->recovery = 1;
+       else if (isdigit(str[0])) {
+               if (get_option(&str, &cfg->tolerant) == 2)
+                       get_option(&str, &(cfg->monarch_timeout));
+       } else {
+               pr_info("mce argument %s ignored. Please use /sys\n", str);
+               return 0;
+       }
+       return 1;
+}
+__setup("mce", mcheck_enable);
+
+int __init mcheck_init(void)
+{
+       mcheck_intel_therm_init();
+       mce_register_decode_chain(&first_nb);
+       mce_register_decode_chain(&mce_srao_nb);
+       mce_register_decode_chain(&mce_default_nb);
+       mcheck_vendor_init_severity();
+
+       INIT_WORK(&mce_work, mce_gen_pool_process);
+       init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
+       return 0;
+}
+
+/*
+ * mce_syscore: PM support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static void mce_disable_error_reporting(void)
+{
+       int i;
+
+       for (i = 0; i < mca_cfg.banks; i++) {
+               struct mce_bank *b = &mce_banks[i];
+
+               if (b->init)
+                       wrmsrl(msr_ops.ctl(i), 0);
+       }
+       return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+       /*
+        * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
+        * are socket-wide.
+        * Disabling them for just a single offlined CPU is bad, since it will
+        * inhibit reporting for all shared resources on the socket like the
+        * last level cache (LLC), the integrated memory controller (iMC), etc.
+        */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+           boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
+           boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+               return;
+
+       mce_disable_error_reporting();
+}
+
+static int mce_syscore_suspend(void)
+{
+       vendor_disable_error_reporting();
+       return 0;
+}
+
+static void mce_syscore_shutdown(void)
+{
+       vendor_disable_error_reporting();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static void mce_syscore_resume(void)
+{
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+       __mcheck_cpu_init_clear_banks();
+}
+
+static struct syscore_ops mce_syscore_ops = {
+       .suspend        = mce_syscore_suspend,
+       .shutdown       = mce_syscore_shutdown,
+       .resume         = mce_syscore_resume,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
+static void mce_cpu_restart(void *data)
+{
+       if (!mce_available(raw_cpu_ptr(&cpu_info)))
+               return;
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_clear_banks();
+       __mcheck_cpu_init_timer();
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+       mce_timer_delete_all();
+       on_each_cpu(mce_cpu_restart, NULL, 1);
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_cmci(void *data)
+{
+       if (!mce_available(raw_cpu_ptr(&cpu_info)))
+               return;
+       cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+       if (!mce_available(raw_cpu_ptr(&cpu_info)))
+               return;
+       cmci_reenable();
+       cmci_recheck();
+       if (all)
+               __mcheck_cpu_init_timer();
+}
+
+static struct bus_type mce_subsys = {
+       .name           = "machinecheck",
+       .dev_name       = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct device *, mce_device);
+
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
+{
+       return container_of(attr, struct mce_bank, attr);
+}
+
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
+                        char *buf)
+{
+       return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
+}
+
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
+                       const char *buf, size_t size)
+{
+       u64 new;
+
+       if (kstrtou64(buf, 0, &new) < 0)
+               return -EINVAL;
+
+       attr_to_bank(attr)->ctl = new;
+       mce_restart();
+
+       return size;
+}
+
+static ssize_t set_ignore_ce(struct device *s,
+                            struct device_attribute *attr,
+                            const char *buf, size_t size)
+{
+       u64 new;
+
+       if (kstrtou64(buf, 0, &new) < 0)
+               return -EINVAL;
+
+       mutex_lock(&mce_sysfs_mutex);
+       if (mca_cfg.ignore_ce ^ !!new) {
+               if (new) {
+                       /* disable ce features */
+                       mce_timer_delete_all();
+                       on_each_cpu(mce_disable_cmci, NULL, 1);
+                       mca_cfg.ignore_ce = true;
+               } else {
+                       /* enable ce features */
+                       mca_cfg.ignore_ce = false;
+                       on_each_cpu(mce_enable_ce, (void *)1, 1);
+               }
+       }
+       mutex_unlock(&mce_sysfs_mutex);
+
+       return size;
+}
+
+static ssize_t set_cmci_disabled(struct device *s,
+                                struct device_attribute *attr,
+                                const char *buf, size_t size)
+{
+       u64 new;
+
+       if (kstrtou64(buf, 0, &new) < 0)
+               return -EINVAL;
+
+       mutex_lock(&mce_sysfs_mutex);
+       if (mca_cfg.cmci_disabled ^ !!new) {
+               if (new) {
+                       /* disable cmci */
+                       on_each_cpu(mce_disable_cmci, NULL, 1);
+                       mca_cfg.cmci_disabled = true;
+               } else {
+                       /* enable cmci */
+                       mca_cfg.cmci_disabled = false;
+                       on_each_cpu(mce_enable_ce, NULL, 1);
+               }
+       }
+       mutex_unlock(&mce_sysfs_mutex);
+
+       return size;
+}
+
+static ssize_t store_int_with_restart(struct device *s,
+                                     struct device_attribute *attr,
+                                     const char *buf, size_t size)
+{
+       unsigned long old_check_interval = check_interval;
+       ssize_t ret = device_store_ulong(s, attr, buf, size);
+
+       if (check_interval == old_check_interval)
+               return ret;
+
+       mutex_lock(&mce_sysfs_mutex);
+       mce_restart();
+       mutex_unlock(&mce_sysfs_mutex);
+
+       return ret;
+}
+
+static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+
+static struct dev_ext_attribute dev_attr_check_interval = {
+       __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
+       &check_interval
+};
+
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+       __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+       &mca_cfg.ignore_ce
+};
+
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+       __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+       &mca_cfg.cmci_disabled
+};
+
+static struct device_attribute *mce_device_attrs[] = {
+       &dev_attr_tolerant.attr,
+       &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
+       &dev_attr_trigger,
+#endif
+       &dev_attr_monarch_timeout.attr,
+       &dev_attr_dont_log_ce.attr,
+       &dev_attr_ignore_ce.attr,
+       &dev_attr_cmci_disabled.attr,
+       NULL
+};
+
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+       kfree(dev);
+}
+
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static int mce_device_create(unsigned int cpu)
+{
+       struct device *dev;
+       int err;
+       int i, j;
+
+       if (!mce_available(&boot_cpu_data))
+               return -EIO;
+
+       dev = per_cpu(mce_device, cpu);
+       if (dev)
+               return 0;
+
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+       dev->id  = cpu;
+       dev->bus = &mce_subsys;
+       dev->release = &mce_device_release;
+
+       err = device_register(dev);
+       if (err) {
+               put_device(dev);
+               return err;
+       }
+
+       for (i = 0; mce_device_attrs[i]; i++) {
+               err = device_create_file(dev, mce_device_attrs[i]);
+               if (err)
+                       goto error;
+       }
+       for (j = 0; j < mca_cfg.banks; j++) {
+               err = device_create_file(dev, &mce_banks[j].attr);
+               if (err)
+                       goto error2;
+       }
+       cpumask_set_cpu(cpu, mce_device_initialized);
+       per_cpu(mce_device, cpu) = dev;
+
+       return 0;
+error2:
+       while (--j >= 0)
+               device_remove_file(dev, &mce_banks[j].attr);
+error:
+       while (--i >= 0)
+               device_remove_file(dev, mce_device_attrs[i]);
+
+       device_unregister(dev);
+
+       return err;
+}
+
+static void mce_device_remove(unsigned int cpu)
+{
+       struct device *dev = per_cpu(mce_device, cpu);
+       int i;
+
+       if (!cpumask_test_cpu(cpu, mce_device_initialized))
+               return;
+
+       for (i = 0; mce_device_attrs[i]; i++)
+               device_remove_file(dev, mce_device_attrs[i]);
+
+       for (i = 0; i < mca_cfg.banks; i++)
+               device_remove_file(dev, &mce_banks[i].attr);
+
+       device_unregister(dev);
+       cpumask_clear_cpu(cpu, mce_device_initialized);
+       per_cpu(mce_device, cpu) = NULL;
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void)
+{
+       if (!mce_available(raw_cpu_ptr(&cpu_info)))
+               return;
+
+       if (!cpuhp_tasks_frozen)
+               cmci_clear();
+
+       vendor_disable_error_reporting();
+}
+
+static void mce_reenable_cpu(void)
+{
+       int i;
+
+       if (!mce_available(raw_cpu_ptr(&cpu_info)))
+               return;
+
+       if (!cpuhp_tasks_frozen)
+               cmci_reenable();
+       for (i = 0; i < mca_cfg.banks; i++) {
+               struct mce_bank *b = &mce_banks[i];
+
+               if (b->init)
+                       wrmsrl(msr_ops.ctl(i), b->ctl);
+       }
+}
+
+static int mce_cpu_dead(unsigned int cpu)
+{
+       mce_intel_hcpu_update(cpu);
+
+       /* intentionally ignoring frozen here */
+       if (!cpuhp_tasks_frozen)
+               cmci_rediscover();
+       return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
+{
+       struct timer_list *t = this_cpu_ptr(&mce_timer);
+       int ret;
+
+       mce_device_create(cpu);
+
+       ret = mce_threshold_create_device(cpu);
+       if (ret) {
+               mce_device_remove(cpu);
+               return ret;
+       }
+       mce_reenable_cpu();
+       mce_start_timer(t);
+       return 0;
+}
+
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+       struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+       mce_disable_cpu();
+       del_timer_sync(t);
+       mce_threshold_remove_device(cpu);
+       mce_device_remove(cpu);
+       return 0;
+}
+
+static __init void mce_init_banks(void)
+{
+       int i;
+
+       for (i = 0; i < mca_cfg.banks; i++) {
+               struct mce_bank *b = &mce_banks[i];
+               struct device_attribute *a = &b->attr;
+
+               sysfs_attr_init(&a->attr);
+               a->attr.name    = b->attrname;
+               snprintf(b->attrname, ATTR_LEN, "bank%d", i);
+
+               a->attr.mode    = 0644;
+               a->show         = show_bank;
+               a->store        = set_bank;
+       }
+}
+
+static __init int mcheck_init_device(void)
+{
+       int err;
+
+       /*
+        * Check if we have a spare virtual bit. This will only become
+        * a problem if/when we move beyond 5-level page tables.
+        */
+       MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
+       if (!mce_available(&boot_cpu_data)) {
+               err = -EIO;
+               goto err_out;
+       }
+
+       if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+
+       mce_init_banks();
+
+       err = subsys_system_register(&mce_subsys, NULL);
+       if (err)
+               goto err_out_mem;
+
+       err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+                               mce_cpu_dead);
+       if (err)
+               goto err_out_mem;
+
+       err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+                               mce_cpu_online, mce_cpu_pre_down);
+       if (err < 0)
+               goto err_out_online;
+
+       register_syscore_ops(&mce_syscore_ops);
+
+       return 0;
+
+err_out_online:
+       cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
+
+err_out_mem:
+       free_cpumask_var(mce_device_initialized);
+
+err_out:
+       pr_err("Unable to init MCE device (rc: %d)\n", err);
+
+       return err;
+}
+device_initcall_sync(mcheck_init_device);
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+       mca_cfg.disabled = 1;
+       return 1;
+}
+__setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+       static struct dentry *dmce;
+
+       if (!dmce)
+               dmce = debugfs_create_dir("mce", NULL);
+
+       return dmce;
+}
+
+static void mce_reset(void)
+{
+       cpu_missing = 0;
+       atomic_set(&mce_fake_panicked, 0);
+       atomic_set(&mce_executing, 0);
+       atomic_set(&mce_callin, 0);
+       atomic_set(&global_nwo, 0);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+       *val = fake_panic;
+       return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+       mce_reset();
+       fake_panic = val;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
+                       fake_panic_set, "%llu\n");
+
+static int __init mcheck_debugfs_init(void)
+{
+       struct dentry *dmce, *ffake_panic;
+
+       dmce = mce_get_debugfs_dir();
+       if (!dmce)
+               return -ENOMEM;
+       ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
+                                         &fake_panic_fops);
+       if (!ffake_panic)
+               return -ENOMEM;
+
+       return 0;
+}
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
+#endif
+
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
+static int __init mcheck_late_init(void)
+{
+       if (mca_cfg.recovery)
+               static_branch_inc(&mcsafe_key);
+
+       mcheck_debugfs_init();
+       cec_init();
+
+       /*
+        * Flush out everything that has been logged during early boot, now that
+        * everything has been initialized (workqueues, decoders, ...).
+        */
+       mce_schedule_work();
+
+       return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
new file mode 100644 (file)
index 0000000..41d9169
--- /dev/null
@@ -0,0 +1,360 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "internal.h"
+
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+       .signature      = MCE_LOG_SIGNATURE,
+       .len            = MCE_LOG_LEN,
+       .recordlen      = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+                               void *data)
+{
+       struct mce *mce = (struct mce *)data;
+       unsigned int entry;
+
+       mutex_lock(&mce_chrdev_read_mutex);
+
+       entry = mcelog.next;
+
+       /*
+        * When the buffer fills up discard new entries. Assume that the
+        * earlier errors are the more interesting ones:
+        */
+       if (entry >= MCE_LOG_LEN) {
+               set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+               goto unlock;
+       }
+
+       mcelog.next = entry + 1;
+
+       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+       mcelog.entry[entry].finished = 1;
+
+       /* wake processes polling /dev/mcelog */
+       wake_up_interruptible(&mce_chrdev_wait);
+
+unlock:
+       mutex_unlock(&mce_chrdev_read_mutex);
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+       .notifier_call  = dev_mce_log,
+       .priority       = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+       call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+       if (mce_helper[0])
+               schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+       strcpy(buf, mce_helper);
+       strcat(buf, "\n");
+       return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+                               const char *buf, size_t siz)
+{
+       char *p;
+
+       strncpy(mce_helper, buf, sizeof(mce_helper));
+       mce_helper[sizeof(mce_helper)-1] = 0;
+       p = strchr(mce_helper, '\n');
+
+       if (p)
+               *p = 0;
+
+       return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count;      /* #times opened */
+static int mce_chrdev_open_exclu;      /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_chrdev_state_lock);
+
+       if (mce_chrdev_open_exclu ||
+           (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+               spin_unlock(&mce_chrdev_state_lock);
+
+               return -EBUSY;
+       }
+
+       if (file->f_flags & O_EXCL)
+               mce_chrdev_open_exclu = 1;
+       mce_chrdev_open_count++;
+
+       spin_unlock(&mce_chrdev_state_lock);
+
+       return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_chrdev_state_lock);
+
+       mce_chrdev_open_count--;
+       mce_chrdev_open_exclu = 0;
+
+       spin_unlock(&mce_chrdev_state_lock);
+
+       return 0;
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+       int rc;
+       u64 record_id;
+       struct mce m;
+
+       if (usize < sizeof(struct mce))
+               return -EINVAL;
+
+       rc = apei_read_mce(&m, &record_id);
+       /* Error or no more MCE record */
+       if (rc <= 0) {
+               mce_apei_read_done = 1;
+               /*
+                * When ERST is disabled, mce_chrdev_read() should return
+                * "no record" instead of "no device."
+                */
+               if (rc == -ENODEV)
+                       return 0;
+               return rc;
+       }
+       rc = -EFAULT;
+       if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+               return rc;
+       /*
+        * In fact, we should have cleared the record after that has
+        * been flushed to the disk or sent to network in
+        * /sbin/mcelog, but we have no interface to support that now,
+        * so just clear it to avoid duplication.
+        */
+       rc = apei_clear_mce(record_id);
+       if (rc) {
+               mce_apei_read_done = 1;
+               return rc;
+       }
+       *ubuf += sizeof(struct mce);
+
+       return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+                               size_t usize, loff_t *off)
+{
+       char __user *buf = ubuf;
+       unsigned next;
+       int i, err;
+
+       mutex_lock(&mce_chrdev_read_mutex);
+
+       if (!mce_apei_read_done) {
+               err = __mce_read_apei(&buf, usize);
+               if (err || buf != ubuf)
+                       goto out;
+       }
+
+       /* Only supports full reads right now */
+       err = -EINVAL;
+       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+               goto out;
+
+       next = mcelog.next;
+       err = 0;
+
+       for (i = 0; i < next; i++) {
+               struct mce *m = &mcelog.entry[i];
+
+               err |= copy_to_user(buf, m, sizeof(*m));
+               buf += sizeof(*m);
+       }
+
+       memset(mcelog.entry, 0, next * sizeof(struct mce));
+       mcelog.next = 0;
+
+       if (err)
+               err = -EFAULT;
+
+out:
+       mutex_unlock(&mce_chrdev_read_mutex);
+
+       return err ? err : buf - ubuf;
+}
+
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+       poll_wait(file, &mce_chrdev_wait, wait);
+       if (READ_ONCE(mcelog.next))
+               return EPOLLIN | EPOLLRDNORM;
+       if (!mce_apei_read_done && apei_check_mce())
+               return EPOLLIN | EPOLLRDNORM;
+       return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+                               unsigned long arg)
+{
+       int __user *p = (int __user *)arg;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+       case MCE_GET_RECORD_LEN:
+               return put_user(sizeof(struct mce), p);
+       case MCE_GET_LOG_LEN:
+               return put_user(MCE_LOG_LEN, p);
+       case MCE_GETCLEAR_FLAGS: {
+               unsigned flags;
+
+               do {
+                       flags = mcelog.flags;
+               } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+               return put_user(flags, p);
+       }
+       default:
+               return -ENOTTY;
+       }
+}
+
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+       blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
+
+void mce_unregister_injector_chain(struct notifier_block *nb)
+{
+       blocking_notifier_chain_unregister(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+                               size_t usize, loff_t *off)
+{
+       struct mce m;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       /*
+        * There are some cases where real MSR reads could slip
+        * through.
+        */
+       if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+               return -EIO;
+
+       if ((unsigned long)usize > sizeof(struct mce))
+               usize = sizeof(struct mce);
+       if (copy_from_user(&m, ubuf, usize))
+               return -EFAULT;
+
+       if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+               return -EINVAL;
+
+       /*
+        * Need to give user space some time to set everything up,
+        * so do it a jiffie or two later everywhere.
+        */
+       schedule_timeout(2);
+
+       blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+       return usize;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+       .open                   = mce_chrdev_open,
+       .release                = mce_chrdev_release,
+       .read                   = mce_chrdev_read,
+       .write                  = mce_chrdev_write,
+       .poll                   = mce_chrdev_poll,
+       .unlocked_ioctl         = mce_chrdev_ioctl,
+       .llseek                 = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+       MISC_MCELOG_MINOR,
+       "mcelog",
+       &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+       int err;
+
+       /* register character device /dev/mcelog */
+       err = misc_register(&mce_chrdev_device);
+       if (err) {
+               if (err == -EBUSY)
+                       /* Xen dom0 might have registered the device already. */
+                       pr_info("Unable to init device /dev/mcelog, already registered");
+               else
+                       pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
+               return err;
+       }
+
+       mce_register_decode_chain(&dev_mcelog_nb);
+       return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
new file mode 100644 (file)
index 0000000..3395549
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ     (2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+       struct mce_evt_llist *node;
+       struct mce *m1, *m2;
+
+       m1 = &t->mce;
+
+       llist_for_each_entry(node, &l->llnode, llnode) {
+               m2 = &node->mce;
+
+               if (!mce_cmp(m1, m2))
+                       return true;
+       }
+       return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet.  The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+       struct llist_node *head;
+       LLIST_HEAD(new_head);
+       struct mce_evt_llist *node, *t;
+
+       head = llist_del_all(&mce_event_llist);
+       if (!head)
+               return NULL;
+
+       /* squeeze out duplicates while reversing order */
+       llist_for_each_entry_safe(node, t, head, llnode) {
+               if (!is_duplicate_mce_record(node, t))
+                       llist_add(&node->llnode, &new_head);
+       }
+
+       return new_head.first;
+}
+
+void mce_gen_pool_process(struct work_struct *__unused)
+{
+       struct llist_node *head;
+       struct mce_evt_llist *node, *tmp;
+       struct mce *mce;
+
+       head = llist_del_all(&mce_event_llist);
+       if (!head)
+               return;
+
+       head = llist_reverse_order(head);
+       llist_for_each_entry_safe(node, tmp, head, llnode) {
+               mce = &node->mce;
+               blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+               gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+       }
+}
+
+bool mce_gen_pool_empty(void)
+{
+       return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+       struct mce_evt_llist *node;
+
+       if (!mce_evt_pool)
+               return -EINVAL;
+
+       node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+       if (!node) {
+               pr_warn_ratelimited("MCE records pool full!\n");
+               return -ENOMEM;
+       }
+
+       memcpy(&node->mce, mce, sizeof(*mce));
+       llist_add(&node->llnode, &mce_event_llist);
+
+       return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+       struct gen_pool *tmpp;
+       int ret = -ENOMEM;
+
+       tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+       if (!tmpp)
+               goto out;
+
+       ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+       if (ret) {
+               gen_pool_destroy(tmpp);
+               goto out;
+       }
+
+       mce_evt_pool = tmpp;
+
+out:
+       return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+       /* Just init mce_gen_pool once. */
+       if (mce_evt_pool)
+               return 0;
+
+       return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
new file mode 100644 (file)
index 0000000..8492ef7
--- /dev/null
@@ -0,0 +1,739 @@
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-17:  Borislav Petkov <bp@alien8.de>
+ *                        Advanced Micro Devices Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd_nb.h>
+#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+static u8 n_banks;
+
+#define MAX_FLAG_OPT_SIZE      4
+#define NBCFG                  0x44
+
+enum injection_type {
+       SW_INJ = 0,     /* SW injection, simply decode the error */
+       HW_INJ,         /* Trigger a #MC */
+       DFR_INT_INJ,    /* Trigger Deferred error interrupt */
+       THR_INT_INJ,    /* Trigger threshold interrupt */
+       N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+       [SW_INJ] = "sw",
+       [HW_INJ] = "hw",
+       [DFR_INT_INJ] = "df",
+       [THR_INT_INJ] = "th",
+       NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg)                                            \
+static int inj_##reg##_set(void *data, u64 val)                                \
+{                                                                      \
+       struct mce *m = (struct mce *)data;                             \
+                                                                       \
+       m->reg = val;                                                   \
+       return 0;                                                       \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg)                                            \
+static int inj_##reg##_get(void *data, u64 *val)                       \
+{                                                                      \
+       struct mce *m = (struct mce *)data;                             \
+                                                                       \
+       *val = m->reg;                                                  \
+       return 0;                                                       \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+       memset(m, 0, sizeof(struct mce));
+
+       m->cpuvendor = boot_cpu_data.x86_vendor;
+       m->time      = ktime_get_real_seconds();
+       m->cpuid     = cpuid_eax(1);
+       m->microcode = boot_cpu_data.microcode;
+}
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+       struct mce *i = &per_cpu(injectm, m->extcpu);
+
+       /* Make sure no one reads partially written injectm */
+       i->finished = 0;
+       mb();
+       m->finished = 0;
+       /* First set the fields after finished */
+       i->extcpu = m->extcpu;
+       mb();
+       /* Now write record in order, finished last (except above) */
+       memcpy(i, m, sizeof(struct mce));
+       /* Finally activate it */
+       mb();
+       i->finished = 1;
+}
+
+static void raise_poll(struct mce *m)
+{
+       unsigned long flags;
+       mce_banks_t b;
+
+       memset(&b, 0xff, sizeof(mce_banks_t));
+       local_irq_save(flags);
+       machine_check_poll(0, &b);
+       local_irq_restore(flags);
+       m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
+{
+       struct pt_regs regs;
+       unsigned long flags;
+
+       if (!pregs) {
+               memset(&regs, 0, sizeof(struct pt_regs));
+               regs.ip = m->ip;
+               regs.cs = m->cs;
+               pregs = &regs;
+       }
+       /* in mcheck exeception handler, irq will be disabled */
+       local_irq_save(flags);
+       do_machine_check(pregs, 0);
+       local_irq_restore(flags);
+       m->finished = 0;
+}
+
+static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
+
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
+{
+       int cpu = smp_processor_id();
+       struct mce *m = this_cpu_ptr(&injectm);
+       if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+               return NMI_DONE;
+       cpumask_clear_cpu(cpu, mce_inject_cpumask);
+       if (m->inject_flags & MCJ_EXCEPTION)
+               raise_exception(m, regs);
+       else if (m->status)
+               raise_poll(m);
+       return NMI_HANDLED;
+}
+
+static void mce_irq_ipi(void *info)
+{
+       int cpu = smp_processor_id();
+       struct mce *m = this_cpu_ptr(&injectm);
+
+       if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+                       m->inject_flags & MCJ_EXCEPTION) {
+               cpumask_clear_cpu(cpu, mce_inject_cpumask);
+               raise_exception(m, NULL);
+       }
+}
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+       struct mce *m = this_cpu_ptr(&injectm);
+       int context = MCJ_CTX(m->inject_flags);
+       int ret = 0;
+       int cpu = m->extcpu;
+
+       if (m->inject_flags & MCJ_EXCEPTION) {
+               pr_info("Triggering MCE exception on CPU %d\n", cpu);
+               switch (context) {
+               case MCJ_CTX_IRQ:
+                       /*
+                        * Could do more to fake interrupts like
+                        * calling irq_enter, but the necessary
+                        * machinery isn't exported currently.
+                        */
+                       /*FALL THROUGH*/
+               case MCJ_CTX_PROCESS:
+                       raise_exception(m, NULL);
+                       break;
+               default:
+                       pr_info("Invalid MCE context\n");
+                       ret = -EINVAL;
+               }
+               pr_info("MCE exception done on CPU %d\n", cpu);
+       } else if (m->status) {
+               pr_info("Starting machine check poll CPU %d\n", cpu);
+               raise_poll(m);
+               mce_notify_irq();
+               pr_info("Machine check poll done on CPU %d\n", cpu);
+       } else
+               m->finished = 0;
+
+       return ret;
+}
+
+static void __maybe_unused raise_mce(struct mce *m)
+{
+       int context = MCJ_CTX(m->inject_flags);
+
+       inject_mce(m);
+
+       if (context == MCJ_CTX_RANDOM)
+               return;
+
+       if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
+               unsigned long start;
+               int cpu;
+
+               get_online_cpus();
+               cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+               cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
+               for_each_online_cpu(cpu) {
+                       struct mce *mcpu = &per_cpu(injectm, cpu);
+                       if (!mcpu->finished ||
+                           MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+                               cpumask_clear_cpu(cpu, mce_inject_cpumask);
+               }
+               if (!cpumask_empty(mce_inject_cpumask)) {
+                       if (m->inject_flags & MCJ_IRQ_BROADCAST) {
+                               /*
+                                * don't wait because mce_irq_ipi is necessary
+                                * to be sync with following raise_local
+                                */
+                               preempt_disable();
+                               smp_call_function_many(mce_inject_cpumask,
+                                       mce_irq_ipi, NULL, 0);
+                               preempt_enable();
+                       } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+                               apic->send_IPI_mask(mce_inject_cpumask,
+                                               NMI_VECTOR);
+               }
+               start = jiffies;
+               while (!cpumask_empty(mce_inject_cpumask)) {
+                       if (!time_before(jiffies, start + 2*HZ)) {
+                               pr_err("Timeout waiting for mce inject %lx\n",
+                                       *cpumask_bits(mce_inject_cpumask));
+                               break;
+                       }
+                       cpu_relax();
+               }
+               raise_local();
+               put_cpu();
+               put_online_cpus();
+       } else {
+               preempt_disable();
+               raise_local();
+               preempt_enable();
+       }
+}
+
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+                           void *data)
+{
+       struct mce *m = (struct mce *)data;
+
+       if (!m)
+               return NOTIFY_DONE;
+
+       mutex_lock(&mce_inject_mutex);
+       raise_mce(m);
+       mutex_unlock(&mce_inject_mutex);
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+       .notifier_call  = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+       u32 l, h;
+       int err;
+
+       err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+       if (err) {
+               pr_err("%s: error reading HWCR\n", __func__);
+               return err;
+       }
+
+       enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+       err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+       if (err)
+               pr_err("%s: error writing HWCR\n", __func__);
+
+       return err;
+}
+
+static int __set_inj(const char *buf)
+{
+       int i;
+
+       for (i = 0; i < N_INJ_TYPES; i++) {
+               if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+                       inj_type = i;
+                       return 0;
+               }
+       }
+       return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+                         size_t cnt, loff_t *ppos)
+{
+       char buf[MAX_FLAG_OPT_SIZE];
+       int n;
+
+       n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+                          size_t cnt, loff_t *ppos)
+{
+       char buf[MAX_FLAG_OPT_SIZE], *__buf;
+       int err;
+
+       if (cnt > MAX_FLAG_OPT_SIZE)
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt - 1] = 0;
+
+       /* strip whitespace */
+       __buf = strstrip(buf);
+
+       err = __set_inj(__buf);
+       if (err) {
+               pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+               return err;
+       }
+
+       *ppos += cnt;
+
+       return cnt;
+}
+
+static const struct file_operations flags_fops = {
+       .read           = flags_read,
+       .write          = flags_write,
+       .llseek         = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+       struct mce *m = (struct mce *)data;
+
+       if (val >= nr_cpu_ids || !cpu_online(val)) {
+               pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+               return -EINVAL;
+       }
+       m->extcpu = val;
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+       asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+       asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+       asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+       u32 cores_per_node;
+
+       cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
+
+       return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+       struct amd_northbridge *nb;
+       struct pci_dev *F3;
+       u32 val;
+       int err;
+
+       nb = node_to_amd_nb(nid);
+       if (!nb)
+               return;
+
+       F3 = nb->misc;
+       if (!F3)
+               return;
+
+       err = pci_read_config_dword(F3, NBCFG, &val);
+       if (err) {
+               pr_err("%s: Error reading F%dx%03x.\n",
+                      __func__, PCI_FUNC(F3->devfn), NBCFG);
+               return;
+       }
+
+       if (val & BIT(27))
+               return;
+
+       pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+              __func__);
+
+       val |= BIT(27);
+       err = pci_write_config_dword(F3, NBCFG, val);
+       if (err)
+               pr_err("%s: Error writing F%dx%03x.\n",
+                      __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+       struct mce m = *(struct mce *)info;
+       u8 b = m.bank;
+
+       wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+       if (boot_cpu_has(X86_FEATURE_SMCA)) {
+               if (m.inject_flags == DFR_INT_INJ) {
+                       wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+                       wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+               } else {
+                       wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+                       wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+               }
+
+               wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+               wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+       } else {
+               wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+               wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+               wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+       }
+}
+
+static void do_inject(void)
+{
+       u64 mcg_status = 0;
+       unsigned int cpu = i_mce.extcpu;
+       u8 b = i_mce.bank;
+
+       i_mce.tsc = rdtsc_ordered();
+
+       if (i_mce.misc)
+               i_mce.status |= MCI_STATUS_MISCV;
+
+       if (i_mce.synd)
+               i_mce.status |= MCI_STATUS_SYNDV;
+
+       if (inj_type == SW_INJ) {
+               mce_inject_log(&i_mce);
+               return;
+       }
+
+       /* prep MCE global settings for the injection */
+       mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+       if (!(i_mce.status & MCI_STATUS_PCC))
+               mcg_status |= MCG_STATUS_RIPV;
+
+       /*
+        * Ensure necessary status bits for deferred errors:
+        * - MCx_STATUS[Deferred]: make sure it is a deferred error
+        * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+        */
+       if (inj_type == DFR_INT_INJ) {
+               i_mce.status |= MCI_STATUS_DEFERRED;
+               i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+       }
+
+       /*
+        * For multi node CPUs, logging and reporting of bank 4 errors happens
+        * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+        * Fam10h and later BKDGs.
+        */
+       if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+           b == 4 &&
+           boot_cpu_data.x86 < 0x17) {
+               toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+               cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+       }
+
+       get_online_cpus();
+       if (!cpu_online(cpu))
+               goto err;
+
+       toggle_hw_mce_inject(cpu, true);
+
+       i_mce.mcgstatus = mcg_status;
+       i_mce.inject_flags = inj_type;
+       smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+       toggle_hw_mce_inject(cpu, false);
+
+       switch (inj_type) {
+       case DFR_INT_INJ:
+               smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+               break;
+       case THR_INT_INJ:
+               smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+               break;
+       default:
+               smp_call_function_single(cpu, trigger_mce, NULL, 0);
+       }
+
+err:
+       put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+       struct mce *m = (struct mce *)data;
+
+       if (val >= n_banks) {
+               pr_err("Non-existent MCE bank: %llu\n", val);
+               return -EINVAL;
+       }
+
+       m->bank = val;
+       do_inject();
+
+       /* Reset injection struct */
+       setup_inj_struct(&i_mce);
+
+       return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t  - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t    format only. Safe to use.\n"
+"\t  - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t    handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t    is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t    before injecting.\n"
+"\t  - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t    error APIC interrupt handler to handle the error if the feature is \n"
+"\t    is present in hardware. \n"
+"\t  - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t    APIC interrupt handler to handle the error. \n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       return simple_read_from_buffer(ubuf, cnt, ppos,
+                                       readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+       .read           = inj_readme_read,
+};
+
+static struct dfs_node {
+       char *name;
+       struct dentry *d;
+       const struct file_operations *fops;
+       umode_t perm;
+} dfs_fls[] = {
+       { .name = "status",     .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+       { .name = "misc",       .fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
+       { .name = "addr",       .fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
+       { .name = "synd",       .fops = &synd_fops,   .perm = S_IRUSR | S_IWUSR },
+       { .name = "bank",       .fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
+       { .name = "flags",      .fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
+       { .name = "cpu",        .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+       { .name = "README",     .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init debugfs_init(void)
+{
+       unsigned int i;
+       u64 cap;
+
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+       n_banks = cap & MCG_BANKCNT_MASK;
+
+       dfs_inj = debugfs_create_dir("mce-inject", NULL);
+       if (!dfs_inj)
+               return -EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+               dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+                                                   dfs_fls[i].perm,
+                                                   dfs_inj,
+                                                   &i_mce,
+                                                   dfs_fls[i].fops);
+
+               if (!dfs_fls[i].d)
+                       goto err_dfs_add;
+       }
+
+       return 0;
+
+err_dfs_add:
+       while (i-- > 0)
+               debugfs_remove(dfs_fls[i].d);
+
+       debugfs_remove(dfs_inj);
+       dfs_inj = NULL;
+
+       return -ENODEV;
+}
+
+static int __init inject_init(void)
+{
+       int err;
+
+       if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+               return -ENOMEM;
+
+       err = debugfs_init();
+       if (err) {
+               free_cpumask_var(mce_inject_cpumask);
+               return err;
+       }
+
+       register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+       mce_register_injector_chain(&inject_nb);
+
+       setup_inj_struct(&i_mce);
+
+       pr_info("Machine check injector initialized\n");
+
+       return 0;
+}
+
+static void __exit inject_exit(void)
+{
+
+       mce_unregister_injector_chain(&inject_nb);
+       unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+       debugfs_remove_recursive(dfs_inj);
+       dfs_inj = NULL;
+
+       memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+       free_cpumask_var(mce_inject_cpumask);
+}
+
+module_init(inject_init);
+module_exit(inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
new file mode 100644 (file)
index 0000000..e43eb67
--- /dev/null
@@ -0,0 +1,518 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD         1
+#define CMCI_POLL_INTERVAL     (30 * HZ)
+#define CMCI_STORM_INTERVAL    (HZ)
+#define CMCI_STORM_THRESHOLD   15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+       CMCI_STORM_NONE,
+       CMCI_STORM_ACTIVE,
+       CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
+
+static int cmci_supported(int *banks)
+{
+       u64 cap;
+
+       if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
+               return 0;
+
+       /*
+        * Vendor check is not strictly needed, but the initial
+        * initialization is vendor keyed and this
+        * makes sure none of the backdoors are entered otherwise.
+        */
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return 0;
+       if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
+               return 0;
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+       *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+       return !!(cap & MCG_CMCI_P);
+}
+
+static bool lmce_supported(void)
+{
+       u64 tmp;
+
+       if (mca_cfg.lmce_disabled)
+               return false;
+
+       rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+       /*
+        * LMCE depends on recovery support in the processor. Hence both
+        * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+        */
+       if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+                  (MCG_SER_P | MCG_LMCE_P))
+               return false;
+
+       /*
+        * BIOS should indicate support for LMCE by setting bit 20 in
+        * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+        * generate a #GP fault.
+        */
+       rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+       if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+                  (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+               return true;
+
+       return false;
+}
+
+bool mce_intel_cmci_poll(void)
+{
+       if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+               return false;
+
+       /*
+        * Reset the counter if we've logged an error in the last poll
+        * during the storm.
+        */
+       if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
+               this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+       else
+               this_cpu_dec(cmci_backoff_cnt);
+
+       return true;
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+       if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+               atomic_dec(&cmci_storm_on_cpus);
+
+       per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+static void cmci_toggle_interrupt_mode(bool on)
+{
+       unsigned long flags, *owned;
+       int bank;
+       u64 val;
+
+       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+       owned = this_cpu_ptr(mce_banks_owned);
+       for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+               rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+               if (on)
+                       val |= MCI_CTL2_CMCI_EN;
+               else
+                       val &= ~MCI_CTL2_CMCI_EN;
+
+               wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+       }
+       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
+{
+       if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+           (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+               mce_notify_irq();
+               return CMCI_STORM_INTERVAL;
+       }
+
+       switch (__this_cpu_read(cmci_storm_state)) {
+       case CMCI_STORM_ACTIVE:
+
+               /*
+                * We switch back to interrupt mode once the poll timer has
+                * silenced itself. That means no events recorded and the timer
+                * interval is back to our poll interval.
+                */
+               __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+               if (!atomic_sub_return(1, &cmci_storm_on_cpus))
+                       pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
+               /* FALLTHROUGH */
+
+       case CMCI_STORM_SUBSIDED:
+               /*
+                * We wait for all CPUs to go back to SUBSIDED state. When that
+                * happens we switch back to interrupt mode.
+                */
+               if (!atomic_read(&cmci_storm_on_cpus)) {
+                       __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+                       cmci_toggle_interrupt_mode(true);
+                       cmci_recheck();
+               }
+               return CMCI_POLL_INTERVAL;
+       default:
+
+               /* We have shiny weather. Let the poll do whatever it thinks. */
+               return interval;
+       }
+}
+
+static bool cmci_storm_detect(void)
+{
+       unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+       unsigned long ts = __this_cpu_read(cmci_time_stamp);
+       unsigned long now = jiffies;
+       int r;
+
+       if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+               return true;
+
+       if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+               cnt++;
+       } else {
+               cnt = 1;
+               __this_cpu_write(cmci_time_stamp, now);
+       }
+       __this_cpu_write(cmci_storm_cnt, cnt);
+
+       if (cnt <= CMCI_STORM_THRESHOLD)
+               return false;
+
+       cmci_toggle_interrupt_mode(false);
+       __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+       r = atomic_add_return(1, &cmci_storm_on_cpus);
+       mce_timer_kick(CMCI_STORM_INTERVAL);
+       this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+
+       if (r == 1)
+               pr_notice("CMCI storm detected: switching to poll mode\n");
+       return true;
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+       if (cmci_storm_detect())
+               return;
+
+       machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks)
+{
+       unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+       unsigned long flags;
+       int i;
+       int bios_wrong_thresh = 0;
+
+       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+       for (i = 0; i < banks; i++) {
+               u64 val;
+               int bios_zero_thresh = 0;
+
+               if (test_bit(i, owned))
+                       continue;
+
+               /* Skip banks in firmware first mode */
+               if (test_bit(i, mce_banks_ce_disabled))
+                       continue;
+
+               rdmsrl(MSR_IA32_MCx_CTL2(i), val);
+
+               /* Already owned by someone else? */
+               if (val & MCI_CTL2_CMCI_EN) {
+                       clear_bit(i, owned);
+                       __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+                       continue;
+               }
+
+               if (!mca_cfg.bios_cmci_threshold) {
+                       val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+                       val |= CMCI_THRESHOLD;
+               } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+                       /*
+                        * If bios_cmci_threshold boot option was specified
+                        * but the threshold is zero, we'll try to initialize
+                        * it to 1.
+                        */
+                       bios_zero_thresh = 1;
+                       val |= CMCI_THRESHOLD;
+               }
+
+               val |= MCI_CTL2_CMCI_EN;
+               wrmsrl(MSR_IA32_MCx_CTL2(i), val);
+               rdmsrl(MSR_IA32_MCx_CTL2(i), val);
+
+               /* Did the enable bit stick? -- the bank supports CMCI */
+               if (val & MCI_CTL2_CMCI_EN) {
+                       set_bit(i, owned);
+                       __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+                       /*
+                        * We are able to set thresholds for some banks that
+                        * had a threshold of 0. This means the BIOS has not
+                        * set the thresholds properly or does not work with
+                        * this boot option. Note down now and report later.
+                        */
+                       if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+                                       (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+                               bios_wrong_thresh = 1;
+               } else {
+                       WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
+               }
+       }
+       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+       if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+               pr_info_once(
+                       "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+               pr_info_once(
+                       "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+       }
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+       unsigned long flags;
+       int banks;
+
+       if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+               return;
+
+       local_irq_save(flags);
+       machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
+       local_irq_restore(flags);
+}
+
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+       u64 val;
+
+       if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
+               return;
+       rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+       val &= ~MCI_CTL2_CMCI_EN;
+       wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+       __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+       unsigned long flags;
+       int i;
+       int banks;
+
+       if (!cmci_supported(&banks))
+               return;
+       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+       for (i = 0; i < banks; i++)
+               __cmci_disable_bank(i);
+       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_rediscover_work_func(void *arg)
+{
+       int banks;
+
+       /* Recheck banks in case CPUs don't all have the same */
+       if (cmci_supported(&banks))
+               cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
+{
+       int banks;
+
+       if (!cmci_supported(&banks))
+               return;
+
+       on_each_cpu(cmci_rediscover_work_func, NULL, 1);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+       int banks;
+       if (cmci_supported(&banks))
+               cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+       int banks;
+       unsigned long flags;
+
+       if (!cmci_supported(&banks))
+               return;
+
+       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+       __cmci_disable_bank(bank);
+       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void intel_init_cmci(void)
+{
+       int banks;
+
+       if (!cmci_supported(&banks))
+               return;
+
+       mce_threshold_vector = intel_threshold_interrupt;
+       cmci_discover(banks);
+       /*
+        * For CPU #0 this runs with still disabled APIC, but that's
+        * ok because only the vector is set up. We still do another
+        * check for the banks later for CPU #0 just to make sure
+        * to not miss any events.
+        */
+       apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+       cmci_recheck();
+}
+
+static void intel_init_lmce(void)
+{
+       u64 val;
+
+       if (!lmce_supported())
+               return;
+
+       rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+       if (!(val & MCG_EXT_CTL_LMCE_EN))
+               wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+static void intel_clear_lmce(void)
+{
+       u64 val;
+
+       if (!lmce_supported())
+               return;
+
+       rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+       val &= ~MCG_EXT_CTL_LMCE_EN;
+       wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+       unsigned long long val;
+
+       /*
+        * Even if testing the presence of the MSR would be enough, we don't
+        * want to risk the situation where other models reuse this MSR for
+        * other purposes.
+        */
+       switch (c->x86_model) {
+       case INTEL_FAM6_IVYBRIDGE_X:
+       case INTEL_FAM6_HASWELL_X:
+       case INTEL_FAM6_BROADWELL_XEON_D:
+       case INTEL_FAM6_BROADWELL_X:
+       case INTEL_FAM6_SKYLAKE_X:
+       case INTEL_FAM6_XEON_PHI_KNL:
+       case INTEL_FAM6_XEON_PHI_KNM:
+
+               if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+                       return;
+
+               if ((val & 3UL) == 1UL) {
+                       /* PPIN available but disabled: */
+                       return;
+               }
+
+               /* If PPIN is disabled, but not locked, try to enable: */
+               if (!(val & 3UL)) {
+                       wrmsrl_safe(MSR_PPIN_CTL,  val | 2UL);
+                       rdmsrl_safe(MSR_PPIN_CTL, &val);
+               }
+
+               if ((val & 3UL) == 2UL)
+                       set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+       }
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+       intel_init_thermal(c);
+       intel_init_cmci();
+       intel_init_lmce();
+       intel_ppin_init(c);
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+       intel_clear_lmce();
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
new file mode 100644 (file)
index 0000000..ceb67cd
--- /dev/null
@@ -0,0 +1,173 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
+#include <linux/device.h>
+#include <asm/mce.h>
+
+enum severity_level {
+       MCE_NO_SEVERITY,
+       MCE_DEFERRED_SEVERITY,
+       MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+       MCE_KEEP_SEVERITY,
+       MCE_SOME_SEVERITY,
+       MCE_AO_SEVERITY,
+       MCE_UC_SEVERITY,
+       MCE_AR_SEVERITY,
+       MCE_PANIC_SEVERITY,
+};
+
+extern struct blocking_notifier_head x86_mce_decoder_chain;
+
+#define ATTR_LEN               16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
+
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank {
+       u64                     ctl;                    /* subevents to enable */
+       unsigned char init;                             /* initialise bank? */
+       struct device_attribute attr;                   /* device attribute */
+       char                    attrname[ATTR_LEN];     /* attribute name */
+};
+
+struct mce_evt_llist {
+       struct llist_node llnode;
+       struct mce mce;
+};
+
+void mce_gen_pool_process(struct work_struct *__unused);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
+
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
+struct dentry *mce_get_debugfs_dir(void);
+
+extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
+#else
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+       return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+       return 0;
+}
+static inline int apei_check_mce(void)
+{
+       return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+       return -EINVAL;
+}
+#endif
+
+void mce_inject_log(struct mce *m);
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+       return m1->bank != m2->bank ||
+               m1->status != m2->status ||
+               m1->addr != m2->addr ||
+               m1->misc != m2->misc;
+}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
+#else
+static inline void mce_work_trigger(void)      { }
+static inline void mce_register_injector_chain(struct notifier_block *nb)      { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb)    { }
+#endif
+
+struct mca_config {
+       bool dont_log_ce;
+       bool cmci_disabled;
+       bool ignore_ce;
+
+       __u64 lmce_disabled             : 1,
+             disabled                  : 1,
+             ser                       : 1,
+             recovery                  : 1,
+             bios_cmci_threshold       : 1,
+             __reserved                : 59;
+
+       u8 banks;
+       s8 bootlog;
+       int tolerant;
+       int monarch_timeout;
+       int panic_timeout;
+       u32 rip_msr;
+};
+
+extern struct mca_config mca_cfg;
+
+struct mce_vendor_flags {
+       /*
+        * Indicates that overflow conditions are not fatal, when set.
+        */
+       __u64 overflow_recov    : 1,
+
+       /*
+        * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+        * Recovery. It indicates support for data poisoning in HW and deferred
+        * error interrupts.
+        */
+             succor            : 1,
+
+       /*
+        * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+        * the register space for each MCA bank and also increases number of
+        * banks. Also, to accommodate the new banks and registers, the MCA
+        * register space is moved to a new MSR range.
+        */
+             smca              : 1,
+
+             __reserved_0      : 61;
+};
+
+extern struct mce_vendor_flags mce_flags;
+
+struct mca_msr_regs {
+       u32 (*ctl)      (int bank);
+       u32 (*status)   (int bank);
+       u32 (*addr)     (int bank);
+       u32 (*misc)     (int bank);
+};
+
+extern struct mca_msr_regs msr_ops;
+
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
new file mode 100644 (file)
index 0000000..5cddf83
--- /dev/null
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * P5 specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/smp.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+/* By default disabled */
+int mce_p5_enabled __read_mostly;
+
+/* Machine check handler for Pentium class Intel CPUs: */
+static void pentium_machine_check(struct pt_regs *regs, long error_code)
+{
+       u32 loaddr, hi, lotype;
+
+       ist_enter(regs);
+
+       rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
+       rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
+
+       pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+                smp_processor_id(), loaddr, lotype);
+
+       if (lotype & (1<<5)) {
+               pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+                        smp_processor_id());
+       }
+
+       add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+       ist_exit(regs);
+}
+
+/* Set up machine check reporting for processors with Intel style MCE: */
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+{
+       u32 l, h;
+
+       /* Default P5 to off as its often misconnected: */
+       if (!mce_p5_enabled)
+               return;
+
+       /* Check for MCE support: */
+       if (!cpu_has(c, X86_FEATURE_MCE))
+               return;
+
+       machine_check_vector = pentium_machine_check;
+       /* Make sure the vector pointer is visible before we enable MCEs: */
+       wmb();
+
+       /* Read registers before enabling: */
+       rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
+       rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
+       pr_info("Intel old style machine check architecture supported.\n");
+
+       /* Enable MCE: */
+       cr4_set_bits(X86_CR4_MCE);
+       pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+               smp_processor_id());
+}
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
new file mode 100644 (file)
index 0000000..dc3e26e
--- /dev/null
@@ -0,0 +1,419 @@
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <asm/mce.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
+
+static struct severity {
+       u64 mask;
+       u64 result;
+       unsigned char sev;
+       unsigned char mcgmask;
+       unsigned char mcgres;
+       unsigned char ser;
+       unsigned char context;
+       unsigned char excp;
+       unsigned char covered;
+       char *msg;
+} severities[] = {
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define  KERNEL                .context = IN_KERNEL
+#define  USER          .context = IN_USER
+#define  KERNEL_RECOV  .context = IN_KERNEL_RECOV
+#define  SER           .ser = SER_REQUIRED
+#define  NOSER         .ser = NO_SER
+#define  EXCP          .excp = EXCP_CONTEXT
+#define  NOEXCP                .excp = NO_EXCP
+#define  BITCLR(x)     .mask = x, .result = 0
+#define  BITSET(x)     .mask = x, .result = x
+#define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define  MASK(x, y)    .mask = x, .result = y
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define        MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
+
+       MCESEV(
+               NO, "Invalid",
+               BITCLR(MCI_STATUS_VAL)
+               ),
+       MCESEV(
+               NO, "Not enabled",
+               EXCP, BITCLR(MCI_STATUS_EN)
+               ),
+       MCESEV(
+               PANIC, "Processor context corrupt",
+               BITSET(MCI_STATUS_PCC)
+               ),
+       /* When MCIP is not set something is very confused */
+       MCESEV(
+               PANIC, "MCIP not set in MCA handler",
+               EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
+               ),
+       /* Neither return not error IP -- no chance to recover -> PANIC */
+       MCESEV(
+               PANIC, "Neither restart nor error IP",
+               EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+               ),
+       MCESEV(
+               PANIC, "In kernel and no restart IP",
+               EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+               ),
+       MCESEV(
+               PANIC, "In kernel and no restart IP",
+               EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+               ),
+       MCESEV(
+               DEFERRED, "Deferred error",
+               NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
+               ),
+       MCESEV(
+               KEEP, "Corrected error",
+               NOSER, BITCLR(MCI_STATUS_UC)
+               ),
+
+       /*
+        * known AO MCACODs reported via MCE or CMC:
+        *
+        * SRAO could be signaled either via a machine check exception or
+        * CMCI with the corresponding bit S 1 or 0. So we don't need to
+        * check bit S for SRAO.
+        */
+       MCESEV(
+               AO, "Action optional: memory scrubbing error",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+               ),
+       MCESEV(
+               AO, "Action optional: last level cache writeback error",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+               ),
+
+       /* ignore OVER for UCNA */
+       MCESEV(
+               UCNA, "Uncorrected no action required",
+               SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+               ),
+       MCESEV(
+               PANIC, "Illegal combination (UCNA with AR=1)",
+               SER,
+               MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+               ),
+       MCESEV(
+               KEEP, "Non signalled machine check",
+               SER, BITCLR(MCI_STATUS_S)
+               ),
+
+       MCESEV(
+               PANIC, "Action required with lost events",
+               SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+               ),
+
+       /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+       MCESEV(
+               KEEP, "Action required but unaffected thread is continuable",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+               MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+               ),
+       MCESEV(
+               AR, "Action required: data load in error recoverable area of kernel",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               KERNEL_RECOV
+               ),
+       MCESEV(
+               AR, "Action required: data load error in a user process",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               USER
+               ),
+       MCESEV(
+               AR, "Action required: instruction fetch error in a user process",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+               USER
+               ),
+       MCESEV(
+               PANIC, "Data load in unrecoverable area of kernel",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               KERNEL
+               ),
+#endif
+       MCESEV(
+               PANIC, "Action required: unknown MCACOD",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+               ),
+
+       MCESEV(
+               SOME, "Action optional: unknown MCACOD",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+               ),
+       MCESEV(
+               SOME, "Action optional with lost events",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+               ),
+
+       MCESEV(
+               PANIC, "Overflowed uncorrected",
+               BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+               ),
+       MCESEV(
+               UC, "Uncorrected",
+               BITSET(MCI_STATUS_UC)
+               ),
+       MCESEV(
+               SOME, "No match",
+               BITSET(0)
+               )       /* always matches. keep at end */
+};
+
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+                               (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+/*
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
+ */
+static int error_context(struct mce *m)
+{
+       if ((m->cs & 3) == 3)
+               return IN_USER;
+       if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+               return IN_KERNEL_RECOV;
+       return IN_KERNEL;
+}
+
+static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
+{
+       u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+       u32 low, high;
+
+       /*
+        * We need to look at the following bits:
+        * - "succor" bit (data poisoning support), and
+        * - TCC bit (Task Context Corrupt)
+        * in MCi_STATUS to determine error severity.
+        */
+       if (!mce_flags.succor)
+               return MCE_PANIC_SEVERITY;
+
+       if (rdmsr_safe(addr, &low, &high))
+               return MCE_PANIC_SEVERITY;
+
+       /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
+       if ((low & MCI_CONFIG_MCAX) &&
+           (m->status & MCI_STATUS_TCC) &&
+           (err_ctx == IN_KERNEL))
+               return MCE_PANIC_SEVERITY;
+
+        /* ...otherwise invoke hwpoison handler. */
+       return MCE_AR_SEVERITY;
+}
+
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+       enum context ctx = error_context(m);
+
+       /* Processor Context Corrupt, no need to fumble too much, die! */
+       if (m->status & MCI_STATUS_PCC)
+               return MCE_PANIC_SEVERITY;
+
+       if (m->status & MCI_STATUS_UC) {
+
+               if (ctx == IN_KERNEL)
+                       return MCE_PANIC_SEVERITY;
+
+               /*
+                * On older systems where overflow_recov flag is not present, we
+                * should simply panic if an error overflow occurs. If
+                * overflow_recov flag is present and set, then software can try
+                * to at least kill process to prolong system operation.
+                */
+               if (mce_flags.overflow_recov) {
+                       if (mce_flags.smca)
+                               return mce_severity_amd_smca(m, ctx);
+
+                       /* kill current process */
+                       return MCE_AR_SEVERITY;
+               } else {
+                       /* at least one error was not logged */
+                       if (m->status & MCI_STATUS_OVER)
+                               return MCE_PANIC_SEVERITY;
+               }
+
+               /*
+                * For any other case, return MCE_UC_SEVERITY so that we log the
+                * error and exit #MC handler.
+                */
+               return MCE_UC_SEVERITY;
+       }
+
+       /*
+        * deferred error: poll handler catches these and adds to mce_ring so
+        * memory-failure can take recovery actions.
+        */
+       if (m->status & MCI_STATUS_DEFERRED)
+               return MCE_DEFERRED_SEVERITY;
+
+       /*
+        * corrected error: poll handler catches these and passes responsibility
+        * of decoding the error to EDAC
+        */
+       return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+       enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
+       enum context ctx = error_context(m);
+       struct severity *s;
+
+       for (s = severities;; s++) {
+               if ((m->status & s->mask) != s->result)
+                       continue;
+               if ((m->mcgstatus & s->mcgmask) != s->mcgres)
+                       continue;
+               if (s->ser == SER_REQUIRED && !mca_cfg.ser)
+                       continue;
+               if (s->ser == NO_SER && mca_cfg.ser)
+                       continue;
+               if (s->context && ctx != s->context)
+                       continue;
+               if (s->excp && excp != s->excp)
+                       continue;
+               if (msg)
+                       *msg = s->msg;
+               s->covered = 1;
+               if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
+                       if (tolerant < 1)
+                               return MCE_PANIC_SEVERITY;
+               }
+               return s->sev;
+       }
+}
+
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+                   mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+           boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+               mce_severity = mce_severity_amd;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+       if (*pos >= ARRAY_SIZE(severities))
+               return NULL;
+       return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+       if (++(*pos) >= ARRAY_SIZE(severities))
+               return NULL;
+       return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+       struct severity *ser = data;
+       seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+       return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+       .start  = s_start,
+       .next   = s_next,
+       .stop   = s_stop,
+       .show   = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+                                        const char __user *ubuf,
+                                        size_t count, loff_t *ppos)
+{
+       int i;
+       for (i = 0; i < ARRAY_SIZE(severities); i++)
+               severities[i].covered = 0;
+       return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+       .open           = severities_coverage_open,
+       .release        = seq_release,
+       .read           = seq_read,
+       .write          = severities_coverage_write,
+       .llseek         = seq_lseek,
+};
+
+static int __init severities_debugfs_init(void)
+{
+       struct dentry *dmce, *fsev;
+
+       dmce = mce_get_debugfs_dir();
+       if (!dmce)
+               goto err_out;
+
+       fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+                                  &severities_coverage_fops);
+       if (!fsev)
+               goto err_out;
+
+       return 0;
+
+err_out:
+       return -ENOMEM;
+}
+late_initcall(severities_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
new file mode 100644 (file)
index 0000000..2da67b7
--- /dev/null
@@ -0,0 +1,520 @@
+/*
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ *
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ *          Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/processor.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL         (300 * HZ)
+
+#define THERMAL_THROTTLING_EVENT       0
+#define POWER_LIMIT_EVENT              1
+
+/*
+ * Current thermal event state:
+ */
+struct _thermal_state {
+       bool                    new_event;
+       int                     event;
+       u64                     next_check;
+       unsigned long           count;
+       unsigned long           last_count;
+};
+
+struct thermal_state {
+       struct _thermal_state core_throttle;
+       struct _thermal_state core_power_limit;
+       struct _thermal_state package_throttle;
+       struct _thermal_state package_power_limit;
+       struct _thermal_state core_thresh0;
+       struct _thermal_state core_thresh1;
+       struct _thermal_state pkg_thresh0;
+       struct _thermal_state pkg_thresh1;
+};
+
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+EXPORT_SYMBOL(platform_thermal_notify);
+
+/* Callback to handle core package threshold_interrupts */
+int (*platform_thermal_package_notify)(__u64 msr_val);
+EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+bool (*platform_thermal_package_rate_control)(void);
+EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
+
+
+static DEFINE_PER_CPU(struct thermal_state, thermal_state);
+
+static atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+static u32 lvtthmr_init __read_mostly;
+
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_device_one_ro(_name)                                \
+       static DEVICE_ATTR(_name, 0444,                                 \
+                          therm_throt_device_show_##_name,             \
+                                  NULL)                                \
+
+#define define_therm_throt_device_show_func(event, name)               \
+                                                                       \
+static ssize_t therm_throt_device_show_##event##_##name(               \
+                       struct device *dev,                             \
+                       struct device_attribute *attr,                  \
+                       char *buf)                                      \
+{                                                                      \
+       unsigned int cpu = dev->id;                                     \
+       ssize_t ret;                                                    \
+                                                                       \
+       preempt_disable();      /* CPU hotplug */                       \
+       if (cpu_online(cpu)) {                                          \
+               ret = sprintf(buf, "%lu\n",                             \
+                             per_cpu(thermal_state, cpu).event.name);  \
+       } else                                                          \
+               ret = 0;                                                \
+       preempt_enable();                                               \
+                                                                       \
+       return ret;                                                     \
+}
+
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
+
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
+
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
+
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
+
+static struct attribute *thermal_throttle_attrs[] = {
+       &dev_attr_core_throttle_count.attr,
+       NULL
+};
+
+static const struct attribute_group thermal_attr_group = {
+       .attrs  = thermal_throttle_attrs,
+       .name   = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+
+#define CORE_LEVEL     0
+#define PACKAGE_LEVEL  1
+
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ *        thermal interrupt normally gets called both when the thermal
+ *        event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ */
+static void therm_throt_process(bool new_event, int event, int level)
+{
+       struct _thermal_state *state;
+       unsigned int this_cpu = smp_processor_id();
+       bool old_event;
+       u64 now;
+       struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+
+       now = get_jiffies_64();
+       if (level == CORE_LEVEL) {
+               if (event == THERMAL_THROTTLING_EVENT)
+                       state = &pstate->core_throttle;
+               else if (event == POWER_LIMIT_EVENT)
+                       state = &pstate->core_power_limit;
+               else
+                       return;
+       } else if (level == PACKAGE_LEVEL) {
+               if (event == THERMAL_THROTTLING_EVENT)
+                       state = &pstate->package_throttle;
+               else if (event == POWER_LIMIT_EVENT)
+                       state = &pstate->package_power_limit;
+               else
+                       return;
+       } else
+               return;
+
+       old_event = state->new_event;
+       state->new_event = new_event;
+
+       if (new_event)
+               state->count++;
+
+       if (time_before64(now, state->next_check) &&
+                       state->count != state->last_count)
+               return;
+
+       state->next_check = now + CHECK_INTERVAL;
+       state->last_count = state->count;
+
+       /* if we just entered the thermal event */
+       if (new_event) {
+               if (event == THERMAL_THROTTLING_EVENT)
+                       pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+                               this_cpu,
+                               level == CORE_LEVEL ? "Core" : "Package",
+                               state->count);
+               return;
+       }
+       if (old_event) {
+               if (event == THERMAL_THROTTLING_EVENT)
+                       pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
+                               level == CORE_LEVEL ? "Core" : "Package");
+               return;
+       }
+}
+
+static int thresh_event_valid(int level, int event)
+{
+       struct _thermal_state *state;
+       unsigned int this_cpu = smp_processor_id();
+       struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+       u64 now = get_jiffies_64();
+
+       if (level == PACKAGE_LEVEL)
+               state = (event == 0) ? &pstate->pkg_thresh0 :
+                                               &pstate->pkg_thresh1;
+       else
+               state = (event == 0) ? &pstate->core_thresh0 :
+                                               &pstate->core_thresh1;
+
+       if (time_before64(now, state->next_check))
+               return 0;
+
+       state->next_check = now + CHECK_INTERVAL;
+
+       return 1;
+}
+
+static bool int_pln_enable;
+static int __init int_pln_enable_setup(char *s)
+{
+       int_pln_enable = true;
+
+       return 1;
+}
+__setup("int_pln_enable", int_pln_enable_setup);
+
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device: */
+static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
+{
+       int err;
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+       err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
+       if (err)
+               return err;
+
+       if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+               err = sysfs_add_file_to_group(&dev->kobj,
+                                             &dev_attr_core_power_limit_count.attr,
+                                             thermal_attr_group.name);
+       if (cpu_has(c, X86_FEATURE_PTS)) {
+               err = sysfs_add_file_to_group(&dev->kobj,
+                                             &dev_attr_package_throttle_count.attr,
+                                             thermal_attr_group.name);
+               if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+                       err = sysfs_add_file_to_group(&dev->kobj,
+                                       &dev_attr_package_power_limit_count.attr,
+                                       thermal_attr_group.name);
+       }
+
+       return err;
+}
+
+static void thermal_throttle_remove_dev(struct device *dev)
+{
+       sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+}
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int thermal_throttle_online(unsigned int cpu)
+{
+       struct device *dev = get_cpu_device(cpu);
+
+       return thermal_throttle_add_dev(dev, cpu);
+}
+
+static int thermal_throttle_offline(unsigned int cpu)
+{
+       struct device *dev = get_cpu_device(cpu);
+
+       thermal_throttle_remove_dev(dev);
+       return 0;
+}
+
+static __init int thermal_throttle_init_device(void)
+{
+       int ret;
+
+       if (!atomic_read(&therm_throt_en))
+               return 0;
+
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+                               thermal_throttle_online,
+                               thermal_throttle_offline);
+       return ret < 0 ? ret : 0;
+}
+device_initcall(thermal_throttle_init_device);
+
+#endif /* CONFIG_SYSFS */
+
+static void notify_package_thresholds(__u64 msr_val)
+{
+       bool notify_thres_0 = false;
+       bool notify_thres_1 = false;
+
+       if (!platform_thermal_package_notify)
+               return;
+
+       /* lower threshold check */
+       if (msr_val & THERM_LOG_THRESHOLD0)
+               notify_thres_0 = true;
+       /* higher threshold check */
+       if (msr_val & THERM_LOG_THRESHOLD1)
+               notify_thres_1 = true;
+
+       if (!notify_thres_0 && !notify_thres_1)
+               return;
+
+       if (platform_thermal_package_rate_control &&
+               platform_thermal_package_rate_control()) {
+               /* Rate control is implemented in callback */
+               platform_thermal_package_notify(msr_val);
+               return;
+       }
+
+       /* lower threshold reached */
+       if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
+               platform_thermal_package_notify(msr_val);
+       /* higher threshold reached */
+       if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
+               platform_thermal_package_notify(msr_val);
+}
+
+static void notify_thresholds(__u64 msr_val)
+{
+       /* check whether the interrupt handler is defined;
+        * otherwise simply return
+        */
+       if (!platform_thermal_notify)
+               return;
+
+       /* lower threshold reached */
+       if ((msr_val & THERM_LOG_THRESHOLD0) &&
+                       thresh_event_valid(CORE_LEVEL, 0))
+               platform_thermal_notify(msr_val);
+       /* higher threshold reached */
+       if ((msr_val & THERM_LOG_THRESHOLD1) &&
+                       thresh_event_valid(CORE_LEVEL, 1))
+               platform_thermal_notify(msr_val);
+}
+
+/* Thermal transition interrupt handler */
+static void intel_thermal_interrupt(void)
+{
+       __u64 msr_val;
+
+       if (static_cpu_has(X86_FEATURE_HWP))
+               wrmsrl_safe(MSR_HWP_STATUS, 0);
+
+       rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+
+       /* Check for violation of core thermal thresholds*/
+       notify_thresholds(msr_val);
+
+       therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+                           THERMAL_THROTTLING_EVENT,
+                           CORE_LEVEL);
+
+       if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
+               therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+                                       POWER_LIMIT_EVENT,
+                                       CORE_LEVEL);
+
+       if (this_cpu_has(X86_FEATURE_PTS)) {
+               rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+               /* check violations of package thermal thresholds */
+               notify_package_thresholds(msr_val);
+               therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+                                       THERMAL_THROTTLING_EVENT,
+                                       PACKAGE_LEVEL);
+               if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
+                       therm_throt_process(msr_val &
+                                       PACKAGE_THERM_STATUS_POWER_LIMIT,
+                                       POWER_LIMIT_EVENT,
+                                       PACKAGE_LEVEL);
+       }
+}
+
+static void unexpected_thermal_interrupt(void)
+{
+       pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+               smp_processor_id());
+}
+
+static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
+
+asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r)
+{
+       entering_irq();
+       trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+       inc_irq_stat(irq_thermal_count);
+       smp_thermal_vector();
+       trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+       exiting_ack_irq();
+}
+
+/* Thermal monitoring depends on APIC, ACPI and clock modulation */
+static int intel_thermal_supported(struct cpuinfo_x86 *c)
+{
+       if (!boot_cpu_has(X86_FEATURE_APIC))
+               return 0;
+       if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+               return 0;
+       return 1;
+}
+
+void __init mcheck_intel_therm_init(void)
+{
+       /*
+        * This function is only called on boot CPU. Save the init thermal
+        * LVT value on BSP and use that value to restore APs' thermal LVT
+        * entry BIOS programmed later
+        */
+       if (intel_thermal_supported(&boot_cpu_data))
+               lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
+void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+       unsigned int cpu = smp_processor_id();
+       int tm2 = 0;
+       u32 l, h;
+
+       if (!intel_thermal_supported(c))
+               return;
+
+       /*
+        * First check if its enabled already, in which case there might
+        * be some SMM goo which handles it, so we can't even put a handler
+        * since it might be delivered via SMI already:
+        */
+       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+
+       h = lvtthmr_init;
+       /*
+        * The initial value of thermal LVT entries on all APs always reads
+        * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+        * sequence to them and LVT registers are reset to 0s except for
+        * the mask bits which are set to 1s when APs receive INIT IPI.
+        * If BIOS takes over the thermal interrupt and sets its interrupt
+        * delivery mode to SMI (not fixed), it restores the value that the
+        * BIOS has programmed on AP based on BSP's info we saved since BIOS
+        * is always setting the same value for all threads/cores.
+        */
+       if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+               apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+
+       if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
+               if (system_state == SYSTEM_BOOTING)
+                       pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
+               return;
+       }
+
+       /* early Pentium M models use different method for enabling TM2 */
+       if (cpu_has(c, X86_FEATURE_TM2)) {
+               if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
+                       rdmsr(MSR_THERM2_CTL, l, h);
+                       if (l & MSR_THERM2_CTL_TM_SELECT)
+                               tm2 = 1;
+               } else if (l & MSR_IA32_MISC_ENABLE_TM2)
+                       tm2 = 1;
+       }
+
+       /* We'll mask the thermal vector in the lapic till we're ready: */
+       h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
+       apic_write(APIC_LVTTHMR, h);
+
+       rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+       if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+               wrmsr(MSR_IA32_THERM_INTERRUPT,
+                       (l | (THERM_INT_LOW_ENABLE
+                       | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
+       else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+               wrmsr(MSR_IA32_THERM_INTERRUPT,
+                       l | (THERM_INT_LOW_ENABLE
+                       | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+       else
+               wrmsr(MSR_IA32_THERM_INTERRUPT,
+                     l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+       if (cpu_has(c, X86_FEATURE_PTS)) {
+               rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+               if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+                               (l | (PACKAGE_THERM_INT_LOW_ENABLE
+                               | PACKAGE_THERM_INT_HIGH_ENABLE))
+                               & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
+               else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+                               l | (PACKAGE_THERM_INT_LOW_ENABLE
+                               | PACKAGE_THERM_INT_HIGH_ENABLE
+                               | PACKAGE_THERM_INT_PLN_ENABLE), h);
+               else
+                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+                             l | (PACKAGE_THERM_INT_LOW_ENABLE
+                               | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+       }
+
+       smp_thermal_vector = intel_thermal_interrupt;
+
+       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+       wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
+
+       /* Unmask the thermal vector: */
+       l = apic_read(APIC_LVTTHMR);
+       apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
+       pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+                     tm2 ? "TM2" : "TM1");
+
+       /* enable thermal throttle processing */
+       atomic_set(&therm_throt_en, 1);
+}
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
new file mode 100644 (file)
index 0000000..2b584b3
--- /dev/null
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+
+static void default_threshold_interrupt(void)
+{
+       pr_err("Unexpected threshold interrupt at vector %x\n",
+               THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
+{
+       entering_irq();
+       trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+       inc_irq_stat(irq_threshold_count);
+       mce_threshold_vector();
+       trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+       exiting_ack_irq();
+}
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
new file mode 100644 (file)
index 0000000..3b45b27
--- /dev/null
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IDT Winchip specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+/* Machine check handler for WinChip C6: */
+static void winchip_machine_check(struct pt_regs *regs, long error_code)
+{
+       ist_enter(regs);
+
+       pr_emerg("CPU0: Machine Check Exception.\n");
+       add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+       ist_exit(regs);
+}
+
+/* Set up machine check reporting on the Winchip C6 series */
+void winchip_mcheck_init(struct cpuinfo_x86 *c)
+{
+       u32 lo, hi;
+
+       machine_check_vector = winchip_machine_check;
+       /* Make sure the vector pointer is visible before we enable MCEs: */
+       wmb();
+
+       rdmsr(MSR_IDT_FCR1, lo, hi);
+       lo |= (1<<2);   /* Enable EIERRINT (int 18 MCE) */
+       lo &= ~(1<<4);  /* Enable MCE */
+       wrmsr(MSR_IDT_FCR1, lo, hi);
+
+       cr4_set_bits(X86_CR4_MCE);
+
+       pr_info("Winchip machine check reporting enabled on CPU#0.\n");
+}
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
deleted file mode 100644 (file)
index bcc7c54..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-y                          =  mce.o mce-severity.o mce-genpool.o
-
-obj-$(CONFIG_X86_ANCIENT_MCE)  += winchip.o p5.o
-obj-$(CONFIG_X86_MCE_INTEL)    += mce_intel.o
-obj-$(CONFIG_X86_MCE_AMD)      += mce_amd.o
-obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
-obj-$(CONFIG_X86_MCE_INJECT)   += mce-inject.o
-
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
-obj-$(CONFIG_ACPI_APEI)                += mce-apei.o
-
-obj-$(CONFIG_X86_MCELOG_LEGACY)        += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
deleted file mode 100644 (file)
index 27f394a..0000000
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * /dev/mcelog driver
- *
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/miscdevice.h>
-#include <linux/slab.h>
-#include <linux/kmod.h>
-#include <linux/poll.h>
-
-#include "mce-internal.h"
-
-static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
-
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log_buffer mcelog = {
-       .signature      = MCE_LOG_SIGNATURE,
-       .len            = MCE_LOG_LEN,
-       .recordlen      = sizeof(struct mce),
-};
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
-static int dev_mce_log(struct notifier_block *nb, unsigned long val,
-                               void *data)
-{
-       struct mce *mce = (struct mce *)data;
-       unsigned int entry;
-
-       mutex_lock(&mce_chrdev_read_mutex);
-
-       entry = mcelog.next;
-
-       /*
-        * When the buffer fills up discard new entries. Assume that the
-        * earlier errors are the more interesting ones:
-        */
-       if (entry >= MCE_LOG_LEN) {
-               set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
-               goto unlock;
-       }
-
-       mcelog.next = entry + 1;
-
-       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
-       mcelog.entry[entry].finished = 1;
-
-       /* wake processes polling /dev/mcelog */
-       wake_up_interruptible(&mce_chrdev_wait);
-
-unlock:
-       mutex_unlock(&mce_chrdev_read_mutex);
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block dev_mcelog_nb = {
-       .notifier_call  = dev_mce_log,
-       .priority       = MCE_PRIO_MCELOG,
-};
-
-static void mce_do_trigger(struct work_struct *work)
-{
-       call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
-
-void mce_work_trigger(void)
-{
-       if (mce_helper[0])
-               schedule_work(&mce_trigger_work);
-}
-
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
-       strcpy(buf, mce_helper);
-       strcat(buf, "\n");
-       return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
-                               const char *buf, size_t siz)
-{
-       char *p;
-
-       strncpy(mce_helper, buf, sizeof(mce_helper));
-       mce_helper[sizeof(mce_helper)-1] = 0;
-       p = strchr(mce_helper, '\n');
-
-       if (p)
-               *p = 0;
-
-       return strlen(mce_helper) + !!p;
-}
-
-DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
-
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_count;      /* #times opened */
-static int mce_chrdev_open_exclu;      /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_chrdev_state_lock);
-
-       if (mce_chrdev_open_exclu ||
-           (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
-               spin_unlock(&mce_chrdev_state_lock);
-
-               return -EBUSY;
-       }
-
-       if (file->f_flags & O_EXCL)
-               mce_chrdev_open_exclu = 1;
-       mce_chrdev_open_count++;
-
-       spin_unlock(&mce_chrdev_state_lock);
-
-       return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_chrdev_state_lock);
-
-       mce_chrdev_open_count--;
-       mce_chrdev_open_exclu = 0;
-
-       spin_unlock(&mce_chrdev_state_lock);
-
-       return 0;
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
-       int rc;
-       u64 record_id;
-       struct mce m;
-
-       if (usize < sizeof(struct mce))
-               return -EINVAL;
-
-       rc = apei_read_mce(&m, &record_id);
-       /* Error or no more MCE record */
-       if (rc <= 0) {
-               mce_apei_read_done = 1;
-               /*
-                * When ERST is disabled, mce_chrdev_read() should return
-                * "no record" instead of "no device."
-                */
-               if (rc == -ENODEV)
-                       return 0;
-               return rc;
-       }
-       rc = -EFAULT;
-       if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
-               return rc;
-       /*
-        * In fact, we should have cleared the record after that has
-        * been flushed to the disk or sent to network in
-        * /sbin/mcelog, but we have no interface to support that now,
-        * so just clear it to avoid duplication.
-        */
-       rc = apei_clear_mce(record_id);
-       if (rc) {
-               mce_apei_read_done = 1;
-               return rc;
-       }
-       *ubuf += sizeof(struct mce);
-
-       return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
-                               size_t usize, loff_t *off)
-{
-       char __user *buf = ubuf;
-       unsigned next;
-       int i, err;
-
-       mutex_lock(&mce_chrdev_read_mutex);
-
-       if (!mce_apei_read_done) {
-               err = __mce_read_apei(&buf, usize);
-               if (err || buf != ubuf)
-                       goto out;
-       }
-
-       /* Only supports full reads right now */
-       err = -EINVAL;
-       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
-               goto out;
-
-       next = mcelog.next;
-       err = 0;
-
-       for (i = 0; i < next; i++) {
-               struct mce *m = &mcelog.entry[i];
-
-               err |= copy_to_user(buf, m, sizeof(*m));
-               buf += sizeof(*m);
-       }
-
-       memset(mcelog.entry, 0, next * sizeof(struct mce));
-       mcelog.next = 0;
-
-       if (err)
-               err = -EFAULT;
-
-out:
-       mutex_unlock(&mce_chrdev_read_mutex);
-
-       return err ? err : buf - ubuf;
-}
-
-static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
-{
-       poll_wait(file, &mce_chrdev_wait, wait);
-       if (READ_ONCE(mcelog.next))
-               return EPOLLIN | EPOLLRDNORM;
-       if (!mce_apei_read_done && apei_check_mce())
-               return EPOLLIN | EPOLLRDNORM;
-       return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
-                               unsigned long arg)
-{
-       int __user *p = (int __user *)arg;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       switch (cmd) {
-       case MCE_GET_RECORD_LEN:
-               return put_user(sizeof(struct mce), p);
-       case MCE_GET_LOG_LEN:
-               return put_user(MCE_LOG_LEN, p);
-       case MCE_GETCLEAR_FLAGS: {
-               unsigned flags;
-
-               do {
-                       flags = mcelog.flags;
-               } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
-               return put_user(flags, p);
-       }
-       default:
-               return -ENOTTY;
-       }
-}
-
-void mce_register_injector_chain(struct notifier_block *nb)
-{
-       blocking_notifier_chain_register(&mce_injector_chain, nb);
-}
-EXPORT_SYMBOL_GPL(mce_register_injector_chain);
-
-void mce_unregister_injector_chain(struct notifier_block *nb)
-{
-       blocking_notifier_chain_unregister(&mce_injector_chain, nb);
-}
-EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-                               size_t usize, loff_t *off)
-{
-       struct mce m;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-       /*
-        * There are some cases where real MSR reads could slip
-        * through.
-        */
-       if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
-               return -EIO;
-
-       if ((unsigned long)usize > sizeof(struct mce))
-               usize = sizeof(struct mce);
-       if (copy_from_user(&m, ubuf, usize))
-               return -EFAULT;
-
-       if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
-               return -EINVAL;
-
-       /*
-        * Need to give user space some time to set everything up,
-        * so do it a jiffie or two later everywhere.
-        */
-       schedule_timeout(2);
-
-       blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
-
-       return usize;
-}
-
-static const struct file_operations mce_chrdev_ops = {
-       .open                   = mce_chrdev_open,
-       .release                = mce_chrdev_release,
-       .read                   = mce_chrdev_read,
-       .write                  = mce_chrdev_write,
-       .poll                   = mce_chrdev_poll,
-       .unlocked_ioctl         = mce_chrdev_ioctl,
-       .llseek                 = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
-       MISC_MCELOG_MINOR,
-       "mcelog",
-       &mce_chrdev_ops,
-};
-
-static __init int dev_mcelog_init_device(void)
-{
-       int err;
-
-       /* register character device /dev/mcelog */
-       err = misc_register(&mce_chrdev_device);
-       if (err) {
-               if (err == -EBUSY)
-                       /* Xen dom0 might have registered the device already. */
-                       pr_info("Unable to init device /dev/mcelog, already registered");
-               else
-                       pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
-
-               return err;
-       }
-
-       mce_register_decode_chain(&dev_mcelog_nb);
-       return 0;
-}
-device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
deleted file mode 100644 (file)
index 2eee853..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Bridge between MCE and APEI
- *
- * On some machine, corrected memory errors are reported via APEI
- * generic hardware error source (GHES) instead of corrected Machine
- * Check. These corrected memory errors can be reported to user space
- * through /dev/mcelog via faking a corrected Machine Check, so that
- * the error memory page can be offlined by /sbin/mcelog if the error
- * count for one page is beyond the threshold.
- *
- * For fatal MCE, save MCE record into persistent storage via ERST, so
- * that the MCE record can be logged after reboot via ERST.
- *
- * Copyright 2010 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/cper.h>
-#include <acpi/apei.h>
-#include <acpi/ghes.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
-{
-       struct mce m;
-
-       if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-               return;
-
-       mce_setup(&m);
-       m.bank = -1;
-       /* Fake a memory read error with unknown channel */
-       m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
-
-       if (severity >= GHES_SEV_RECOVERABLE)
-               m.status |= MCI_STATUS_UC;
-
-       if (severity >= GHES_SEV_PANIC) {
-               m.status |= MCI_STATUS_PCC;
-               m.tsc = rdtsc();
-       }
-
-       m.addr = mem_err->physical_addr;
-       mce_log(&m);
-}
-EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
-
-#define CPER_CREATOR_MCE                                               \
-       UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,     \
-               0x64, 0x90, 0xb8, 0x9d)
-#define CPER_SECTION_TYPE_MCE                                          \
-       UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96,     \
-               0x04, 0x4a, 0x38, 0xfc)
-
-/*
- * CPER specification (in UEFI specification 2.3 appendix N) requires
- * byte-packed.
- */
-struct cper_mce_record {
-       struct cper_record_header hdr;
-       struct cper_section_descriptor sec_hdr;
-       struct mce mce;
-} __packed;
-
-int apei_write_mce(struct mce *m)
-{
-       struct cper_mce_record rcd;
-
-       memset(&rcd, 0, sizeof(rcd));
-       memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
-       rcd.hdr.revision = CPER_RECORD_REV;
-       rcd.hdr.signature_end = CPER_SIG_END;
-       rcd.hdr.section_count = 1;
-       rcd.hdr.error_severity = CPER_SEV_FATAL;
-       /* timestamp, platform_id, partition_id are all invalid */
-       rcd.hdr.validation_bits = 0;
-       rcd.hdr.record_length = sizeof(rcd);
-       rcd.hdr.creator_id = CPER_CREATOR_MCE;
-       rcd.hdr.notification_type = CPER_NOTIFY_MCE;
-       rcd.hdr.record_id = cper_next_record_id();
-       rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
-
-       rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
-       rcd.sec_hdr.section_length = sizeof(rcd.mce);
-       rcd.sec_hdr.revision = CPER_SEC_REV;
-       /* fru_id and fru_text is invalid */
-       rcd.sec_hdr.validation_bits = 0;
-       rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
-       rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
-       rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
-
-       memcpy(&rcd.mce, m, sizeof(*m));
-
-       return erst_write(&rcd.hdr);
-}
-
-ssize_t apei_read_mce(struct mce *m, u64 *record_id)
-{
-       struct cper_mce_record rcd;
-       int rc, pos;
-
-       rc = erst_get_record_id_begin(&pos);
-       if (rc)
-               return rc;
-retry:
-       rc = erst_get_record_id_next(&pos, record_id);
-       if (rc)
-               goto out;
-       /* no more record */
-       if (*record_id == APEI_ERST_INVALID_RECORD_ID)
-               goto out;
-       rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
-       /* someone else has cleared the record, try next one */
-       if (rc == -ENOENT)
-               goto retry;
-       else if (rc < 0)
-               goto out;
-       /* try to skip other type records in storage */
-       else if (rc != sizeof(rcd) ||
-                uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
-               goto retry;
-       memcpy(m, &rcd.mce, sizeof(*m));
-       rc = sizeof(*m);
-out:
-       erst_get_record_id_end();
-
-       return rc;
-}
-
-/* Check whether there is record in ERST */
-int apei_check_mce(void)
-{
-       return erst_get_record_count();
-}
-
-int apei_clear_mce(u64 record_id)
-{
-       return erst_clear(record_id);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
deleted file mode 100644 (file)
index 217cd44..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * MCE event pool management in MCE context
- *
- * Copyright (C) 2015 Intel Corp.
- * Author: Chen, Gong <gong.chen@linux.intel.com>
- *
- * This file is licensed under GPLv2.
- */
-#include <linux/smp.h>
-#include <linux/mm.h>
-#include <linux/genalloc.h>
-#include <linux/llist.h>
-#include "mce-internal.h"
-
-/*
- * printk() is not safe in MCE context. This is a lock-less memory allocator
- * used to save error information organized in a lock-less list.
- *
- * This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
- */
-#define MCE_POOLSZ     (2 * PAGE_SIZE)
-
-static struct gen_pool *mce_evt_pool;
-static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
-
-/*
- * Compare the record "t" with each of the records on list "l" to see if
- * an equivalent one is present in the list.
- */
-static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
-{
-       struct mce_evt_llist *node;
-       struct mce *m1, *m2;
-
-       m1 = &t->mce;
-
-       llist_for_each_entry(node, &l->llnode, llnode) {
-               m2 = &node->mce;
-
-               if (!mce_cmp(m1, m2))
-                       return true;
-       }
-       return false;
-}
-
-/*
- * The system has panicked - we'd like to peruse the list of MCE records
- * that have been queued, but not seen by anyone yet.  The list is in
- * reverse time order, so we need to reverse it. While doing that we can
- * also drop duplicate records (these were logged because some banks are
- * shared between cores or by all threads on a socket).
- */
-struct llist_node *mce_gen_pool_prepare_records(void)
-{
-       struct llist_node *head;
-       LLIST_HEAD(new_head);
-       struct mce_evt_llist *node, *t;
-
-       head = llist_del_all(&mce_event_llist);
-       if (!head)
-               return NULL;
-
-       /* squeeze out duplicates while reversing order */
-       llist_for_each_entry_safe(node, t, head, llnode) {
-               if (!is_duplicate_mce_record(node, t))
-                       llist_add(&node->llnode, &new_head);
-       }
-
-       return new_head.first;
-}
-
-void mce_gen_pool_process(struct work_struct *__unused)
-{
-       struct llist_node *head;
-       struct mce_evt_llist *node, *tmp;
-       struct mce *mce;
-
-       head = llist_del_all(&mce_event_llist);
-       if (!head)
-               return;
-
-       head = llist_reverse_order(head);
-       llist_for_each_entry_safe(node, tmp, head, llnode) {
-               mce = &node->mce;
-               blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
-               gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
-       }
-}
-
-bool mce_gen_pool_empty(void)
-{
-       return llist_empty(&mce_event_llist);
-}
-
-int mce_gen_pool_add(struct mce *mce)
-{
-       struct mce_evt_llist *node;
-
-       if (!mce_evt_pool)
-               return -EINVAL;
-
-       node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
-       if (!node) {
-               pr_warn_ratelimited("MCE records pool full!\n");
-               return -ENOMEM;
-       }
-
-       memcpy(&node->mce, mce, sizeof(*mce));
-       llist_add(&node->llnode, &mce_event_llist);
-
-       return 0;
-}
-
-static int mce_gen_pool_create(void)
-{
-       struct gen_pool *tmpp;
-       int ret = -ENOMEM;
-
-       tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
-       if (!tmpp)
-               goto out;
-
-       ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
-       if (ret) {
-               gen_pool_destroy(tmpp);
-               goto out;
-       }
-
-       mce_evt_pool = tmpp;
-
-out:
-       return ret;
-}
-
-int mce_gen_pool_init(void)
-{
-       /* Just init mce_gen_pool once. */
-       if (mce_evt_pool)
-               return 0;
-
-       return mce_gen_pool_create();
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
deleted file mode 100644 (file)
index 1fc424c..0000000
+++ /dev/null
@@ -1,739 +0,0 @@
-/*
- * Machine check injection support.
- * Copyright 2008 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Authors:
- * Andi Kleen
- * Ying Huang
- *
- * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
- * for testing different aspects of the RAS code. This driver should be
- * built as module so that it can be loaded on production kernels for
- * testing purposes.
- *
- * This file may be distributed under the terms of the GNU General Public
- * License version 2.
- *
- * Copyright (c) 2010-17:  Borislav Petkov <bp@alien8.de>
- *                        Advanced Micro Devices Inc.
- */
-
-#include <linux/cpu.h>
-#include <linux/debugfs.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/pci.h>
-#include <linux/uaccess.h>
-
-#include <asm/amd_nb.h>
-#include <asm/apic.h>
-#include <asm/irq_vectors.h>
-#include <asm/mce.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-
-#include "mce-internal.h"
-
-/*
- * Collect all the MCi_XXX settings
- */
-static struct mce i_mce;
-static struct dentry *dfs_inj;
-
-static u8 n_banks;
-
-#define MAX_FLAG_OPT_SIZE      4
-#define NBCFG                  0x44
-
-enum injection_type {
-       SW_INJ = 0,     /* SW injection, simply decode the error */
-       HW_INJ,         /* Trigger a #MC */
-       DFR_INT_INJ,    /* Trigger Deferred error interrupt */
-       THR_INT_INJ,    /* Trigger threshold interrupt */
-       N_INJ_TYPES,
-};
-
-static const char * const flags_options[] = {
-       [SW_INJ] = "sw",
-       [HW_INJ] = "hw",
-       [DFR_INT_INJ] = "df",
-       [THR_INT_INJ] = "th",
-       NULL
-};
-
-/* Set default injection to SW_INJ */
-static enum injection_type inj_type = SW_INJ;
-
-#define MCE_INJECT_SET(reg)                                            \
-static int inj_##reg##_set(void *data, u64 val)                                \
-{                                                                      \
-       struct mce *m = (struct mce *)data;                             \
-                                                                       \
-       m->reg = val;                                                   \
-       return 0;                                                       \
-}
-
-MCE_INJECT_SET(status);
-MCE_INJECT_SET(misc);
-MCE_INJECT_SET(addr);
-MCE_INJECT_SET(synd);
-
-#define MCE_INJECT_GET(reg)                                            \
-static int inj_##reg##_get(void *data, u64 *val)                       \
-{                                                                      \
-       struct mce *m = (struct mce *)data;                             \
-                                                                       \
-       *val = m->reg;                                                  \
-       return 0;                                                       \
-}
-
-MCE_INJECT_GET(status);
-MCE_INJECT_GET(misc);
-MCE_INJECT_GET(addr);
-MCE_INJECT_GET(synd);
-
-DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
-
-static void setup_inj_struct(struct mce *m)
-{
-       memset(m, 0, sizeof(struct mce));
-
-       m->cpuvendor = boot_cpu_data.x86_vendor;
-       m->time      = ktime_get_real_seconds();
-       m->cpuid     = cpuid_eax(1);
-       m->microcode = boot_cpu_data.microcode;
-}
-
-/* Update fake mce registers on current CPU. */
-static void inject_mce(struct mce *m)
-{
-       struct mce *i = &per_cpu(injectm, m->extcpu);
-
-       /* Make sure no one reads partially written injectm */
-       i->finished = 0;
-       mb();
-       m->finished = 0;
-       /* First set the fields after finished */
-       i->extcpu = m->extcpu;
-       mb();
-       /* Now write record in order, finished last (except above) */
-       memcpy(i, m, sizeof(struct mce));
-       /* Finally activate it */
-       mb();
-       i->finished = 1;
-}
-
-static void raise_poll(struct mce *m)
-{
-       unsigned long flags;
-       mce_banks_t b;
-
-       memset(&b, 0xff, sizeof(mce_banks_t));
-       local_irq_save(flags);
-       machine_check_poll(0, &b);
-       local_irq_restore(flags);
-       m->finished = 0;
-}
-
-static void raise_exception(struct mce *m, struct pt_regs *pregs)
-{
-       struct pt_regs regs;
-       unsigned long flags;
-
-       if (!pregs) {
-               memset(&regs, 0, sizeof(struct pt_regs));
-               regs.ip = m->ip;
-               regs.cs = m->cs;
-               pregs = &regs;
-       }
-       /* in mcheck exeception handler, irq will be disabled */
-       local_irq_save(flags);
-       do_machine_check(pregs, 0);
-       local_irq_restore(flags);
-       m->finished = 0;
-}
-
-static cpumask_var_t mce_inject_cpumask;
-static DEFINE_MUTEX(mce_inject_mutex);
-
-static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
-{
-       int cpu = smp_processor_id();
-       struct mce *m = this_cpu_ptr(&injectm);
-       if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
-               return NMI_DONE;
-       cpumask_clear_cpu(cpu, mce_inject_cpumask);
-       if (m->inject_flags & MCJ_EXCEPTION)
-               raise_exception(m, regs);
-       else if (m->status)
-               raise_poll(m);
-       return NMI_HANDLED;
-}
-
-static void mce_irq_ipi(void *info)
-{
-       int cpu = smp_processor_id();
-       struct mce *m = this_cpu_ptr(&injectm);
-
-       if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
-                       m->inject_flags & MCJ_EXCEPTION) {
-               cpumask_clear_cpu(cpu, mce_inject_cpumask);
-               raise_exception(m, NULL);
-       }
-}
-
-/* Inject mce on current CPU */
-static int raise_local(void)
-{
-       struct mce *m = this_cpu_ptr(&injectm);
-       int context = MCJ_CTX(m->inject_flags);
-       int ret = 0;
-       int cpu = m->extcpu;
-
-       if (m->inject_flags & MCJ_EXCEPTION) {
-               pr_info("Triggering MCE exception on CPU %d\n", cpu);
-               switch (context) {
-               case MCJ_CTX_IRQ:
-                       /*
-                        * Could do more to fake interrupts like
-                        * calling irq_enter, but the necessary
-                        * machinery isn't exported currently.
-                        */
-                       /*FALL THROUGH*/
-               case MCJ_CTX_PROCESS:
-                       raise_exception(m, NULL);
-                       break;
-               default:
-                       pr_info("Invalid MCE context\n");
-                       ret = -EINVAL;
-               }
-               pr_info("MCE exception done on CPU %d\n", cpu);
-       } else if (m->status) {
-               pr_info("Starting machine check poll CPU %d\n", cpu);
-               raise_poll(m);
-               mce_notify_irq();
-               pr_info("Machine check poll done on CPU %d\n", cpu);
-       } else
-               m->finished = 0;
-
-       return ret;
-}
-
-static void __maybe_unused raise_mce(struct mce *m)
-{
-       int context = MCJ_CTX(m->inject_flags);
-
-       inject_mce(m);
-
-       if (context == MCJ_CTX_RANDOM)
-               return;
-
-       if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
-               unsigned long start;
-               int cpu;
-
-               get_online_cpus();
-               cpumask_copy(mce_inject_cpumask, cpu_online_mask);
-               cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
-               for_each_online_cpu(cpu) {
-                       struct mce *mcpu = &per_cpu(injectm, cpu);
-                       if (!mcpu->finished ||
-                           MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
-                               cpumask_clear_cpu(cpu, mce_inject_cpumask);
-               }
-               if (!cpumask_empty(mce_inject_cpumask)) {
-                       if (m->inject_flags & MCJ_IRQ_BROADCAST) {
-                               /*
-                                * don't wait because mce_irq_ipi is necessary
-                                * to be sync with following raise_local
-                                */
-                               preempt_disable();
-                               smp_call_function_many(mce_inject_cpumask,
-                                       mce_irq_ipi, NULL, 0);
-                               preempt_enable();
-                       } else if (m->inject_flags & MCJ_NMI_BROADCAST)
-                               apic->send_IPI_mask(mce_inject_cpumask,
-                                               NMI_VECTOR);
-               }
-               start = jiffies;
-               while (!cpumask_empty(mce_inject_cpumask)) {
-                       if (!time_before(jiffies, start + 2*HZ)) {
-                               pr_err("Timeout waiting for mce inject %lx\n",
-                                       *cpumask_bits(mce_inject_cpumask));
-                               break;
-                       }
-                       cpu_relax();
-               }
-               raise_local();
-               put_cpu();
-               put_online_cpus();
-       } else {
-               preempt_disable();
-               raise_local();
-               preempt_enable();
-       }
-}
-
-static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
-                           void *data)
-{
-       struct mce *m = (struct mce *)data;
-
-       if (!m)
-               return NOTIFY_DONE;
-
-       mutex_lock(&mce_inject_mutex);
-       raise_mce(m);
-       mutex_unlock(&mce_inject_mutex);
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block inject_nb = {
-       .notifier_call  = mce_inject_raise,
-};
-
-/*
- * Caller needs to be make sure this cpu doesn't disappear
- * from under us, i.e.: get_cpu/put_cpu.
- */
-static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
-{
-       u32 l, h;
-       int err;
-
-       err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
-       if (err) {
-               pr_err("%s: error reading HWCR\n", __func__);
-               return err;
-       }
-
-       enable ? (l |= BIT(18)) : (l &= ~BIT(18));
-
-       err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
-       if (err)
-               pr_err("%s: error writing HWCR\n", __func__);
-
-       return err;
-}
-
-static int __set_inj(const char *buf)
-{
-       int i;
-
-       for (i = 0; i < N_INJ_TYPES; i++) {
-               if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
-                       inj_type = i;
-                       return 0;
-               }
-       }
-       return -EINVAL;
-}
-
-static ssize_t flags_read(struct file *filp, char __user *ubuf,
-                         size_t cnt, loff_t *ppos)
-{
-       char buf[MAX_FLAG_OPT_SIZE];
-       int n;
-
-       n = sprintf(buf, "%s\n", flags_options[inj_type]);
-
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
-}
-
-static ssize_t flags_write(struct file *filp, const char __user *ubuf,
-                          size_t cnt, loff_t *ppos)
-{
-       char buf[MAX_FLAG_OPT_SIZE], *__buf;
-       int err;
-
-       if (cnt > MAX_FLAG_OPT_SIZE)
-               return -EINVAL;
-
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-
-       buf[cnt - 1] = 0;
-
-       /* strip whitespace */
-       __buf = strstrip(buf);
-
-       err = __set_inj(__buf);
-       if (err) {
-               pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
-               return err;
-       }
-
-       *ppos += cnt;
-
-       return cnt;
-}
-
-static const struct file_operations flags_fops = {
-       .read           = flags_read,
-       .write          = flags_write,
-       .llseek         = generic_file_llseek,
-};
-
-/*
- * On which CPU to inject?
- */
-MCE_INJECT_GET(extcpu);
-
-static int inj_extcpu_set(void *data, u64 val)
-{
-       struct mce *m = (struct mce *)data;
-
-       if (val >= nr_cpu_ids || !cpu_online(val)) {
-               pr_err("%s: Invalid CPU: %llu\n", __func__, val);
-               return -EINVAL;
-       }
-       m->extcpu = val;
-       return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
-
-static void trigger_mce(void *info)
-{
-       asm volatile("int $18");
-}
-
-static void trigger_dfr_int(void *info)
-{
-       asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
-}
-
-static void trigger_thr_int(void *info)
-{
-       asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
-}
-
-static u32 get_nbc_for_node(int node_id)
-{
-       struct cpuinfo_x86 *c = &boot_cpu_data;
-       u32 cores_per_node;
-
-       cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
-
-       return cores_per_node * node_id;
-}
-
-static void toggle_nb_mca_mst_cpu(u16 nid)
-{
-       struct amd_northbridge *nb;
-       struct pci_dev *F3;
-       u32 val;
-       int err;
-
-       nb = node_to_amd_nb(nid);
-       if (!nb)
-               return;
-
-       F3 = nb->misc;
-       if (!F3)
-               return;
-
-       err = pci_read_config_dword(F3, NBCFG, &val);
-       if (err) {
-               pr_err("%s: Error reading F%dx%03x.\n",
-                      __func__, PCI_FUNC(F3->devfn), NBCFG);
-               return;
-       }
-
-       if (val & BIT(27))
-               return;
-
-       pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
-              __func__);
-
-       val |= BIT(27);
-       err = pci_write_config_dword(F3, NBCFG, val);
-       if (err)
-               pr_err("%s: Error writing F%dx%03x.\n",
-                      __func__, PCI_FUNC(F3->devfn), NBCFG);
-}
-
-static void prepare_msrs(void *info)
-{
-       struct mce m = *(struct mce *)info;
-       u8 b = m.bank;
-
-       wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
-
-       if (boot_cpu_has(X86_FEATURE_SMCA)) {
-               if (m.inject_flags == DFR_INT_INJ) {
-                       wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
-                       wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
-               } else {
-                       wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
-                       wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
-               }
-
-               wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
-               wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
-       } else {
-               wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
-               wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
-               wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
-       }
-}
-
-static void do_inject(void)
-{
-       u64 mcg_status = 0;
-       unsigned int cpu = i_mce.extcpu;
-       u8 b = i_mce.bank;
-
-       i_mce.tsc = rdtsc_ordered();
-
-       if (i_mce.misc)
-               i_mce.status |= MCI_STATUS_MISCV;
-
-       if (i_mce.synd)
-               i_mce.status |= MCI_STATUS_SYNDV;
-
-       if (inj_type == SW_INJ) {
-               mce_inject_log(&i_mce);
-               return;
-       }
-
-       /* prep MCE global settings for the injection */
-       mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
-
-       if (!(i_mce.status & MCI_STATUS_PCC))
-               mcg_status |= MCG_STATUS_RIPV;
-
-       /*
-        * Ensure necessary status bits for deferred errors:
-        * - MCx_STATUS[Deferred]: make sure it is a deferred error
-        * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
-        */
-       if (inj_type == DFR_INT_INJ) {
-               i_mce.status |= MCI_STATUS_DEFERRED;
-               i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
-       }
-
-       /*
-        * For multi node CPUs, logging and reporting of bank 4 errors happens
-        * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
-        * Fam10h and later BKDGs.
-        */
-       if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
-           b == 4 &&
-           boot_cpu_data.x86 < 0x17) {
-               toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
-               cpu = get_nbc_for_node(amd_get_nb_id(cpu));
-       }
-
-       get_online_cpus();
-       if (!cpu_online(cpu))
-               goto err;
-
-       toggle_hw_mce_inject(cpu, true);
-
-       i_mce.mcgstatus = mcg_status;
-       i_mce.inject_flags = inj_type;
-       smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
-
-       toggle_hw_mce_inject(cpu, false);
-
-       switch (inj_type) {
-       case DFR_INT_INJ:
-               smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
-               break;
-       case THR_INT_INJ:
-               smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
-               break;
-       default:
-               smp_call_function_single(cpu, trigger_mce, NULL, 0);
-       }
-
-err:
-       put_online_cpus();
-
-}
-
-/*
- * This denotes into which bank we're injecting and triggers
- * the injection, at the same time.
- */
-static int inj_bank_set(void *data, u64 val)
-{
-       struct mce *m = (struct mce *)data;
-
-       if (val >= n_banks) {
-               pr_err("Non-existent MCE bank: %llu\n", val);
-               return -EINVAL;
-       }
-
-       m->bank = val;
-       do_inject();
-
-       /* Reset injection struct */
-       setup_inj_struct(&i_mce);
-
-       return 0;
-}
-
-MCE_INJECT_GET(bank);
-
-DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
-
-static const char readme_msg[] =
-"Description of the files and their usages:\n"
-"\n"
-"Note1: i refers to the bank number below.\n"
-"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
-"as they mirror the hardware registers.\n"
-"\n"
-"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
-"\t attributes of the error which caused the MCE.\n"
-"\n"
-"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
-"\t used for error thresholding purposes and its validity is indicated by\n"
-"\t MCi_STATUS[MiscV].\n"
-"\n"
-"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
-"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
-"\n"
-"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
-"\t associated with the error.\n"
-"\n"
-"cpu:\t The CPU to inject the error on.\n"
-"\n"
-"bank:\t Specify the bank you want to inject the error into: the number of\n"
-"\t banks in a processor varies and is family/model-specific, therefore, the\n"
-"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
-"\t injection.\n"
-"\n"
-"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
-"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
-"\t for AMD processors.\n"
-"\n"
-"\t Allowed error injection types:\n"
-"\t  - \"sw\": Software error injection. Decode error to a human-readable \n"
-"\t    format only. Safe to use.\n"
-"\t  - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
-"\t    handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
-"\t    is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
-"\t    before injecting.\n"
-"\t  - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
-"\t    error APIC interrupt handler to handle the error if the feature is \n"
-"\t    is present in hardware. \n"
-"\t  - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
-"\t    APIC interrupt handler to handle the error. \n"
-"\n";
-
-static ssize_t
-inj_readme_read(struct file *filp, char __user *ubuf,
-                      size_t cnt, loff_t *ppos)
-{
-       return simple_read_from_buffer(ubuf, cnt, ppos,
-                                       readme_msg, strlen(readme_msg));
-}
-
-static const struct file_operations readme_fops = {
-       .read           = inj_readme_read,
-};
-
-static struct dfs_node {
-       char *name;
-       struct dentry *d;
-       const struct file_operations *fops;
-       umode_t perm;
-} dfs_fls[] = {
-       { .name = "status",     .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
-       { .name = "misc",       .fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
-       { .name = "addr",       .fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
-       { .name = "synd",       .fops = &synd_fops,   .perm = S_IRUSR | S_IWUSR },
-       { .name = "bank",       .fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
-       { .name = "flags",      .fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
-       { .name = "cpu",        .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
-       { .name = "README",     .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
-};
-
-static int __init debugfs_init(void)
-{
-       unsigned int i;
-       u64 cap;
-
-       rdmsrl(MSR_IA32_MCG_CAP, cap);
-       n_banks = cap & MCG_BANKCNT_MASK;
-
-       dfs_inj = debugfs_create_dir("mce-inject", NULL);
-       if (!dfs_inj)
-               return -EINVAL;
-
-       for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
-               dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
-                                                   dfs_fls[i].perm,
-                                                   dfs_inj,
-                                                   &i_mce,
-                                                   dfs_fls[i].fops);
-
-               if (!dfs_fls[i].d)
-                       goto err_dfs_add;
-       }
-
-       return 0;
-
-err_dfs_add:
-       while (i-- > 0)
-               debugfs_remove(dfs_fls[i].d);
-
-       debugfs_remove(dfs_inj);
-       dfs_inj = NULL;
-
-       return -ENODEV;
-}
-
-static int __init inject_init(void)
-{
-       int err;
-
-       if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
-               return -ENOMEM;
-
-       err = debugfs_init();
-       if (err) {
-               free_cpumask_var(mce_inject_cpumask);
-               return err;
-       }
-
-       register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
-       mce_register_injector_chain(&inject_nb);
-
-       setup_inj_struct(&i_mce);
-
-       pr_info("Machine check injector initialized\n");
-
-       return 0;
-}
-
-static void __exit inject_exit(void)
-{
-
-       mce_unregister_injector_chain(&inject_nb);
-       unregister_nmi_handler(NMI_LOCAL, "mce_notify");
-
-       debugfs_remove_recursive(dfs_inj);
-       dfs_inj = NULL;
-
-       memset(&dfs_fls, 0, sizeof(dfs_fls));
-
-       free_cpumask_var(mce_inject_cpumask);
-}
-
-module_init(inject_init);
-module_exit(inject_exit);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
deleted file mode 100644 (file)
index ceb67cd..0000000
+++ /dev/null
@@ -1,173 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __X86_MCE_INTERNAL_H__
-#define __X86_MCE_INTERNAL_H__
-
-#include <linux/device.h>
-#include <asm/mce.h>
-
-enum severity_level {
-       MCE_NO_SEVERITY,
-       MCE_DEFERRED_SEVERITY,
-       MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
-       MCE_KEEP_SEVERITY,
-       MCE_SOME_SEVERITY,
-       MCE_AO_SEVERITY,
-       MCE_UC_SEVERITY,
-       MCE_AR_SEVERITY,
-       MCE_PANIC_SEVERITY,
-};
-
-extern struct blocking_notifier_head x86_mce_decoder_chain;
-
-#define ATTR_LEN               16
-#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
-
-/* One object for each MCE bank, shared by all CPUs */
-struct mce_bank {
-       u64                     ctl;                    /* subevents to enable */
-       unsigned char init;                             /* initialise bank? */
-       struct device_attribute attr;                   /* device attribute */
-       char                    attrname[ATTR_LEN];     /* attribute name */
-};
-
-struct mce_evt_llist {
-       struct llist_node llnode;
-       struct mce mce;
-};
-
-void mce_gen_pool_process(struct work_struct *__unused);
-bool mce_gen_pool_empty(void);
-int mce_gen_pool_add(struct mce *mce);
-int mce_gen_pool_init(void);
-struct llist_node *mce_gen_pool_prepare_records(void);
-
-extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
-struct dentry *mce_get_debugfs_dir(void);
-
-extern struct mce_bank *mce_banks;
-extern mce_banks_t mce_banks_ce_disabled;
-
-#ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
-void cmci_disable_bank(int bank);
-#else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
-static inline void cmci_disable_bank(int bank) { }
-#endif
-
-void mce_timer_kick(unsigned long interval);
-
-#ifdef CONFIG_ACPI_APEI
-int apei_write_mce(struct mce *m);
-ssize_t apei_read_mce(struct mce *m, u64 *record_id);
-int apei_check_mce(void);
-int apei_clear_mce(u64 record_id);
-#else
-static inline int apei_write_mce(struct mce *m)
-{
-       return -EINVAL;
-}
-static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
-{
-       return 0;
-}
-static inline int apei_check_mce(void)
-{
-       return 0;
-}
-static inline int apei_clear_mce(u64 record_id)
-{
-       return -EINVAL;
-}
-#endif
-
-void mce_inject_log(struct mce *m);
-
-/*
- * We consider records to be equivalent if bank+status+addr+misc all match.
- * This is only used when the system is going down because of a fatal error
- * to avoid cluttering the console log with essentially repeated information.
- * In normal processing all errors seen are logged.
- */
-static inline bool mce_cmp(struct mce *m1, struct mce *m2)
-{
-       return m1->bank != m2->bank ||
-               m1->status != m2->status ||
-               m1->addr != m2->addr ||
-               m1->misc != m2->misc;
-}
-
-extern struct device_attribute dev_attr_trigger;
-
-#ifdef CONFIG_X86_MCELOG_LEGACY
-void mce_work_trigger(void);
-void mce_register_injector_chain(struct notifier_block *nb);
-void mce_unregister_injector_chain(struct notifier_block *nb);
-#else
-static inline void mce_work_trigger(void)      { }
-static inline void mce_register_injector_chain(struct notifier_block *nb)      { }
-static inline void mce_unregister_injector_chain(struct notifier_block *nb)    { }
-#endif
-
-struct mca_config {
-       bool dont_log_ce;
-       bool cmci_disabled;
-       bool ignore_ce;
-
-       __u64 lmce_disabled             : 1,
-             disabled                  : 1,
-             ser                       : 1,
-             recovery                  : 1,
-             bios_cmci_threshold       : 1,
-             __reserved                : 59;
-
-       u8 banks;
-       s8 bootlog;
-       int tolerant;
-       int monarch_timeout;
-       int panic_timeout;
-       u32 rip_msr;
-};
-
-extern struct mca_config mca_cfg;
-
-struct mce_vendor_flags {
-       /*
-        * Indicates that overflow conditions are not fatal, when set.
-        */
-       __u64 overflow_recov    : 1,
-
-       /*
-        * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
-        * Recovery. It indicates support for data poisoning in HW and deferred
-        * error interrupts.
-        */
-             succor            : 1,
-
-       /*
-        * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
-        * the register space for each MCA bank and also increases number of
-        * banks. Also, to accommodate the new banks and registers, the MCA
-        * register space is moved to a new MSR range.
-        */
-             smca              : 1,
-
-             __reserved_0      : 61;
-};
-
-extern struct mce_vendor_flags mce_flags;
-
-struct mca_msr_regs {
-       u32 (*ctl)      (int bank);
-       u32 (*status)   (int bank);
-       u32 (*addr)     (int bank);
-       u32 (*misc)     (int bank);
-};
-
-extern struct mca_msr_regs msr_ops;
-
-#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
deleted file mode 100644 (file)
index 44396d5..0000000
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * MCE grading rules.
- * Copyright 2008, 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Author: Andi Kleen
- */
-#include <linux/kernel.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <asm/mce.h>
-#include <linux/uaccess.h>
-
-#include "mce-internal.h"
-
-/*
- * Grade an mce by severity. In general the most severe ones are processed
- * first. Since there are quite a lot of combinations test the bits in a
- * table-driven way. The rules are simply processed in order, first
- * match wins.
- *
- * Note this is only used for machine check exceptions, the corrected
- * errors use much simpler rules. The exceptions still check for the corrected
- * errors, but only to leave them alone for the CMCI handler (except for
- * panic situations)
- */
-
-enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
-enum ser { SER_REQUIRED = 1, NO_SER = 2 };
-enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
-
-static struct severity {
-       u64 mask;
-       u64 result;
-       unsigned char sev;
-       unsigned char mcgmask;
-       unsigned char mcgres;
-       unsigned char ser;
-       unsigned char context;
-       unsigned char excp;
-       unsigned char covered;
-       char *msg;
-} severities[] = {
-#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
-#define  KERNEL                .context = IN_KERNEL
-#define  USER          .context = IN_USER
-#define  KERNEL_RECOV  .context = IN_KERNEL_RECOV
-#define  SER           .ser = SER_REQUIRED
-#define  NOSER         .ser = NO_SER
-#define  EXCP          .excp = EXCP_CONTEXT
-#define  NOEXCP                .excp = NO_EXCP
-#define  BITCLR(x)     .mask = x, .result = 0
-#define  BITSET(x)     .mask = x, .result = x
-#define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
-#define  MASK(x, y)    .mask = x, .result = y
-#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
-#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
-#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
-#define        MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
-
-       MCESEV(
-               NO, "Invalid",
-               BITCLR(MCI_STATUS_VAL)
-               ),
-       MCESEV(
-               NO, "Not enabled",
-               EXCP, BITCLR(MCI_STATUS_EN)
-               ),
-       MCESEV(
-               PANIC, "Processor context corrupt",
-               BITSET(MCI_STATUS_PCC)
-               ),
-       /* When MCIP is not set something is very confused */
-       MCESEV(
-               PANIC, "MCIP not set in MCA handler",
-               EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
-               ),
-       /* Neither return not error IP -- no chance to recover -> PANIC */
-       MCESEV(
-               PANIC, "Neither restart nor error IP",
-               EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
-               ),
-       MCESEV(
-               PANIC, "In kernel and no restart IP",
-               EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
-               ),
-       MCESEV(
-               PANIC, "In kernel and no restart IP",
-               EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
-               ),
-       MCESEV(
-               DEFERRED, "Deferred error",
-               NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
-               ),
-       MCESEV(
-               KEEP, "Corrected error",
-               NOSER, BITCLR(MCI_STATUS_UC)
-               ),
-
-       /*
-        * known AO MCACODs reported via MCE or CMC:
-        *
-        * SRAO could be signaled either via a machine check exception or
-        * CMCI with the corresponding bit S 1 or 0. So we don't need to
-        * check bit S for SRAO.
-        */
-       MCESEV(
-               AO, "Action optional: memory scrubbing error",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
-               ),
-       MCESEV(
-               AO, "Action optional: last level cache writeback error",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
-               ),
-
-       /* ignore OVER for UCNA */
-       MCESEV(
-               UCNA, "Uncorrected no action required",
-               SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
-               ),
-       MCESEV(
-               PANIC, "Illegal combination (UCNA with AR=1)",
-               SER,
-               MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
-               ),
-       MCESEV(
-               KEEP, "Non signalled machine check",
-               SER, BITCLR(MCI_STATUS_S)
-               ),
-
-       MCESEV(
-               PANIC, "Action required with lost events",
-               SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
-               ),
-
-       /* known AR MCACODs: */
-#ifdef CONFIG_MEMORY_FAILURE
-       MCESEV(
-               KEEP, "Action required but unaffected thread is continuable",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
-               MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
-               ),
-       MCESEV(
-               AR, "Action required: data load in error recoverable area of kernel",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
-               KERNEL_RECOV
-               ),
-       MCESEV(
-               AR, "Action required: data load error in a user process",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
-               USER
-               ),
-       MCESEV(
-               AR, "Action required: instruction fetch error in a user process",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
-               USER
-               ),
-       MCESEV(
-               PANIC, "Data load in unrecoverable area of kernel",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
-               KERNEL
-               ),
-#endif
-       MCESEV(
-               PANIC, "Action required: unknown MCACOD",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
-               ),
-
-       MCESEV(
-               SOME, "Action optional: unknown MCACOD",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
-               ),
-       MCESEV(
-               SOME, "Action optional with lost events",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
-               ),
-
-       MCESEV(
-               PANIC, "Overflowed uncorrected",
-               BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
-               ),
-       MCESEV(
-               UC, "Uncorrected",
-               BITSET(MCI_STATUS_UC)
-               ),
-       MCESEV(
-               SOME, "No match",
-               BITSET(0)
-               )       /* always matches. keep at end */
-};
-
-#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
-                               (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
-
-/*
- * If mcgstatus indicated that ip/cs on the stack were
- * no good, then "m->cs" will be zero and we will have
- * to assume the worst case (IN_KERNEL) as we actually
- * have no idea what we were executing when the machine
- * check hit.
- * If we do have a good "m->cs" (or a faked one in the
- * case we were executing in VM86 mode) we can use it to
- * distinguish an exception taken in user from from one
- * taken in the kernel.
- */
-static int error_context(struct mce *m)
-{
-       if ((m->cs & 3) == 3)
-               return IN_USER;
-       if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
-               return IN_KERNEL_RECOV;
-       return IN_KERNEL;
-}
-
-static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
-{
-       u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
-       u32 low, high;
-
-       /*
-        * We need to look at the following bits:
-        * - "succor" bit (data poisoning support), and
-        * - TCC bit (Task Context Corrupt)
-        * in MCi_STATUS to determine error severity.
-        */
-       if (!mce_flags.succor)
-               return MCE_PANIC_SEVERITY;
-
-       if (rdmsr_safe(addr, &low, &high))
-               return MCE_PANIC_SEVERITY;
-
-       /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
-       if ((low & MCI_CONFIG_MCAX) &&
-           (m->status & MCI_STATUS_TCC) &&
-           (err_ctx == IN_KERNEL))
-               return MCE_PANIC_SEVERITY;
-
-        /* ...otherwise invoke hwpoison handler. */
-       return MCE_AR_SEVERITY;
-}
-
-/*
- * See AMD Error Scope Hierarchy table in a newer BKDG. For example
- * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
- */
-static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
-{
-       enum context ctx = error_context(m);
-
-       /* Processor Context Corrupt, no need to fumble too much, die! */
-       if (m->status & MCI_STATUS_PCC)
-               return MCE_PANIC_SEVERITY;
-
-       if (m->status & MCI_STATUS_UC) {
-
-               if (ctx == IN_KERNEL)
-                       return MCE_PANIC_SEVERITY;
-
-               /*
-                * On older systems where overflow_recov flag is not present, we
-                * should simply panic if an error overflow occurs. If
-                * overflow_recov flag is present and set, then software can try
-                * to at least kill process to prolong system operation.
-                */
-               if (mce_flags.overflow_recov) {
-                       if (mce_flags.smca)
-                               return mce_severity_amd_smca(m, ctx);
-
-                       /* kill current process */
-                       return MCE_AR_SEVERITY;
-               } else {
-                       /* at least one error was not logged */
-                       if (m->status & MCI_STATUS_OVER)
-                               return MCE_PANIC_SEVERITY;
-               }
-
-               /*
-                * For any other case, return MCE_UC_SEVERITY so that we log the
-                * error and exit #MC handler.
-                */
-               return MCE_UC_SEVERITY;
-       }
-
-       /*
-        * deferred error: poll handler catches these and adds to mce_ring so
-        * memory-failure can take recovery actions.
-        */
-       if (m->status & MCI_STATUS_DEFERRED)
-               return MCE_DEFERRED_SEVERITY;
-
-       /*
-        * corrected error: poll handler catches these and passes responsibility
-        * of decoding the error to EDAC
-        */
-       return MCE_KEEP_SEVERITY;
-}
-
-static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
-{
-       enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
-       enum context ctx = error_context(m);
-       struct severity *s;
-
-       for (s = severities;; s++) {
-               if ((m->status & s->mask) != s->result)
-                       continue;
-               if ((m->mcgstatus & s->mcgmask) != s->mcgres)
-                       continue;
-               if (s->ser == SER_REQUIRED && !mca_cfg.ser)
-                       continue;
-               if (s->ser == NO_SER && mca_cfg.ser)
-                       continue;
-               if (s->context && ctx != s->context)
-                       continue;
-               if (s->excp && excp != s->excp)
-                       continue;
-               if (msg)
-                       *msg = s->msg;
-               s->covered = 1;
-               if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
-                       if (tolerant < 1)
-                               return MCE_PANIC_SEVERITY;
-               }
-               return s->sev;
-       }
-}
-
-/* Default to mce_severity_intel */
-int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
-                   mce_severity_intel;
-
-void __init mcheck_vendor_init_severity(void)
-{
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
-           boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
-               mce_severity = mce_severity_amd;
-}
-
-#ifdef CONFIG_DEBUG_FS
-static void *s_start(struct seq_file *f, loff_t *pos)
-{
-       if (*pos >= ARRAY_SIZE(severities))
-               return NULL;
-       return &severities[*pos];
-}
-
-static void *s_next(struct seq_file *f, void *data, loff_t *pos)
-{
-       if (++(*pos) >= ARRAY_SIZE(severities))
-               return NULL;
-       return &severities[*pos];
-}
-
-static void s_stop(struct seq_file *f, void *data)
-{
-}
-
-static int s_show(struct seq_file *f, void *data)
-{
-       struct severity *ser = data;
-       seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
-       return 0;
-}
-
-static const struct seq_operations severities_seq_ops = {
-       .start  = s_start,
-       .next   = s_next,
-       .stop   = s_stop,
-       .show   = s_show,
-};
-
-static int severities_coverage_open(struct inode *inode, struct file *file)
-{
-       return seq_open(file, &severities_seq_ops);
-}
-
-static ssize_t severities_coverage_write(struct file *file,
-                                        const char __user *ubuf,
-                                        size_t count, loff_t *ppos)
-{
-       int i;
-       for (i = 0; i < ARRAY_SIZE(severities); i++)
-               severities[i].covered = 0;
-       return count;
-}
-
-static const struct file_operations severities_coverage_fops = {
-       .open           = severities_coverage_open,
-       .release        = seq_release,
-       .read           = seq_read,
-       .write          = severities_coverage_write,
-       .llseek         = seq_lseek,
-};
-
-static int __init severities_debugfs_init(void)
-{
-       struct dentry *dmce, *fsev;
-
-       dmce = mce_get_debugfs_dir();
-       if (!dmce)
-               goto err_out;
-
-       fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
-                                  &severities_coverage_fops);
-       if (!fsev)
-               goto err_out;
-
-       return 0;
-
-err_out:
-       return -ENOMEM;
-}
-late_initcall(severities_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
deleted file mode 100644 (file)
index 36d2696..0000000
+++ /dev/null
@@ -1,2499 +0,0 @@
-/*
- * Machine check handler.
- *
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/thread_info.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/kobject.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/string.h>
-#include <linux/device.h>
-#include <linux/syscore_ops.h>
-#include <linux/delay.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kmod.h>
-#include <linux/poll.h>
-#include <linux/nmi.h>
-#include <linux/cpu.h>
-#include <linux/ras.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/debugfs.h>
-#include <linux/irq_work.h>
-#include <linux/export.h>
-#include <linux/jump_label.h>
-#include <linux/set_memory.h>
-
-#include <asm/intel-family.h>
-#include <asm/processor.h>
-#include <asm/traps.h>
-#include <asm/tlbflush.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/reboot.h>
-
-#include "mce-internal.h"
-
-static DEFINE_MUTEX(mce_log_mutex);
-
-/* sysfs synchronization */
-static DEFINE_MUTEX(mce_sysfs_mutex);
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/mce.h>
-
-#define SPINUNIT               100     /* 100ns */
-
-DEFINE_PER_CPU(unsigned, mce_exception_count);
-
-struct mce_bank *mce_banks __read_mostly;
-struct mce_vendor_flags mce_flags __read_mostly;
-
-struct mca_config mca_cfg __read_mostly = {
-       .bootlog  = -1,
-       /*
-        * Tolerant levels:
-        * 0: always panic on uncorrected errors, log corrected errors
-        * 1: panic or SIGBUS on uncorrected errors, log corrected errors
-        * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
-        * 3: never panic or SIGBUS, log all errors (for testing only)
-        */
-       .tolerant = 1,
-       .monarch_timeout = -1
-};
-
-static DEFINE_PER_CPU(struct mce, mces_seen);
-static unsigned long mce_need_notify;
-static int cpu_missing;
-
-/*
- * MCA banks polled by the period polling timer for corrected events.
- * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
- */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
-       [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-/*
- * MCA banks controlled through firmware first for corrected errors.
- * This is a global list of banks for which we won't enable CMCI and we
- * won't poll. Firmware controls these banks and is responsible for
- * reporting corrected errors through GHES. Uncorrected/recoverable
- * errors are still notified through a machine check.
- */
-mce_banks_t mce_banks_ce_disabled;
-
-static struct work_struct mce_work;
-static struct irq_work mce_irq_work;
-
-static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-
-/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
- */
-BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
-       memset(m, 0, sizeof(struct mce));
-       m->cpu = m->extcpu = smp_processor_id();
-       /* need the internal __ version to avoid deadlocks */
-       m->time = __ktime_get_real_seconds();
-       m->cpuvendor = boot_cpu_data.x86_vendor;
-       m->cpuid = cpuid_eax(1);
-       m->socketid = cpu_data(m->extcpu).phys_proc_id;
-       m->apicid = cpu_data(m->extcpu).initial_apicid;
-       rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
-
-       if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
-               rdmsrl(MSR_PPIN, m->ppin);
-
-       m->microcode = boot_cpu_data.microcode;
-}
-
-DEFINE_PER_CPU(struct mce, injectm);
-EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-
-void mce_log(struct mce *m)
-{
-       if (!mce_gen_pool_add(m))
-               irq_work_queue(&mce_irq_work);
-}
-
-void mce_inject_log(struct mce *m)
-{
-       mutex_lock(&mce_log_mutex);
-       mce_log(m);
-       mutex_unlock(&mce_log_mutex);
-}
-EXPORT_SYMBOL_GPL(mce_inject_log);
-
-static struct notifier_block mce_srao_nb;
-
-/*
- * We run the default notifier if we have only the SRAO, the first and the
- * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
- * notifiers registered on the chain.
- */
-#define NUM_DEFAULT_NOTIFIERS  3
-static atomic_t num_notifiers;
-
-void mce_register_decode_chain(struct notifier_block *nb)
-{
-       if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
-               return;
-
-       atomic_inc(&num_notifiers);
-
-       blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
-}
-EXPORT_SYMBOL_GPL(mce_register_decode_chain);
-
-void mce_unregister_decode_chain(struct notifier_block *nb)
-{
-       atomic_dec(&num_notifiers);
-
-       blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
-}
-EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-
-static inline u32 ctl_reg(int bank)
-{
-       return MSR_IA32_MCx_CTL(bank);
-}
-
-static inline u32 status_reg(int bank)
-{
-       return MSR_IA32_MCx_STATUS(bank);
-}
-
-static inline u32 addr_reg(int bank)
-{
-       return MSR_IA32_MCx_ADDR(bank);
-}
-
-static inline u32 misc_reg(int bank)
-{
-       return MSR_IA32_MCx_MISC(bank);
-}
-
-static inline u32 smca_ctl_reg(int bank)
-{
-       return MSR_AMD64_SMCA_MCx_CTL(bank);
-}
-
-static inline u32 smca_status_reg(int bank)
-{
-       return MSR_AMD64_SMCA_MCx_STATUS(bank);
-}
-
-static inline u32 smca_addr_reg(int bank)
-{
-       return MSR_AMD64_SMCA_MCx_ADDR(bank);
-}
-
-static inline u32 smca_misc_reg(int bank)
-{
-       return MSR_AMD64_SMCA_MCx_MISC(bank);
-}
-
-struct mca_msr_regs msr_ops = {
-       .ctl    = ctl_reg,
-       .status = status_reg,
-       .addr   = addr_reg,
-       .misc   = misc_reg
-};
-
-static void __print_mce(struct mce *m)
-{
-       pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
-                m->extcpu,
-                (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
-                m->mcgstatus, m->bank, m->status);
-
-       if (m->ip) {
-               pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
-                       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-                       m->cs, m->ip);
-
-               if (m->cs == __KERNEL_CS)
-                       pr_cont("{%pS}", (void *)(unsigned long)m->ip);
-               pr_cont("\n");
-       }
-
-       pr_emerg(HW_ERR "TSC %llx ", m->tsc);
-       if (m->addr)
-               pr_cont("ADDR %llx ", m->addr);
-       if (m->misc)
-               pr_cont("MISC %llx ", m->misc);
-
-       if (mce_flags.smca) {
-               if (m->synd)
-                       pr_cont("SYND %llx ", m->synd);
-               if (m->ipid)
-                       pr_cont("IPID %llx ", m->ipid);
-       }
-
-       pr_cont("\n");
-       /*
-        * Note this output is parsed by external tools and old fields
-        * should not be changed.
-        */
-       pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
-               m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
-               m->microcode);
-}
-
-static void print_mce(struct mce *m)
-{
-       __print_mce(m);
-
-       if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
-               pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
-}
-
-#define PANIC_TIMEOUT 5 /* 5 seconds */
-
-static atomic_t mce_panicked;
-
-static int fake_panic;
-static atomic_t mce_fake_panicked;
-
-/* Panic in progress. Enable interrupts and wait for final IPI */
-static void wait_for_panic(void)
-{
-       long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
-
-       preempt_disable();
-       local_irq_enable();
-       while (timeout-- > 0)
-               udelay(1);
-       if (panic_timeout == 0)
-               panic_timeout = mca_cfg.panic_timeout;
-       panic("Panicing machine check CPU died");
-}
-
-static void mce_panic(const char *msg, struct mce *final, char *exp)
-{
-       int apei_err = 0;
-       struct llist_node *pending;
-       struct mce_evt_llist *l;
-
-       if (!fake_panic) {
-               /*
-                * Make sure only one CPU runs in machine check panic
-                */
-               if (atomic_inc_return(&mce_panicked) > 1)
-                       wait_for_panic();
-               barrier();
-
-               bust_spinlocks(1);
-               console_verbose();
-       } else {
-               /* Don't log too much for fake panic */
-               if (atomic_inc_return(&mce_fake_panicked) > 1)
-                       return;
-       }
-       pending = mce_gen_pool_prepare_records();
-       /* First print corrected ones that are still unlogged */
-       llist_for_each_entry(l, pending, llnode) {
-               struct mce *m = &l->mce;
-               if (!(m->status & MCI_STATUS_UC)) {
-                       print_mce(m);
-                       if (!apei_err)
-                               apei_err = apei_write_mce(m);
-               }
-       }
-       /* Now print uncorrected but with the final one last */
-       llist_for_each_entry(l, pending, llnode) {
-               struct mce *m = &l->mce;
-               if (!(m->status & MCI_STATUS_UC))
-                       continue;
-               if (!final || mce_cmp(m, final)) {
-                       print_mce(m);
-                       if (!apei_err)
-                               apei_err = apei_write_mce(m);
-               }
-       }
-       if (final) {
-               print_mce(final);
-               if (!apei_err)
-                       apei_err = apei_write_mce(final);
-       }
-       if (cpu_missing)
-               pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
-       if (exp)
-               pr_emerg(HW_ERR "Machine check: %s\n", exp);
-       if (!fake_panic) {
-               if (panic_timeout == 0)
-                       panic_timeout = mca_cfg.panic_timeout;
-               panic(msg);
-       } else
-               pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
-}
-
-/* Support code for software error injection */
-
-static int msr_to_offset(u32 msr)
-{
-       unsigned bank = __this_cpu_read(injectm.bank);
-
-       if (msr == mca_cfg.rip_msr)
-               return offsetof(struct mce, ip);
-       if (msr == msr_ops.status(bank))
-               return offsetof(struct mce, status);
-       if (msr == msr_ops.addr(bank))
-               return offsetof(struct mce, addr);
-       if (msr == msr_ops.misc(bank))
-               return offsetof(struct mce, misc);
-       if (msr == MSR_IA32_MCG_STATUS)
-               return offsetof(struct mce, mcgstatus);
-       return -1;
-}
-
-/* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
-{
-       u64 v;
-
-       if (__this_cpu_read(injectm.finished)) {
-               int offset = msr_to_offset(msr);
-
-               if (offset < 0)
-                       return 0;
-               return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
-       }
-
-       if (rdmsrl_safe(msr, &v)) {
-               WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
-               /*
-                * Return zero in case the access faulted. This should
-                * not happen normally but can happen if the CPU does
-                * something weird, or if the code is buggy.
-                */
-               v = 0;
-       }
-
-       return v;
-}
-
-static void mce_wrmsrl(u32 msr, u64 v)
-{
-       if (__this_cpu_read(injectm.finished)) {
-               int offset = msr_to_offset(msr);
-
-               if (offset >= 0)
-                       *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
-               return;
-       }
-       wrmsrl(msr, v);
-}
-
-/*
- * Collect all global (w.r.t. this processor) status about this machine
- * check into our "mce" struct so that we can use it later to assess
- * the severity of the problem as we read per-bank specific details.
- */
-static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
-{
-       mce_setup(m);
-
-       m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
-       if (regs) {
-               /*
-                * Get the address of the instruction at the time of
-                * the machine check error.
-                */
-               if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
-                       m->ip = regs->ip;
-                       m->cs = regs->cs;
-
-                       /*
-                        * When in VM86 mode make the cs look like ring 3
-                        * always. This is a lie, but it's better than passing
-                        * the additional vm86 bit around everywhere.
-                        */
-                       if (v8086_mode(regs))
-                               m->cs |= 3;
-               }
-               /* Use accurate RIP reporting if available. */
-               if (mca_cfg.rip_msr)
-                       m->ip = mce_rdmsrl(mca_cfg.rip_msr);
-       }
-}
-
-int mce_available(struct cpuinfo_x86 *c)
-{
-       if (mca_cfg.disabled)
-               return 0;
-       return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static void mce_schedule_work(void)
-{
-       if (!mce_gen_pool_empty())
-               schedule_work(&mce_work);
-}
-
-static void mce_irq_work_cb(struct irq_work *entry)
-{
-       mce_schedule_work();
-}
-
-static void mce_report_event(struct pt_regs *regs)
-{
-       if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
-               mce_notify_irq();
-               /*
-                * Triggering the work queue here is just an insurance
-                * policy in case the syscall exit notify handler
-                * doesn't run soon enough or ends up running on the
-                * wrong CPU (can happen when audit sleeps)
-                */
-               mce_schedule_work();
-               return;
-       }
-
-       irq_work_queue(&mce_irq_work);
-}
-
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
- */
-int mce_usable_address(struct mce *m)
-{
-       if (!(m->status & MCI_STATUS_ADDRV))
-               return 0;
-
-       /* Checks after this one are Intel-specific: */
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-               return 1;
-
-       if (!(m->status & MCI_STATUS_MISCV))
-               return 0;
-
-       if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
-               return 0;
-
-       if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
-               return 0;
-
-       return 1;
-}
-EXPORT_SYMBOL_GPL(mce_usable_address);
-
-bool mce_is_memory_error(struct mce *m)
-{
-       if (m->cpuvendor == X86_VENDOR_AMD ||
-           m->cpuvendor == X86_VENDOR_HYGON) {
-               return amd_mce_is_memory_error(m);
-       } else if (m->cpuvendor == X86_VENDOR_INTEL) {
-               /*
-                * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
-                *
-                * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
-                * indicating a memory error. Bit 8 is used for indicating a
-                * cache hierarchy error. The combination of bit 2 and bit 3
-                * is used for indicating a `generic' cache hierarchy error
-                * But we can't just blindly check the above bits, because if
-                * bit 11 is set, then it is a bus/interconnect error - and
-                * either way the above bits just gives more detail on what
-                * bus/interconnect error happened. Note that bit 12 can be
-                * ignored, as it's the "filter" bit.
-                */
-               return (m->status & 0xef80) == BIT(7) ||
-                      (m->status & 0xef00) == BIT(8) ||
-                      (m->status & 0xeffc) == 0xc;
-       }
-
-       return false;
-}
-EXPORT_SYMBOL_GPL(mce_is_memory_error);
-
-bool mce_is_correctable(struct mce *m)
-{
-       if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
-               return false;
-
-       if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
-               return false;
-
-       if (m->status & MCI_STATUS_UC)
-               return false;
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(mce_is_correctable);
-
-static bool cec_add_mce(struct mce *m)
-{
-       if (!m)
-               return false;
-
-       /* We eat only correctable DRAM errors with usable addresses. */
-       if (mce_is_memory_error(m) &&
-           mce_is_correctable(m)  &&
-           mce_usable_address(m))
-               if (!cec_add_elem(m->addr >> PAGE_SHIFT))
-                       return true;
-
-       return false;
-}
-
-static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
-                             void *data)
-{
-       struct mce *m = (struct mce *)data;
-
-       if (!m)
-               return NOTIFY_DONE;
-
-       if (cec_add_mce(m))
-               return NOTIFY_STOP;
-
-       /* Emit the trace record: */
-       trace_mce_record(m);
-
-       set_bit(0, &mce_need_notify);
-
-       mce_notify_irq();
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block first_nb = {
-       .notifier_call  = mce_first_notifier,
-       .priority       = MCE_PRIO_FIRST,
-};
-
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
-                               void *data)
-{
-       struct mce *mce = (struct mce *)data;
-       unsigned long pfn;
-
-       if (!mce)
-               return NOTIFY_DONE;
-
-       if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
-               pfn = mce->addr >> PAGE_SHIFT;
-               if (!memory_failure(pfn, 0))
-                       set_mce_nospec(pfn);
-       }
-
-       return NOTIFY_OK;
-}
-static struct notifier_block mce_srao_nb = {
-       .notifier_call  = srao_decode_notifier,
-       .priority       = MCE_PRIO_SRAO,
-};
-
-static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
-                               void *data)
-{
-       struct mce *m = (struct mce *)data;
-
-       if (!m)
-               return NOTIFY_DONE;
-
-       if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
-               return NOTIFY_DONE;
-
-       __print_mce(m);
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block mce_default_nb = {
-       .notifier_call  = mce_default_notifier,
-       /* lowest prio, we want it to run last. */
-       .priority       = MCE_PRIO_LOWEST,
-};
-
-/*
- * Read ADDR and MISC registers.
- */
-static void mce_read_aux(struct mce *m, int i)
-{
-       if (m->status & MCI_STATUS_MISCV)
-               m->misc = mce_rdmsrl(msr_ops.misc(i));
-
-       if (m->status & MCI_STATUS_ADDRV) {
-               m->addr = mce_rdmsrl(msr_ops.addr(i));
-
-               /*
-                * Mask the reported address by the reported granularity.
-                */
-               if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
-                       u8 shift = MCI_MISC_ADDR_LSB(m->misc);
-                       m->addr >>= shift;
-                       m->addr <<= shift;
-               }
-
-               /*
-                * Extract [55:<lsb>] where lsb is the least significant
-                * *valid* bit of the address bits.
-                */
-               if (mce_flags.smca) {
-                       u8 lsb = (m->addr >> 56) & 0x3f;
-
-                       m->addr &= GENMASK_ULL(55, lsb);
-               }
-       }
-
-       if (mce_flags.smca) {
-               m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
-
-               if (m->status & MCI_STATUS_SYNDV)
-                       m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
-       }
-}
-
-DEFINE_PER_CPU(unsigned, mce_poll_count);
-
-/*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- *
- * Note: spec recommends to panic for fatal unsignalled
- * errors here. However this would be quite problematic --
- * we would need to reimplement the Monarch handling and
- * it would mess up the exclusion between exception handler
- * and poll hander -- * so we skip this for now.
- * These cases should not happen anyways, or only when the CPU
- * is already totally * confused. In this case it's likely it will
- * not fully execute the machine check handler either.
- */
-bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
-       bool error_seen = false;
-       struct mce m;
-       int i;
-
-       this_cpu_inc(mce_poll_count);
-
-       mce_gather_info(&m, NULL);
-
-       if (flags & MCP_TIMESTAMP)
-               m.tsc = rdtsc();
-
-       for (i = 0; i < mca_cfg.banks; i++) {
-               if (!mce_banks[i].ctl || !test_bit(i, *b))
-                       continue;
-
-               m.misc = 0;
-               m.addr = 0;
-               m.bank = i;
-
-               barrier();
-               m.status = mce_rdmsrl(msr_ops.status(i));
-               if (!(m.status & MCI_STATUS_VAL))
-                       continue;
-
-               /*
-                * Uncorrected or signalled events are handled by the exception
-                * handler when it is enabled, so don't process those here.
-                *
-                * TBD do the same check for MCI_STATUS_EN here?
-                */
-               if (!(flags & MCP_UC) &&
-                   (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
-                       continue;
-
-               error_seen = true;
-
-               mce_read_aux(&m, i);
-
-               m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
-               /*
-                * Don't get the IP here because it's unlikely to
-                * have anything to do with the actual error location.
-                */
-               if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
-                       mce_log(&m);
-               else if (mce_usable_address(&m)) {
-                       /*
-                        * Although we skipped logging this, we still want
-                        * to take action. Add to the pool so the registered
-                        * notifiers will see it.
-                        */
-                       if (!mce_gen_pool_add(&m))
-                               mce_schedule_work();
-               }
-
-               /*
-                * Clear state for this bank.
-                */
-               mce_wrmsrl(msr_ops.status(i), 0);
-       }
-
-       /*
-        * Don't clear MCG_STATUS here because it's only defined for
-        * exceptions.
-        */
-
-       sync_core();
-
-       return error_seen;
-}
-EXPORT_SYMBOL_GPL(machine_check_poll);
-
-/*
- * Do a quick check if any of the events requires a panic.
- * This decides if we keep the events around or clear them.
- */
-static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
-                         struct pt_regs *regs)
-{
-       char *tmp;
-       int i;
-
-       for (i = 0; i < mca_cfg.banks; i++) {
-               m->status = mce_rdmsrl(msr_ops.status(i));
-               if (!(m->status & MCI_STATUS_VAL))
-                       continue;
-
-               __set_bit(i, validp);
-               if (quirk_no_way_out)
-                       quirk_no_way_out(i, m, regs);
-
-               if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
-                       mce_read_aux(m, i);
-                       *msg = tmp;
-                       return 1;
-               }
-       }
-       return 0;
-}
-
-/*
- * Variable to establish order between CPUs while scanning.
- * Each CPU spins initially until executing is equal its number.
- */
-static atomic_t mce_executing;
-
-/*
- * Defines order of CPUs on entry. First CPU becomes Monarch.
- */
-static atomic_t mce_callin;
-
-/*
- * Check if a timeout waiting for other CPUs happened.
- */
-static int mce_timed_out(u64 *t, const char *msg)
-{
-       /*
-        * The others already did panic for some reason.
-        * Bail out like in a timeout.
-        * rmb() to tell the compiler that system_state
-        * might have been modified by someone else.
-        */
-       rmb();
-       if (atomic_read(&mce_panicked))
-               wait_for_panic();
-       if (!mca_cfg.monarch_timeout)
-               goto out;
-       if ((s64)*t < SPINUNIT) {
-               if (mca_cfg.tolerant <= 1)
-                       mce_panic(msg, NULL, NULL);
-               cpu_missing = 1;
-               return 1;
-       }
-       *t -= SPINUNIT;
-out:
-       touch_nmi_watchdog();
-       return 0;
-}
-
-/*
- * The Monarch's reign.  The Monarch is the CPU who entered
- * the machine check handler first. It waits for the others to
- * raise the exception too and then grades them. When any
- * error is fatal panic. Only then let the others continue.
- *
- * The other CPUs entering the MCE handler will be controlled by the
- * Monarch. They are called Subjects.
- *
- * This way we prevent any potential data corruption in a unrecoverable case
- * and also makes sure always all CPU's errors are examined.
- *
- * Also this detects the case of a machine check event coming from outer
- * space (not detected by any CPUs) In this case some external agent wants
- * us to shut down, so panic too.
- *
- * The other CPUs might still decide to panic if the handler happens
- * in a unrecoverable place, but in this case the system is in a semi-stable
- * state and won't corrupt anything by itself. It's ok to let the others
- * continue for a bit first.
- *
- * All the spin loops have timeouts; when a timeout happens a CPU
- * typically elects itself to be Monarch.
- */
-static void mce_reign(void)
-{
-       int cpu;
-       struct mce *m = NULL;
-       int global_worst = 0;
-       char *msg = NULL;
-       char *nmsg = NULL;
-
-       /*
-        * This CPU is the Monarch and the other CPUs have run
-        * through their handlers.
-        * Grade the severity of the errors of all the CPUs.
-        */
-       for_each_possible_cpu(cpu) {
-               int severity = mce_severity(&per_cpu(mces_seen, cpu),
-                                           mca_cfg.tolerant,
-                                           &nmsg, true);
-               if (severity > global_worst) {
-                       msg = nmsg;
-                       global_worst = severity;
-                       m = &per_cpu(mces_seen, cpu);
-               }
-       }
-
-       /*
-        * Cannot recover? Panic here then.
-        * This dumps all the mces in the log buffer and stops the
-        * other CPUs.
-        */
-       if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
-               mce_panic("Fatal machine check", m, msg);
-
-       /*
-        * For UC somewhere we let the CPU who detects it handle it.
-        * Also must let continue the others, otherwise the handling
-        * CPU could deadlock on a lock.
-        */
-
-       /*
-        * No machine check event found. Must be some external
-        * source or one CPU is hung. Panic.
-        */
-       if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
-               mce_panic("Fatal machine check from unknown source", NULL, NULL);
-
-       /*
-        * Now clear all the mces_seen so that they don't reappear on
-        * the next mce.
-        */
-       for_each_possible_cpu(cpu)
-               memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
-}
-
-static atomic_t global_nwo;
-
-/*
- * Start of Monarch synchronization. This waits until all CPUs have
- * entered the exception handler and then determines if any of them
- * saw a fatal event that requires panic. Then it executes them
- * in the entry order.
- * TBD double check parallel CPU hotunplug
- */
-static int mce_start(int *no_way_out)
-{
-       int order;
-       int cpus = num_online_cpus();
-       u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
-
-       if (!timeout)
-               return -1;
-
-       atomic_add(*no_way_out, &global_nwo);
-       /*
-        * Rely on the implied barrier below, such that global_nwo
-        * is updated before mce_callin.
-        */
-       order = atomic_inc_return(&mce_callin);
-
-       /*
-        * Wait for everyone.
-        */
-       while (atomic_read(&mce_callin) != cpus) {
-               if (mce_timed_out(&timeout,
-                                 "Timeout: Not all CPUs entered broadcast exception handler")) {
-                       atomic_set(&global_nwo, 0);
-                       return -1;
-               }
-               ndelay(SPINUNIT);
-       }
-
-       /*
-        * mce_callin should be read before global_nwo
-        */
-       smp_rmb();
-
-       if (order == 1) {
-               /*
-                * Monarch: Starts executing now, the others wait.
-                */
-               atomic_set(&mce_executing, 1);
-       } else {
-               /*
-                * Subject: Now start the scanning loop one by one in
-                * the original callin order.
-                * This way when there are any shared banks it will be
-                * only seen by one CPU before cleared, avoiding duplicates.
-                */
-               while (atomic_read(&mce_executing) < order) {
-                       if (mce_timed_out(&timeout,
-                                         "Timeout: Subject CPUs unable to finish machine check processing")) {
-                               atomic_set(&global_nwo, 0);
-                               return -1;
-                       }
-                       ndelay(SPINUNIT);
-               }
-       }
-
-       /*
-        * Cache the global no_way_out state.
-        */
-       *no_way_out = atomic_read(&global_nwo);
-
-       return order;
-}
-
-/*
- * Synchronize between CPUs after main scanning loop.
- * This invokes the bulk of the Monarch processing.
- */
-static int mce_end(int order)
-{
-       int ret = -1;
-       u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
-
-       if (!timeout)
-               goto reset;
-       if (order < 0)
-               goto reset;
-
-       /*
-        * Allow others to run.
-        */
-       atomic_inc(&mce_executing);
-
-       if (order == 1) {
-               /* CHECKME: Can this race with a parallel hotplug? */
-               int cpus = num_online_cpus();
-
-               /*
-                * Monarch: Wait for everyone to go through their scanning
-                * loops.
-                */
-               while (atomic_read(&mce_executing) <= cpus) {
-                       if (mce_timed_out(&timeout,
-                                         "Timeout: Monarch CPU unable to finish machine check processing"))
-                               goto reset;
-                       ndelay(SPINUNIT);
-               }
-
-               mce_reign();
-               barrier();
-               ret = 0;
-       } else {
-               /*
-                * Subject: Wait for Monarch to finish.
-                */
-               while (atomic_read(&mce_executing) != 0) {
-                       if (mce_timed_out(&timeout,
-                                         "Timeout: Monarch CPU did not finish machine check processing"))
-                               goto reset;
-                       ndelay(SPINUNIT);
-               }
-
-               /*
-                * Don't reset anything. That's done by the Monarch.
-                */
-               return 0;
-       }
-
-       /*
-        * Reset all global state.
-        */
-reset:
-       atomic_set(&global_nwo, 0);
-       atomic_set(&mce_callin, 0);
-       barrier();
-
-       /*
-        * Let others run again.
-        */
-       atomic_set(&mce_executing, 0);
-       return ret;
-}
-
-static void mce_clear_state(unsigned long *toclear)
-{
-       int i;
-
-       for (i = 0; i < mca_cfg.banks; i++) {
-               if (test_bit(i, toclear))
-                       mce_wrmsrl(msr_ops.status(i), 0);
-       }
-}
-
-static int do_memory_failure(struct mce *m)
-{
-       int flags = MF_ACTION_REQUIRED;
-       int ret;
-
-       pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
-       if (!(m->mcgstatus & MCG_STATUS_RIPV))
-               flags |= MF_MUST_KILL;
-       ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
-       if (ret)
-               pr_err("Memory error not recovered");
-       else
-               set_mce_nospec(m->addr >> PAGE_SHIFT);
-       return ret;
-}
-
-
-/*
- * Cases where we avoid rendezvous handler timeout:
- * 1) If this CPU is offline.
- *
- * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
- *  skip those CPUs which remain looping in the 1st kernel - see
- *  crash_nmi_callback().
- *
- * Note: there still is a small window between kexec-ing and the new,
- * kdump kernel establishing a new #MC handler where a broadcasted MCE
- * might not get handled properly.
- */
-static bool __mc_check_crashing_cpu(int cpu)
-{
-       if (cpu_is_offline(cpu) ||
-           (crashing_cpu != -1 && crashing_cpu != cpu)) {
-               u64 mcgstatus;
-
-               mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
-               if (mcgstatus & MCG_STATUS_RIPV) {
-                       mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-                       return true;
-               }
-       }
-       return false;
-}
-
-static void __mc_scan_banks(struct mce *m, struct mce *final,
-                           unsigned long *toclear, unsigned long *valid_banks,
-                           int no_way_out, int *worst)
-{
-       struct mca_config *cfg = &mca_cfg;
-       int severity, i;
-
-       for (i = 0; i < cfg->banks; i++) {
-               __clear_bit(i, toclear);
-               if (!test_bit(i, valid_banks))
-                       continue;
-
-               if (!mce_banks[i].ctl)
-                       continue;
-
-               m->misc = 0;
-               m->addr = 0;
-               m->bank = i;
-
-               m->status = mce_rdmsrl(msr_ops.status(i));
-               if (!(m->status & MCI_STATUS_VAL))
-                       continue;
-
-               /*
-                * Corrected or non-signaled errors are handled by
-                * machine_check_poll(). Leave them alone, unless this panics.
-                */
-               if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
-                       !no_way_out)
-                       continue;
-
-               /* Set taint even when machine check was not enabled. */
-               add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
-               severity = mce_severity(m, cfg->tolerant, NULL, true);
-
-               /*
-                * When machine check was for corrected/deferred handler don't
-                * touch, unless we're panicking.
-                */
-               if ((severity == MCE_KEEP_SEVERITY ||
-                    severity == MCE_UCNA_SEVERITY) && !no_way_out)
-                       continue;
-
-               __set_bit(i, toclear);
-
-               /* Machine check event was not enabled. Clear, but ignore. */
-               if (severity == MCE_NO_SEVERITY)
-                       continue;
-
-               mce_read_aux(m, i);
-
-               /* assuming valid severity level != 0 */
-               m->severity = severity;
-
-               mce_log(m);
-
-               if (severity > *worst) {
-                       *final = *m;
-                       *worst = severity;
-               }
-       }
-
-       /* mce_clear_state will clear *final, save locally for use later */
-       *m = *final;
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- *
- * On Intel systems this is entered on all CPUs in parallel through
- * MCE broadcast. However some CPUs might be broken beyond repair,
- * so be always careful when synchronizing with others.
- */
-void do_machine_check(struct pt_regs *regs, long error_code)
-{
-       DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
-       DECLARE_BITMAP(toclear, MAX_NR_BANKS);
-       struct mca_config *cfg = &mca_cfg;
-       int cpu = smp_processor_id();
-       char *msg = "Unknown";
-       struct mce m, *final;
-       int worst = 0;
-
-       /*
-        * Establish sequential order between the CPUs entering the machine
-        * check handler.
-        */
-       int order = -1;
-
-       /*
-        * If no_way_out gets set, there is no safe way to recover from this
-        * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
-        */
-       int no_way_out = 0;
-
-       /*
-        * If kill_it gets set, there might be a way to recover from this
-        * error.
-        */
-       int kill_it = 0;
-
-       /*
-        * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
-        * on Intel.
-        */
-       int lmce = 1;
-
-       if (__mc_check_crashing_cpu(cpu))
-               return;
-
-       ist_enter(regs);
-
-       this_cpu_inc(mce_exception_count);
-
-       mce_gather_info(&m, regs);
-       m.tsc = rdtsc();
-
-       final = this_cpu_ptr(&mces_seen);
-       *final = m;
-
-       memset(valid_banks, 0, sizeof(valid_banks));
-       no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
-
-       barrier();
-
-       /*
-        * When no restart IP might need to kill or panic.
-        * Assume the worst for now, but if we find the
-        * severity is MCE_AR_SEVERITY we have other options.
-        */
-       if (!(m.mcgstatus & MCG_STATUS_RIPV))
-               kill_it = 1;
-
-       /*
-        * Check if this MCE is signaled to only this logical processor,
-        * on Intel only.
-        */
-       if (m.cpuvendor == X86_VENDOR_INTEL)
-               lmce = m.mcgstatus & MCG_STATUS_LMCES;
-
-       /*
-        * Local machine check may already know that we have to panic.
-        * Broadcast machine check begins rendezvous in mce_start()
-        * Go through all banks in exclusion of the other CPUs. This way we
-        * don't report duplicated events on shared banks because the first one
-        * to see it will clear it.
-        */
-       if (lmce) {
-               if (no_way_out)
-                       mce_panic("Fatal local machine check", &m, msg);
-       } else {
-               order = mce_start(&no_way_out);
-       }
-
-       __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
-
-       if (!no_way_out)
-               mce_clear_state(toclear);
-
-       /*
-        * Do most of the synchronization with other CPUs.
-        * When there's any problem use only local no_way_out state.
-        */
-       if (!lmce) {
-               if (mce_end(order) < 0)
-                       no_way_out = worst >= MCE_PANIC_SEVERITY;
-       } else {
-               /*
-                * If there was a fatal machine check we should have
-                * already called mce_panic earlier in this function.
-                * Since we re-read the banks, we might have found
-                * something new. Check again to see if we found a
-                * fatal error. We call "mce_severity()" again to
-                * make sure we have the right "msg".
-                */
-               if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
-                       mce_severity(&m, cfg->tolerant, &msg, true);
-                       mce_panic("Local fatal machine check!", &m, msg);
-               }
-       }
-
-       /*
-        * If tolerant is at an insane level we drop requests to kill
-        * processes and continue even when there is no way out.
-        */
-       if (cfg->tolerant == 3)
-               kill_it = 0;
-       else if (no_way_out)
-               mce_panic("Fatal machine check on current CPU", &m, msg);
-
-       if (worst > 0)
-               mce_report_event(regs);
-       mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-
-       sync_core();
-
-       if (worst != MCE_AR_SEVERITY && !kill_it)
-               goto out_ist;
-
-       /* Fault was in user mode and we need to take some action */
-       if ((m.cs & 3) == 3) {
-               ist_begin_non_atomic(regs);
-               local_irq_enable();
-
-               if (kill_it || do_memory_failure(&m))
-                       force_sig(SIGBUS, current);
-               local_irq_disable();
-               ist_end_non_atomic();
-       } else {
-               if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
-                       mce_panic("Failed kernel mode recovery", &m, NULL);
-       }
-
-out_ist:
-       ist_exit(regs);
-}
-EXPORT_SYMBOL_GPL(do_machine_check);
-
-#ifndef CONFIG_MEMORY_FAILURE
-int memory_failure(unsigned long pfn, int flags)
-{
-       /* mce_severity() should not hand us an ACTION_REQUIRED error */
-       BUG_ON(flags & MF_ACTION_REQUIRED);
-       pr_err("Uncorrected memory error in page 0x%lx ignored\n"
-              "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
-              pfn);
-
-       return 0;
-}
-#endif
-
-/*
- * Periodic polling timer for "silent" machine check errors.  If the
- * poller finds an MCE, poll 2x faster.  When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
-
-static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
-
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
-       return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
-static void __start_timer(struct timer_list *t, unsigned long interval)
-{
-       unsigned long when = jiffies + interval;
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       if (!timer_pending(t) || time_before(when, t->expires))
-               mod_timer(t, round_jiffies(when));
-
-       local_irq_restore(flags);
-}
-
-static void mce_timer_fn(struct timer_list *t)
-{
-       struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
-       unsigned long iv;
-
-       WARN_ON(cpu_t != t);
-
-       iv = __this_cpu_read(mce_next_interval);
-
-       if (mce_available(this_cpu_ptr(&cpu_info))) {
-               machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
-
-               if (mce_intel_cmci_poll()) {
-                       iv = mce_adjust_timer(iv);
-                       goto done;
-               }
-       }
-
-       /*
-        * Alert userspace if needed. If we logged an MCE, reduce the polling
-        * interval, otherwise increase the polling interval.
-        */
-       if (mce_notify_irq())
-               iv = max(iv / 2, (unsigned long) HZ/100);
-       else
-               iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-
-done:
-       __this_cpu_write(mce_next_interval, iv);
-       __start_timer(t, iv);
-}
-
-/*
- * Ensure that the timer is firing in @interval from now.
- */
-void mce_timer_kick(unsigned long interval)
-{
-       struct timer_list *t = this_cpu_ptr(&mce_timer);
-       unsigned long iv = __this_cpu_read(mce_next_interval);
-
-       __start_timer(t, interval);
-
-       if (interval < iv)
-               __this_cpu_write(mce_next_interval, interval);
-}
-
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
-static void mce_timer_delete_all(void)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu)
-               del_timer_sync(&per_cpu(mce_timer, cpu));
-}
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_irq(void)
-{
-       /* Not more than two messages every minute */
-       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
-       if (test_and_clear_bit(0, &mce_need_notify)) {
-               mce_work_trigger();
-
-               if (__ratelimit(&ratelimit))
-                       pr_info(HW_ERR "Machine check events logged\n");
-
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
-
-static int __mcheck_cpu_mce_banks_init(void)
-{
-       int i;
-       u8 num_banks = mca_cfg.banks;
-
-       mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL);
-       if (!mce_banks)
-               return -ENOMEM;
-
-       for (i = 0; i < num_banks; i++) {
-               struct mce_bank *b = &mce_banks[i];
-
-               b->ctl = -1ULL;
-               b->init = 1;
-       }
-       return 0;
-}
-
-/*
- * Initialize Machine Checks for a CPU.
- */
-static int __mcheck_cpu_cap_init(void)
-{
-       unsigned b;
-       u64 cap;
-
-       rdmsrl(MSR_IA32_MCG_CAP, cap);
-
-       b = cap & MCG_BANKCNT_MASK;
-       if (!mca_cfg.banks)
-               pr_info("CPU supports %d MCE banks\n", b);
-
-       if (b > MAX_NR_BANKS) {
-               pr_warn("Using only %u machine check banks out of %u\n",
-                       MAX_NR_BANKS, b);
-               b = MAX_NR_BANKS;
-       }
-
-       /* Don't support asymmetric configurations today */
-       WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
-       mca_cfg.banks = b;
-
-       if (!mce_banks) {
-               int err = __mcheck_cpu_mce_banks_init();
-
-               if (err)
-                       return err;
-       }
-
-       /* Use accurate RIP reporting if available. */
-       if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
-               mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
-
-       if (cap & MCG_SER_P)
-               mca_cfg.ser = 1;
-
-       return 0;
-}
-
-static void __mcheck_cpu_init_generic(void)
-{
-       enum mcp_flags m_fl = 0;
-       mce_banks_t all_banks;
-       u64 cap;
-
-       if (!mca_cfg.bootlog)
-               m_fl = MCP_DONTLOG;
-
-       /*
-        * Log the machine checks left over from the previous reset.
-        */
-       bitmap_fill(all_banks, MAX_NR_BANKS);
-       machine_check_poll(MCP_UC | m_fl, &all_banks);
-
-       cr4_set_bits(X86_CR4_MCE);
-
-       rdmsrl(MSR_IA32_MCG_CAP, cap);
-       if (cap & MCG_CTL_P)
-               wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-}
-
-static void __mcheck_cpu_init_clear_banks(void)
-{
-       int i;
-
-       for (i = 0; i < mca_cfg.banks; i++) {
-               struct mce_bank *b = &mce_banks[i];
-
-               if (!b->init)
-                       continue;
-               wrmsrl(msr_ops.ctl(i), b->ctl);
-               wrmsrl(msr_ops.status(i), 0);
-       }
-}
-
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
-       if (bank != 0)
-               return;
-       if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
-               return;
-       if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
-                         MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
-                         MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
-                         MCACOD)) !=
-                        (MCI_STATUS_UC|MCI_STATUS_EN|
-                         MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
-                         MCI_STATUS_AR|MCACOD_INSTR))
-               return;
-
-       m->mcgstatus |= MCG_STATUS_EIPV;
-       m->ip = regs->ip;
-       m->cs = regs->cs;
-}
-
-/* Add per CPU specific workarounds here */
-static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
-{
-       struct mca_config *cfg = &mca_cfg;
-
-       if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
-               pr_info("unknown CPU type - not enabling MCE support\n");
-               return -EOPNOTSUPP;
-       }
-
-       /* This should be disabled by the BIOS, but isn't always */
-       if (c->x86_vendor == X86_VENDOR_AMD) {
-               if (c->x86 == 15 && cfg->banks > 4) {
-                       /*
-                        * disable GART TBL walk error reporting, which
-                        * trips off incorrectly with the IOMMU & 3ware
-                        * & Cerberus:
-                        */
-                       clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
-               }
-               if (c->x86 < 0x11 && cfg->bootlog < 0) {
-                       /*
-                        * Lots of broken BIOS around that don't clear them
-                        * by default and leave crap in there. Don't log:
-                        */
-                       cfg->bootlog = 0;
-               }
-               /*
-                * Various K7s with broken bank 0 around. Always disable
-                * by default.
-                */
-               if (c->x86 == 6 && cfg->banks > 0)
-                       mce_banks[0].ctl = 0;
-
-               /*
-                * overflow_recov is supported for F15h Models 00h-0fh
-                * even though we don't have a CPUID bit for it.
-                */
-               if (c->x86 == 0x15 && c->x86_model <= 0xf)
-                       mce_flags.overflow_recov = 1;
-
-               /*
-                * Turn off MC4_MISC thresholding banks on those models since
-                * they're not supported there.
-                */
-               if (c->x86 == 0x15 &&
-                   (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
-                       int i;
-                       u64 hwcr;
-                       bool need_toggle;
-                       u32 msrs[] = {
-                               0x00000413, /* MC4_MISC0 */
-                               0xc0000408, /* MC4_MISC1 */
-                       };
-
-                       rdmsrl(MSR_K7_HWCR, hwcr);
-
-                       /* McStatusWrEn has to be set */
-                       need_toggle = !(hwcr & BIT(18));
-
-                       if (need_toggle)
-                               wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
-
-                       /* Clear CntP bit safely */
-                       for (i = 0; i < ARRAY_SIZE(msrs); i++)
-                               msr_clear_bit(msrs[i], 62);
-
-                       /* restore old settings */
-                       if (need_toggle)
-                               wrmsrl(MSR_K7_HWCR, hwcr);
-               }
-       }
-
-       if (c->x86_vendor == X86_VENDOR_INTEL) {
-               /*
-                * SDM documents that on family 6 bank 0 should not be written
-                * because it aliases to another special BIOS controlled
-                * register.
-                * But it's not aliased anymore on model 0x1a+
-                * Don't ignore bank 0 completely because there could be a
-                * valid event later, merely don't write CTL0.
-                */
-
-               if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
-                       mce_banks[0].init = 0;
-
-               /*
-                * All newer Intel systems support MCE broadcasting. Enable
-                * synchronization with a one second timeout.
-                */
-               if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
-                       cfg->monarch_timeout < 0)
-                       cfg->monarch_timeout = USEC_PER_SEC;
-
-               /*
-                * There are also broken BIOSes on some Pentium M and
-                * earlier systems:
-                */
-               if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
-                       cfg->bootlog = 0;
-
-               if (c->x86 == 6 && c->x86_model == 45)
-                       quirk_no_way_out = quirk_sandybridge_ifu;
-       }
-       if (cfg->monarch_timeout < 0)
-               cfg->monarch_timeout = 0;
-       if (cfg->bootlog != 0)
-               cfg->panic_timeout = 30;
-
-       return 0;
-}
-
-static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
-{
-       if (c->x86 != 5)
-               return 0;
-
-       switch (c->x86_vendor) {
-       case X86_VENDOR_INTEL:
-               intel_p5_mcheck_init(c);
-               return 1;
-               break;
-       case X86_VENDOR_CENTAUR:
-               winchip_mcheck_init(c);
-               return 1;
-               break;
-       default:
-               return 0;
-       }
-
-       return 0;
-}
-
-/*
- * Init basic CPU features needed for early decoding of MCEs.
- */
-static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
-{
-       if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
-               mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
-               mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
-               mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
-
-               if (mce_flags.smca) {
-                       msr_ops.ctl     = smca_ctl_reg;
-                       msr_ops.status  = smca_status_reg;
-                       msr_ops.addr    = smca_addr_reg;
-                       msr_ops.misc    = smca_misc_reg;
-               }
-       }
-}
-
-static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
-{
-       struct mca_config *cfg = &mca_cfg;
-
-        /*
-         * All newer Centaur CPUs support MCE broadcasting. Enable
-         * synchronization with a one second timeout.
-         */
-       if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
-            c->x86 > 6) {
-               if (cfg->monarch_timeout < 0)
-                       cfg->monarch_timeout = USEC_PER_SEC;
-       }
-}
-
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
-{
-       switch (c->x86_vendor) {
-       case X86_VENDOR_INTEL:
-               mce_intel_feature_init(c);
-               mce_adjust_timer = cmci_intel_adjust_timer;
-               break;
-
-       case X86_VENDOR_AMD: {
-               mce_amd_feature_init(c);
-               break;
-               }
-
-       case X86_VENDOR_HYGON:
-               mce_hygon_feature_init(c);
-               break;
-
-       case X86_VENDOR_CENTAUR:
-               mce_centaur_feature_init(c);
-               break;
-
-       default:
-               break;
-       }
-}
-
-static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
-{
-       switch (c->x86_vendor) {
-       case X86_VENDOR_INTEL:
-               mce_intel_feature_clear(c);
-               break;
-       default:
-               break;
-       }
-}
-
-static void mce_start_timer(struct timer_list *t)
-{
-       unsigned long iv = check_interval * HZ;
-
-       if (mca_cfg.ignore_ce || !iv)
-               return;
-
-       this_cpu_write(mce_next_interval, iv);
-       __start_timer(t, iv);
-}
-
-static void __mcheck_cpu_setup_timer(void)
-{
-       struct timer_list *t = this_cpu_ptr(&mce_timer);
-
-       timer_setup(t, mce_timer_fn, TIMER_PINNED);
-}
-
-static void __mcheck_cpu_init_timer(void)
-{
-       struct timer_list *t = this_cpu_ptr(&mce_timer);
-
-       timer_setup(t, mce_timer_fn, TIMER_PINNED);
-       mce_start_timer(t);
-}
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
-       pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
-              smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
-                                               unexpected_machine_check;
-
-dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
-{
-       machine_check_vector(regs, error_code);
-}
-
-/*
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off:
- */
-void mcheck_cpu_init(struct cpuinfo_x86 *c)
-{
-       if (mca_cfg.disabled)
-               return;
-
-       if (__mcheck_cpu_ancient_init(c))
-               return;
-
-       if (!mce_available(c))
-               return;
-
-       if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
-               mca_cfg.disabled = 1;
-               return;
-       }
-
-       if (mce_gen_pool_init()) {
-               mca_cfg.disabled = 1;
-               pr_emerg("Couldn't allocate MCE records pool!\n");
-               return;
-       }
-
-       machine_check_vector = do_machine_check;
-
-       __mcheck_cpu_init_early(c);
-       __mcheck_cpu_init_generic();
-       __mcheck_cpu_init_vendor(c);
-       __mcheck_cpu_init_clear_banks();
-       __mcheck_cpu_setup_timer();
-}
-
-/*
- * Called for each booted CPU to clear some machine checks opt-ins
- */
-void mcheck_cpu_clear(struct cpuinfo_x86 *c)
-{
-       if (mca_cfg.disabled)
-               return;
-
-       if (!mce_available(c))
-               return;
-
-       /*
-        * Possibly to clear general settings generic to x86
-        * __mcheck_cpu_clear_generic(c);
-        */
-       __mcheck_cpu_clear_vendor(c);
-
-}
-
-static void __mce_disable_bank(void *arg)
-{
-       int bank = *((int *)arg);
-       __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
-       cmci_disable_bank(bank);
-}
-
-void mce_disable_bank(int bank)
-{
-       if (bank >= mca_cfg.banks) {
-               pr_warn(FW_BUG
-                       "Ignoring request to disable invalid MCA bank %d.\n",
-                       bank);
-               return;
-       }
-       set_bit(bank, mce_banks_ce_disabled);
-       on_each_cpu(__mce_disable_bank, &bank, 1);
-}
-
-/*
- * mce=off Disables machine check
- * mce=no_cmci Disables CMCI
- * mce=no_lmce Disables LMCE
- * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
- * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
- * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
- *     monarchtimeout is how long to wait for other CPUs on machine
- *     check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
-       and older.
- * mce=nobootlog Don't log MCEs from before booting.
- * mce=bios_cmci_threshold Don't program the CMCI threshold
- * mce=recovery force enable memcpy_mcsafe()
- */
-static int __init mcheck_enable(char *str)
-{
-       struct mca_config *cfg = &mca_cfg;
-
-       if (*str == 0) {
-               enable_p5_mce();
-               return 1;
-       }
-       if (*str == '=')
-               str++;
-       if (!strcmp(str, "off"))
-               cfg->disabled = 1;
-       else if (!strcmp(str, "no_cmci"))
-               cfg->cmci_disabled = true;
-       else if (!strcmp(str, "no_lmce"))
-               cfg->lmce_disabled = 1;
-       else if (!strcmp(str, "dont_log_ce"))
-               cfg->dont_log_ce = true;
-       else if (!strcmp(str, "ignore_ce"))
-               cfg->ignore_ce = true;
-       else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
-               cfg->bootlog = (str[0] == 'b');
-       else if (!strcmp(str, "bios_cmci_threshold"))
-               cfg->bios_cmci_threshold = 1;
-       else if (!strcmp(str, "recovery"))
-               cfg->recovery = 1;
-       else if (isdigit(str[0])) {
-               if (get_option(&str, &cfg->tolerant) == 2)
-                       get_option(&str, &(cfg->monarch_timeout));
-       } else {
-               pr_info("mce argument %s ignored. Please use /sys\n", str);
-               return 0;
-       }
-       return 1;
-}
-__setup("mce", mcheck_enable);
-
-int __init mcheck_init(void)
-{
-       mcheck_intel_therm_init();
-       mce_register_decode_chain(&first_nb);
-       mce_register_decode_chain(&mce_srao_nb);
-       mce_register_decode_chain(&mce_default_nb);
-       mcheck_vendor_init_severity();
-
-       INIT_WORK(&mce_work, mce_gen_pool_process);
-       init_irq_work(&mce_irq_work, mce_irq_work_cb);
-
-       return 0;
-}
-
-/*
- * mce_syscore: PM support
- */
-
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static void mce_disable_error_reporting(void)
-{
-       int i;
-
-       for (i = 0; i < mca_cfg.banks; i++) {
-               struct mce_bank *b = &mce_banks[i];
-
-               if (b->init)
-                       wrmsrl(msr_ops.ctl(i), 0);
-       }
-       return;
-}
-
-static void vendor_disable_error_reporting(void)
-{
-       /*
-        * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
-        * are socket-wide.
-        * Disabling them for just a single offlined CPU is bad, since it will
-        * inhibit reporting for all shared resources on the socket like the
-        * last level cache (LLC), the integrated memory controller (iMC), etc.
-        */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
-           boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
-           boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-               return;
-
-       mce_disable_error_reporting();
-}
-
-static int mce_syscore_suspend(void)
-{
-       vendor_disable_error_reporting();
-       return 0;
-}
-
-static void mce_syscore_shutdown(void)
-{
-       vendor_disable_error_reporting();
-}
-
-/*
- * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
- * Only one CPU is active at this time, the others get re-added later using
- * CPU hotplug:
- */
-static void mce_syscore_resume(void)
-{
-       __mcheck_cpu_init_generic();
-       __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
-       __mcheck_cpu_init_clear_banks();
-}
-
-static struct syscore_ops mce_syscore_ops = {
-       .suspend        = mce_syscore_suspend,
-       .shutdown       = mce_syscore_shutdown,
-       .resume         = mce_syscore_resume,
-};
-
-/*
- * mce_device: Sysfs support
- */
-
-static void mce_cpu_restart(void *data)
-{
-       if (!mce_available(raw_cpu_ptr(&cpu_info)))
-               return;
-       __mcheck_cpu_init_generic();
-       __mcheck_cpu_init_clear_banks();
-       __mcheck_cpu_init_timer();
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void)
-{
-       mce_timer_delete_all();
-       on_each_cpu(mce_cpu_restart, NULL, 1);
-}
-
-/* Toggle features for corrected errors */
-static void mce_disable_cmci(void *data)
-{
-       if (!mce_available(raw_cpu_ptr(&cpu_info)))
-               return;
-       cmci_clear();
-}
-
-static void mce_enable_ce(void *all)
-{
-       if (!mce_available(raw_cpu_ptr(&cpu_info)))
-               return;
-       cmci_reenable();
-       cmci_recheck();
-       if (all)
-               __mcheck_cpu_init_timer();
-}
-
-static struct bus_type mce_subsys = {
-       .name           = "machinecheck",
-       .dev_name       = "machinecheck",
-};
-
-DEFINE_PER_CPU(struct device *, mce_device);
-
-static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
-{
-       return container_of(attr, struct mce_bank, attr);
-}
-
-static ssize_t show_bank(struct device *s, struct device_attribute *attr,
-                        char *buf)
-{
-       return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
-}
-
-static ssize_t set_bank(struct device *s, struct device_attribute *attr,
-                       const char *buf, size_t size)
-{
-       u64 new;
-
-       if (kstrtou64(buf, 0, &new) < 0)
-               return -EINVAL;
-
-       attr_to_bank(attr)->ctl = new;
-       mce_restart();
-
-       return size;
-}
-
-static ssize_t set_ignore_ce(struct device *s,
-                            struct device_attribute *attr,
-                            const char *buf, size_t size)
-{
-       u64 new;
-
-       if (kstrtou64(buf, 0, &new) < 0)
-               return -EINVAL;
-
-       mutex_lock(&mce_sysfs_mutex);
-       if (mca_cfg.ignore_ce ^ !!new) {
-               if (new) {
-                       /* disable ce features */
-                       mce_timer_delete_all();
-                       on_each_cpu(mce_disable_cmci, NULL, 1);
-                       mca_cfg.ignore_ce = true;
-               } else {
-                       /* enable ce features */
-                       mca_cfg.ignore_ce = false;
-                       on_each_cpu(mce_enable_ce, (void *)1, 1);
-               }
-       }
-       mutex_unlock(&mce_sysfs_mutex);
-
-       return size;
-}
-
-static ssize_t set_cmci_disabled(struct device *s,
-                                struct device_attribute *attr,
-                                const char *buf, size_t size)
-{
-       u64 new;
-
-       if (kstrtou64(buf, 0, &new) < 0)
-               return -EINVAL;
-
-       mutex_lock(&mce_sysfs_mutex);
-       if (mca_cfg.cmci_disabled ^ !!new) {
-               if (new) {
-                       /* disable cmci */
-                       on_each_cpu(mce_disable_cmci, NULL, 1);
-                       mca_cfg.cmci_disabled = true;
-               } else {
-                       /* enable cmci */
-                       mca_cfg.cmci_disabled = false;
-                       on_each_cpu(mce_enable_ce, NULL, 1);
-               }
-       }
-       mutex_unlock(&mce_sysfs_mutex);
-
-       return size;
-}
-
-static ssize_t store_int_with_restart(struct device *s,
-                                     struct device_attribute *attr,
-                                     const char *buf, size_t size)
-{
-       unsigned long old_check_interval = check_interval;
-       ssize_t ret = device_store_ulong(s, attr, buf, size);
-
-       if (check_interval == old_check_interval)
-               return ret;
-
-       mutex_lock(&mce_sysfs_mutex);
-       mce_restart();
-       mutex_unlock(&mce_sysfs_mutex);
-
-       return ret;
-}
-
-static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
-static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
-static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
-
-static struct dev_ext_attribute dev_attr_check_interval = {
-       __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
-       &check_interval
-};
-
-static struct dev_ext_attribute dev_attr_ignore_ce = {
-       __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
-       &mca_cfg.ignore_ce
-};
-
-static struct dev_ext_attribute dev_attr_cmci_disabled = {
-       __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
-       &mca_cfg.cmci_disabled
-};
-
-static struct device_attribute *mce_device_attrs[] = {
-       &dev_attr_tolerant.attr,
-       &dev_attr_check_interval.attr,
-#ifdef CONFIG_X86_MCELOG_LEGACY
-       &dev_attr_trigger,
-#endif
-       &dev_attr_monarch_timeout.attr,
-       &dev_attr_dont_log_ce.attr,
-       &dev_attr_ignore_ce.attr,
-       &dev_attr_cmci_disabled.attr,
-       NULL
-};
-
-static cpumask_var_t mce_device_initialized;
-
-static void mce_device_release(struct device *dev)
-{
-       kfree(dev);
-}
-
-/* Per cpu device init. All of the cpus still share the same ctrl bank: */
-static int mce_device_create(unsigned int cpu)
-{
-       struct device *dev;
-       int err;
-       int i, j;
-
-       if (!mce_available(&boot_cpu_data))
-               return -EIO;
-
-       dev = per_cpu(mce_device, cpu);
-       if (dev)
-               return 0;
-
-       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-       if (!dev)
-               return -ENOMEM;
-       dev->id  = cpu;
-       dev->bus = &mce_subsys;
-       dev->release = &mce_device_release;
-
-       err = device_register(dev);
-       if (err) {
-               put_device(dev);
-               return err;
-       }
-
-       for (i = 0; mce_device_attrs[i]; i++) {
-               err = device_create_file(dev, mce_device_attrs[i]);
-               if (err)
-                       goto error;
-       }
-       for (j = 0; j < mca_cfg.banks; j++) {
-               err = device_create_file(dev, &mce_banks[j].attr);
-               if (err)
-                       goto error2;
-       }
-       cpumask_set_cpu(cpu, mce_device_initialized);
-       per_cpu(mce_device, cpu) = dev;
-
-       return 0;
-error2:
-       while (--j >= 0)
-               device_remove_file(dev, &mce_banks[j].attr);
-error:
-       while (--i >= 0)
-               device_remove_file(dev, mce_device_attrs[i]);
-
-       device_unregister(dev);
-
-       return err;
-}
-
-static void mce_device_remove(unsigned int cpu)
-{
-       struct device *dev = per_cpu(mce_device, cpu);
-       int i;
-
-       if (!cpumask_test_cpu(cpu, mce_device_initialized))
-               return;
-
-       for (i = 0; mce_device_attrs[i]; i++)
-               device_remove_file(dev, mce_device_attrs[i]);
-
-       for (i = 0; i < mca_cfg.banks; i++)
-               device_remove_file(dev, &mce_banks[i].attr);
-
-       device_unregister(dev);
-       cpumask_clear_cpu(cpu, mce_device_initialized);
-       per_cpu(mce_device, cpu) = NULL;
-}
-
-/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void)
-{
-       if (!mce_available(raw_cpu_ptr(&cpu_info)))
-               return;
-
-       if (!cpuhp_tasks_frozen)
-               cmci_clear();
-
-       vendor_disable_error_reporting();
-}
-
-static void mce_reenable_cpu(void)
-{
-       int i;
-
-       if (!mce_available(raw_cpu_ptr(&cpu_info)))
-               return;
-
-       if (!cpuhp_tasks_frozen)
-               cmci_reenable();
-       for (i = 0; i < mca_cfg.banks; i++) {
-               struct mce_bank *b = &mce_banks[i];
-
-               if (b->init)
-                       wrmsrl(msr_ops.ctl(i), b->ctl);
-       }
-}
-
-static int mce_cpu_dead(unsigned int cpu)
-{
-       mce_intel_hcpu_update(cpu);
-
-       /* intentionally ignoring frozen here */
-       if (!cpuhp_tasks_frozen)
-               cmci_rediscover();
-       return 0;
-}
-
-static int mce_cpu_online(unsigned int cpu)
-{
-       struct timer_list *t = this_cpu_ptr(&mce_timer);
-       int ret;
-
-       mce_device_create(cpu);
-
-       ret = mce_threshold_create_device(cpu);
-       if (ret) {
-               mce_device_remove(cpu);
-               return ret;
-       }
-       mce_reenable_cpu();
-       mce_start_timer(t);
-       return 0;
-}
-
-static int mce_cpu_pre_down(unsigned int cpu)
-{
-       struct timer_list *t = this_cpu_ptr(&mce_timer);
-
-       mce_disable_cpu();
-       del_timer_sync(t);
-       mce_threshold_remove_device(cpu);
-       mce_device_remove(cpu);
-       return 0;
-}
-
-static __init void mce_init_banks(void)
-{
-       int i;
-
-       for (i = 0; i < mca_cfg.banks; i++) {
-               struct mce_bank *b = &mce_banks[i];
-               struct device_attribute *a = &b->attr;
-
-               sysfs_attr_init(&a->attr);
-               a->attr.name    = b->attrname;
-               snprintf(b->attrname, ATTR_LEN, "bank%d", i);
-
-               a->attr.mode    = 0644;
-               a->show         = show_bank;
-               a->store        = set_bank;
-       }
-}
-
-static __init int mcheck_init_device(void)
-{
-       int err;
-
-       /*
-        * Check if we have a spare virtual bit. This will only become
-        * a problem if/when we move beyond 5-level page tables.
-        */
-       MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
-
-       if (!mce_available(&boot_cpu_data)) {
-               err = -EIO;
-               goto err_out;
-       }
-
-       if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
-               err = -ENOMEM;
-               goto err_out;
-       }
-
-       mce_init_banks();
-
-       err = subsys_system_register(&mce_subsys, NULL);
-       if (err)
-               goto err_out_mem;
-
-       err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
-                               mce_cpu_dead);
-       if (err)
-               goto err_out_mem;
-
-       err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
-                               mce_cpu_online, mce_cpu_pre_down);
-       if (err < 0)
-               goto err_out_online;
-
-       register_syscore_ops(&mce_syscore_ops);
-
-       return 0;
-
-err_out_online:
-       cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
-
-err_out_mem:
-       free_cpumask_var(mce_device_initialized);
-
-err_out:
-       pr_err("Unable to init MCE device (rc: %d)\n", err);
-
-       return err;
-}
-device_initcall_sync(mcheck_init_device);
-
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
-       mca_cfg.disabled = 1;
-       return 1;
-}
-__setup("nomce", mcheck_disable);
-
-#ifdef CONFIG_DEBUG_FS
-struct dentry *mce_get_debugfs_dir(void)
-{
-       static struct dentry *dmce;
-
-       if (!dmce)
-               dmce = debugfs_create_dir("mce", NULL);
-
-       return dmce;
-}
-
-static void mce_reset(void)
-{
-       cpu_missing = 0;
-       atomic_set(&mce_fake_panicked, 0);
-       atomic_set(&mce_executing, 0);
-       atomic_set(&mce_callin, 0);
-       atomic_set(&global_nwo, 0);
-}
-
-static int fake_panic_get(void *data, u64 *val)
-{
-       *val = fake_panic;
-       return 0;
-}
-
-static int fake_panic_set(void *data, u64 val)
-{
-       mce_reset();
-       fake_panic = val;
-       return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
-                       fake_panic_set, "%llu\n");
-
-static int __init mcheck_debugfs_init(void)
-{
-       struct dentry *dmce, *ffake_panic;
-
-       dmce = mce_get_debugfs_dir();
-       if (!dmce)
-               return -ENOMEM;
-       ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
-                                         &fake_panic_fops);
-       if (!ffake_panic)
-               return -ENOMEM;
-
-       return 0;
-}
-#else
-static int __init mcheck_debugfs_init(void) { return -EINVAL; }
-#endif
-
-DEFINE_STATIC_KEY_FALSE(mcsafe_key);
-EXPORT_SYMBOL_GPL(mcsafe_key);
-
-static int __init mcheck_late_init(void)
-{
-       if (mca_cfg.recovery)
-               static_branch_inc(&mcsafe_key);
-
-       mcheck_debugfs_init();
-       cec_init();
-
-       /*
-        * Flush out everything that has been logged during early boot, now that
-        * everything has been initialized (workqueues, decoders, ...).
-        */
-       mce_schedule_work();
-
-       return 0;
-}
-late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
deleted file mode 100644 (file)
index e12454e..0000000
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- *  (c) 2005-2016 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- *
- *  Written by Jacob Shin - AMD, Inc.
- *  Maintained by: Borislav Petkov <bp@alien8.de>
- *
- *  All MC4_MISCi registers are shared between cores on a node.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-
-#include <asm/amd_nb.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-
-#include "mce-internal.h"
-
-#define NR_BLOCKS         5
-#define THRESHOLD_MAX     0xFFF
-#define INT_TYPE_APIC     0x00020000
-#define MASK_VALID_HI     0x80000000
-#define MASK_CNTP_HI      0x40000000
-#define MASK_LOCKED_HI    0x20000000
-#define MASK_LVTOFF_HI    0x00F00000
-#define MASK_COUNT_EN_HI  0x00080000
-#define MASK_INT_TYPE_HI  0x00060000
-#define MASK_OVERFLOW_HI  0x00010000
-#define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_BLKPTR_LO    0xFF000000
-#define MCG_XBLK_ADDR     0xC0000400
-
-/* Deferred error settings */
-#define MSR_CU_DEF_ERR         0xC0000410
-#define MASK_DEF_LVTOFF                0x000000F0
-#define MASK_DEF_INT_TYPE      0x00000006
-#define DEF_LVT_OFF            0x2
-#define DEF_INT_TYPE_APIC      0x2
-
-/* Scalable MCA: */
-
-/* Threshold LVT offset is at MSR0xC0000410[15:12] */
-#define SMCA_THR_LVT_OFF       0xF000
-
-static bool thresholding_irq_en;
-
-static const char * const th_names[] = {
-       "load_store",
-       "insn_fetch",
-       "combined_unit",
-       "decode_unit",
-       "northbridge",
-       "execution_unit",
-};
-
-static const char * const smca_umc_block_names[] = {
-       "dram_ecc",
-       "misc_umc"
-};
-
-struct smca_bank_name {
-       const char *name;       /* Short name for sysfs */
-       const char *long_name;  /* Long name for pretty-printing */
-};
-
-static struct smca_bank_name smca_names[] = {
-       [SMCA_LS]       = { "load_store",       "Load Store Unit" },
-       [SMCA_IF]       = { "insn_fetch",       "Instruction Fetch Unit" },
-       [SMCA_L2_CACHE] = { "l2_cache",         "L2 Cache" },
-       [SMCA_DE]       = { "decode_unit",      "Decode Unit" },
-       [SMCA_RESERVED] = { "reserved",         "Reserved" },
-       [SMCA_EX]       = { "execution_unit",   "Execution Unit" },
-       [SMCA_FP]       = { "floating_point",   "Floating Point Unit" },
-       [SMCA_L3_CACHE] = { "l3_cache",         "L3 Cache" },
-       [SMCA_CS]       = { "coherent_slave",   "Coherent Slave" },
-       [SMCA_PIE]      = { "pie",              "Power, Interrupts, etc." },
-       [SMCA_UMC]      = { "umc",              "Unified Memory Controller" },
-       [SMCA_PB]       = { "param_block",      "Parameter Block" },
-       [SMCA_PSP]      = { "psp",              "Platform Security Processor" },
-       [SMCA_SMU]      = { "smu",              "System Management Unit" },
-};
-
-static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
-{
-       [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
-};
-
-const char *smca_get_name(enum smca_bank_types t)
-{
-       if (t >= N_SMCA_BANK_TYPES)
-               return NULL;
-
-       return smca_names[t].name;
-}
-
-const char *smca_get_long_name(enum smca_bank_types t)
-{
-       if (t >= N_SMCA_BANK_TYPES)
-               return NULL;
-
-       return smca_names[t].long_name;
-}
-EXPORT_SYMBOL_GPL(smca_get_long_name);
-
-static enum smca_bank_types smca_get_bank_type(unsigned int bank)
-{
-       struct smca_bank *b;
-
-       if (bank >= MAX_NR_BANKS)
-               return N_SMCA_BANK_TYPES;
-
-       b = &smca_banks[bank];
-       if (!b->hwid)
-               return N_SMCA_BANK_TYPES;
-
-       return b->hwid->bank_type;
-}
-
-static struct smca_hwid smca_hwid_mcatypes[] = {
-       /* { bank_type, hwid_mcatype, xec_bitmap } */
-
-       /* Reserved type */
-       { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
-
-       /* ZN Core (HWID=0xB0) MCA types */
-       { SMCA_LS,       HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
-       { SMCA_IF,       HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
-       { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
-       { SMCA_DE,       HWID_MCATYPE(0xB0, 0x3), 0x1FF },
-       /* HWID 0xB0 MCATYPE 0x4 is Reserved */
-       { SMCA_EX,       HWID_MCATYPE(0xB0, 0x5), 0x7FF },
-       { SMCA_FP,       HWID_MCATYPE(0xB0, 0x6), 0x7F },
-       { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
-
-       /* Data Fabric MCA types */
-       { SMCA_CS,       HWID_MCATYPE(0x2E, 0x0), 0x1FF },
-       { SMCA_PIE,      HWID_MCATYPE(0x2E, 0x1), 0xF },
-
-       /* Unified Memory Controller MCA type */
-       { SMCA_UMC,      HWID_MCATYPE(0x96, 0x0), 0x3F },
-
-       /* Parameter Block MCA type */
-       { SMCA_PB,       HWID_MCATYPE(0x05, 0x0), 0x1 },
-
-       /* Platform Security Processor MCA type */
-       { SMCA_PSP,      HWID_MCATYPE(0xFF, 0x0), 0x1 },
-
-       /* System Management Unit MCA type */
-       { SMCA_SMU,      HWID_MCATYPE(0x01, 0x0), 0x1 },
-};
-
-struct smca_bank smca_banks[MAX_NR_BANKS];
-EXPORT_SYMBOL_GPL(smca_banks);
-
-/*
- * In SMCA enabled processors, we can have multiple banks for a given IP type.
- * So to define a unique name for each bank, we use a temp c-string to append
- * the MCA_IPID[InstanceId] to type's name in get_name().
- *
- * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
- * is greater than 8 plus 1 (for underscore) plus length of longest type name.
- */
-#define MAX_MCATYPE_NAME_LEN   30
-static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
-
-static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
-
-static void amd_threshold_interrupt(void);
-static void amd_deferred_error_interrupt(void);
-
-static void default_deferred_error_interrupt(void)
-{
-       pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
-}
-void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-
-static void smca_configure(unsigned int bank, unsigned int cpu)
-{
-       unsigned int i, hwid_mcatype;
-       struct smca_hwid *s_hwid;
-       u32 high, low;
-       u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-
-       /* Set appropriate bits in MCA_CONFIG */
-       if (!rdmsr_safe(smca_config, &low, &high)) {
-               /*
-                * OS is required to set the MCAX bit to acknowledge that it is
-                * now using the new MSR ranges and new registers under each
-                * bank. It also means that the OS will configure deferred
-                * errors in the new MCx_CONFIG register. If the bit is not set,
-                * uncorrectable errors will cause a system panic.
-                *
-                * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
-                */
-               high |= BIT(0);
-
-               /*
-                * SMCA sets the Deferred Error Interrupt type per bank.
-                *
-                * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
-                * if the DeferredIntType bit field is available.
-                *
-                * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
-                * high portion of the MSR). OS should set this to 0x1 to enable
-                * APIC based interrupt. First, check that no interrupt has been
-                * set.
-                */
-               if ((low & BIT(5)) && !((high >> 5) & 0x3))
-                       high |= BIT(5);
-
-               wrmsr(smca_config, low, high);
-       }
-
-       /* Return early if this bank was already initialized. */
-       if (smca_banks[bank].hwid)
-               return;
-
-       if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
-               pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
-               return;
-       }
-
-       hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
-                                   (high & MCI_IPID_MCATYPE) >> 16);
-
-       for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
-               s_hwid = &smca_hwid_mcatypes[i];
-               if (hwid_mcatype == s_hwid->hwid_mcatype) {
-                       smca_banks[bank].hwid = s_hwid;
-                       smca_banks[bank].id = low;
-                       smca_banks[bank].sysfs_id = s_hwid->count++;
-                       break;
-               }
-       }
-}
-
-struct thresh_restart {
-       struct threshold_block  *b;
-       int                     reset;
-       int                     set_lvt_off;
-       int                     lvt_off;
-       u16                     old_limit;
-};
-
-static inline bool is_shared_bank(int bank)
-{
-       /*
-        * Scalable MCA provides for only one core to have access to the MSRs of
-        * a shared bank.
-        */
-       if (mce_flags.smca)
-               return false;
-
-       /* Bank 4 is for northbridge reporting and is thus shared */
-       return (bank == 4);
-}
-
-static const char *bank4_names(const struct threshold_block *b)
-{
-       switch (b->address) {
-       /* MSR4_MISC0 */
-       case 0x00000413:
-               return "dram";
-
-       case 0xc0000408:
-               return "ht_links";
-
-       case 0xc0000409:
-               return "l3_cache";
-
-       default:
-               WARN(1, "Funny MSR: 0x%08x\n", b->address);
-               return "";
-       }
-};
-
-
-static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
-{
-       /*
-        * bank 4 supports APIC LVT interrupts implicitly since forever.
-        */
-       if (bank == 4)
-               return true;
-
-       /*
-        * IntP: interrupt present; if this bit is set, the thresholding
-        * bank can generate APIC LVT interrupts
-        */
-       return msr_high_bits & BIT(28);
-}
-
-static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
-{
-       int msr = (hi & MASK_LVTOFF_HI) >> 20;
-
-       if (apic < 0) {
-               pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
-                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
-                      b->bank, b->block, b->address, hi, lo);
-               return 0;
-       }
-
-       if (apic != msr) {
-               /*
-                * On SMCA CPUs, LVT offset is programmed at a different MSR, and
-                * the BIOS provides the value. The original field where LVT offset
-                * was set is reserved. Return early here:
-                */
-               if (mce_flags.smca)
-                       return 0;
-
-               pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
-                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
-                      b->cpu, apic, b->bank, b->block, b->address, hi, lo);
-               return 0;
-       }
-
-       return 1;
-};
-
-/* Reprogram MCx_MISC MSR behind this threshold bank. */
-static void threshold_restart_bank(void *_tr)
-{
-       struct thresh_restart *tr = _tr;
-       u32 hi, lo;
-
-       rdmsr(tr->b->address, lo, hi);
-
-       if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
-               tr->reset = 1;  /* limit cannot be lower than err count */
-
-       if (tr->reset) {                /* reset err count and overflow bit */
-               hi =
-                   (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
-                   (THRESHOLD_MAX - tr->b->threshold_limit);
-       } else if (tr->old_limit) {     /* change limit w/o reset */
-               int new_count = (hi & THRESHOLD_MAX) +
-                   (tr->old_limit - tr->b->threshold_limit);
-
-               hi = (hi & ~MASK_ERR_COUNT_HI) |
-                   (new_count & THRESHOLD_MAX);
-       }
-
-       /* clear IntType */
-       hi &= ~MASK_INT_TYPE_HI;
-
-       if (!tr->b->interrupt_capable)
-               goto done;
-
-       if (tr->set_lvt_off) {
-               if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
-                       /* set new lvt offset */
-                       hi &= ~MASK_LVTOFF_HI;
-                       hi |= tr->lvt_off << 20;
-               }
-       }
-
-       if (tr->b->interrupt_enable)
-               hi |= INT_TYPE_APIC;
-
- done:
-
-       hi |= MASK_COUNT_EN_HI;
-       wrmsr(tr->b->address, lo, hi);
-}
-
-static void mce_threshold_block_init(struct threshold_block *b, int offset)
-{
-       struct thresh_restart tr = {
-               .b                      = b,
-               .set_lvt_off            = 1,
-               .lvt_off                = offset,
-       };
-
-       b->threshold_limit              = THRESHOLD_MAX;
-       threshold_restart_bank(&tr);
-};
-
-static int setup_APIC_mce_threshold(int reserved, int new)
-{
-       if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
-                                             APIC_EILVT_MSG_FIX, 0))
-               return new;
-
-       return reserved;
-}
-
-static int setup_APIC_deferred_error(int reserved, int new)
-{
-       if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
-                                             APIC_EILVT_MSG_FIX, 0))
-               return new;
-
-       return reserved;
-}
-
-static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
-{
-       u32 low = 0, high = 0;
-       int def_offset = -1, def_new;
-
-       if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
-               return;
-
-       def_new = (low & MASK_DEF_LVTOFF) >> 4;
-       if (!(low & MASK_DEF_LVTOFF)) {
-               pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
-               def_new = DEF_LVT_OFF;
-               low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
-       }
-
-       def_offset = setup_APIC_deferred_error(def_offset, def_new);
-       if ((def_offset == def_new) &&
-           (deferred_error_int_vector != amd_deferred_error_interrupt))
-               deferred_error_int_vector = amd_deferred_error_interrupt;
-
-       if (!mce_flags.smca)
-               low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
-
-       wrmsr(MSR_CU_DEF_ERR, low, high);
-}
-
-static u32 smca_get_block_address(unsigned int bank, unsigned int block)
-{
-       u32 low, high;
-       u32 addr = 0;
-
-       if (smca_get_bank_type(bank) == SMCA_RESERVED)
-               return addr;
-
-       if (!block)
-               return MSR_AMD64_SMCA_MCx_MISC(bank);
-
-       /* Check our cache first: */
-       if (smca_bank_addrs[bank][block] != -1)
-               return smca_bank_addrs[bank][block];
-
-       /*
-        * For SMCA enabled processors, BLKPTR field of the first MISC register
-        * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
-        */
-       if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
-               goto out;
-
-       if (!(low & MCI_CONFIG_MCAX))
-               goto out;
-
-       if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
-           (low & MASK_BLKPTR_LO))
-               addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
-
-out:
-       smca_bank_addrs[bank][block] = addr;
-       return addr;
-}
-
-static u32 get_block_address(u32 current_addr, u32 low, u32 high,
-                            unsigned int bank, unsigned int block)
-{
-       u32 addr = 0, offset = 0;
-
-       if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
-               return addr;
-
-       if (mce_flags.smca)
-               return smca_get_block_address(bank, block);
-
-       /* Fall back to method we used for older processors: */
-       switch (block) {
-       case 0:
-               addr = msr_ops.misc(bank);
-               break;
-       case 1:
-               offset = ((low & MASK_BLKPTR_LO) >> 21);
-               if (offset)
-                       addr = MCG_XBLK_ADDR + offset;
-               break;
-       default:
-               addr = ++current_addr;
-       }
-       return addr;
-}
-
-static int
-prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
-                       int offset, u32 misc_high)
-{
-       unsigned int cpu = smp_processor_id();
-       u32 smca_low, smca_high;
-       struct threshold_block b;
-       int new;
-
-       if (!block)
-               per_cpu(bank_map, cpu) |= (1 << bank);
-
-       memset(&b, 0, sizeof(b));
-       b.cpu                   = cpu;
-       b.bank                  = bank;
-       b.block                 = block;
-       b.address               = addr;
-       b.interrupt_capable     = lvt_interrupt_supported(bank, misc_high);
-
-       if (!b.interrupt_capable)
-               goto done;
-
-       b.interrupt_enable = 1;
-
-       if (!mce_flags.smca) {
-               new = (misc_high & MASK_LVTOFF_HI) >> 20;
-               goto set_offset;
-       }
-
-       /* Gather LVT offset for thresholding: */
-       if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-               goto out;
-
-       new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
-
-set_offset:
-       offset = setup_APIC_mce_threshold(offset, new);
-       if (offset == new)
-               thresholding_irq_en = true;
-
-done:
-       mce_threshold_block_init(&b, offset);
-
-out:
-       return offset;
-}
-
-/* cpu init entry point, called from mce.c with preempt off */
-void mce_amd_feature_init(struct cpuinfo_x86 *c)
-{
-       u32 low = 0, high = 0, address = 0;
-       unsigned int bank, block, cpu = smp_processor_id();
-       int offset = -1;
-
-       for (bank = 0; bank < mca_cfg.banks; ++bank) {
-               if (mce_flags.smca)
-                       smca_configure(bank, cpu);
-
-               for (block = 0; block < NR_BLOCKS; ++block) {
-                       address = get_block_address(address, low, high, bank, block);
-                       if (!address)
-                               break;
-
-                       if (rdmsr_safe(address, &low, &high))
-                               break;
-
-                       if (!(high & MASK_VALID_HI))
-                               continue;
-
-                       if (!(high & MASK_CNTP_HI)  ||
-                            (high & MASK_LOCKED_HI))
-                               continue;
-
-                       offset = prepare_threshold_block(bank, block, address, offset, high);
-               }
-       }
-
-       if (mce_flags.succor)
-               deferred_error_interrupt_enable(c);
-}
-
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
-{
-       u64 dram_base_addr, dram_limit_addr, dram_hole_base;
-       /* We start from the normalized address */
-       u64 ret_addr = norm_addr;
-
-       u32 tmp;
-
-       u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
-       u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
-       u8 intlv_addr_sel, intlv_addr_bit;
-       u8 num_intlv_bits, hashed_bit;
-       u8 lgcy_mmio_hole_en, base = 0;
-       u8 cs_mask, cs_id = 0;
-       bool hash_enabled = false;
-
-       /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
-       if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
-               goto out_err;
-
-       /* Remove HiAddrOffset from normalized address, if enabled: */
-       if (tmp & BIT(0)) {
-               u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
-
-               if (norm_addr >= hi_addr_offset) {
-                       ret_addr -= hi_addr_offset;
-                       base = 1;
-               }
-       }
-
-       /* Read D18F0x110 (DramBaseAddress). */
-       if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
-               goto out_err;
-
-       /* Check if address range is valid. */
-       if (!(tmp & BIT(0))) {
-               pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
-                       __func__, tmp);
-               goto out_err;
-       }
-
-       lgcy_mmio_hole_en = tmp & BIT(1);
-       intlv_num_chan    = (tmp >> 4) & 0xF;
-       intlv_addr_sel    = (tmp >> 8) & 0x7;
-       dram_base_addr    = (tmp & GENMASK_ULL(31, 12)) << 16;
-
-       /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
-       if (intlv_addr_sel > 3) {
-               pr_err("%s: Invalid interleave address select %d.\n",
-                       __func__, intlv_addr_sel);
-               goto out_err;
-       }
-
-       /* Read D18F0x114 (DramLimitAddress). */
-       if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
-               goto out_err;
-
-       intlv_num_sockets = (tmp >> 8) & 0x1;
-       intlv_num_dies    = (tmp >> 10) & 0x3;
-       dram_limit_addr   = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
-
-       intlv_addr_bit = intlv_addr_sel + 8;
-
-       /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
-       switch (intlv_num_chan) {
-       case 0: intlv_num_chan = 0; break;
-       case 1: intlv_num_chan = 1; break;
-       case 3: intlv_num_chan = 2; break;
-       case 5: intlv_num_chan = 3; break;
-       case 7: intlv_num_chan = 4; break;
-
-       case 8: intlv_num_chan = 1;
-               hash_enabled = true;
-               break;
-       default:
-               pr_err("%s: Invalid number of interleaved channels %d.\n",
-                       __func__, intlv_num_chan);
-               goto out_err;
-       }
-
-       num_intlv_bits = intlv_num_chan;
-
-       if (intlv_num_dies > 2) {
-               pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
-                       __func__, intlv_num_dies);
-               goto out_err;
-       }
-
-       num_intlv_bits += intlv_num_dies;
-
-       /* Add a bit if sockets are interleaved. */
-       num_intlv_bits += intlv_num_sockets;
-
-       /* Assert num_intlv_bits <= 4 */
-       if (num_intlv_bits > 4) {
-               pr_err("%s: Invalid interleave bits %d.\n",
-                       __func__, num_intlv_bits);
-               goto out_err;
-       }
-
-       if (num_intlv_bits > 0) {
-               u64 temp_addr_x, temp_addr_i, temp_addr_y;
-               u8 die_id_bit, sock_id_bit, cs_fabric_id;
-
-               /*
-                * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
-                * This is the fabric id for this coherent slave. Use
-                * umc/channel# as instance id of the coherent slave
-                * for FICAA.
-                */
-               if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
-                       goto out_err;
-
-               cs_fabric_id = (tmp >> 8) & 0xFF;
-               die_id_bit   = 0;
-
-               /* If interleaved over more than 1 channel: */
-               if (intlv_num_chan) {
-                       die_id_bit = intlv_num_chan;
-                       cs_mask    = (1 << die_id_bit) - 1;
-                       cs_id      = cs_fabric_id & cs_mask;
-               }
-
-               sock_id_bit = die_id_bit;
-
-               /* Read D18F1x208 (SystemFabricIdMask). */
-               if (intlv_num_dies || intlv_num_sockets)
-                       if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
-                               goto out_err;
-
-               /* If interleaved over more than 1 die. */
-               if (intlv_num_dies) {
-                       sock_id_bit  = die_id_bit + intlv_num_dies;
-                       die_id_shift = (tmp >> 24) & 0xF;
-                       die_id_mask  = (tmp >> 8) & 0xFF;
-
-                       cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
-               }
-
-               /* If interleaved over more than 1 socket. */
-               if (intlv_num_sockets) {
-                       socket_id_shift = (tmp >> 28) & 0xF;
-                       socket_id_mask  = (tmp >> 16) & 0xFF;
-
-                       cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
-               }
-
-               /*
-                * The pre-interleaved address consists of XXXXXXIIIYYYYY
-                * where III is the ID for this CS, and XXXXXXYYYYY are the
-                * address bits from the post-interleaved address.
-                * "num_intlv_bits" has been calculated to tell us how many "I"
-                * bits there are. "intlv_addr_bit" tells us how many "Y" bits
-                * there are (where "I" starts).
-                */
-               temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
-               temp_addr_i = (cs_id << intlv_addr_bit);
-               temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
-               ret_addr    = temp_addr_x | temp_addr_i | temp_addr_y;
-       }
-
-       /* Add dram base address */
-       ret_addr += dram_base_addr;
-
-       /* If legacy MMIO hole enabled */
-       if (lgcy_mmio_hole_en) {
-               if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
-                       goto out_err;
-
-               dram_hole_base = tmp & GENMASK(31, 24);
-               if (ret_addr >= dram_hole_base)
-                       ret_addr += (BIT_ULL(32) - dram_hole_base);
-       }
-
-       if (hash_enabled) {
-               /* Save some parentheses and grab ls-bit at the end. */
-               hashed_bit =    (ret_addr >> 12) ^
-                               (ret_addr >> 18) ^
-                               (ret_addr >> 21) ^
-                               (ret_addr >> 30) ^
-                               cs_id;
-
-               hashed_bit &= BIT(0);
-
-               if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
-                       ret_addr ^= BIT(intlv_addr_bit);
-       }
-
-       /* Is calculated system address is above DRAM limit address? */
-       if (ret_addr > dram_limit_addr)
-               goto out_err;
-
-       *sys_addr = ret_addr;
-       return 0;
-
-out_err:
-       return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-
-bool amd_mce_is_memory_error(struct mce *m)
-{
-       /* ErrCodeExt[20:16] */
-       u8 xec = (m->status >> 16) & 0x1f;
-
-       if (mce_flags.smca)
-               return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
-
-       return m->bank == 4 && xec == 0x8;
-}
-
-static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
-{
-       struct mce m;
-
-       mce_setup(&m);
-
-       m.status = status;
-       m.misc   = misc;
-       m.bank   = bank;
-       m.tsc    = rdtsc();
-
-       if (m.status & MCI_STATUS_ADDRV) {
-               m.addr = addr;
-
-               /*
-                * Extract [55:<lsb>] where lsb is the least significant
-                * *valid* bit of the address bits.
-                */
-               if (mce_flags.smca) {
-                       u8 lsb = (m.addr >> 56) & 0x3f;
-
-                       m.addr &= GENMASK_ULL(55, lsb);
-               }
-       }
-
-       if (mce_flags.smca) {
-               rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
-
-               if (m.status & MCI_STATUS_SYNDV)
-                       rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
-       }
-
-       mce_log(&m);
-}
-
-asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
-{
-       entering_irq();
-       trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
-       inc_irq_stat(irq_deferred_error_count);
-       deferred_error_int_vector();
-       trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
-       exiting_ack_irq();
-}
-
-/*
- * Returns true if the logged error is deferred. False, otherwise.
- */
-static inline bool
-_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
-{
-       u64 status, addr = 0;
-
-       rdmsrl(msr_stat, status);
-       if (!(status & MCI_STATUS_VAL))
-               return false;
-
-       if (status & MCI_STATUS_ADDRV)
-               rdmsrl(msr_addr, addr);
-
-       __log_error(bank, status, addr, misc);
-
-       wrmsrl(msr_stat, 0);
-
-       return status & MCI_STATUS_DEFERRED;
-}
-
-/*
- * We have three scenarios for checking for Deferred errors:
- *
- * 1) Non-SMCA systems check MCA_STATUS and log error if found.
- * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
- *    clear MCA_DESTAT.
- * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
- *    log it.
- */
-static void log_error_deferred(unsigned int bank)
-{
-       bool defrd;
-
-       defrd = _log_error_bank(bank, msr_ops.status(bank),
-                                       msr_ops.addr(bank), 0);
-
-       if (!mce_flags.smca)
-               return;
-
-       /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
-       if (defrd) {
-               wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
-               return;
-       }
-
-       /*
-        * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
-        * for a valid error.
-        */
-       _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
-                             MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
-}
-
-/* APIC interrupt handler for deferred errors */
-static void amd_deferred_error_interrupt(void)
-{
-       unsigned int bank;
-
-       for (bank = 0; bank < mca_cfg.banks; ++bank)
-               log_error_deferred(bank);
-}
-
-static void log_error_thresholding(unsigned int bank, u64 misc)
-{
-       _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
-}
-
-static void log_and_reset_block(struct threshold_block *block)
-{
-       struct thresh_restart tr;
-       u32 low = 0, high = 0;
-
-       if (!block)
-               return;
-
-       if (rdmsr_safe(block->address, &low, &high))
-               return;
-
-       if (!(high & MASK_OVERFLOW_HI))
-               return;
-
-       /* Log the MCE which caused the threshold event. */
-       log_error_thresholding(block->bank, ((u64)high << 32) | low);
-
-       /* Reset threshold block after logging error. */
-       memset(&tr, 0, sizeof(tr));
-       tr.b = block;
-       threshold_restart_bank(&tr);
-}
-
-/*
- * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
- * goes off when error_count reaches threshold_limit.
- */
-static void amd_threshold_interrupt(void)
-{
-       struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
-       unsigned int bank, cpu = smp_processor_id();
-
-       for (bank = 0; bank < mca_cfg.banks; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
-                       continue;
-
-               first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
-               if (!first_block)
-                       continue;
-
-               /*
-                * The first block is also the head of the list. Check it first
-                * before iterating over the rest.
-                */
-               log_and_reset_block(first_block);
-               list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
-                       log_and_reset_block(block);
-       }
-}
-
-/*
- * Sysfs Interface
- */
-
-struct threshold_attr {
-       struct attribute attr;
-       ssize_t (*show) (struct threshold_block *, char *);
-       ssize_t (*store) (struct threshold_block *, const char *, size_t count);
-};
-
-#define SHOW_FIELDS(name)                                              \
-static ssize_t show_ ## name(struct threshold_block *b, char *buf)     \
-{                                                                      \
-       return sprintf(buf, "%lu\n", (unsigned long) b->name);          \
-}
-SHOW_FIELDS(interrupt_enable)
-SHOW_FIELDS(threshold_limit)
-
-static ssize_t
-store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
-{
-       struct thresh_restart tr;
-       unsigned long new;
-
-       if (!b->interrupt_capable)
-               return -EINVAL;
-
-       if (kstrtoul(buf, 0, &new) < 0)
-               return -EINVAL;
-
-       b->interrupt_enable = !!new;
-
-       memset(&tr, 0, sizeof(tr));
-       tr.b            = b;
-
-       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
-       return size;
-}
-
-static ssize_t
-store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
-{
-       struct thresh_restart tr;
-       unsigned long new;
-
-       if (kstrtoul(buf, 0, &new) < 0)
-               return -EINVAL;
-
-       if (new > THRESHOLD_MAX)
-               new = THRESHOLD_MAX;
-       if (new < 1)
-               new = 1;
-
-       memset(&tr, 0, sizeof(tr));
-       tr.old_limit = b->threshold_limit;
-       b->threshold_limit = new;
-       tr.b = b;
-
-       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
-       return size;
-}
-
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
-{
-       u32 lo, hi;
-
-       rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
-
-       return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
-                                    (THRESHOLD_MAX - b->threshold_limit)));
-}
-
-static struct threshold_attr error_count = {
-       .attr = {.name = __stringify(error_count), .mode = 0444 },
-       .show = show_error_count,
-};
-
-#define RW_ATTR(val)                                                   \
-static struct threshold_attr val = {                                   \
-       .attr   = {.name = __stringify(val), .mode = 0644 },            \
-       .show   = show_## val,                                          \
-       .store  = store_## val,                                         \
-};
-
-RW_ATTR(interrupt_enable);
-RW_ATTR(threshold_limit);
-
-static struct attribute *default_attrs[] = {
-       &threshold_limit.attr,
-       &error_count.attr,
-       NULL,   /* possibly interrupt_enable if supported, see below */
-       NULL,
-};
-
-#define to_block(k)    container_of(k, struct threshold_block, kobj)
-#define to_attr(a)     container_of(a, struct threshold_attr, attr)
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-       struct threshold_block *b = to_block(kobj);
-       struct threshold_attr *a = to_attr(attr);
-       ssize_t ret;
-
-       ret = a->show ? a->show(b, buf) : -EIO;
-
-       return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
-                    const char *buf, size_t count)
-{
-       struct threshold_block *b = to_block(kobj);
-       struct threshold_attr *a = to_attr(attr);
-       ssize_t ret;
-
-       ret = a->store ? a->store(b, buf, count) : -EIO;
-
-       return ret;
-}
-
-static const struct sysfs_ops threshold_ops = {
-       .show                   = show,
-       .store                  = store,
-};
-
-static struct kobj_type threshold_ktype = {
-       .sysfs_ops              = &threshold_ops,
-       .default_attrs          = default_attrs,
-};
-
-static const char *get_name(unsigned int bank, struct threshold_block *b)
-{
-       enum smca_bank_types bank_type;
-
-       if (!mce_flags.smca) {
-               if (b && bank == 4)
-                       return bank4_names(b);
-
-               return th_names[bank];
-       }
-
-       bank_type = smca_get_bank_type(bank);
-       if (bank_type >= N_SMCA_BANK_TYPES)
-               return NULL;
-
-       if (b && bank_type == SMCA_UMC) {
-               if (b->block < ARRAY_SIZE(smca_umc_block_names))
-                       return smca_umc_block_names[b->block];
-               return NULL;
-       }
-
-       if (smca_banks[bank].hwid->count == 1)
-               return smca_get_name(bank_type);
-
-       snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
-                "%s_%x", smca_get_name(bank_type),
-                         smca_banks[bank].sysfs_id);
-       return buf_mcatype;
-}
-
-static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
-                                    unsigned int block, u32 address)
-{
-       struct threshold_block *b = NULL;
-       u32 low, high;
-       int err;
-
-       if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
-               return 0;
-
-       if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
-               return 0;
-
-       if (!(high & MASK_VALID_HI)) {
-               if (block)
-                       goto recurse;
-               else
-                       return 0;
-       }
-
-       if (!(high & MASK_CNTP_HI)  ||
-            (high & MASK_LOCKED_HI))
-               goto recurse;
-
-       b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
-       if (!b)
-               return -ENOMEM;
-
-       b->block                = block;
-       b->bank                 = bank;
-       b->cpu                  = cpu;
-       b->address              = address;
-       b->interrupt_enable     = 0;
-       b->interrupt_capable    = lvt_interrupt_supported(bank, high);
-       b->threshold_limit      = THRESHOLD_MAX;
-
-       if (b->interrupt_capable) {
-               threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
-               b->interrupt_enable = 1;
-       } else {
-               threshold_ktype.default_attrs[2] = NULL;
-       }
-
-       INIT_LIST_HEAD(&b->miscj);
-
-       if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
-               list_add(&b->miscj,
-                        &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
-       } else {
-               per_cpu(threshold_banks, cpu)[bank]->blocks = b;
-       }
-
-       err = kobject_init_and_add(&b->kobj, &threshold_ktype,
-                                  per_cpu(threshold_banks, cpu)[bank]->kobj,
-                                  get_name(bank, b));
-       if (err)
-               goto out_free;
-recurse:
-       address = get_block_address(address, low, high, bank, ++block);
-       if (!address)
-               return 0;
-
-       err = allocate_threshold_blocks(cpu, bank, block, address);
-       if (err)
-               goto out_free;
-
-       if (b)
-               kobject_uevent(&b->kobj, KOBJ_ADD);
-
-       return err;
-
-out_free:
-       if (b) {
-               kobject_put(&b->kobj);
-               list_del(&b->miscj);
-               kfree(b);
-       }
-       return err;
-}
-
-static int __threshold_add_blocks(struct threshold_bank *b)
-{
-       struct list_head *head = &b->blocks->miscj;
-       struct threshold_block *pos = NULL;
-       struct threshold_block *tmp = NULL;
-       int err = 0;
-
-       err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
-       if (err)
-               return err;
-
-       list_for_each_entry_safe(pos, tmp, head, miscj) {
-
-               err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
-               if (err) {
-                       list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
-                               kobject_del(&pos->kobj);
-
-                       return err;
-               }
-       }
-       return err;
-}
-
-static int threshold_create_bank(unsigned int cpu, unsigned int bank)
-{
-       struct device *dev = per_cpu(mce_device, cpu);
-       struct amd_northbridge *nb = NULL;
-       struct threshold_bank *b = NULL;
-       const char *name = get_name(bank, NULL);
-       int err = 0;
-
-       if (!dev)
-               return -ENODEV;
-
-       if (is_shared_bank(bank)) {
-               nb = node_to_amd_nb(amd_get_nb_id(cpu));
-
-               /* threshold descriptor already initialized on this node? */
-               if (nb && nb->bank4) {
-                       /* yes, use it */
-                       b = nb->bank4;
-                       err = kobject_add(b->kobj, &dev->kobj, name);
-                       if (err)
-                               goto out;
-
-                       per_cpu(threshold_banks, cpu)[bank] = b;
-                       refcount_inc(&b->cpus);
-
-                       err = __threshold_add_blocks(b);
-
-                       goto out;
-               }
-       }
-
-       b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
-       if (!b) {
-               err = -ENOMEM;
-               goto out;
-       }
-
-       b->kobj = kobject_create_and_add(name, &dev->kobj);
-       if (!b->kobj) {
-               err = -EINVAL;
-               goto out_free;
-       }
-
-       per_cpu(threshold_banks, cpu)[bank] = b;
-
-       if (is_shared_bank(bank)) {
-               refcount_set(&b->cpus, 1);
-
-               /* nb is already initialized, see above */
-               if (nb) {
-                       WARN_ON(nb->bank4);
-                       nb->bank4 = b;
-               }
-       }
-
-       err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
-       if (!err)
-               goto out;
-
- out_free:
-       kfree(b);
-
- out:
-       return err;
-}
-
-static void deallocate_threshold_block(unsigned int cpu,
-                                                unsigned int bank)
-{
-       struct threshold_block *pos = NULL;
-       struct threshold_block *tmp = NULL;
-       struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
-
-       if (!head)
-               return;
-
-       list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
-               kobject_put(&pos->kobj);
-               list_del(&pos->miscj);
-               kfree(pos);
-       }
-
-       kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
-       per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
-}
-
-static void __threshold_remove_blocks(struct threshold_bank *b)
-{
-       struct threshold_block *pos = NULL;
-       struct threshold_block *tmp = NULL;
-
-       kobject_del(b->kobj);
-
-       list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
-               kobject_del(&pos->kobj);
-}
-
-static void threshold_remove_bank(unsigned int cpu, int bank)
-{
-       struct amd_northbridge *nb;
-       struct threshold_bank *b;
-
-       b = per_cpu(threshold_banks, cpu)[bank];
-       if (!b)
-               return;
-
-       if (!b->blocks)
-               goto free_out;
-
-       if (is_shared_bank(bank)) {
-               if (!refcount_dec_and_test(&b->cpus)) {
-                       __threshold_remove_blocks(b);
-                       per_cpu(threshold_banks, cpu)[bank] = NULL;
-                       return;
-               } else {
-                       /*
-                        * the last CPU on this node using the shared bank is
-                        * going away, remove that bank now.
-                        */
-                       nb = node_to_amd_nb(amd_get_nb_id(cpu));
-                       nb->bank4 = NULL;
-               }
-       }
-
-       deallocate_threshold_block(cpu, bank);
-
-free_out:
-       kobject_del(b->kobj);
-       kobject_put(b->kobj);
-       kfree(b);
-       per_cpu(threshold_banks, cpu)[bank] = NULL;
-}
-
-int mce_threshold_remove_device(unsigned int cpu)
-{
-       unsigned int bank;
-
-       for (bank = 0; bank < mca_cfg.banks; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
-                       continue;
-               threshold_remove_bank(cpu, bank);
-       }
-       kfree(per_cpu(threshold_banks, cpu));
-       per_cpu(threshold_banks, cpu) = NULL;
-       return 0;
-}
-
-/* create dir/files for all valid threshold banks */
-int mce_threshold_create_device(unsigned int cpu)
-{
-       unsigned int bank;
-       struct threshold_bank **bp;
-       int err = 0;
-
-       bp = per_cpu(threshold_banks, cpu);
-       if (bp)
-               return 0;
-
-       bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
-                    GFP_KERNEL);
-       if (!bp)
-               return -ENOMEM;
-
-       per_cpu(threshold_banks, cpu) = bp;
-
-       for (bank = 0; bank < mca_cfg.banks; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
-                       continue;
-               err = threshold_create_bank(cpu, bank);
-               if (err)
-                       goto err;
-       }
-       return err;
-err:
-       mce_threshold_remove_device(cpu);
-       return err;
-}
-
-static __init int threshold_init_device(void)
-{
-       unsigned lcpu = 0;
-
-       /* to hit CPUs online before the notifier is up */
-       for_each_online_cpu(lcpu) {
-               int err = mce_threshold_create_device(lcpu);
-
-               if (err)
-                       return err;
-       }
-
-       if (thresholding_irq_en)
-               mce_threshold_vector = amd_threshold_interrupt;
-
-       return 0;
-}
-/*
- * there are 3 funcs which need to be _initcalled in a logic sequence:
- * 1. xen_late_init_mcelog
- * 2. mcheck_init_device
- * 3. threshold_init_device
- *
- * xen_late_init_mcelog must register xen_mce_chrdev_device before
- * native mce_chrdev_device registration if running under xen platform;
- *
- * mcheck_init_device should be inited before threshold_init_device to
- * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
- *
- * so we use following _initcalls
- * 1. device_initcall(xen_late_init_mcelog);
- * 2. device_initcall_sync(mcheck_init_device);
- * 3. late_initcall(threshold_init_device);
- *
- * when running under xen, the initcall order is 1,2,3;
- * on baremetal, we skip 1 and we do only 2 and 3.
- */
-late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
deleted file mode 100644 (file)
index d05be30..0000000
+++ /dev/null
@@ -1,518 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel specific MCE features.
- * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
- * Copyright (C) 2008, 2009 Intel Corporation
- * Author: Andi Kleen
- */
-
-#include <linux/gfp.h>
-#include <linux/interrupt.h>
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <asm/apic.h>
-#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-/*
- * Support for Intel Correct Machine Check Interrupts. This allows
- * the CPU to raise an interrupt when a corrected machine check happened.
- * Normally we pick those up using a regular polling timer.
- * Also supports reliable discovery of shared banks.
- */
-
-/*
- * CMCI can be delivered to multiple cpus that share a machine check bank
- * so we need to designate a single cpu to process errors logged in each bank
- * in the interrupt handler (otherwise we would have many races and potential
- * double reporting of the same error).
- * Note that this can change when a cpu is offlined or brought online since
- * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
- * disables CMCI on all banks owned by the cpu and clears this bitfield. At
- * this point, cmci_rediscover() kicks in and a different cpu may end up
- * taking ownership of some of the shared MCA banks that were previously
- * owned by the offlined cpu.
- */
-static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
-
-/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
- * cmci_discover_lock protects against parallel discovery attempts
- * which could race against each other.
- */
-static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
-
-#define CMCI_THRESHOLD         1
-#define CMCI_POLL_INTERVAL     (30 * HZ)
-#define CMCI_STORM_INTERVAL    (HZ)
-#define CMCI_STORM_THRESHOLD   15
-
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-
-enum {
-       CMCI_STORM_NONE,
-       CMCI_STORM_ACTIVE,
-       CMCI_STORM_SUBSIDED,
-};
-
-static atomic_t cmci_storm_on_cpus;
-
-static int cmci_supported(int *banks)
-{
-       u64 cap;
-
-       if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
-               return 0;
-
-       /*
-        * Vendor check is not strictly needed, but the initial
-        * initialization is vendor keyed and this
-        * makes sure none of the backdoors are entered otherwise.
-        */
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-               return 0;
-       if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
-               return 0;
-       rdmsrl(MSR_IA32_MCG_CAP, cap);
-       *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
-       return !!(cap & MCG_CMCI_P);
-}
-
-static bool lmce_supported(void)
-{
-       u64 tmp;
-
-       if (mca_cfg.lmce_disabled)
-               return false;
-
-       rdmsrl(MSR_IA32_MCG_CAP, tmp);
-
-       /*
-        * LMCE depends on recovery support in the processor. Hence both
-        * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
-        */
-       if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
-                  (MCG_SER_P | MCG_LMCE_P))
-               return false;
-
-       /*
-        * BIOS should indicate support for LMCE by setting bit 20 in
-        * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
-        * generate a #GP fault.
-        */
-       rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
-       if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
-                  (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
-               return true;
-
-       return false;
-}
-
-bool mce_intel_cmci_poll(void)
-{
-       if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
-               return false;
-
-       /*
-        * Reset the counter if we've logged an error in the last poll
-        * during the storm.
-        */
-       if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
-               this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
-       else
-               this_cpu_dec(cmci_backoff_cnt);
-
-       return true;
-}
-
-void mce_intel_hcpu_update(unsigned long cpu)
-{
-       if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
-               atomic_dec(&cmci_storm_on_cpus);
-
-       per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
-}
-
-static void cmci_toggle_interrupt_mode(bool on)
-{
-       unsigned long flags, *owned;
-       int bank;
-       u64 val;
-
-       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-       owned = this_cpu_ptr(mce_banks_owned);
-       for_each_set_bit(bank, owned, MAX_NR_BANKS) {
-               rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-
-               if (on)
-                       val |= MCI_CTL2_CMCI_EN;
-               else
-                       val &= ~MCI_CTL2_CMCI_EN;
-
-               wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
-       }
-       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
-       if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
-           (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
-               mce_notify_irq();
-               return CMCI_STORM_INTERVAL;
-       }
-
-       switch (__this_cpu_read(cmci_storm_state)) {
-       case CMCI_STORM_ACTIVE:
-
-               /*
-                * We switch back to interrupt mode once the poll timer has
-                * silenced itself. That means no events recorded and the timer
-                * interval is back to our poll interval.
-                */
-               __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
-               if (!atomic_sub_return(1, &cmci_storm_on_cpus))
-                       pr_notice("CMCI storm subsided: switching to interrupt mode\n");
-
-               /* FALLTHROUGH */
-
-       case CMCI_STORM_SUBSIDED:
-               /*
-                * We wait for all CPUs to go back to SUBSIDED state. When that
-                * happens we switch back to interrupt mode.
-                */
-               if (!atomic_read(&cmci_storm_on_cpus)) {
-                       __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
-                       cmci_toggle_interrupt_mode(true);
-                       cmci_recheck();
-               }
-               return CMCI_POLL_INTERVAL;
-       default:
-
-               /* We have shiny weather. Let the poll do whatever it thinks. */
-               return interval;
-       }
-}
-
-static bool cmci_storm_detect(void)
-{
-       unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
-       unsigned long ts = __this_cpu_read(cmci_time_stamp);
-       unsigned long now = jiffies;
-       int r;
-
-       if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
-               return true;
-
-       if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
-               cnt++;
-       } else {
-               cnt = 1;
-               __this_cpu_write(cmci_time_stamp, now);
-       }
-       __this_cpu_write(cmci_storm_cnt, cnt);
-
-       if (cnt <= CMCI_STORM_THRESHOLD)
-               return false;
-
-       cmci_toggle_interrupt_mode(false);
-       __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
-       r = atomic_add_return(1, &cmci_storm_on_cpus);
-       mce_timer_kick(CMCI_STORM_INTERVAL);
-       this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
-
-       if (r == 1)
-               pr_notice("CMCI storm detected: switching to poll mode\n");
-       return true;
-}
-
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
-       if (cmci_storm_detect())
-               return;
-
-       machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
-}
-
-/*
- * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
- * on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
- */
-static void cmci_discover(int banks)
-{
-       unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
-       unsigned long flags;
-       int i;
-       int bios_wrong_thresh = 0;
-
-       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-       for (i = 0; i < banks; i++) {
-               u64 val;
-               int bios_zero_thresh = 0;
-
-               if (test_bit(i, owned))
-                       continue;
-
-               /* Skip banks in firmware first mode */
-               if (test_bit(i, mce_banks_ce_disabled))
-                       continue;
-
-               rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
-               /* Already owned by someone else? */
-               if (val & MCI_CTL2_CMCI_EN) {
-                       clear_bit(i, owned);
-                       __clear_bit(i, this_cpu_ptr(mce_poll_banks));
-                       continue;
-               }
-
-               if (!mca_cfg.bios_cmci_threshold) {
-                       val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
-                       val |= CMCI_THRESHOLD;
-               } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
-                       /*
-                        * If bios_cmci_threshold boot option was specified
-                        * but the threshold is zero, we'll try to initialize
-                        * it to 1.
-                        */
-                       bios_zero_thresh = 1;
-                       val |= CMCI_THRESHOLD;
-               }
-
-               val |= MCI_CTL2_CMCI_EN;
-               wrmsrl(MSR_IA32_MCx_CTL2(i), val);
-               rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
-               /* Did the enable bit stick? -- the bank supports CMCI */
-               if (val & MCI_CTL2_CMCI_EN) {
-                       set_bit(i, owned);
-                       __clear_bit(i, this_cpu_ptr(mce_poll_banks));
-                       /*
-                        * We are able to set thresholds for some banks that
-                        * had a threshold of 0. This means the BIOS has not
-                        * set the thresholds properly or does not work with
-                        * this boot option. Note down now and report later.
-                        */
-                       if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
-                                       (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
-                               bios_wrong_thresh = 1;
-               } else {
-                       WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
-               }
-       }
-       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-       if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
-               pr_info_once(
-                       "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
-               pr_info_once(
-                       "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
-       }
-}
-
-/*
- * Just in case we missed an event during initialization check
- * all the CMCI owned banks.
- */
-void cmci_recheck(void)
-{
-       unsigned long flags;
-       int banks;
-
-       if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
-               return;
-
-       local_irq_save(flags);
-       machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
-       local_irq_restore(flags);
-}
-
-/* Caller must hold the lock on cmci_discover_lock */
-static void __cmci_disable_bank(int bank)
-{
-       u64 val;
-
-       if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
-               return;
-       rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-       val &= ~MCI_CTL2_CMCI_EN;
-       wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
-       __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
-}
-
-/*
- * Disable CMCI on this CPU for all banks it owns when it goes down.
- * This allows other CPUs to claim the banks on rediscovery.
- */
-void cmci_clear(void)
-{
-       unsigned long flags;
-       int i;
-       int banks;
-
-       if (!cmci_supported(&banks))
-               return;
-       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-       for (i = 0; i < banks; i++)
-               __cmci_disable_bank(i);
-       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-static void cmci_rediscover_work_func(void *arg)
-{
-       int banks;
-
-       /* Recheck banks in case CPUs don't all have the same */
-       if (cmci_supported(&banks))
-               cmci_discover(banks);
-}
-
-/* After a CPU went down cycle through all the others and rediscover */
-void cmci_rediscover(void)
-{
-       int banks;
-
-       if (!cmci_supported(&banks))
-               return;
-
-       on_each_cpu(cmci_rediscover_work_func, NULL, 1);
-}
-
-/*
- * Reenable CMCI on this CPU in case a CPU down failed.
- */
-void cmci_reenable(void)
-{
-       int banks;
-       if (cmci_supported(&banks))
-               cmci_discover(banks);
-}
-
-void cmci_disable_bank(int bank)
-{
-       int banks;
-       unsigned long flags;
-
-       if (!cmci_supported(&banks))
-               return;
-
-       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-       __cmci_disable_bank(bank);
-       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-static void intel_init_cmci(void)
-{
-       int banks;
-
-       if (!cmci_supported(&banks))
-               return;
-
-       mce_threshold_vector = intel_threshold_interrupt;
-       cmci_discover(banks);
-       /*
-        * For CPU #0 this runs with still disabled APIC, but that's
-        * ok because only the vector is set up. We still do another
-        * check for the banks later for CPU #0 just to make sure
-        * to not miss any events.
-        */
-       apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
-       cmci_recheck();
-}
-
-static void intel_init_lmce(void)
-{
-       u64 val;
-
-       if (!lmce_supported())
-               return;
-
-       rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
-
-       if (!(val & MCG_EXT_CTL_LMCE_EN))
-               wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
-}
-
-static void intel_clear_lmce(void)
-{
-       u64 val;
-
-       if (!lmce_supported())
-               return;
-
-       rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
-       val &= ~MCG_EXT_CTL_LMCE_EN;
-       wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
-}
-
-static void intel_ppin_init(struct cpuinfo_x86 *c)
-{
-       unsigned long long val;
-
-       /*
-        * Even if testing the presence of the MSR would be enough, we don't
-        * want to risk the situation where other models reuse this MSR for
-        * other purposes.
-        */
-       switch (c->x86_model) {
-       case INTEL_FAM6_IVYBRIDGE_X:
-       case INTEL_FAM6_HASWELL_X:
-       case INTEL_FAM6_BROADWELL_XEON_D:
-       case INTEL_FAM6_BROADWELL_X:
-       case INTEL_FAM6_SKYLAKE_X:
-       case INTEL_FAM6_XEON_PHI_KNL:
-       case INTEL_FAM6_XEON_PHI_KNM:
-
-               if (rdmsrl_safe(MSR_PPIN_CTL, &val))
-                       return;
-
-               if ((val & 3UL) == 1UL) {
-                       /* PPIN available but disabled: */
-                       return;
-               }
-
-               /* If PPIN is disabled, but not locked, try to enable: */
-               if (!(val & 3UL)) {
-                       wrmsrl_safe(MSR_PPIN_CTL,  val | 2UL);
-                       rdmsrl_safe(MSR_PPIN_CTL, &val);
-               }
-
-               if ((val & 3UL) == 2UL)
-                       set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
-       }
-}
-
-void mce_intel_feature_init(struct cpuinfo_x86 *c)
-{
-       intel_init_thermal(c);
-       intel_init_cmci();
-       intel_init_lmce();
-       intel_ppin_init(c);
-}
-
-void mce_intel_feature_clear(struct cpuinfo_x86 *c)
-{
-       intel_clear_lmce();
-}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
deleted file mode 100644 (file)
index 5cddf83..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * P5 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/traps.h>
-#include <asm/tlbflush.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* By default disabled */
-int mce_p5_enabled __read_mostly;
-
-/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
-{
-       u32 loaddr, hi, lotype;
-
-       ist_enter(regs);
-
-       rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
-       rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
-
-       pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
-                smp_processor_id(), loaddr, lotype);
-
-       if (lotype & (1<<5)) {
-               pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
-                        smp_processor_id());
-       }
-
-       add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
-       ist_exit(regs);
-}
-
-/* Set up machine check reporting for processors with Intel style MCE: */
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
-{
-       u32 l, h;
-
-       /* Default P5 to off as its often misconnected: */
-       if (!mce_p5_enabled)
-               return;
-
-       /* Check for MCE support: */
-       if (!cpu_has(c, X86_FEATURE_MCE))
-               return;
-
-       machine_check_vector = pentium_machine_check;
-       /* Make sure the vector pointer is visible before we enable MCEs: */
-       wmb();
-
-       /* Read registers before enabling: */
-       rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
-       rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-       pr_info("Intel old style machine check architecture supported.\n");
-
-       /* Enable MCE: */
-       cr4_set_bits(X86_CR4_MCE);
-       pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
-               smp_processor_id());
-}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
deleted file mode 100644 (file)
index 2da67b7..0000000
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- *          Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL         (300 * HZ)
-
-#define THERMAL_THROTTLING_EVENT       0
-#define POWER_LIMIT_EVENT              1
-
-/*
- * Current thermal event state:
- */
-struct _thermal_state {
-       bool                    new_event;
-       int                     event;
-       u64                     next_check;
-       unsigned long           count;
-       unsigned long           last_count;
-};
-
-struct thermal_state {
-       struct _thermal_state core_throttle;
-       struct _thermal_state core_power_limit;
-       struct _thermal_state package_throttle;
-       struct _thermal_state package_power_limit;
-       struct _thermal_state core_thresh0;
-       struct _thermal_state core_thresh1;
-       struct _thermal_state pkg_thresh0;
-       struct _thermal_state pkg_thresh1;
-};
-
-/* Callback to handle core threshold interrupts */
-int (*platform_thermal_notify)(__u64 msr_val);
-EXPORT_SYMBOL(platform_thermal_notify);
-
-/* Callback to handle core package threshold_interrupts */
-int (*platform_thermal_package_notify)(__u64 msr_val);
-EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-bool (*platform_thermal_package_rate_control)(void);
-EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
-
-
-static DEFINE_PER_CPU(struct thermal_state, thermal_state);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-static u32 lvtthmr_init __read_mostly;
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_device_one_ro(_name)                                \
-       static DEVICE_ATTR(_name, 0444,                                 \
-                          therm_throt_device_show_##_name,             \
-                                  NULL)                                \
-
-#define define_therm_throt_device_show_func(event, name)               \
-                                                                       \
-static ssize_t therm_throt_device_show_##event##_##name(               \
-                       struct device *dev,                             \
-                       struct device_attribute *attr,                  \
-                       char *buf)                                      \
-{                                                                      \
-       unsigned int cpu = dev->id;                                     \
-       ssize_t ret;                                                    \
-                                                                       \
-       preempt_disable();      /* CPU hotplug */                       \
-       if (cpu_online(cpu)) {                                          \
-               ret = sprintf(buf, "%lu\n",                             \
-                             per_cpu(thermal_state, cpu).event.name);  \
-       } else                                                          \
-               ret = 0;                                                \
-       preempt_enable();                                               \
-                                                                       \
-       return ret;                                                     \
-}
-
-define_therm_throt_device_show_func(core_throttle, count);
-define_therm_throt_device_one_ro(core_throttle_count);
-
-define_therm_throt_device_show_func(core_power_limit, count);
-define_therm_throt_device_one_ro(core_power_limit_count);
-
-define_therm_throt_device_show_func(package_throttle, count);
-define_therm_throt_device_one_ro(package_throttle_count);
-
-define_therm_throt_device_show_func(package_power_limit, count);
-define_therm_throt_device_one_ro(package_power_limit_count);
-
-static struct attribute *thermal_throttle_attrs[] = {
-       &dev_attr_core_throttle_count.attr,
-       NULL
-};
-
-static const struct attribute_group thermal_attr_group = {
-       .attrs  = thermal_throttle_attrs,
-       .name   = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-#define CORE_LEVEL     0
-#define PACKAGE_LEVEL  1
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- *        thermal interrupt normally gets called both when the thermal
- *        event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- */
-static void therm_throt_process(bool new_event, int event, int level)
-{
-       struct _thermal_state *state;
-       unsigned int this_cpu = smp_processor_id();
-       bool old_event;
-       u64 now;
-       struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-
-       now = get_jiffies_64();
-       if (level == CORE_LEVEL) {
-               if (event == THERMAL_THROTTLING_EVENT)
-                       state = &pstate->core_throttle;
-               else if (event == POWER_LIMIT_EVENT)
-                       state = &pstate->core_power_limit;
-               else
-                       return;
-       } else if (level == PACKAGE_LEVEL) {
-               if (event == THERMAL_THROTTLING_EVENT)
-                       state = &pstate->package_throttle;
-               else if (event == POWER_LIMIT_EVENT)
-                       state = &pstate->package_power_limit;
-               else
-                       return;
-       } else
-               return;
-
-       old_event = state->new_event;
-       state->new_event = new_event;
-
-       if (new_event)
-               state->count++;
-
-       if (time_before64(now, state->next_check) &&
-                       state->count != state->last_count)
-               return;
-
-       state->next_check = now + CHECK_INTERVAL;
-       state->last_count = state->count;
-
-       /* if we just entered the thermal event */
-       if (new_event) {
-               if (event == THERMAL_THROTTLING_EVENT)
-                       pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
-                               this_cpu,
-                               level == CORE_LEVEL ? "Core" : "Package",
-                               state->count);
-               return;
-       }
-       if (old_event) {
-               if (event == THERMAL_THROTTLING_EVENT)
-                       pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
-                               level == CORE_LEVEL ? "Core" : "Package");
-               return;
-       }
-}
-
-static int thresh_event_valid(int level, int event)
-{
-       struct _thermal_state *state;
-       unsigned int this_cpu = smp_processor_id();
-       struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-       u64 now = get_jiffies_64();
-
-       if (level == PACKAGE_LEVEL)
-               state = (event == 0) ? &pstate->pkg_thresh0 :
-                                               &pstate->pkg_thresh1;
-       else
-               state = (event == 0) ? &pstate->core_thresh0 :
-                                               &pstate->core_thresh1;
-
-       if (time_before64(now, state->next_check))
-               return 0;
-
-       state->next_check = now + CHECK_INTERVAL;
-
-       return 1;
-}
-
-static bool int_pln_enable;
-static int __init int_pln_enable_setup(char *s)
-{
-       int_pln_enable = true;
-
-       return 1;
-}
-__setup("int_pln_enable", int_pln_enable_setup);
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
-{
-       int err;
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-       err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
-       if (err)
-               return err;
-
-       if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
-               err = sysfs_add_file_to_group(&dev->kobj,
-                                             &dev_attr_core_power_limit_count.attr,
-                                             thermal_attr_group.name);
-       if (cpu_has(c, X86_FEATURE_PTS)) {
-               err = sysfs_add_file_to_group(&dev->kobj,
-                                             &dev_attr_package_throttle_count.attr,
-                                             thermal_attr_group.name);
-               if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
-                       err = sysfs_add_file_to_group(&dev->kobj,
-                                       &dev_attr_package_power_limit_count.attr,
-                                       thermal_attr_group.name);
-       }
-
-       return err;
-}
-
-static void thermal_throttle_remove_dev(struct device *dev)
-{
-       sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int thermal_throttle_online(unsigned int cpu)
-{
-       struct device *dev = get_cpu_device(cpu);
-
-       return thermal_throttle_add_dev(dev, cpu);
-}
-
-static int thermal_throttle_offline(unsigned int cpu)
-{
-       struct device *dev = get_cpu_device(cpu);
-
-       thermal_throttle_remove_dev(dev);
-       return 0;
-}
-
-static __init int thermal_throttle_init_device(void)
-{
-       int ret;
-
-       if (!atomic_read(&therm_throt_en))
-               return 0;
-
-       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
-                               thermal_throttle_online,
-                               thermal_throttle_offline);
-       return ret < 0 ? ret : 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-static void notify_package_thresholds(__u64 msr_val)
-{
-       bool notify_thres_0 = false;
-       bool notify_thres_1 = false;
-
-       if (!platform_thermal_package_notify)
-               return;
-
-       /* lower threshold check */
-       if (msr_val & THERM_LOG_THRESHOLD0)
-               notify_thres_0 = true;
-       /* higher threshold check */
-       if (msr_val & THERM_LOG_THRESHOLD1)
-               notify_thres_1 = true;
-
-       if (!notify_thres_0 && !notify_thres_1)
-               return;
-
-       if (platform_thermal_package_rate_control &&
-               platform_thermal_package_rate_control()) {
-               /* Rate control is implemented in callback */
-               platform_thermal_package_notify(msr_val);
-               return;
-       }
-
-       /* lower threshold reached */
-       if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
-               platform_thermal_package_notify(msr_val);
-       /* higher threshold reached */
-       if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
-               platform_thermal_package_notify(msr_val);
-}
-
-static void notify_thresholds(__u64 msr_val)
-{
-       /* check whether the interrupt handler is defined;
-        * otherwise simply return
-        */
-       if (!platform_thermal_notify)
-               return;
-
-       /* lower threshold reached */
-       if ((msr_val & THERM_LOG_THRESHOLD0) &&
-                       thresh_event_valid(CORE_LEVEL, 0))
-               platform_thermal_notify(msr_val);
-       /* higher threshold reached */
-       if ((msr_val & THERM_LOG_THRESHOLD1) &&
-                       thresh_event_valid(CORE_LEVEL, 1))
-               platform_thermal_notify(msr_val);
-}
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
-       __u64 msr_val;
-
-       if (static_cpu_has(X86_FEATURE_HWP))
-               wrmsrl_safe(MSR_HWP_STATUS, 0);
-
-       rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-
-       /* Check for violation of core thermal thresholds*/
-       notify_thresholds(msr_val);
-
-       therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
-                           THERMAL_THROTTLING_EVENT,
-                           CORE_LEVEL);
-
-       if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
-               therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
-                                       POWER_LIMIT_EVENT,
-                                       CORE_LEVEL);
-
-       if (this_cpu_has(X86_FEATURE_PTS)) {
-               rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
-               /* check violations of package thermal thresholds */
-               notify_package_thresholds(msr_val);
-               therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
-                                       THERMAL_THROTTLING_EVENT,
-                                       PACKAGE_LEVEL);
-               if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
-                       therm_throt_process(msr_val &
-                                       PACKAGE_THERM_STATUS_POWER_LIMIT,
-                                       POWER_LIMIT_EVENT,
-                                       PACKAGE_LEVEL);
-       }
-}
-
-static void unexpected_thermal_interrupt(void)
-{
-       pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
-               smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r)
-{
-       entering_irq();
-       trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
-       inc_irq_stat(irq_thermal_count);
-       smp_thermal_vector();
-       trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
-       exiting_ack_irq();
-}
-
-/* Thermal monitoring depends on APIC, ACPI and clock modulation */
-static int intel_thermal_supported(struct cpuinfo_x86 *c)
-{
-       if (!boot_cpu_has(X86_FEATURE_APIC))
-               return 0;
-       if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
-               return 0;
-       return 1;
-}
-
-void __init mcheck_intel_therm_init(void)
-{
-       /*
-        * This function is only called on boot CPU. Save the init thermal
-        * LVT value on BSP and use that value to restore APs' thermal LVT
-        * entry BIOS programmed later
-        */
-       if (intel_thermal_supported(&boot_cpu_data))
-               lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
-       unsigned int cpu = smp_processor_id();
-       int tm2 = 0;
-       u32 l, h;
-
-       if (!intel_thermal_supported(c))
-               return;
-
-       /*
-        * First check if its enabled already, in which case there might
-        * be some SMM goo which handles it, so we can't even put a handler
-        * since it might be delivered via SMI already:
-        */
-       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
-       h = lvtthmr_init;
-       /*
-        * The initial value of thermal LVT entries on all APs always reads
-        * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
-        * sequence to them and LVT registers are reset to 0s except for
-        * the mask bits which are set to 1s when APs receive INIT IPI.
-        * If BIOS takes over the thermal interrupt and sets its interrupt
-        * delivery mode to SMI (not fixed), it restores the value that the
-        * BIOS has programmed on AP based on BSP's info we saved since BIOS
-        * is always setting the same value for all threads/cores.
-        */
-       if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
-               apic_write(APIC_LVTTHMR, lvtthmr_init);
-
-
-       if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
-               if (system_state == SYSTEM_BOOTING)
-                       pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
-               return;
-       }
-
-       /* early Pentium M models use different method for enabling TM2 */
-       if (cpu_has(c, X86_FEATURE_TM2)) {
-               if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
-                       rdmsr(MSR_THERM2_CTL, l, h);
-                       if (l & MSR_THERM2_CTL_TM_SELECT)
-                               tm2 = 1;
-               } else if (l & MSR_IA32_MISC_ENABLE_TM2)
-                       tm2 = 1;
-       }
-
-       /* We'll mask the thermal vector in the lapic till we're ready: */
-       h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
-       apic_write(APIC_LVTTHMR, h);
-
-       rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-       if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
-               wrmsr(MSR_IA32_THERM_INTERRUPT,
-                       (l | (THERM_INT_LOW_ENABLE
-                       | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
-       else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
-               wrmsr(MSR_IA32_THERM_INTERRUPT,
-                       l | (THERM_INT_LOW_ENABLE
-                       | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
-       else
-               wrmsr(MSR_IA32_THERM_INTERRUPT,
-                     l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
-       if (cpu_has(c, X86_FEATURE_PTS)) {
-               rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-               if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
-                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
-                               (l | (PACKAGE_THERM_INT_LOW_ENABLE
-                               | PACKAGE_THERM_INT_HIGH_ENABLE))
-                               & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
-               else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
-                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
-                               l | (PACKAGE_THERM_INT_LOW_ENABLE
-                               | PACKAGE_THERM_INT_HIGH_ENABLE
-                               | PACKAGE_THERM_INT_PLN_ENABLE), h);
-               else
-                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
-                             l | (PACKAGE_THERM_INT_LOW_ENABLE
-                               | PACKAGE_THERM_INT_HIGH_ENABLE), h);
-       }
-
-       smp_thermal_vector = intel_thermal_interrupt;
-
-       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-       wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
-       /* Unmask the thermal vector: */
-       l = apic_read(APIC_LVTTHMR);
-       apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
-       pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
-                     tm2 ? "TM2" : "TM1");
-
-       /* enable thermal throttle processing */
-       atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
deleted file mode 100644 (file)
index 2b584b3..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Common corrected MCE threshold handler code:
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-
-#include <asm/irq_vectors.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/trace/irq_vectors.h>
-
-static void default_threshold_interrupt(void)
-{
-       pr_err("Unexpected threshold interrupt at vector %x\n",
-               THRESHOLD_APIC_VECTOR);
-}
-
-void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-
-asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
-{
-       entering_irq();
-       trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
-       inc_irq_stat(irq_threshold_count);
-       mce_threshold_vector();
-       trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
-       exiting_ack_irq();
-}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
deleted file mode 100644 (file)
index 3b45b27..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IDT Winchip specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include <asm/processor.h>
-#include <asm/traps.h>
-#include <asm/tlbflush.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
-{
-       ist_enter(regs);
-
-       pr_emerg("CPU0: Machine Check Exception.\n");
-       add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
-       ist_exit(regs);
-}
-
-/* Set up machine check reporting on the Winchip C6 series */
-void winchip_mcheck_init(struct cpuinfo_x86 *c)
-{
-       u32 lo, hi;
-
-       machine_check_vector = winchip_machine_check;
-       /* Make sure the vector pointer is visible before we enable MCEs: */
-       wmb();
-
-       rdmsr(MSR_IDT_FCR1, lo, hi);
-       lo |= (1<<2);   /* Enable EIERRINT (int 18 MCE) */
-       lo &= ~(1<<4);  /* Enable MCE */
-       wrmsr(MSR_IDT_FCR1, lo, hi);
-
-       cr4_set_bits(X86_CR4_MCE);
-
-       pr_info("Winchip machine check reporting enabled on CPU#0.\n");
-}