x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers
authorAvadhut Naik <avadhut.naik@amd.com>
Tue, 22 Oct 2024 19:36:29 +0000 (19:36 +0000)
committerBorislav Petkov (AMD) <bp@alien8.de>
Thu, 31 Oct 2024 09:36:07 +0000 (10:36 +0100)
Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers:
MCA_SYND1 and MCA_SYND2.

These registers will include supplemental error information in addition to the
existing MCA_SYND register. The data within these registers is considered
valid if MCA_STATUS[SyndV] is set.

Userspace error decoding tools like rasdaemon gather related hardware error
information through the tracepoints.

Therefore, export these two registers through the mce_record tracepoint so
that tools like rasdaemon can parse them and output the supplemental error
information like FRU text contained in them.

  [ bp: Massage. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://lore.kernel.org/r/20241022194158.110073-4-avadhut.naik@amd.com
arch/x86/include/asm/mce.h
arch/x86/include/uapi/asm/mce.h
arch/x86/kernel/cpu/mce/amd.c
arch/x86/kernel/cpu/mce/core.c
drivers/edac/mce_amd.c
include/trace/events/mce.h

index 4e45f45673a3306a3d3544d69b32d84ba494c92e..4d936ee20e24eb32f4fedc4404b2ede1637f73ec 100644 (file)
 #define MSR_AMD64_SMCA_MC0_DESTAT      0xc0002008
 #define MSR_AMD64_SMCA_MC0_DEADDR      0xc0002009
 #define MSR_AMD64_SMCA_MC0_MISC1       0xc000200a
+/* Registers MISC2 to MISC4 are at offsets B to D. */
+#define MSR_AMD64_SMCA_MC0_SYND1       0xc000200e
+#define MSR_AMD64_SMCA_MC0_SYND2       0xc000200f
 #define MSR_AMD64_SMCA_MCx_CTL(x)      (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_STATUS(x)   (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_ADDR(x)     (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_DESTAT(x)   (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_DEADDR(x)   (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+#define MSR_AMD64_SMCA_MCx_SYND1(x)    (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND2(x)    (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
 
 #define XEC(x, mask)                   (((x) >> 16) & mask)
 
@@ -190,9 +195,25 @@ enum mce_notifier_prios {
 /**
  * struct mce_hw_err - Hardware Error Record.
  * @m:         Machine Check record.
+ * @vendor:    Vendor-specific error information.
+ *
+ * Vendor-specific fields should not be added to struct mce. Instead, vendors
+ * should export their vendor-specific data through their structure in the
+ * vendor union below.
+ *
+ * AMD's vendor data is parsed by error decoding tools for supplemental error
+ * information. Thus, current offsets of existing fields must be maintained.
+ * Only add new fields at the end of AMD's vendor structure.
  */
 struct mce_hw_err {
        struct mce m;
+
+       union vendor_info {
+               struct {
+                       u64 synd1;              /* MCA_SYND1 MSR */
+                       u64 synd2;              /* MCA_SYND2 MSR */
+               } amd;
+       } vendor;
 };
 
 #define        to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)
index db9adc081c5af37766483d0f9fe64b33fee211e1..cb6b48a7c22b41c643b88d7b3e8fb773066ccc76 100644 (file)
@@ -8,7 +8,8 @@
 /*
  * Fields are zero when not available. Also, this struct is shared with
  * userspace mcelog and thus must keep existing fields at current offsets.
- * Only add new fields to the end of the structure
+ * Only add new, shared fields to the end of the structure.
+ * Do not add vendor-specific fields.
  */
 struct mce {
        __u64 status;           /* Bank's MCi_STATUS MSR */
index 5b4d266500b22fb32a7ff166be72425ae519decf..6ca80fff1fea946df3310c8c473961cba4bea750 100644 (file)
@@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
        if (mce_flags.smca) {
                rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
 
-               if (m->status & MCI_STATUS_SYNDV)
+               if (m->status & MCI_STATUS_SYNDV) {
                        rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
+                       rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
+                       rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
+               }
        }
 
        mce_log(&err);
index 28e28b69d84d8c5d51500296ad4da0d0b5971d64..7fb5556a0b53d0ad8827b10ebdab6e39529494b8 100644 (file)
@@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err)
        if (mce_flags.smca) {
                if (m->synd)
                        pr_cont("SYND %llx ", m->synd);
+               if (err->vendor.amd.synd1)
+                       pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+               if (err->vendor.amd.synd2)
+                       pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
                if (m->ipid)
                        pr_cont("IPID %llx ", m->ipid);
        }
@@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
        if (mce_flags.smca) {
                m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 
-               if (m->status & MCI_STATUS_SYNDV)
+               if (m->status & MCI_STATUS_SYNDV) {
                        m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+                       err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
+                       err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+               }
        }
 }
 
index 8130c3dc64da56470ac60015897787a12005c5fc..194d9fd47d2073db9efd01aaad9f785d3be91523 100644 (file)
@@ -793,6 +793,7 @@ static int
 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 {
        struct mce *m = (struct mce *)data;
+       struct mce_hw_err *err = to_mce_hw_err(m);
        unsigned int fam = x86_family(m->cpuid);
        int ecc;
 
@@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
        if (boot_cpu_has(X86_FEATURE_SMCA)) {
                pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
 
-               if (m->status & MCI_STATUS_SYNDV)
-                       pr_cont(", Syndrome: 0x%016llx", m->synd);
+               if (m->status & MCI_STATUS_SYNDV) {
+                       pr_cont(", Syndrome: 0x%016llx\n", m->synd);
+                       pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
+                                err->vendor.amd.synd1, err->vendor.amd.synd2);
+               }
 
                pr_cont("\n");
 
index 65aba1afcd070acd70b0eb7f809ddbbf53eacd4a..c1c50df9ecfd32496b372666e79a5dd519491e88 100644 (file)
@@ -43,6 +43,7 @@ TRACE_EVENT(mce_record,
                __field(        u8,             bank            )
                __field(        u8,             cpuvendor       )
                __field(        u32,            microcode       )
+               __dynamic_array(u8, v_data, sizeof(err->vendor))
        ),
 
        TP_fast_assign(
@@ -65,9 +66,10 @@ TRACE_EVENT(mce_record,
                __entry->bank           = err->m.bank;
                __entry->cpuvendor      = err->m.cpuvendor;
                __entry->microcode      = err->m.microcode;
+               memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor));
        ),
 
-       TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
+       TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
                __entry->cpu,
                __entry->mcgcap, __entry->mcgstatus,
                __entry->bank, __entry->status,
@@ -83,7 +85,8 @@ TRACE_EVENT(mce_record,
                __entry->walltime,
                __entry->socketid,
                __entry->apicid,
-               __entry->microcode)
+               __entry->microcode,
+               __print_dynamic_array(v_data, sizeof(u8)))
 );
 
 #endif /* _TRACE_MCE_H */