sb_edac: Fix support for systems with two home agents per socket
authorTony Luck <tony.luck@intel.com>
Mon, 18 May 2015 20:50:42 +0000 (17:50 -0300)
committerMauro Carvalho Chehab <mchehab@osg.samsung.com>
Wed, 3 Jun 2015 13:10:52 +0000 (10:10 -0300)
First noticed a problem on a 4 socket machine where EDAC only reported
half the DIMMS.  Tracked this down to the code that assumes that systems
with two home agents only have two memory channels on each agent. This
is true on 2 sockect ("-EP") machines. But four socket ("-EX") machines
have four memory channels on each home agent.

The old code would have had problems on two socket systems as it did
a shuffling trick to make the internals of the code think that the
channels from the first agent were '0' and '1', with the second agent
providing '2' and '3'. But the code didn't uniformly convert from
{ha,channel} tuples to this internal representation.

New code always considers up to eight channels.
On a machine with a single home agent these map easily to edac channels
0, 1, 2, 3. On machines with two home agents we map using:
  edac_channel = 4*ha# + channel
So on a -EP machine where each home agent supports only two channels
we'll fill in channels 0, 1, 4, 5, and on a -EX machine we use all of 0,
1, 2, 3, 4, 5, 6, 7.

[mchehab@osg.samsung.com: fold a fixup patch as per Tony's request and fixed
 a few CodingStyle issues]
Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
drivers/edac/sb_edac.c

index b9648323106b87b8de9242ca2957d248a57c82e5..be961d4ed4d150c97ef5a1bd974f7d22eca481d6 100644 (file)
@@ -34,7 +34,7 @@ static int probed;
 /*
  * Alter this version for the module when modifications are made
  */
-#define SBRIDGE_REVISION    " Ver: 1.1.0 "
+#define SBRIDGE_REVISION    " Ver: 1.1.1 "
 #define EDAC_MOD_STR      "sbridge_edac"
 
 /*
@@ -254,7 +254,7 @@ static const u32 correrrthrsld[] = {
  * sbridge structs
  */
 
-#define NUM_CHANNELS           4
+#define NUM_CHANNELS           8       /* 2MC per socket, four chan per MC */
 #define MAX_DIMMS              3       /* Max DIMMS per channel */
 #define CHANNEL_UNSPECIFIED    0xf     /* Intel IA32 SDM 15-14 */
 
@@ -393,6 +393,8 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
 #define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS                0x0e79
 #define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0       0x0e6a
 #define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1       0x0e6b
+#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2       0x0e6c
+#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3       0x0e6d
 
 static const struct pci_id_descr pci_dev_descr_ibridge[] = {
                /* Processor Home Agent */
@@ -421,6 +423,8 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = {
 #endif
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1)        },
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1, 1)        },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2, 1)        },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3, 1)        },
 
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0, 1)      },
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0, 1)      },
@@ -909,6 +913,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
        for (i = 0; i < NUM_CHANNELS; i++) {
                u32 mtr;
 
+               if (!pvt->pci_tad[i])
+                       continue;
                for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
                        dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
                                       i, j, 0);
@@ -925,8 +931,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
                                size = ((u64)rows * cols * banks * ranks) >> (20 - 3);
                                npages = MiB_TO_PAGES(size);
 
-                               edac_dbg(0, "mc#%d: channel %d, dimm %d, %Ld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
-                                        pvt->sbridge_dev->mc, i, j,
+                               edac_dbg(0, "mc#%d: ha %d channel %d, dimm %d, %lld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
+                                        pvt->sbridge_dev->mc, i/4, i%4, j,
                                         size, npages,
                                         banks, ranks, rows, cols);
 
@@ -946,8 +952,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
                                dimm->mtype = mtype;
                                dimm->edac_mode = mode;
                                snprintf(dimm->label, sizeof(dimm->label),
-                                        "CPU_SrcID#%u_Channel#%u_DIMM#%u",
-                                        pvt->sbridge_dev->source_id, i, j);
+                                        "CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u",
+                                        pvt->sbridge_dev->source_id, i/4, i%4, j);
                        }
                }
        }
@@ -1128,7 +1134,7 @@ static struct mem_ctl_info *get_mci_for_node_id(u8 node_id)
 
 static int get_memory_error_data(struct mem_ctl_info *mci,
                                 u64 addr,
-                                u8 *socket,
+                                u8 *socket, u8 *ha,
                                 long *channel_mask,
                                 u8 *rank,
                                 char **area_type, char *msg)
@@ -1141,7 +1147,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
        int                     interleave_mode, shiftup = 0;
        unsigned                sad_interleave[pvt->info.max_interleave];
        u32                     reg, dram_rule;
-       u8                      ch_way, sck_way, pkg, sad_ha = 0;
+       u8                      ch_way, sck_way, pkg, sad_ha = 0, ch_add = 0;
        u32                     tad_offset;
        u32                     rir_way;
        u32                     mb, gb;
@@ -1254,6 +1260,8 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
                *socket = sad_pkg_socket(pkg);
                sad_ha = sad_pkg_ha(pkg);
+               if (sad_ha)
+                       ch_add = 4;
 
                if (a7mode) {
                        /* MCChanShiftUpEnable */
@@ -1270,10 +1278,14 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
                *socket = sad_pkg_socket(pkg);
                sad_ha = sad_pkg_ha(pkg);
+               if (sad_ha)
+                       ch_add = 4;
                edac_dbg(0, "SAD interleave package: %d = CPU socket %d, HA %d\n",
                         idx, *socket, sad_ha);
        }
 
+       *ha = sad_ha;
+
        /*
         * Move to the proper node structure, in order to access the
         * right PCI registers
@@ -1346,7 +1358,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
        }
        *channel_mask = 1 << base_ch;
 
-       pci_read_config_dword(pvt->pci_tad[base_ch],
+       pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
                                tad_ch_nilv_offset[n_tads],
                                &tad_offset);
 
@@ -1405,7 +1417,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
         * Step 3) Decode rank
         */
        for (n_rir = 0; n_rir < MAX_RIR_RANGES; n_rir++) {
-               pci_read_config_dword(pvt->pci_tad[base_ch],
+               pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
                                      rir_way_limit[n_rir],
                                      &reg);
 
@@ -1435,7 +1447,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
                idx = (ch_addr >> 13);  /* FIXME: Datasheet says to shift by 15 */
        idx %= 1 << rir_way;
 
-       pci_read_config_dword(pvt->pci_tad[base_ch],
+       pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
                              rir_offset[n_rir][idx],
                              &reg);
        *rank = RIR_RNK_TGT(reg);
@@ -1681,16 +1693,9 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
                                 struct sbridge_dev *sbridge_dev)
 {
        struct sbridge_pvt *pvt = mci->pvt_info;
-       struct pci_dev *pdev, *tmp;
+       struct pci_dev *pdev;
+       u8 saw_chan_mask = 0;
        int i;
-       bool mode_2ha = false;
-
-       tmp = pci_get_device(PCI_VENDOR_ID_INTEL,
-                            PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, NULL);
-       if (tmp) {
-               mode_2ha = true;
-               pci_dev_put(tmp);
-       }
 
        for (i = 0; i < sbridge_dev->n_devs; i++) {
                pdev = sbridge_dev->pdev[i];
@@ -1706,26 +1711,21 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
                        pvt->pci_ras = pdev;
                        break;
-               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
-               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
-                       /* if we have 2 HAs active, channels 2 and 3
-                        * are in other device */
-                       if (mode_2ha)
-                               break;
-                       /* fall through */
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0:
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD1:
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
                {
                        int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0;
                        pvt->pci_tad[id] = pdev;
+                       saw_chan_mask |= 1 << id;
                }
                        break;
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0:
                        pvt->pci_ddrio = pdev;
                        break;
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0:
-                       if (!mode_2ha)
-                               pvt->pci_ddrio = pdev;
+                       pvt->pci_ddrio = pdev;
                        break;
                case PCI_DEVICE_ID_INTEL_IBRIDGE_SAD:
                        pvt->pci_sad0 = pdev;
@@ -1741,13 +1741,12 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
                        break;
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0:
                case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1:
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2:
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3:
                {
-                       int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 2;
-
-                       /* we shouldn't have this device if we have just one
-                        * HA present */
-                       WARN_ON(!mode_2ha);
+                       int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 4;
                        pvt->pci_tad[id] = pdev;
+                       saw_chan_mask |= 1 << id;
                }
                        break;
                default:
@@ -1766,10 +1765,10 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
            !pvt->pci_ta)
                goto enodev;
 
-       for (i = 0; i < NUM_CHANNELS; i++) {
-               if (!pvt->pci_tad[i])
-                       goto enodev;
-       }
+       if (saw_chan_mask != 0x0f && /* -EN */
+           saw_chan_mask != 0x33 && /* -EP */
+           saw_chan_mask != 0xff)   /* -EX */
+               goto enodev;
        return 0;
 
 enodev:
@@ -1787,16 +1786,9 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
                                 struct sbridge_dev *sbridge_dev)
 {
        struct sbridge_pvt *pvt = mci->pvt_info;
-       struct pci_dev *pdev, *tmp;
+       struct pci_dev *pdev;
+       u8 saw_chan_mask = 0;
        int i;
-       bool mode_2ha = false;
-
-       tmp = pci_get_device(PCI_VENDOR_ID_INTEL,
-                            PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1, NULL);
-       if (tmp) {
-               mode_2ha = true;
-               pci_dev_put(tmp);
-       }
 
        /* there's only one device per system; not tied to any bus */
        if (pvt->info.pci_vtd == NULL)
@@ -1827,18 +1819,26 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
                        pvt->pci_ras = pdev;
                        break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0:
-                       pvt->pci_tad[0] = pdev;
-                       break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD1:
-                       pvt->pci_tad[1] = pdev;
-                       break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD2:
-                       if (!mode_2ha)
-                               pvt->pci_tad[2] = pdev;
-                       break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3:
-                       if (!mode_2ha)
-                               pvt->pci_tad[3] = pdev;
+               {
+                       int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0;
+
+                       pvt->pci_tad[id] = pdev;
+                       saw_chan_mask |= 1 << id;
+               }
+                       break;
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3:
+               {
+                       int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0 + 4;
+
+                       pvt->pci_tad[id] = pdev;
+                       saw_chan_mask |= 1 << id;
+               }
                        break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0:
                        pvt->pci_ddrio = pdev;
@@ -1849,14 +1849,6 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA:
                        pvt->pci_ha1_ta = pdev;
                        break;
-               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
-                       if (mode_2ha)
-                               pvt->pci_tad[2] = pdev;
-                       break;
-               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
-                       if (mode_2ha)
-                               pvt->pci_tad[3] = pdev;
-                       break;
                default:
                        break;
                }
@@ -1872,10 +1864,10 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
            !pvt->pci_ras  || !pvt->pci_ta || !pvt->info.pci_vtd)
                goto enodev;
 
-       for (i = 0; i < NUM_CHANNELS; i++) {
-               if (!pvt->pci_tad[i])
-                       goto enodev;
-       }
+       if (saw_chan_mask != 0x0f && /* -EN */
+           saw_chan_mask != 0x33 && /* -EP */
+           saw_chan_mask != 0xff)   /* -EX */
+               goto enodev;
        return 0;
 
 enodev:
@@ -1986,7 +1978,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
        u32 channel = GET_BITFIELD(m->status, 0, 3);
        u32 optypenum = GET_BITFIELD(m->status, 4, 6);
        long channel_mask, first_channel;
-       u8  rank, socket;
+       u8  rank, socket, ha;
        int rc, dimm;
        char *area_type = NULL;
 
@@ -2048,7 +2040,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
        if (!GET_BITFIELD(m->status, 58, 58))
                return;
 
-       rc = get_memory_error_data(mci, m->addr, &socket,
+       rc = get_memory_error_data(mci, m->addr, &socket, &ha,
                                   &channel_mask, &rank, &area_type, msg);
        if (rc < 0)
                goto err_parsing;
@@ -2080,12 +2072,12 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
                channel = first_channel;
 
        snprintf(msg, sizeof(msg),
-                "%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
+                "%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d",
                 overflow ? " OVERFLOW" : "",
                 (uncorrected_error && recoverable) ? " recoverable" : "",
                 area_type,
                 mscod, errcode,
-                socket,
+                socket, ha,
                 channel_mask,
                 rank);
 
@@ -2099,7 +2091,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
        /* Call the helper to output message */
        edac_mc_handle_error(tp_event, mci, core_err_cnt,
                             m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
-                            channel, dimm, -1,
+                            4*ha+channel, dimm, -1,
                             optype, msg);
        return;
 err_parsing: