drm/amdgpu: update algorithm of umc address conversion
authorTao Zhou <tao.zhou1@amd.com>
Wed, 19 Jan 2022 09:00:09 +0000 (17:00 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 27 Jan 2022 20:49:08 +0000 (15:49 -0500)
On ALDEBARAN, we need to traverse all column bits higher than
BIT11(C4C3C2) in a row, the shift of R14 bit should be also taken
into account. Retire all pages we find.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
drivers/gpu/drm/amd/amdgpu/umc_v6_7.h

index 300dee9ec6b4ee21cb9a49a4fc2c4b38f485db1e..1ecba7b5df1c23c2ca508ed467ce62ade1ffd88c 100644 (file)
@@ -119,7 +119,7 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
                                         uint32_t ch_inst,
                                         uint32_t umc_inst)
 {
-       uint64_t mc_umc_status, err_addr, retired_page;
+       uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
        uint32_t channel_index;
        uint32_t eccinfo_table_idx;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
@@ -145,15 +145,27 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
                err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
 
                /* translate umc channel address to soc pa, 3 parts are included */
-               retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+               soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
                                ADDR_OF_256B_BLOCK(channel_index) |
                                OFFSET_IN_256B_BLOCK(err_addr);
+               /* clear [C4 C3 C2] in soc physical address */
+               soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
 
                /* we only save ue error information currently, ce is skipped */
                if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
-                               == 1)
-                       amdgpu_umc_fill_error_record(err_data, err_addr,
+                               == 1) {
+                       /* loop for all possibilities of [C4 C3 C2] */
+                       for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
+                               retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
+                               amdgpu_umc_fill_error_record(err_data, err_addr,
                                        retired_page, channel_index, umc_inst);
+
+                               /* shift R14 bit */
+                               retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
+                               amdgpu_umc_fill_error_record(err_data, err_addr,
+                                       retired_page, channel_index, umc_inst);
+                       }
+               }
        }
 }
 
@@ -332,8 +344,9 @@ static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
                                         uint32_t umc_inst)
 {
        uint32_t mc_umc_status_addr;
-       uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
        uint32_t channel_index;
+       uint64_t mc_umc_status, mc_umc_addrt0;
+       uint64_t err_addr, soc_pa, retired_page, column;
 
        mc_umc_status_addr =
                SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -363,15 +376,27 @@ static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
                err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
 
                /* translate umc channel address to soc pa, 3 parts are included */
-               retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+               soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
                                ADDR_OF_256B_BLOCK(channel_index) |
                                OFFSET_IN_256B_BLOCK(err_addr);
+               /* clear [C4 C3 C2] in soc physical address */
+               soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
 
                /* we only save ue error information currently, ce is skipped */
                if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
-                               == 1)
-                       amdgpu_umc_fill_error_record(err_data, err_addr,
+                               == 1) {
+                       /* loop for all possibilities of [C4 C3 C2] */
+                       for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
+                               retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
+                               amdgpu_umc_fill_error_record(err_data, err_addr,
+                                       retired_page, channel_index, umc_inst);
+
+                               /* shift R14 bit */
+                               retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
+                               amdgpu_umc_fill_error_record(err_data, err_addr,
                                        retired_page, channel_index, umc_inst);
+                       }
+               }
        }
 
        /* clear umc status */
index 9adebcf98582c5807a7dccf3e3803753bd2f6fe1..b67677867b45b846686e433c86f15e7b6ee67046 100644 (file)
 #define UMC_V6_7_NA_MAP_PA_NUM 8
 /* R14 bit shift should be considered, double the number */
 #define UMC_V6_7_BAD_PAGE_NUM_PER_CHANNEL      (UMC_V6_7_NA_MAP_PA_NUM * 2)
+/* The C2 bit in SOC physical address */
+#define UMC_V6_7_PA_C2_BIT     17
+/* The R14 bit in SOC physical address */
+#define UMC_V6_7_PA_R14_BIT    34
 /* UMC regiser per channel offset */
 #define UMC_V6_7_PER_CHANNEL_OFFSET            0x400
 extern struct amdgpu_umc_ras umc_v6_7_ras;