From 99cab331a4ee621e3604542ca88f9d76f2865aef Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 12 Dec 2023 17:23:23 +0800 Subject: [PATCH] drm/amdgpu: Add umc page retirement for umc v12_0 Add umc page retirement for umc v12_0. V2: 1. Changed umc page retirement check condition to call umc_v12_0_is_uncorrectable_error. 2. Use memset to clear the contents of the umc error address structure. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 56 ++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 4 ++ 2 files changed, 60 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 8430888760ba..7458a218e89d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -26,6 +26,7 @@ #include "amdgpu.h" #include "umc/umc_12_0_0_offset.h" #include "umc/umc_12_0_0_sh_mask.h" +#include "mp/mp_13_0_6_sh_mask.h" const uint32_t umc_v12_0_channel_idx_tbl[] @@ -370,6 +371,59 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, return 0; } +static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + amdgpu_mca_smu_log_ras_error(adev, + AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status); + amdgpu_mca_smu_log_ras_error(adev, + AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status); +} + +static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev, + void *ras_error_status) +{ + struct ras_err_node *err_node; + uint64_t mc_umc_status; + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + + for_each_ras_error(err_node, err_data) { + mc_umc_status = err_node->err_info.err_addr.err_status; + if (!mc_umc_status) + continue; + + if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) { + uint64_t mca_addr, err_addr, mca_ipid; + uint32_t InstanceIdLo; + struct amdgpu_smuio_mcm_config_info *mcm_info; + + mcm_info = &err_node->err_info.mcm_info; + mca_addr = err_node->err_info.err_addr.err_addr; + mca_ipid = err_node->err_info.err_addr.err_ipid; + + err_addr = REG_GET_FIELD(mca_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); + + dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n", + mca_ipid, + mcm_info->die_id, + MCA_IPID_LO_2_UMC_INST(InstanceIdLo), + MCA_IPID_LO_2_UMC_CH(InstanceIdLo), + err_addr); + + umc_v12_0_convert_error_address(adev, + err_data, err_addr, + MCA_IPID_LO_2_UMC_CH(InstanceIdLo), + MCA_IPID_LO_2_UMC_INST(InstanceIdLo), + mcm_info->die_id); + + /* Clear umc error address content */ + memset(&err_node->err_info.err_addr, + 0, sizeof(err_node->err_info.err_addr)); + } + } +} + static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) { amdgpu_umc_loop_channels(adev, @@ -396,4 +450,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = { }, .err_cnt_init = umc_v12_0_err_cnt_init, .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, + .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count, + .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address, }; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index 17b4b52d6f13..e8de3a92251a 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -117,6 +117,10 @@ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \ } while (0) +#define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \ + (((_ipid_lo) >> 12) & 0xF)) +#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7) + bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); -- 2.25.1