drm/amdgpu: skip umc ras error count harvest
authorStanley.Yang <Stanley.Yang@amd.com>
Tue, 7 Dec 2021 06:28:58 +0000 (14:28 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 7 Dec 2021 18:12:19 +0000 (13:12 -0500)
remove in recovery stat check, skip umc ras err cnt
harvest in amdgpu_ras_log_on_err_counter

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 1043d41b6807777f05a6aeb0810ee95f64d72b0c..a95d200adff9674154e790874b4589bb5727fb8f 100644 (file)
@@ -897,11 +897,6 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
        int ret = 0;
 
-       /* skip get ecc info during gpu recovery */
-       if (atomic_read(&ras->in_recovery) == 1 &&
-               adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))
-               return;
-
        /*
         * choosing right query method according to
         * whether smu support query error information
@@ -1752,6 +1747,16 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
                if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
                        continue;
 
+               /*
+                * this is a workaround for aldebaran, skip send msg to
+                * smu to get ecc_info table due to smu handle get ecc
+                * info table failed temporarily.
+                * should be removed until smu fix handle ecc_info table.
+                */
+               if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
+                       (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
+                       continue;
+
                amdgpu_ras_query_error_status(adev, &info);
        }
 }