drm/amdgpu: add reset_ras_error_count function for GFX
authorHawking Zhang <Hawking.Zhang@amd.com>
Mon, 2 Mar 2020 05:27:59 +0000 (13:27 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 5 Mar 2020 05:32:47 +0000 (00:32 -0500)
GFX ras error counters are dirty ones after cold reboot
Read operation is needed to reset them to 0

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Guchun Chen <guchun.chen@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h

index 37ba05b63b2a17656576ad8e4562ecc47b675579..bb05cb7b3f085ddb0530da9baaea243ce14e0ca2 100644 (file)
@@ -206,6 +206,7 @@ struct amdgpu_gfx_funcs {
                                 u32 queue, u32 vmid);
        int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
        int (*query_ras_error_count) (struct amdgpu_device *adev, void *ras_error_status);
+       void (*reset_ras_error_count) (struct amdgpu_device *adev);
 };
 
 struct sq_work {
index 0ad35976ca7dc9ad74a60cda79bd406046a09a86..6e0817e86668c7e4f3e27357f32956566a084d8f 100644 (file)
@@ -738,9 +738,9 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
                                          void *ras_error_status);
-static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev);
 static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
                                     void *inject_if);
+static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev);
 
 static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring,
                                uint64_t queue_mask)
@@ -1997,7 +1997,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
        .read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
        .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
        .ras_error_inject = &gfx_v9_0_ras_error_inject,
-       .query_ras_error_count = &gfx_v9_0_query_ras_error_count
+       .query_ras_error_count = &gfx_v9_0_query_ras_error_count,
+       .reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
 };
 
 static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
@@ -2008,7 +2009,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
        .read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
        .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
        .ras_error_inject = &gfx_v9_4_ras_error_inject,
-       .query_ras_error_count = &gfx_v9_4_query_ras_error_count
+       .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
+       .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -4395,18 +4397,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                goto fail;
        }
 
-       switch (adev->asic_type)
-       {
-       case CHIP_VEGA20:
-               gfx_v9_0_clear_ras_edc_counter(adev);
-               break;
-       case CHIP_ARCTURUS:
-               gfx_v9_4_clear_ras_edc_counter(adev);
-               break;
-       default:
-               break;
-       }
-
 fail:
        amdgpu_ib_free(adev, &ib, NULL);
        dma_fence_put(f);
@@ -4454,6 +4444,10 @@ static int gfx_v9_0_ecc_late_init(void *handle)
        if (r)
                return r;
 
+       if (adev->gfx.funcs &&
+           adev->gfx.funcs->reset_ras_error_count)
+               adev->gfx.funcs->reset_ras_error_count(adev);
+
        r = amdgpu_gfx_ras_late_init(adev);
        if (r)
                return r;
@@ -6388,7 +6382,7 @@ static int gfx_v9_0_ras_error_count(const struct soc15_reg_entry *reg,
        return 0;
 }
 
-static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev)
+static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev)
 {
        int i, j, k;
 
index f099f13d7f1e93d95a0fdcc847cb6b6c0b907270..17f1e7b69a60350c73a7474a5c1eb216c6e5e20a 100644 (file)
@@ -893,7 +893,7 @@ int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
        return 0;
 }
 
-void gfx_v9_4_clear_ras_edc_counter(struct amdgpu_device *adev)
+void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
 {
        int i, j, k;
 
index 2e3f6f755ad47de7d9799bbe5a77aa7e2ac7f6da..1ffecc5c0f0a9c7818a8f78f65851f5417dfb834 100644 (file)
@@ -32,4 +32,6 @@ int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
 int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
                                     void *inject_if);
 
+void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev);
+
 #endif /* __GFX_V9_4_H__ */