drm/amdgpu: Fix RAS function interface
authorLuben Tuikov <luben.tuikov@amd.com>
Wed, 19 May 2021 01:07:17 +0000 (21:07 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 27 May 2021 16:22:54 +0000 (12:22 -0400)
The correctable and uncorrectable errors
are calculated at each invocation of this
function. Therefore, it is highly inefficient to
return just one of them based on a Boolean
input. If the caller wants both, twice the work
would be done. (And this work is O(n^3) on
Vega20.)

Fix this "interface" to simply return what it had
calculated--both values. Let the caller choose
what it wants to record, inspect, use.

Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
Reviewed-by: Alexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index e3a4c3a7635ac050b64dc2249b8e63267054f021..ed3c43e8b0b547b28c56717c302494c572c990ba 100644 (file)
@@ -1043,29 +1043,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }
 
 /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-               bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+                                 unsigned long *ce_count,
+                                 unsigned long *ue_count)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_manager *obj;
-       struct ras_err_data data = {0, 0};
+       unsigned long ce, ue;
 
        if (!adev->ras_enabled || !con)
-               return 0;
+               return;
 
+       ce = 0;
+       ue = 0;
        list_for_each_entry(obj, &con->head, node) {
                struct ras_query_if info = {
                        .head = obj->head,
                };
 
                if (amdgpu_ras_query_error_status(adev, &info))
-                       return 0;
+                       return;
 
-               data.ce_count += info.ce_count;
-               data.ue_count += info.ue_count;
+               ce += info.ce_count;
+               ue += info.ue_count;
        }
 
-       return is_ce ? data.ce_count : data.ue_count;
+       if (ce_count)
+               *ce_count = ce;
+
+       if (ue_count)
+               *ue_count = ue;
 }
 /* query/inject/cure end */
 
index bfa40c8ecc94e0b16546331f6e0dc168148aa00a..10fca03931068689600142443716ce7130dff603 100644 (file)
@@ -485,8 +485,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-               bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+                                 unsigned long *ce_count,
+                                 unsigned long *ue_count);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,