drm/amdgpu: protect RAS sysfs during GPU reset
authorJohn Clements <john.clements@amd.com>
Thu, 19 Mar 2020 06:41:55 +0000 (14:41 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 20 Mar 2020 14:45:00 +0000 (10:45 -0400)
MMHub EDC becomes dirty after BACO reset

EDC registers should be cleared early on in reset phase

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 6f469facabfbf84917ece89591213aeecbd72d89..faa3e7102156c755d8d32063d9d59f0b0cceed7b 100644 (file)
@@ -2742,6 +2742,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 
                if (adev->asic_reset_res)
                        goto fail;
+
+               if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
+                       adev->mmhub.funcs->reset_ras_error_count(adev);
        } else {
 
                task_barrier_full(&hive->tb);
@@ -3910,8 +3913,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                }
        }
 
-       if (!r && amdgpu_ras_intr_triggered())
+       if (!r && amdgpu_ras_intr_triggered()) {
+               list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+                       if (tmp_adev->mmhub.funcs &&
+                           tmp_adev->mmhub.funcs->reset_ras_error_count)
+                               tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
+               }
+
                amdgpu_ras_intr_cleared();
+       }
 
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                if (need_full_reset) {
index 43055a01f35e428487230e9007bab7aeda0942b6..3c32a94d24240a9adefd469264c5b014cd3821be 100644 (file)
@@ -281,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
        struct ras_debug_if data;
        int ret = 0;
 
+       if (amdgpu_ras_intr_triggered()) {
+               DRM_WARN("RAS WARN: error injection currently inaccessible\n");
+               return size;
+       }
+
        ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
        if (ret)
                return -EINVAL;
@@ -394,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
                .head = obj->head,
        };
 
+       if (amdgpu_ras_intr_triggered())
+               return snprintf(buf, PAGE_SIZE,
+                               "Query currently inaccessible\n");
+
        if (amdgpu_ras_error_query(obj->adev, &info))
                return -EINVAL;