drm/amdgpu: optimize the gpu reset for XGMI setup V2

author Evan Quan <evan.quan@amd.com>

Thu, 16 Apr 2020 04:27:28 +0000 (12:27 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 22 Apr 2020 22:11:49 +0000 (18:11 -0400)
author Evan Quan <evan.quan@amd.com>
Thu, 16 Apr 2020 04:27:28 +0000 (12:27 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 22 Apr 2020 22:11:49 +0000 (18:11 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 4be5187391c65a0028996e893506333e57346bf3..e75deb789e5674da3493a5f43d4f5c112a373ae1 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4152,16 +4152,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         }
  
         need_full_reset = job_signaled = false;
-       INIT_LIST_HEAD(&device_list);
-
-       amdgpu_ras_set_error_query_ready(adev, false);
  
         dev_info(adev->dev, "GPU %s begin!\n",
                 (in_ras_intr && !use_baco) ? "jobs stop":"reset");
  
-       cancel_delayed_work_sync(&adev->delayed_init_work);
-
-       hive = amdgpu_get_xgmi_hive(adev, false);
+       hive = amdgpu_get_xgmi_hive(adev, true);
  
         /*
          * Here we trylock to avoid chain of resets executing from
@@ -4174,35 +4169,21 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         if (hive && !mutex_trylock(&hive->reset_lock)) {
                 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
                           job ? job->base.id : -1, hive->hive_id);
+               mutex_unlock(&hive->hive_lock);
                 return 0;
         }
  
-       /* Start with adev pre asic reset first for soft reset check.*/
-       if (!amdgpu_device_lock_adev(adev, !hive)) {
-               DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
-                         job ? job->base.id : -1);
-               return 0;
-       }
-
-       /* Block kfd: SRIOV would do it separately */
-       if (!amdgpu_sriov_vf(adev))
-                amdgpu_amdkfd_pre_reset(adev);
-
-       /* Build list of devices to reset */
-       if  (adev->gmc.xgmi.num_physical_nodes > 1) {
-               if (!hive) {
-                       /*unlock kfd: SRIOV would do it separately */
-                       if (!amdgpu_sriov_vf(adev))
-                               amdgpu_amdkfd_post_reset(adev);
-                       amdgpu_device_unlock_adev(adev);
+       /*
+        * Build list of devices to reset.
+        * In case we are in XGMI hive mode, resort the device list
+        * to put adev in the 1st position.
+        */
+       INIT_LIST_HEAD(&device_list);
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               if (!hive)
                         return -ENODEV;
-               }
-
-               /*
-                * In case we are in XGMI hive mode device reset is done for all the
-                * nodes in the hive to retrain all XGMI links and hence the reset
-                * sequence is executed in loop on all nodes.
-                */
+               if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
+                       list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
                 device_list_handle = &hive->device_list;
         } else {
                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
@@ -4211,15 +4192,20 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
         /* block all schedulers and reset given job's ring */
         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (tmp_adev != adev) {
-                       amdgpu_ras_set_error_query_ready(tmp_adev, false);
-                       amdgpu_device_lock_adev(tmp_adev, false);
-                       if (!amdgpu_sriov_vf(tmp_adev))
-                                       amdgpu_amdkfd_pre_reset(tmp_adev);
+               if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
+                       DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
+                                 job ? job->base.id : -1);
+                       mutex_unlock(&hive->hive_lock);
+                       return 0;
                 }
  
+               amdgpu_ras_set_error_query_ready(tmp_adev, false);
+
                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
  
+               if (!amdgpu_sriov_vf(tmp_adev))
+                       amdgpu_amdkfd_pre_reset(tmp_adev);
+
                 /*
                  * Mark these ASICs to be reseted as untracked first
                  * And add them back after reset completed
@@ -4265,22 +4251,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                 goto skip_hw_reset;
         }
  
-
-       /* Guilty job will be freed after this*/
-       r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
-       if (r) {
-               /*TODO Should we stop ?*/
-               DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
-                         r, adev->ddev->unique);
-               adev->asic_reset_res = r;
-       }
-
  retry: /* Rest of adevs pre asic reset from XGMI hive. */
         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-
-               if (tmp_adev == adev)
-                       continue;
-
                 r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                  NULL,
                                                  &need_full_reset);
@@ -4345,8 +4317,10 @@ skip_sched_resume:
                 amdgpu_device_unlock_adev(tmp_adev);
         }
  
-       if (hive)
+       if (hive) {
                 mutex_unlock(&hive->reset_lock);
+               mutex_unlock(&hive->hive_lock);
+       }
  
         if (r)
                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
author	Evan Quan <evan.quan@amd.com>
	Thu, 16 Apr 2020 04:27:28 +0000 (12:27 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 22 Apr 2020 22:11:49 +0000 (18:11 -0400)