drm/amdgpu: Add reset_context flag for host FLR
authorYunxiang Li <Yunxiang.Li@amd.com>
Mon, 22 Apr 2024 18:44:38 +0000 (14:44 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 2 May 2024 19:40:50 +0000 (15:40 -0400)
There are other reset sources that pass NULL as the job pointer, such as
amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if
the FLR comes from the host does not work.

Add a flag in reset_context to explicitly mark host triggered reset, and
set this flag when we receive host reset notification.

Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: Emily Deng <Emily.Deng@amd.com>
Reviewed-by: Zhigang Luo <zhigang.luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c

index 8befd10bf00769e410d388b2690cf72b57477d82..33c889c027a5c0bae5e1fb70a7ec4db43cc9dcbd 100644 (file)
@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
  *
  * @adev: amdgpu_device pointer
- * @from_hypervisor: request from hypervisor
+ * @reset_context: amdgpu reset context pointer
  *
  * do VF FLR and reinitialize Asic
  * return 0 means succeeded otherwise failed
  */
 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
-                                    bool from_hypervisor)
+                                    struct amdgpu_reset_context *reset_context)
 {
        int r;
        struct amdgpu_hive_info *hive = NULL;
@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 retry:
        amdgpu_amdkfd_pre_reset(adev);
 
-       if (from_hypervisor)
+       if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
+               clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
                r = amdgpu_virt_request_full_gpu(adev, true);
-       else
+       } else {
                r = amdgpu_virt_reset_gpu(adev);
+       }
        if (r)
                return r;
+
        amdgpu_ras_set_fed(adev, false);
        amdgpu_irq_gpu_reset_resume_helper(adev);
 
@@ -5826,7 +5829,7 @@ retry:    /* Rest of adevs pre asic reset from XGMI hive. */
        /* Actual ASIC resets if needed.*/
        /* Host driver will handle XGMI hive reset for SRIOV */
        if (amdgpu_sriov_vf(adev)) {
-               r = amdgpu_device_reset_sriov(adev, job ? false : true);
+               r = amdgpu_device_reset_sriov(adev, reset_context);
                if (r)
                        adev->asic_reset_res = r;
 
index b11d190ece53514c61c0cb81850b51e44e8182cc..5a9cc043b858386c1806465a342ea0e5ae9fb52c 100644 (file)
@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
        AMDGPU_NEED_FULL_RESET = 0,
        AMDGPU_SKIP_HW_RESET = 1,
        AMDGPU_SKIP_COREDUMP = 2,
+       AMDGPU_HOST_FLR = 3,
 };
 
 struct amdgpu_reset_context {
index c5ba9c4757a82f82a371c3cc1d4b1075fb0efe43..f4c47492e0cd5859eddbe767b8a0d068b84a8243 100644 (file)
@@ -292,6 +292,7 @@ flr_done:
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
index fb7cf4214e3a52e3d55c47f181b9896918d98327..37b49a5ed2a165c944286b6527c915941320be0b 100644 (file)
@@ -328,6 +328,7 @@ flr_done:
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
index 14a065516ae4aa05b08534924cbdfe2aa516bfef..78cd07744ebe4a864bbf2063072922ae2051a70f 100644 (file)
@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
                reset_context.method = AMD_RESET_METHOD_NONE;
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }