drm/amdgpu: handle eviction fence race
authorShashank Sharma <shashank.sharma@amd.com>
Wed, 20 Nov 2024 17:04:33 +0000 (18:04 +0100)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 8 Apr 2025 20:48:18 +0000 (16:48 -0400)
The eviction process can get into a race condition between the eviction
fence suspend work (which replaces the old fence with new) and kms_close
(which destroys the fence and doesn't expect a new one).

This patch:
- adds a flag to indicate that fd is closing, so fence replacement is
  not required (evf_mgr->fd_closing)
- adds a flush_work() during the ev_fence_destroy routine

V2: Addressed review comments from Christian:
    - Do not use mutex to sync
    - Use flush_work and wait for suspend_work to be done

V3: Fixed state machine for queue->active, which adds into race between
    suspend/resume and queue ops

Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
Signed-off-by: Arvind Yadav <arvind.yadav@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c

index 189afb8727750ab03c29c137433b5b95dcc672a3..c22767a75348b0ef8f0b261edf36622c3ce6e2a8 100644 (file)
@@ -117,6 +117,10 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
        /* Signal old eviction fence */
        amdgpu_eviction_fence_signal(evf_mgr);
 
+       /* Do not replace eviction fence is fd is getting closed */
+       if (evf_mgr->fd_closing)
+               return;
+
        /* Prepare the objects to replace eviction fence */
        drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
        drm_exec_until_all_locked(&exec) {
@@ -199,6 +203,9 @@ void amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr)
 {
        struct amdgpu_eviction_fence *ev_fence;
 
+       /* Wait for any pending work to execute */
+       flush_delayed_work(&evf_mgr->suspend_work);
+
        spin_lock(&evf_mgr->ev_fence_lock);
        ev_fence = evf_mgr->ev_fence;
        spin_unlock(&evf_mgr->ev_fence_lock);
index 700442584f97f4e87168dd1d21cc090a28ea3eb6..f8370da080388a0c86f21eb1d93d773101e1cb3e 100644 (file)
@@ -1490,10 +1490,12 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
                amdgpu_bo_unreserve(pd);
        }
 
+       fpriv->evf_mgr.fd_closing = true;
+       amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
        amdgpu_eviction_fence_destroy(&fpriv->evf_mgr);
+
        amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
        amdgpu_vm_fini(adev, &fpriv->vm);
-       amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
 
        if (pasid)
                amdgpu_pasid_free_delayed(pd->tbo.base.resv, pasid);
index 57e502d014a12532fd7310e935e2588e20ebadd6..cba51bdf2e2c6e564261ce2ac50467a39ce7b2d8 100644 (file)
@@ -614,9 +614,10 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
 
        cancel_delayed_work(&userq_mgr->resume_work);
 
+       mutex_lock(&userq_mgr->userq_mutex);
        idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id)
                amdgpu_userqueue_cleanup(userq_mgr, queue, queue_id);
-
        idr_destroy(&userq_mgr->userq_idr);
+       mutex_unlock(&userq_mgr->userq_mutex);
        mutex_destroy(&userq_mgr->userq_mutex);
 }
index ee0a757fcfa347815f39118a90cf8829fbbe522f..b1b7bc47d39f03cf3b940ce4de5362e0d10aa766 100644 (file)
@@ -139,6 +139,7 @@ static int mes_v11_0_userq_map(struct amdgpu_userq_mgr *uq_mgr,
                return r;
        }
 
+       queue->queue_active = true;
        DRM_DEBUG_DRIVER("Queue (doorbell:%d) mapped successfully\n", userq_props->doorbell_index);
        return 0;
 }
@@ -160,6 +161,7 @@ static void mes_v11_0_userq_unmap(struct amdgpu_userq_mgr *uq_mgr,
        amdgpu_mes_unlock(&adev->mes);
        if (r)
                DRM_ERROR("Failed to unmap queue in HW, err (%d)\n", r);
+       queue->queue_active = false;
 }
 
 static int mes_v11_0_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
@@ -331,7 +333,6 @@ static int mes_v11_0_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
                goto free_ctx;
        }
 
-       queue->queue_active = true;
        return 0;
 
 free_ctx:
@@ -350,12 +351,12 @@ static void
 mes_v11_0_userq_mqd_destroy(struct amdgpu_userq_mgr *uq_mgr,
                            struct amdgpu_usermode_queue *queue)
 {
-       mes_v11_0_userq_unmap(uq_mgr, queue);
-       amdgpu_bo_unref(&queue->wptr_obj.obj);
+       if (queue->queue_active)
+               mes_v11_0_userq_unmap(uq_mgr, queue);
+
        amdgpu_userqueue_destroy_object(uq_mgr, &queue->fw_obj);
        kfree(queue->userq_prop);
        amdgpu_userqueue_destroy_object(uq_mgr, &queue->mqd);
-       queue->queue_active = false;
 }
 
 static int mes_v11_0_userq_suspend(struct amdgpu_userq_mgr *uq_mgr,