From: Alex Deucher Date: Thu, 26 Jun 2025 12:58:21 +0000 (-0400) Subject: drm/amdgpu/sdma: allow caller to handle kernel rings in engine reset X-Git-Tag: io_uring-6.17-20250815~29^2~10^2~4 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=0c3c2e334c4fd00ed7a8ddb9c163a8c1138af1f1;p=linux-block.git drm/amdgpu/sdma: allow caller to handle kernel rings in engine reset Add a parameter to amdgpu_sdma_reset_engine() to let the caller handle the kernel rings. This allows the kernel rings to back up their unprocessed state if the reset comes in via the drm scheduler rather than KFD. Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 56939bb1d1a9..8b8a04138711 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -545,10 +545,13 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id) * amdgpu_sdma_reset_engine - Reset a specific SDMA engine * @adev: Pointer to the AMDGPU device * @instance_id: Logical ID of the SDMA engine instance to reset + * @caller_handles_kernel_queues: Skip kernel queue processing. Caller + * will handle it. * * Returns: 0 on success, or a negative error code on failure. */ -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id) +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, + bool caller_handles_kernel_queues) { int ret = 0; struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id]; @@ -556,14 +559,17 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id) struct amdgpu_ring *page_ring = &sdma_instance->page; mutex_lock(&sdma_instance->engine_reset_mutex); - /* Stop the scheduler's work queue for the GFX and page rings if they are running. - * This ensures that no new tasks are submitted to the queues while - * the reset is in progress. - */ - drm_sched_wqueue_stop(&gfx_ring->sched); - if (adev->sdma.has_page_queue) - drm_sched_wqueue_stop(&page_ring->sched); + if (!caller_handles_kernel_queues) { + /* Stop the scheduler's work queue for the GFX and page rings if they are running. + * This ensures that no new tasks are submitted to the queues while + * the reset is in progress. + */ + drm_sched_wqueue_stop(&gfx_ring->sched); + + if (adev->sdma.has_page_queue) + drm_sched_wqueue_stop(&page_ring->sched); + } if (sdma_instance->funcs->stop_kernel_queue) { sdma_instance->funcs->stop_kernel_queue(gfx_ring); @@ -585,16 +591,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id) } exit: - /* Restart the scheduler's work queue for the GFX and page rings - * if they were stopped by this function. This allows new tasks - * to be submitted to the queues after the reset is complete. - */ - if (!ret) { - amdgpu_fence_driver_force_completion(gfx_ring); - drm_sched_wqueue_start(&gfx_ring->sched); - if (adev->sdma.has_page_queue) { - amdgpu_fence_driver_force_completion(page_ring); - drm_sched_wqueue_start(&page_ring->sched); + if (!caller_handles_kernel_queues) { + /* Restart the scheduler's work queue for the GFX and page rings + * if they were stopped by this function. This allows new tasks + * to be submitted to the queues after the reset is complete. + */ + if (!ret) { + amdgpu_fence_driver_force_completion(gfx_ring); + drm_sched_wqueue_start(&gfx_ring->sched); + if (adev->sdma.has_page_queue) { + amdgpu_fence_driver_force_completion(page_ring); + drm_sched_wqueue_start(&page_ring->sched); + } } } mutex_unlock(&sdma_instance->engine_reset_mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index e5f8951bbb6f..34311f32be4c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -172,7 +172,8 @@ struct amdgpu_buffer_funcs { uint32_t byte_count); }; -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id); +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, + bool caller_handles_kernel_queues); #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib), (s), (d), (b), (t)) #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b)) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index a7e1dbe03b29..20fad2525969 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1668,7 +1668,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, return -EOPNOTSUPP; amdgpu_amdkfd_suspend(adev, true); - r = amdgpu_sdma_reset_engine(adev, id); + r = amdgpu_sdma_reset_engine(adev, id, false); amdgpu_amdkfd_resume(adev, true); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index ed1706da7dee..5a1098bdd825 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -1548,7 +1548,7 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring, int r; amdgpu_amdkfd_suspend(adev, true); - r = amdgpu_sdma_reset_engine(adev, inst_id); + r = amdgpu_sdma_reset_engine(adev, inst_id, false); amdgpu_amdkfd_resume(adev, true); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index b87a4b44fa93..6843c2c3d71f 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -1461,7 +1461,7 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring, int r; amdgpu_amdkfd_suspend(adev, true); - r = amdgpu_sdma_reset_engine(adev, inst_id); + r = amdgpu_sdma_reset_engine(adev, inst_id, false); amdgpu_amdkfd_resume(adev, true); return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 500f51552038..2d91027e2a74 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2312,7 +2312,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm) continue; /* Reset engine and check. */ - if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) || + if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) || dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) || !set_sdma_queue_as_reset(dqm, doorbell_off)) { r = -ENOTRECOVERABLE;