drm/amdkfd: implement per queue sdma reset for gfx 9.4+
authorJonathan Kim <jonathan.kim@amd.com>
Wed, 15 Jan 2025 20:29:34 +0000 (15:29 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 5 Mar 2025 15:47:26 +0000 (10:47 -0500)
To reset hung SDMA queues on GFX 9.4+ for the GFX9 family, a soft reset
must be issued through SMU.  Since soft resets will reset an entire SDMA
engine, use a common KGD call to do the reset as the KGD will handle
avoiding a reset of in flight GFX and paging queues on that engine.

In addition, create a common call for all reset types to simplify
the handling of module parameter settings that block gpu resets.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
12 files changed:
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/include/kgd_kfd_interface.h

index 8dfdb18197c497c2a7ee94c62415101569345f96..6e861d08d0441c5404d9d0fb87eb56ad42a12572 100644 (file)
@@ -193,4 +193,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
        .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
        .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
        .hqd_reset = kgd_gfx_v9_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
 };
index 9abf29b58ac75a9be2f7e1406b22e2040cf9b3f2..c820418e8ccdf9790caeca733e48eab708df211a 100644 (file)
@@ -419,5 +419,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
        .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
        .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
        .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
-       .hqd_reset = kgd_gfx_v9_hqd_reset
+       .hqd_reset = kgd_gfx_v9_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
 };
index e2ae714a700f8514e5ead51f8d0db2d69e1a085a..0c09984775980dabe4d2ba58b6524be0d1bd7ced 100644 (file)
@@ -509,6 +509,17 @@ static uint32_t kgd_gfx_v9_4_3_clear_address_watch(struct amdgpu_device *adev,
        return 0;
 }
 
+static uint32_t kgd_gfx_v9_4_3_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                                    int engine, int queue)
+{
+       uint32_t reg_offset = get_sdma_rlc_reg_offset(adev, engine, queue);
+       uint32_t status = RREG32(regSDMA_RLC0_CONTEXT_STATUS + reg_offset);
+       uint32_t doorbell_off = RREG32(regSDMA_RLC0_DOORBELL_OFFSET + reg_offset);
+       bool is_active = !!REG_GET_FIELD(status, SDMA_RLC0_CONTEXT_STATUS, SELECTED);
+
+       return is_active ? doorbell_off >> 2 : 0;
+}
+
 const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
        .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
        .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
@@ -543,5 +554,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
        .set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
        .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
        .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
-       .hqd_reset = kgd_gfx_v9_hqd_reset
+       .hqd_reset = kgd_gfx_v9_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell
 };
index 62176d607befada1f5b687c07bfb6277c822b472..2887b6f3eaa227bf50a6c5e036886aa3d9ff8751 100644 (file)
@@ -1084,6 +1084,12 @@ uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
        return 0;
 }
 
+uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                          int engine, int queue)
+{
+       return 0;
+}
+
 const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
        .program_sh_mem_settings = kgd_program_sh_mem_settings,
        .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
@@ -1112,5 +1118,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
        .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
        .program_trap_handler_settings = program_trap_handler_settings,
        .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
-       .hqd_reset = kgd_gfx_v10_hqd_reset
+       .hqd_reset = kgd_gfx_v10_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell
 };
index 9efd2dd4fdd70307f4c050e624be2bad53acb94b..db577c2a847abfcbfda29883ffe0df76929aba34 100644 (file)
@@ -65,3 +65,5 @@ uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
                              uint32_t queue_id,
                              uint32_t inst,
                              unsigned int utimeout);
+uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                          int engine, int queue);
index c718bedda0cacd984e0cc7b6fa6862022d5a25f4..ac9ad505f9d720ae07492599d89e0e8244dc9a6e 100644 (file)
@@ -682,5 +682,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
        .set_address_watch = kgd_gfx_v10_set_address_watch,
        .clear_address_watch = kgd_gfx_v10_clear_address_watch,
        .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
-       .hqd_reset = kgd_gfx_v10_hqd_reset
+       .hqd_reset = kgd_gfx_v10_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell
 };
index a4ba49cb22db455476374692aea0eddc7c7ef97f..e0e6a6a49d900e7bac1e921831d17720846f70d3 100644 (file)
@@ -800,6 +800,12 @@ static uint64_t kgd_gfx_v11_hqd_reset(struct amdgpu_device *adev,
        return 0;
 }
 
+static uint32_t kgd_gfx_v11_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                                 int engine, int queue)
+{
+       return 0;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
        .program_sh_mem_settings = program_sh_mem_settings_v11,
        .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -824,5 +830,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
        .set_address_watch = kgd_gfx_v11_set_address_watch,
        .clear_address_watch = kgd_gfx_v11_clear_address_watch,
        .hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr,
-       .hqd_reset = kgd_gfx_v11_hqd_reset
+       .hqd_reset = kgd_gfx_v11_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v11_hqd_sdma_get_doorbell
 };
index 0dfe7093bd8a05fc0094d7a4304071705c119380..6f0dc23c901b8431228556128e790eec357374eb 100644 (file)
@@ -361,6 +361,12 @@ static uint32_t kgd_gfx_v12_clear_address_watch(struct amdgpu_device *adev,
        return 0;
 }
 
+static uint32_t kgd_gfx_v12_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                                int engine, int queue)
+{
+       return 0;
+}
+
 const struct kfd2kgd_calls gfx_v12_kfd2kgd = {
        .init_interrupts = init_interrupts_v12,
        .hqd_dump = hqd_dump_v12,
@@ -374,4 +380,5 @@ const struct kfd2kgd_calls gfx_v12_kfd2kgd = {
        .set_wave_launch_mode = kgd_gfx_v12_set_wave_launch_mode,
        .set_address_watch = kgd_gfx_v12_set_address_watch,
        .clear_address_watch = kgd_gfx_v12_clear_address_watch,
+       .hqd_sdma_get_doorbell = kgd_gfx_v12_hqd_sdma_get_doorbell
 };
index 441568163e20e404bcb9dc516cdd638eac867da5..84135eb906606b32917b9036187cb58fcbb1c380 100644 (file)
@@ -1131,9 +1131,6 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
        uint32_t low, high;
        uint64_t queue_addr = 0;
 
-       if (!amdgpu_gpu_recovery)
-               return 0;
-
        kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
        amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
 
@@ -1182,9 +1179,6 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
        uint32_t low, high, pipe_reset_data = 0;
        uint64_t queue_addr = 0;
 
-       if (!amdgpu_gpu_recovery)
-               return 0;
-
        kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
        amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
 
@@ -1229,6 +1223,13 @@ unlock_out:
        return queue_addr;
 }
 
+uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                         int engine, int queue)
+
+{
+       return 0;
+}
+
 const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
        .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
        .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -1258,5 +1259,6 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
        .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
        .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
        .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
-       .hqd_reset = kgd_gfx_v9_hqd_reset
+       .hqd_reset = kgd_gfx_v9_hqd_reset,
+       .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
 };
index b6a91a552aa431f9535d182d4833bd45167b484e..90c8fa13d519275fc407e79adef492994d3a4996 100644 (file)
@@ -111,3 +111,5 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
                              uint32_t queue_id,
                              uint32_t inst,
                              unsigned int utimeout);
+uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+                                         int engine, int queue);
index 91e4988dc1e37afd7ac563d3fab555d4a3a8d6d3..f3f2fd6ee65c1aea9d22ad31dff65cfddc69063b 100644 (file)
@@ -36,6 +36,7 @@
 #include "kfd_kernel_queue.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_sdma.h"
 #include "mes_v11_api_def.h"
 #include "kfd_debug.h"
 
@@ -67,6 +68,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
                                struct queue *q, const uint32_t *restore_sdma_id);
 
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
+
 static inline
 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
 {
@@ -2205,8 +2208,7 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
        return NULL;
 }
 
-/* only for compute queue */
-static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
+static int reset_hung_queues(struct device_queue_manager *dqm)
 {
        int r = 0, reset_count = 0, i;
 
@@ -2259,6 +2261,104 @@ reset_fail:
        return r;
 }
 
+static bool sdma_has_hang(struct device_queue_manager *dqm)
+{
+       int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
+       int engine_end = engine_start + get_num_all_sdma_engines(dqm);
+       int num_queues_per_eng =  dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
+       int i, j;
+
+       for (i = engine_start; i < engine_end; i++) {
+               for (j = 0; j < num_queues_per_eng; j++) {
+                       if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j))
+                               continue;
+
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm,
+                                   uint32_t doorbell_off)
+{
+       struct device_process_node *cur;
+       struct qcm_process_device *qpd;
+       struct queue *q;
+
+       list_for_each_entry(cur, &dqm->queues, list) {
+               qpd = cur->qpd;
+               list_for_each_entry(q, &qpd->queues_list, list) {
+                       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+                            q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) &&
+                            q->properties.doorbell_off == doorbell_off) {
+                               set_queue_as_reset(dqm, q, qpd);
+                               return true;
+                       }
+               }
+       }
+
+       return false;
+}
+
+static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
+{
+       int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
+       int engine_end = engine_start + get_num_all_sdma_engines(dqm);
+       int num_queues_per_eng =  dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
+       int r = 0, i, j;
+
+       if (dqm->is_hws_hang)
+               return -EIO;
+
+       /* Scan for hung HW queues and reset engine. */
+       dqm->detect_hang_count = 0;
+       for (i = engine_start; i < engine_end; i++) {
+               for (j = 0; j < num_queues_per_eng; j++) {
+                       uint32_t doorbell_off =
+                               dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j);
+
+                       if (!doorbell_off)
+                               continue;
+
+                       /* Reset engine and check. */
+                       if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
+                           dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
+                           !set_sdma_queue_as_reset(dqm, doorbell_off)) {
+                               r = -ENOTRECOVERABLE;
+                               goto reset_fail;
+                       }
+
+                       /* Should only expect one queue active per engine */
+                       dqm->detect_hang_count++;
+                       break;
+               }
+       }
+
+       /* Signal process reset */
+       if (dqm->detect_hang_count)
+               kfd_signal_reset_event(dqm->dev);
+       else
+               r = -ENOTRECOVERABLE;
+
+reset_fail:
+       dqm->detect_hang_count = 0;
+
+       return r;
+}
+
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma)
+{
+       while (halt_if_hws_hang)
+               schedule();
+
+       if (!amdgpu_gpu_recovery)
+               return -ENOTRECOVERABLE;
+
+       return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm);
+}
+
 /* dqm->lock mutex has to be locked before calling this function */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
                                enum kfd_unmap_queues_filter filter,
@@ -2309,16 +2409,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
         * check those fields
         */
        mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
-       if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
-               while (halt_if_hws_hang)
-                       schedule();
-               if (reset_queues_on_hws_hang(dqm)) {
-                       dqm->is_hws_hang = true;
-                       kfd_hws_hang(dqm);
-                       retval = -ETIME;
-                       goto out;
-               }
-       }
+       if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) &&
+           reset_queues_on_hws_hang(dqm, false))
+               goto reset_fail;
+
+       /* Check for SDMA hang and attempt SDMA reset */
+       if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true))
+               goto reset_fail;
 
        /* We need to reset the grace period value for this device */
        if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
@@ -2329,10 +2426,15 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 
        pm_release_ib(&dqm->packet_mgr);
        dqm->active_runlist = false;
-
 out:
        up_read(&dqm->dev->adev->reset_domain->sem);
        return retval;
+
+reset_fail:
+       dqm->is_hws_hang = true;
+       kfd_hws_hang(dqm);
+       up_read(&dqm->dev->adev->reset_domain->sem);
+       return -ETIME;
 }
 
 /* only for compute queue */
index e3e635a31b8a4112b47b8469ae2d229b0d2b60e3..1e8dfa6c0dc82c1355a6c62d860b85c35f34f03a 100644 (file)
@@ -330,6 +330,8 @@ struct kfd2kgd_calls {
        uint64_t (*hqd_reset)(struct amdgpu_device *adev,
                              uint32_t pipe_id, uint32_t queue_id,
                              uint32_t inst, unsigned int utimeout);
+       uint32_t (*hqd_sdma_get_doorbell)(struct amdgpu_device *adev,
+                                         int engine, int queue);
 };
 
 #endif /* KGD_KFD_INTERFACE_H_INCLUDED */