drm/amdkfd: support per-queue reset on gfx9
authorJonathan Kim <Jonathan.Kim@amd.com>
Tue, 25 Jun 2024 15:22:50 +0000 (11:22 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 6 Aug 2024 14:43:18 +0000 (10:43 -0400)
Support per-queue reset for GFX9.  The recommendation is for the driver
to target reset the HW queue via a SPI MMIO register write.

Since this requires pipe and HW queue info and MEC FW is limited to
doorbell reports of hung queues after an unmap failure, scan the HW
queue slots defined by SET_RESOURCES first to identify the user queue
candidates to reset.

Only signal reset events to processes that have had a queue reset.

If queue reset fails, fall back to GPU reset.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
16 files changed:
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
drivers/gpu/drm/amd/amdkfd/kfd_events.c
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/include/kgd_kfd_interface.h

index aff08321e9763917fd1d855fc370b7e61b34de2c..8dfdb18197c497c2a7ee94c62415101569345f96 100644 (file)
@@ -191,4 +191,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
        .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
        .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
        .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+       .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v9_hqd_reset,
 };
index 3a3f3ce09f00dbe77f61455f24fed7bd0db0dec5..017e8a3013aaa408d261b62f12d54b1545d20712 100644 (file)
@@ -418,5 +418,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
        .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
        .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
        .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-       .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
+       .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+       .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v9_hqd_reset
 };
index a5c7259cf2a3e858dd753fe782a13c529ddef536..e2ae714a700f8514e5ead51f8d0db2d69e1a085a 100644 (file)
@@ -541,5 +541,7 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
                        kgd_gfx_v9_4_3_set_wave_launch_trap_override,
        .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
        .set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
-       .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch
+       .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
+       .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v9_hqd_reset
 };
index 3ab6c3aa0ad1a96515741a6028f15e1130e49bdd..62176d607befada1f5b687c07bfb6277c822b472 100644 (file)
@@ -1070,6 +1070,20 @@ static void program_trap_handler_settings(struct amdgpu_device *adev,
        unlock_srbm(adev);
 }
 
+uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,
+                                    uint32_t pipe_id, uint32_t queue_id,
+                                    uint32_t inst)
+{
+       return 0;
+}
+
+uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
+                              uint32_t pipe_id, uint32_t queue_id,
+                              uint32_t inst, unsigned int utimeout)
+{
+       return 0;
+}
+
 const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
        .program_sh_mem_settings = kgd_program_sh_mem_settings,
        .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
@@ -1097,4 +1111,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
        .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
        .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
        .program_trap_handler_settings = program_trap_handler_settings,
+       .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v10_hqd_reset
 };
index 67bcaa3d4226410279a6bce1b62b7e0cd206259f..9efd2dd4fdd70307f4c050e624be2bad53acb94b 100644 (file)
@@ -56,3 +56,12 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
                                               uint32_t grace_period,
                                               uint32_t *reg_offset,
                                               uint32_t *reg_data);
+uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,
+                                   uint32_t pipe_id,
+                                   uint32_t queue_id,
+                                   uint32_t inst);
+uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
+                             uint32_t pipe_id,
+                             uint32_t queue_id,
+                             uint32_t inst,
+                             unsigned int utimeout);
index 8c8437a4383f7bf0aa85883a20ce50cf2a332b0d..c718bedda0cacd984e0cc7b6fa6862022d5a25f4 100644 (file)
@@ -680,5 +680,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
        .set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
        .set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
        .set_address_watch = kgd_gfx_v10_set_address_watch,
-       .clear_address_watch = kgd_gfx_v10_clear_address_watch
+       .clear_address_watch = kgd_gfx_v10_clear_address_watch,
+       .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v10_hqd_reset
 };
index b61a32d6af4b8a889406d3f56a4f98d132e58261..a4ba49cb22db455476374692aea0eddc7c7ef97f 100644 (file)
@@ -786,6 +786,20 @@ static uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev,
        return 0;
 }
 
+static uint64_t kgd_gfx_v11_hqd_get_pq_addr(struct amdgpu_device *adev,
+                                           uint32_t pipe_id, uint32_t queue_id,
+                                           uint32_t inst)
+{
+       return 0;
+}
+
+static uint64_t kgd_gfx_v11_hqd_reset(struct amdgpu_device *adev,
+                                     uint32_t pipe_id, uint32_t queue_id,
+                                     uint32_t inst, unsigned int utimeout)
+{
+       return 0;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
        .program_sh_mem_settings = program_sh_mem_settings_v11,
        .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -808,5 +822,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
        .set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override,
        .set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode,
        .set_address_watch = kgd_gfx_v11_set_address_watch,
-       .clear_address_watch = kgd_gfx_v11_clear_address_watch
+       .clear_address_watch = kgd_gfx_v11_clear_address_watch,
+       .hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v11_hqd_reset
 };
index 5a35a8ca89222bafc8f9c3ea12d5ee134b37ba52..32f28c12077b5c71fd8abade2e73bcf304cfc4e6 100644 (file)
@@ -1144,6 +1144,89 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
        kgd_gfx_v9_unlock_srbm(adev, inst);
 }
 
+uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
+                                   uint32_t pipe_id, uint32_t queue_id,
+                                   uint32_t inst)
+{
+       uint32_t low, high;
+       uint64_t queue_addr = 0;
+
+       kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
+       amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
+
+       if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE))
+               goto unlock_out;
+
+       low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE);
+       high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI);
+
+       /* only concerned with user queues. */
+       if (!high)
+               goto unlock_out;
+
+       queue_addr = (((queue_addr | high) << 32) | low) << 8;
+
+unlock_out:
+       amdgpu_gfx_rlc_exit_safe_mode(adev, inst);
+       kgd_gfx_v9_release_queue(adev, inst);
+
+       return queue_addr;
+}
+
+uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
+                             uint32_t pipe_id, uint32_t queue_id,
+                             uint32_t inst, unsigned int utimeout)
+{
+       uint32_t low, high, temp;
+       unsigned long end_jiffies;
+       uint64_t queue_addr = 0;
+
+       kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
+       amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
+
+       if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE))
+               goto unlock_out;
+
+       low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE);
+       high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI);
+
+       /* only concerned with user queues. */
+       if (!high)
+               goto unlock_out;
+
+       queue_addr = (((queue_addr | high) << 32) | low) << 8;
+
+       pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n",
+                inst, pipe_id, queue_id);
+
+       /* assume previous dequeue request issued will take affect after reset */
+       WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1);
+
+       end_jiffies = (utimeout * HZ / 1000) + jiffies;
+       while (true) {
+               temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
+
+               if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
+                       break;
+
+               if (time_after(jiffies, end_jiffies)) {
+                       queue_addr = 0;
+                       break;
+               }
+
+               usleep_range(500, 1000);
+       }
+
+       pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
+                inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
+
+unlock_out:
+       amdgpu_gfx_rlc_exit_safe_mode(adev, inst);
+       kgd_gfx_v9_release_queue(adev, inst);
+
+       return queue_addr;
+}
+
 const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
        .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
        .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -1172,4 +1255,6 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
        .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
        .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
        .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
+       .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
+       .hqd_reset = kgd_gfx_v9_hqd_reset
 };
index ce424615f59b5bc753c54b8dc83bf1161fca407e..988c50ac3be0138afbe27585aa3b013acd2b2b62 100644 (file)
@@ -101,3 +101,12 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
                                               uint32_t grace_period,
                                               uint32_t *reg_offset,
                                               uint32_t *reg_data);
+uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
+                                   uint32_t pipe_id,
+                                   uint32_t queue_id,
+                                   uint32_t inst);
+uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
+                             uint32_t pipe_id,
+                             uint32_t queue_id,
+                             uint32_t inst,
+                             unsigned int utimeout);
index f0bfeb35246ffac5a5acf31e5ac2dc9087fc693d..f6e2110702997ccb90ce8e1b233169ac94723f32 100644 (file)
@@ -153,6 +153,20 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
 
 static void kfd_hws_hang(struct device_queue_manager *dqm)
 {
+       struct device_process_node *cur;
+       struct qcm_process_device *qpd;
+       struct queue *q;
+
+       /* Mark all device queues as reset. */
+       list_for_each_entry(cur, &dqm->queues, list) {
+               qpd = cur->qpd;
+               list_for_each_entry(q, &qpd->queues_list, list) {
+                       struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+
+                       pdd->has_reset_queue = true;
+               }
+       }
+
        /*
         * Issue a GPU reset if HWS is unresponsive
         */
@@ -878,6 +892,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
                else if (prev_active)
                        retval = remove_queue_mes(dqm, q, &pdd->qpd);
 
+               /* queue is reset so inaccessable  */
+               if (pdd->has_reset_queue) {
+                       retval = -EACCES;
+                       goto out_unlock;
+               }
+
                if (retval) {
                        dev_err(dev, "unmap queue failed\n");
                        goto out_unlock;
@@ -1662,7 +1682,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 static int start_cpsch(struct device_queue_manager *dqm)
 {
        struct device *dev = dqm->dev->adev->dev;
-       int retval;
+       int retval, num_hw_queue_slots;
 
        retval = 0;
 
@@ -1715,9 +1735,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
                                        &dqm->wait_times);
        }
 
+       /* setup per-queue reset detection buffer  */
+       num_hw_queue_slots =  dqm->dev->kfd->shared_resources.num_queue_per_pipe *
+                             dqm->dev->kfd->shared_resources.num_pipe_per_mec *
+                             NUM_XCC(dqm->dev->xcc_mask);
+
+       dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info);
+       dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL);
+
+       if (!dqm->detect_hang_info) {
+               retval = -ENOMEM;
+               goto fail_detect_hang_buffer;
+       }
+
        dqm_unlock(dqm);
 
        return 0;
+fail_detect_hang_buffer:
+       kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
 fail_allocate_vidmem:
 fail_set_sched_resources:
        if (!dqm->dev->kfd->shared_resources.enable_mes)
@@ -1748,6 +1783,8 @@ static int stop_cpsch(struct device_queue_manager *dqm)
        kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
        if (!dqm->dev->kfd->shared_resources.enable_mes)
                pm_uninit(&dqm->packet_mgr);
+       kfree(dqm->detect_hang_info);
+       dqm->detect_hang_info = NULL;
        dqm_unlock(dqm);
 
        return 0;
@@ -1965,6 +2002,135 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
        return retval;
 }
 
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,
+                              struct qcm_process_device *qpd)
+{
+       struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+
+       dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid 0x%0x is reset\n",
+               q->properties.queue_id, q->process->pasid);
+
+       pdd->has_reset_queue = true;
+       if (q->properties.is_active) {
+               q->properties.is_active = false;
+               decrement_queue_count(dqm, qpd, q);
+       }
+}
+
+static int detect_queue_hang(struct device_queue_manager *dqm)
+{
+       int i;
+
+       /* detect should be used only in dqm locked queue reset */
+       if (WARN_ON(dqm->detect_hang_count > 0))
+               return 0;
+
+       memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);
+
+       for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
+               uint32_t mec, pipe, queue;
+               int xcc_id;
+
+               mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
+                       / dqm->dev->kfd->shared_resources.num_pipe_per_mec;
+
+               if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap))
+                       continue;
+
+               amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue);
+
+               for_each_inst(xcc_id, dqm->dev->xcc_mask) {
+                       uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr(
+                                               dqm->dev->adev, pipe, queue, xcc_id);
+                       struct dqm_detect_hang_info hang_info;
+
+                       if (!queue_addr)
+                               continue;
+
+                       hang_info.pipe_id = pipe;
+                       hang_info.queue_id = queue;
+                       hang_info.xcc_id = xcc_id;
+                       hang_info.queue_address = queue_addr;
+
+                       dqm->detect_hang_info[dqm->detect_hang_count] = hang_info;
+                       dqm->detect_hang_count++;
+               }
+       }
+
+       return dqm->detect_hang_count;
+}
+
+static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address)
+{
+       struct device_process_node *cur;
+       struct qcm_process_device *qpd;
+       struct queue *q;
+
+       list_for_each_entry(cur, &dqm->queues, list) {
+               qpd = cur->qpd;
+               list_for_each_entry(q, &qpd->queues_list, list) {
+                       if (queue_address == q->properties.queue_address)
+                               return q;
+               }
+       }
+
+       return NULL;
+}
+
+/* only for compute queue */
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
+{
+       int r = 0, reset_count = 0, i;
+
+       if (!dqm->detect_hang_info || dqm->is_hws_hang)
+               return -EIO;
+
+       /* assume dqm locked. */
+       if (!detect_queue_hang(dqm))
+               return -ENOTRECOVERABLE;
+
+       for (i = 0; i < dqm->detect_hang_count; i++) {
+               struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i];
+               struct queue *q = find_queue_by_address(dqm, hang_info.queue_address);
+               struct kfd_process_device *pdd;
+               uint64_t queue_addr = 0;
+
+               if (!q) {
+                       r = -ENOTRECOVERABLE;
+                       goto reset_fail;
+               }
+
+               pdd = kfd_get_process_device_data(dqm->dev, q->process);
+               if (!pdd) {
+                       r = -ENOTRECOVERABLE;
+                       goto reset_fail;
+               }
+
+               queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev,
+                               hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id,
+                               KFD_UNMAP_LATENCY_MS);
+
+               /* either reset failed or we reset an unexpected queue. */
+               if (queue_addr != q->properties.queue_address) {
+                       r = -ENOTRECOVERABLE;
+                       goto reset_fail;
+               }
+
+               set_queue_as_reset(dqm, q, &pdd->qpd);
+               reset_count++;
+       }
+
+       if (reset_count == dqm->detect_hang_count)
+               kfd_signal_reset_event(dqm->dev);
+       else
+               r = -ENOTRECOVERABLE;
+
+reset_fail:
+       dqm->detect_hang_count = 0;
+
+       return r;
+}
+
 /* dqm->lock mutex has to be locked before calling this function */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
                                enum kfd_unmap_queues_filter filter,
@@ -2015,11 +2181,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
         */
        mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
        if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
-               while (halt_if_hws_hang)
-                       schedule();
-               kfd_hws_hang(dqm);
-               retval = -ETIME;
-               goto out;
+               if (reset_queues_on_hws_hang(dqm)) {
+                       while (halt_if_hws_hang)
+                               schedule();
+                       dqm->is_hws_hang = true;
+                       kfd_hws_hang(dqm);
+                       retval = -ETIME;
+                       goto out;
+               }
        }
 
        /* We need to reset the grace period value for this device */
@@ -2038,8 +2207,7 @@ out:
 }
 
 /* only for compute queue */
-static int reset_queues_cpsch(struct device_queue_manager *dqm,
-                       uint16_t pasid)
+static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid)
 {
        int retval;
 
index 3b9b8eabaaccfa24e26102603e40923e5cbf06c6..dfb36a246637005a25ee6b3d8c3f7b4860fb5e5d 100644 (file)
@@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops {
                                 struct kfd_node *dev);
 };
 
+struct dqm_detect_hang_info {
+       int pipe_id;
+       int queue_id;
+       int xcc_id;
+       uint64_t queue_address;
+};
+
 /**
  * struct device_queue_manager
  *
@@ -264,6 +271,11 @@ struct device_queue_manager {
        uint32_t                wait_times;
 
        wait_queue_head_t       destroy_wait;
+
+       /* for per-queue reset support */
+       struct dqm_detect_hang_info *detect_hang_info;
+       size_t detect_hang_info_size;
+       int detect_hang_count;
 };
 
 void device_queue_manager_init_cik(
index 9b33d9d2c9ad533827befe8d5a53a8c62af041cb..ea37922492093534d4018be7f6c28fe48b50d10b 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/memory.h>
 #include "kfd_priv.h"
 #include "kfd_events.h"
+#include "kfd_device_queue_manager.h"
 #include <linux/device.h>
 
 /*
@@ -1244,12 +1245,33 @@ void kfd_signal_reset_event(struct kfd_node *dev)
        idx = srcu_read_lock(&kfd_processes_srcu);
        hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
                int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
+               struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
 
                if (unlikely(user_gpu_id == -EINVAL)) {
                        WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
                        continue;
                }
 
+               if (unlikely(!pdd)) {
+                       WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid);
+                       continue;
+               }
+
+               if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
+                       continue;
+
+               if (dev->dqm->detect_hang_count) {
+                       struct amdgpu_task_info *ti;
+
+                       ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
+                       if (ti) {
+                               dev_err(dev->adev->dev,
+                                       "Queues reset on process %s tid %d thread %s pid %d\n",
+                                       ti->process_name, ti->tgid, ti->task_name, ti->pid);
+                               amdgpu_vm_put_task_info(ti);
+                       }
+               }
+
                rcu_read_lock();
 
                id = KFD_FIRST_NONSIGNAL_EVENT_ID;
index 66c73825c0a04eec1beb1375cfa347e3b0f2f638..84e8ea3a8a0c940561c9f97eb62922d8d3311ecf 100644 (file)
@@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
 static bool check_preemption_failed(struct mqd_manager *mm, void *mqd)
 {
        struct v9_mqd *m = (struct v9_mqd *)mqd;
+       uint32_t doorbell_id = m->queue_doorbell_id0;
 
-       return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0);
+       m->queue_doorbell_id0 = 0;
+
+       return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0);
 }
 
 static int get_wave_state(struct mqd_manager *mm, void *mqd,
@@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd)
                m = get_mqd(mqd + hiq_mqd_size * inst);
                ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev,
                                        m->queue_doorbell_id0, inst);
+               m->queue_doorbell_id0 = 0;
                ++inst;
        }
 
index 4190fa3399138b66bed1726937ca5b4983b6e25b..a5d47048c1472e88cf142a77cba49f85bc9f7dcc 100644 (file)
@@ -846,6 +846,9 @@ struct kfd_process_device {
        void *proc_ctx_bo;
        uint64_t proc_ctx_gpu_addr;
        void *proc_ctx_cpu_ptr;
+
+       /* Tracks queue reset status */
+       bool has_reset_queue;
 };
 
 #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
index 9e29b92eb523d0eec2bc80f262da402151a561ce..a902950cc06016dcebcfcfb50499d419ba9c3412 100644 (file)
@@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
                        goto fail;
                }
                n_evicted++;
+
+               pdd->dev->dqm->is_hws_hang = false;
        }
 
        return r;
index 6d094cf3587d64de6ee40c1a64987a60965ab4c9..7744ca3ef4b19b3e134b44ace4e6bb2cd95b86ce 100644 (file)
@@ -318,6 +318,12 @@ struct kfd2kgd_calls {
        void (*program_trap_handler_settings)(struct amdgpu_device *adev,
                        uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
                        uint32_t inst);
+       uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev,
+                                   uint32_t pipe_id, uint32_t queue_id,
+                                   uint32_t inst);
+       uint64_t (*hqd_reset)(struct amdgpu_device *adev,
+                             uint32_t pipe_id, uint32_t queue_id,
+                             uint32_t inst, unsigned int utimeout);
 };
 
 #endif /* KGD_KFD_INTERFACE_H_INCLUDED */