drm/amdkfd: Skip packet submission on fatal error
authorLijo Lazar <lijo.lazar@amd.com>
Thu, 22 Feb 2024 09:24:50 +0000 (14:54 +0530)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 26 Feb 2024 16:14:31 +0000 (11:14 -0500)
If fatal error is detected, packet submission won't go through. Return
error in such cases. Also, avoid waiting for fence when fatal error is
detected.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c

index 190039f14c30c676f080e6c55a8c8637901a74fd..f5f2945711be0c215d6a812d217c402616fc1cef 100644 (file)
@@ -742,6 +742,11 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
        amdgpu_device_flush_hdp(adev, NULL);
 }
 
+bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
+{
+       return amdgpu_ras_get_fed_status(adev);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
        enum amdgpu_ras_block block, bool reset)
 {
index e60f63ccf79a25a3aa01a987509a90988849f542..4fb32d86cd0e98efa5ad3ed7af861a56bb0e105a 100644 (file)
@@ -337,6 +337,7 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, bool reset);
+bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
index c0e71543389a9b7c410a8072332004eb7dea4f08..f4d395e38683db7c85f3a7f5fc922e93b1222f88 100644 (file)
@@ -1903,6 +1903,10 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm,
        uint64_t *fence_addr =  dqm->fence_addr;
 
        while (*fence_addr != fence_value) {
+               /* Fatal err detected, this response won't come */
+               if (amdgpu_amdkfd_is_fed(dqm->dev->adev))
+                       return -EIO;
+
                if (time_after(jiffies, end_jiffies)) {
                        dev_err(dev, "qcm fence wait loop timeout expired\n");
                        /* In HWS case, this is used to halt the driver thread
index 1bea629c49ca027d7c11f80c6a9bb7105b7a47f7..32c926986dbbdbd23740b6c9916b9f4740852db1 100644 (file)
@@ -286,7 +286,7 @@ err_no_space:
        return -ENOMEM;
 }
 
-void kq_submit_packet(struct kernel_queue *kq)
+int kq_submit_packet(struct kernel_queue *kq)
 {
 #ifdef DEBUG
        int i;
@@ -298,6 +298,10 @@ void kq_submit_packet(struct kernel_queue *kq)
        }
        pr_debug("\n");
 #endif
+       /* Fatal err detected, packet submission won't go through */
+       if (amdgpu_amdkfd_is_fed(kq->dev->adev))
+               return -EIO;
+
        if (kq->dev->kfd->device_info.doorbell_size == 8) {
                *kq->wptr64_kernel = kq->pending_wptr64;
                write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
@@ -307,6 +311,8 @@ void kq_submit_packet(struct kernel_queue *kq)
                write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
                                        kq->pending_wptr);
        }
+
+       return 0;
 }
 
 void kq_rollback_packet(struct kernel_queue *kq)
index 9a624443084543f4b294b377e7c15d46e0efb7a6..e24ee50acdf03773f530c9e587662a1f406390e9 100644 (file)
@@ -47,7 +47,7 @@
 int kq_acquire_packet_buffer(struct kernel_queue *kq,
                                size_t packet_size_in_dwords,
                                unsigned int **buffer_ptr);
-void kq_submit_packet(struct kernel_queue *kq);
+int kq_submit_packet(struct kernel_queue *kq);
 void kq_rollback_packet(struct kernel_queue *kq);
 
 
index 401096c103b2f1e9d51d0cea56089eb94195eb5c..d6f65f39072bda6017fc8173e83b17e5828ebe27 100644 (file)
@@ -288,7 +288,7 @@ int pm_send_set_resources(struct packet_manager *pm,
 
        retval = pm->pmf->set_resources(pm, buffer, res);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
@@ -325,7 +325,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
        if (retval)
                goto fail_create_runlist;
 
-       kq_submit_packet(pm->priv_queue);
+       retval = kq_submit_packet(pm->priv_queue);
 
        mutex_unlock(&pm->lock);
 
@@ -361,7 +361,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 
        retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
@@ -392,7 +392,7 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period)
 
                retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
                if (!retval)
-                       kq_submit_packet(pm->priv_queue);
+                       retval = kq_submit_packet(pm->priv_queue);
                else
                        kq_rollback_packet(pm->priv_queue);
        }
@@ -421,7 +421,7 @@ int pm_send_unmap_queue(struct packet_manager *pm,
 
        retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);