drm/amdkfd: reset queue which consumes RAS poison (v2)
authorTao Zhou <tao.zhou1@amd.com>
Mon, 6 Dec 2021 07:54:54 +0000 (15:54 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 28 Dec 2021 21:02:59 +0000 (16:02 -0500)
CP supports unmap queue with reset mode which only destroys specific queue without affecting others.
Replacing whole gpu reset with reset queue mode for RAS poison consumption
saves much time, and we can also fallback to gpu reset solution if reset
queue fails.

v2: Return directly if process is NULL;
    Reset queue solution is not applicable to SDMA, fallback to legacy
way;
    Call kfd_unref_process after lookup process.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 46cf48b3904ac11297f95e9f3890886e2113f453..0bf09a94d9441dbf47ca689b7c3862b331a3d914 100644 (file)
@@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
        return adev->have_atomics_support;
 }
 
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
 {
        struct ras_err_data err_data = {0, 0, 0, NULL};
 
        /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
        if (!adev->gmc.xgmi.connected_to_cpu)
-               amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
-       else
+               amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+       else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
index fcbc8a9c9e06d7161a24d3ff03012792976a0fb8..61f899e54fd5f5e8cb48865ae215aaa29c4141d6 100644 (file)
@@ -296,7 +296,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
                                      uint64_t *mmap_offset);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+                               bool reset);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
 void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
index deb64168c9e8b3b01a567f4b661a02ba3fbe9d43..b8ac28fb12315d935cd1aae1bf96edfc2832aa12 100644 (file)
@@ -89,6 +89,44 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
 
+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+                               uint16_t pasid, uint16_t source_id)
+{
+       int ret = -EINVAL;
+       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+       if (!p)
+               return;
+
+       /* all queues of a process will be unmapped in one time */
+       if (atomic_read(&p->poison)) {
+               kfd_unref_process(p);
+               return;
+       }
+
+       atomic_set(&p->poison, 1);
+       kfd_unref_process(p);
+
+       switch (source_id) {
+       case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+               if (dev->dqm->ops.reset_queues)
+                       ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+               break;
+       case SOC15_INTSRC_SDMA_ECC:
+       default:
+               break;
+       }
+
+       kfd_signal_poison_consumed_event(dev, pasid);
+
+       /* resetting queue passes, do page retirement without gpu reset
+          resetting queue fails, fallback to gpu reset solution */
+       if (!ret)
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+       else
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+}
+
 static bool event_interrupt_isr_v9(struct kfd_dev *dev,
                                        const uint32_t *ih_ring_entry,
                                        uint32_t *patched_ihre,
@@ -230,8 +268,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                        sq_intr_err);
                                if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                        sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
-                                       kfd_signal_poison_consumed_event(dev, pasid);
-                                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                                       event_interrupt_poison_consumption(dev, pasid, source_id);
                                        return;
                                }
                                break;
@@ -252,8 +289,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                if (source_id == SOC15_INTSRC_SDMA_TRAP) {
                        kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
-                       kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+                       event_interrupt_poison_consumption(dev, pasid, source_id);
                        return;
                }
        } else if (client_id == SOC15_IH_CLIENTID_VMC ||
index 0c3f911e3bf4bbdafb4c253bf45a7bb1a5b48592..ea68f3b3a4e9cbb884becd4c799bd120f39cf3a6 100644 (file)
@@ -856,6 +856,8 @@ struct kfd_process {
        struct svm_range_list svms;
 
        bool xnack_enabled;
+
+       atomic_t poison;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */