drm/amdkfd: add RAS ECC event support (v3)
authorEric Huang <JinhuiEric.Huang@amd.com>
Fri, 11 Jan 2019 19:38:51 +0000 (14:38 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 Mar 2019 20:36:51 +0000 (15:36 -0500)
RAS ECC event will combine with GPU reset event, due to
ECC interrupts are caused by uncorrectable error that triggers
GPU reset.

v2: Fix misleading-indentation warning
v3: fix build with CONFIG_HSA_AMD disabled

Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_events.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
include/uapi/linux/kfd_ioctl.h

index fe1d7368c1e666b89648fa0f05cf8e8c8af6246a..acf8ae0cee9a43fdc72caacaf541d7f36f263a27 100644 (file)
@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 {
 }
+
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+}
 #endif
index 0e1711a75b687e60c387353b4de8cfb7d03731a3..e6a503760b628df4bba1bf1dd57be88d686f0bdf 100644 (file)
@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                               struct dma_fence *fence);
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
 
 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
index 88c45f990f0586c0f8007b9ceba176b2ba6abe40..6bb71f6ee18e6ce48b54801b00861641832566bd 100644 (file)
@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
                struct amdgpu_iv_entry *entry)
 {
        /* TODO ue will trigger an interrupt. */
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        amdgpu_ras_reset_gpu(adev, 0);
        return AMDGPU_RAS_UE;
 }
index 2daa5ea1c2eaa5bb35c218c1bb9d89d753691dcd..0252345a1f08f51303e4ea2da63956ba84d52a2f 100644 (file)
@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
 static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
                struct amdgpu_iv_entry *entry)
 {
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        amdgpu_ras_reset_gpu(adev, 0);
        return AMDGPU_RAS_UE;
 }
index 058b9daec514005a4056b77dfb8595b44d57e566..f7a6fafd70aeb9d1db2ac413206aa04b79a2f912 100644 (file)
@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
                return 0;
        }
 
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
        amdgpu_ras_reset_gpu(adev, 0);
 
        return AMDGPU_RAS_UE;
index 8be9677c0c07dae65e3c69dafab241e3b16ff975..b3cdbf79f47b3d7ea3ebfa66a14aa0ee252210d4 100644 (file)
@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
        memset(&kfd->doorbell_available_index, 0,
                sizeof(kfd->doorbell_available_index));
 
+       atomic_set(&kfd->sram_ecc_flag, 0);
+
        return kfd;
 }
 
@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
                return ret;
        count = atomic_dec_return(&kfd_locked);
        WARN_ONCE(count != 0, "KFD reset ref. error");
+
+       atomic_set(&kfd->sram_ecc_flag, 0);
+
        return 0;
 }
 
@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
        return 0;
 }
 
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+       if (kfd)
+               atomic_inc(&kfd->sram_ecc_flag);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
index e9f0e0a1b41c074204a69a7745e29bc8e53668ba..6e1d41c5bf86e0cbce517fbd5629e90919182e72 100644 (file)
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
 void kfd_signal_reset_event(struct kfd_dev *dev)
 {
        struct kfd_hsa_hw_exception_data hw_exception_data;
+       struct kfd_hsa_memory_exception_data memory_exception_data;
        struct kfd_process *p;
        struct kfd_event *ev;
        unsigned int temp;
        uint32_t id, idx;
+       int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+                       KFD_HW_EXCEPTION_ECC :
+                       KFD_HW_EXCEPTION_GPU_HANG;
 
        /* Whole gpu reset caused by GPU hang and memory is lost */
        memset(&hw_exception_data, 0, sizeof(hw_exception_data));
        hw_exception_data.gpu_id = dev->id;
        hw_exception_data.memory_lost = 1;
+       hw_exception_data.reset_cause = reset_cause;
+
+       memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+       memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+       memory_exception_data.gpu_id = dev->id;
+       memory_exception_data.failure.imprecise = true;
 
        idx = srcu_read_lock(&kfd_processes_srcu);
        hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
                mutex_lock(&p->event_mutex);
                id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-               idr_for_each_entry_continue(&p->event_idr, ev, id)
+               idr_for_each_entry_continue(&p->event_idr, ev, id) {
                        if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
                                ev->hw_exception_data = hw_exception_data;
                                set_event(ev);
                        }
+                       if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+                           reset_cause == KFD_HW_EXCEPTION_ECC) {
+                               ev->memory_exception_data = memory_exception_data;
+                               set_event(ev);
+                       }
+               }
                mutex_unlock(&p->event_mutex);
        }
        srcu_read_unlock(&kfd_processes_srcu, idx);
index 0eeee3c6d6dcd067482354905a80cf52766a4fb9..9e02309656758761272c83c4535bac3392f67632 100644 (file)
@@ -276,6 +276,9 @@ struct kfd_dev {
        uint64_t hive_id;
 
        bool pci_atomic_requested;
+
+       /* SRAM ECC flag */
+       atomic_t sram_ecc_flag;
 };
 
 enum kfd_mempool {
index e622fd1fbd46399c967e448c762b69b8dd3220b2..dc067ed0b72d4c0bbf22b3af9a872100c50855a9 100644 (file)
@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define KFD_HW_EXCEPTION_GPU_HANG      0
 #define KFD_HW_EXCEPTION_ECC           1
 
+/* For kfd_hsa_memory_exception_data.ErrorType */
+#define KFD_MEM_ERR_NO_RAS             0
+#define KFD_MEM_ERR_SRAM_ECC           1
+#define KFD_MEM_ERR_POISON_CONSUMED    2
+#define KFD_MEM_ERR_GPU_HANG           3
 
 struct kfd_ioctl_create_event_args {
        __u64 event_page_offset;        /* from KFD */
@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
        struct kfd_memory_exception_failure failure;
        __u64 va;
        __u32 gpu_id;
-       __u32 pad;
+       __u32 ErrorType; /* 0 = no RAS error,
+                         * 1 = ECC_SRAM,
+                         * 2 = Link_SYNFLOOD (poison),
+                         * 3 = GPU hang (not attributable to a specific cause),
+                         * other values reserved
+                         */
 };
 
 /* hw exception data */