return adev->have_atomics_support;
}
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
{
struct ras_err_data err_data = {0, 0, 0, NULL};
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
if (!adev->gmc.xgmi.connected_to_cpu)
- amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
- else
+ amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+ else if (reset)
amdgpu_amdkfd_gpu_reset(adev);
}
uint64_t *mmap_offset);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+ bool reset);
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
+static void event_interrupt_poison_consumption(struct kfd_dev *dev,
+ uint16_t pasid, uint16_t source_id)
+{
+ int ret = -EINVAL;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+ if (!p)
+ return;
+
+ /* all queues of a process will be unmapped in one time */
+ if (atomic_read(&p->poison)) {
+ kfd_unref_process(p);
+ return;
+ }
+
+ atomic_set(&p->poison, 1);
+ kfd_unref_process(p);
+
+ switch (source_id) {
+ case SOC15_INTSRC_SQ_INTERRUPT_MSG:
+ if (dev->dqm->ops.reset_queues)
+ ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+ break;
+ case SOC15_INTSRC_SDMA_ECC:
+ default:
+ break;
+ }
+
+ kfd_signal_poison_consumed_event(dev, pasid);
+
+ /* resetting queue passes, do page retirement without gpu reset
+ resetting queue fails, fallback to gpu reset solution */
+ if (!ret)
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ else
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+}
+
static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
- kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+ event_interrupt_poison_consumption(dev, pasid, source_id);
return;
}
break;
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
- kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev);
+ event_interrupt_poison_consumption(dev, pasid, source_id);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||