drm/amdkfd: add debug query event operation
authorJonathan Kim <jonathan.kim@amd.com>
Mon, 9 May 2022 15:10:32 +0000 (11:10 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 16:36:51 +0000 (12:36 -0400)
Allow the debugger to query a single queue, device and process
exception.
The KFD should also return the GPU or Queue id of the exception.
The debugger also has the option of clearing exceptions after
being queried.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_debug.c
drivers/gpu/drm/amd/amdkfd/kfd_debug.h

index 5ee38614ed9b207100fb3830aefc3a586a4d9bf5..498859259b55620e01e85c9b3d39178b5a03f015 100644 (file)
@@ -3038,6 +3038,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
                r = kfd_dbg_trap_set_flags(target, &args->set_flags.flags);
                break;
        case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+               r = kfd_dbg_ev_query_debug_event(target,
+                               &args->query_debug_event.queue_id,
+                               &args->query_debug_event.gpu_id,
+                               args->query_debug_event.exception_mask,
+                               &args->query_debug_event.exception_mask);
+               break;
        case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
        case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
        case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
index 43c3170998d38c20003a5f1e8976f7af09312748..e9530e682e85cba5aa1e33f1b98e1d1adc63c3c0 100644 (file)
 
 #define MAX_WATCH_ADDRESSES    4
 
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+                     unsigned int *queue_id,
+                     unsigned int *gpu_id,
+                     uint64_t exception_clear_mask,
+                     uint64_t *event_status)
+{
+       struct process_queue_manager *pqm;
+       struct process_queue_node *pqn;
+       int i;
+
+       if (!(process && process->debug_trap_enabled))
+               return -ENODATA;
+
+       mutex_lock(&process->event_mutex);
+       *event_status = 0;
+       *queue_id = 0;
+       *gpu_id = 0;
+
+       /* find and report queue events */
+       pqm = &process->pqm;
+       list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+               uint64_t tmp = process->exception_enable_mask;
+
+               if (!pqn->q)
+                       continue;
+
+               tmp &= pqn->q->properties.exception_status;
+
+               if (!tmp)
+                       continue;
+
+               *event_status = pqn->q->properties.exception_status;
+               *queue_id = pqn->q->properties.queue_id;
+               *gpu_id = pqn->q->device->id;
+               pqn->q->properties.exception_status &= ~exception_clear_mask;
+               goto out;
+       }
+
+       /* find and report device events */
+       for (i = 0; i < process->n_pdds; i++) {
+               struct kfd_process_device *pdd = process->pdds[i];
+               uint64_t tmp = process->exception_enable_mask
+                                               & pdd->exception_status;
+
+               if (!tmp)
+                       continue;
+
+               *event_status = pdd->exception_status;
+               *gpu_id = pdd->dev->id;
+               pdd->exception_status &= ~exception_clear_mask;
+               goto out;
+       }
+
+       /* report process events */
+       if (process->exception_enable_mask & process->exception_status) {
+               *event_status = process->exception_status;
+               process->exception_status &= ~exception_clear_mask;
+       }
+
+out:
+       mutex_unlock(&process->event_mutex);
+       return *event_status ? 0 : -EAGAIN;
+}
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
        struct kfd_process *process;
index ef8e9f7f171657d65090e1dae2b9b5860b725e59..e78f954c06849a6a7fc9429dab199cf6e647e8ea 100644 (file)
 
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+                       unsigned int *queue_id,
+                       unsigned int *gpu_id,
+                       uint64_t exception_clear_mask,
+                       uint64_t *event_status);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
                                   unsigned int pasid,
                                   uint32_t doorbell_id,