amd/amdkfd: Trigger segfault for early userptr unmmapping
authorShane Xiao <shane.xiao@amd.com>
Wed, 23 Apr 2025 09:28:50 +0000 (17:28 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 7 May 2025 21:45:09 +0000 (17:45 -0400)
If applications unmap the memory before destroying the userptr, it needs
trigger a segfault to notify user space to correct the free sequence in
VM debug mode.

v2: Send gpu access fault to user space
v3: Report gpu address to user space, remove unnecessary params
v4: update pr_err into one line, remove userptr log info

Signed-off-by: Shane Xiao <shane.xiao@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdkfd/kfd_events.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index d2ec4130a316ddb13fe4c59f1f84fefc337cbc81..260165bbe3736dc369d5bfa718020ec98d1c8698 100644 (file)
@@ -2559,6 +2559,18 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
                        if (ret != -EFAULT)
                                return ret;
 
+                       /* If applications unmap memory before destroying the userptr
+                        * from the KFD, trigger a segmentation fault in VM debug mode.
+                        */
+                       if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) {
+                               pr_err("Pid %d unmapped memory before destroying userptr at GPU addr 0x%llx\n",
+                                                               pid_nr(process_info->pid), mem->va);
+
+                               // Send GPU VM fault to user space
+                               kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(process_info->pid),
+                                                               mem->va);
+                       }
+
                        ret = 0;
                }
 
index fecdb679407503a8969de0d553b185c731424424..e54e708ed82de1d429ee7754d70e5841ed3afdbc 100644 (file)
@@ -1177,6 +1177,25 @@ void kfd_signal_hw_exception_event(u32 pasid)
        kfd_unref_process(p);
 }
 
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va)
+{
+       struct kfd_process_device *pdd;
+       struct kfd_hsa_memory_exception_data exception_data;
+       int i;
+
+       memset(&exception_data, 0, sizeof(exception_data));
+       exception_data.va = gpu_va;
+       exception_data.failure.NotPresent = 1;
+
+       // Send VM seg fault to all kfd process device
+       for (i = 0; i < p->n_pdds; i++) {
+               pdd = p->pdds[i];
+               exception_data.gpu_id = pdd->user_gpu_id;
+               kfd_evict_process_device(pdd);
+               kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
+       }
+}
+
 void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
                                struct kfd_vm_fault_info *info,
                                struct kfd_hsa_memory_exception_data *data)
index 0ae794539bb0a6476af05953875bf16d6f739452..d221c58dccc3ccaa650ab535a5f97192fce5fef1 100644 (file)
@@ -1507,6 +1507,8 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 int kfd_get_num_events(struct kfd_process *p);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
+void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va);
+
 void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
                                struct kfd_vm_fault_info *info,
                                struct kfd_hsa_memory_exception_data *data);