drm/amdkfd: Don't call mmput from MMU notifier callback
authorPhilip Yang <Philip.Yang@amd.com>
Fri, 20 Jun 2025 22:32:32 +0000 (18:32 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 30 Jun 2025 17:57:12 +0000 (13:57 -0400)
If the process is exiting, the mmput inside mmu notifier callback from
compactd or fork or numa balancing could release the last reference
of mm struct to call exit_mmap and free_pgtable, this triggers deadlock
with below backtrace.

The deadlock will leak kfd process as mmu notifier release is not called
and cause VRAM leaking.

The fix is to take mm reference mmget_non_zero when adding prange to the
deferred list to pair with mmput in deferred list work.

If prange split and add into pchild list, the pchild work_item.mm is not
used, so remove the mm parameter from svm_range_unmap_split and
svm_range_add_child.

The backtrace of hung task:

 INFO: task python:348105 blocked for more than 64512 seconds.
 Call Trace:
  __schedule+0x1c3/0x550
  schedule+0x46/0xb0
  rwsem_down_write_slowpath+0x24b/0x4c0
  unlink_anon_vmas+0xb1/0x1c0
  free_pgtables+0xa9/0x130
  exit_mmap+0xbc/0x1a0
  mmput+0x5a/0x140
  svm_range_cpu_invalidate_pagetables+0x2b/0x40 [amdgpu]
  mn_itree_invalidate+0x72/0xc0
  __mmu_notifier_invalidate_range_start+0x48/0x60
  try_to_unmap_one+0x10fa/0x1400
  rmap_walk_anon+0x196/0x460
  try_to_unmap+0xbb/0x210
  migrate_page_unmap+0x54d/0x7e0
  migrate_pages_batch+0x1c3/0xae0
  migrate_pages_sync+0x98/0x240
  migrate_pages+0x25c/0x520
  compact_zone+0x29d/0x590
  compact_zone_order+0xb6/0xf0
  try_to_compact_pages+0xbe/0x220
  __alloc_pages_direct_compact+0x96/0x1a0
  __alloc_pages_slowpath+0x410/0x930
  __alloc_pages_nodemask+0x3a9/0x3e0
  do_huge_pmd_anonymous_page+0xd7/0x3e0
  __handle_mm_fault+0x5e3/0x5f0
  handle_mm_fault+0xf7/0x2e0
  hmm_vma_fault.isra.0+0x4d/0xa0
  walk_pmd_range.isra.0+0xa8/0x310
  walk_pud_range+0x167/0x240
  walk_pgd_range+0x55/0x100
  __walk_page_range+0x87/0x90
  walk_page_range+0xf6/0x160
  hmm_range_fault+0x4f/0x90
  amdgpu_hmm_range_get_pages+0x123/0x230 [amdgpu]
  amdgpu_ttm_tt_get_user_pages+0xb1/0x150 [amdgpu]
  init_user_pages+0xb1/0x2a0 [amdgpu]
  amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu+0x543/0x7d0 [amdgpu]
  kfd_ioctl_alloc_memory_of_gpu+0x24c/0x4e0 [amdgpu]
  kfd_ioctl+0x29d/0x500 [amdgpu]

Fixes: fa582c6f3684 ("drm/amdkfd: Use mmget_not_zero in MMU notifier")
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit a29e067bd38946f752b0ef855f3dfff87e77bec7)
Cc: stable@vger.kernel.org
drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 7763e4742080b9e1c703fc2bccfea0dafd340b67..a0f22ea6d15af73b9c6cacb781a7354b2791f32a 100644 (file)
@@ -1171,13 +1171,12 @@ svm_range_split_head(struct svm_range *prange, uint64_t new_start,
 }
 
 static void
-svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
-                   struct svm_range *pchild, enum svm_work_list_ops op)
+svm_range_add_child(struct svm_range *prange, struct svm_range *pchild, enum svm_work_list_ops op)
 {
        pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
                 pchild, pchild->start, pchild->last, prange, op);
 
-       pchild->work_item.mm = mm;
+       pchild->work_item.mm = NULL;
        pchild->work_item.op = op;
        list_add_tail(&pchild->child_list, &prange->child_list);
 }
@@ -2394,15 +2393,17 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
                    prange->work_item.op != SVM_OP_UNMAP_RANGE)
                        prange->work_item.op = op;
        } else {
-               prange->work_item.op = op;
-
-               /* Pairs with mmput in deferred_list_work */
-               mmget(mm);
-               prange->work_item.mm = mm;
-               list_add_tail(&prange->deferred_list,
-                             &prange->svms->deferred_range_list);
-               pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
-                        prange, prange->start, prange->last, op);
+               /* Pairs with mmput in deferred_list_work.
+                * If process is exiting and mm is gone, don't update mmu notifier.
+                */
+               if (mmget_not_zero(mm)) {
+                       prange->work_item.mm = mm;
+                       prange->work_item.op = op;
+                       list_add_tail(&prange->deferred_list,
+                                     &prange->svms->deferred_range_list);
+                       pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
+                                prange, prange->start, prange->last, op);
+               }
        }
        spin_unlock(&svms->deferred_list_lock);
 }
@@ -2416,8 +2417,7 @@ void schedule_deferred_list_work(struct svm_range_list *svms)
 }
 
 static void
-svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
-                     struct svm_range *prange, unsigned long start,
+svm_range_unmap_split(struct svm_range *parent, struct svm_range *prange, unsigned long start,
                      unsigned long last)
 {
        struct svm_range *head;
@@ -2438,12 +2438,12 @@ svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
                svm_range_split(tail, last + 1, tail->last, &head);
 
        if (head != prange && tail != prange) {
-               svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
-               svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
+               svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE);
+               svm_range_add_child(parent, tail, SVM_OP_ADD_RANGE);
        } else if (tail != prange) {
-               svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
+               svm_range_add_child(parent, tail, SVM_OP_UNMAP_RANGE);
        } else if (head != prange) {
-               svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
+               svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE);
        } else if (parent != prange) {
                prange->work_item.op = SVM_OP_UNMAP_RANGE;
        }
@@ -2520,14 +2520,14 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
                l = min(last, pchild->last);
                if (l >= s)
                        svm_range_unmap_from_gpus(pchild, s, l, trigger);
-               svm_range_unmap_split(mm, prange, pchild, start, last);
+               svm_range_unmap_split(prange, pchild, start, last);
                mutex_unlock(&pchild->lock);
        }
        s = max(start, prange->start);
        l = min(last, prange->last);
        if (l >= s)
                svm_range_unmap_from_gpus(prange, s, l, trigger);
-       svm_range_unmap_split(mm, prange, prange, start, last);
+       svm_range_unmap_split(prange, prange, start, last);
 
        if (unmap_parent)
                svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
@@ -2570,8 +2570,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
 
        if (range->event == MMU_NOTIFY_RELEASE)
                return true;
-       if (!mmget_not_zero(mni->mm))
-               return true;
 
        start = mni->interval_tree.start;
        last = mni->interval_tree.last;
@@ -2598,7 +2596,6 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
        }
 
        svm_range_unlock(prange);
-       mmput(mni->mm);
 
        return true;
 }