drm/amdgpu: remove all KFD fences from the BO on release
authorChristian König <christian.koenig@amd.com>
Wed, 29 Jan 2025 15:28:49 +0000 (16:28 +0100)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 21 Feb 2025 15:41:49 +0000 (10:41 -0500)
Remove all KFD BOs from the private dma_resv object.

This prevents the KFD from being evict unecessarily when an exported BO
is released.

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Reviewed-and-tested-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 236b73e283e8222be36b4c01fd634da57a528abf..55d5399676951e049b99d5592b4d41cd9c4b525f 100644 (file)
@@ -193,7 +193,7 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
 struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
-int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo);
 int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
                                unsigned long cur_seq, struct kgd_mem *mem);
 int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
@@ -213,9 +213,8 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
 }
 
 static inline
-int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
 {
-       return 0;
 }
 
 static inline
index ea3f7ee189235313a09d3dd1d0c30f0b00b956b0..62ca12e945810f0b6d76efb0424850e979d7b5f8 100644 (file)
@@ -370,40 +370,32 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
        return 0;
 }
 
-int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
+/**
+ * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences
+ * @bo: the BO where to remove the evictions fences from.
+ *
+ * This functions should only be used on release when all references to the BO
+ * are already dropped. We remove the eviction fence from the private copy of
+ * the dma_resv object here since that is what is used during release to
+ * determine of the BO is idle or not.
+ */
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
 {
-       struct amdgpu_bo *root = bo;
-       struct amdgpu_vm_bo_base *vm_bo;
-       struct amdgpu_vm *vm;
-       struct amdkfd_process_info *info;
-       struct amdgpu_amdkfd_fence *ef;
-       int ret;
-
-       /* we can always get vm_bo from root PD bo.*/
-       while (root->parent)
-               root = root->parent;
+       struct dma_resv *resv = &bo->tbo.base._resv;
+       struct dma_fence *fence, *stub;
+       struct dma_resv_iter cursor;
 
-       vm_bo = root->vm_bo;
-       if (!vm_bo)
-               return 0;
+       dma_resv_assert_held(resv);
 
-       vm = vm_bo->vm;
-       if (!vm)
-               return 0;
-
-       info = vm->process_info;
-       if (!info || !info->eviction_fence)
-               return 0;
-
-       ef = container_of(dma_fence_get(&info->eviction_fence->base),
-                       struct amdgpu_amdkfd_fence, base);
-
-       BUG_ON(!dma_resv_trylock(bo->tbo.base.resv));
-       ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef);
-       dma_resv_unlock(bo->tbo.base.resv);
+       stub = dma_fence_get_stub();
+       dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {
+               if (!to_amdgpu_amdkfd_fence(fence))
+                       continue;
 
-       dma_fence_put(&ef->base);
-       return ret;
+               dma_resv_replace_fences(resv, fence->context, stub,
+                                       DMA_RESV_USAGE_BOOKKEEP);
+       }
+       dma_fence_put(stub);
 }
 
 static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
index 96f4b8904e9a6a651cab30e6ac60e086ea2442ac..80cd6f5273db3ab1c5224a5cfbcafdf7c90d7f4c 100644 (file)
@@ -1295,28 +1295,36 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
        if (abo->kfd_bo)
                amdgpu_amdkfd_release_notify(abo);
 
-       /* We only remove the fence if the resv has individualized. */
-       WARN_ON_ONCE(bo->type == ttm_bo_type_kernel
-                       && bo->base.resv != &bo->base._resv);
-       if (bo->base.resv == &bo->base._resv)
-               amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
+       /*
+        * We lock the private dma_resv object here and since the BO is about to
+        * be released nobody else should have a pointer to it.
+        * So when this locking here fails something is wrong with the reference
+        * counting.
+        */
+       if (WARN_ON_ONCE(!dma_resv_trylock(&bo->base._resv)))
+               return;
+
+       amdgpu_amdkfd_remove_all_eviction_fences(abo);
 
        if (!bo->resource || bo->resource->mem_type != TTM_PL_VRAM ||
            !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) ||
            adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev)))
-               return;
+               goto out;
 
-       if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
-               return;
+       r = dma_resv_reserve_fences(&bo->base._resv, 1);
+       if (r)
+               goto out;
 
-       r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true);
-       if (!WARN_ON(r)) {
-               amdgpu_vram_mgr_set_cleared(bo->resource);
-               amdgpu_bo_fence(abo, fence, false);
-               dma_fence_put(fence);
-       }
+       r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true);
+       if (WARN_ON(r))
+               goto out;
+
+       amdgpu_vram_mgr_set_cleared(bo->resource);
+       dma_resv_add_fence(&bo->base._resv, fence, DMA_RESV_USAGE_KERNEL);
+       dma_fence_put(fence);
 
-       dma_resv_unlock(bo->base.resv);
+out:
+       dma_resv_unlock(&bo->base._resv);
 }
 
 /**