drm/xe: Handle -EDEADLK case in preempt worker

author Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>

Mon, 8 May 2023 05:22:23 +0000 (05:22 +0000)

committer Rodrigo Vivi <rodrigo.vivi@intel.com>

Tue, 19 Dec 2023 23:33:49 +0000 (18:33 -0500)
author Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Mon, 8 May 2023 05:22:23 +0000 (05:22 +0000)
committer Rodrigo Vivi <rodrigo.vivi@intel.com>
Tue, 19 Dec 2023 23:33:49 +0000 (18:33 -0500)
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c

index 06b559ff80bf5689422249085c2e3756036b1746..d9579bf5002dbd0f43b7baf3f986d22298bc943f 100644 (file)
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -10,6 +10,7 @@
  #include <drm/ttm/ttm_execbuf_util.h>
  #include <drm/ttm/ttm_tt.h>
  #include <drm/xe_drm.h>
+#include <linux/delay.h>
  #include <linux/kthread.h>
  #include <linux/mm.h>
  #include <linux/swap.h>
@@ -508,6 +509,8 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
                 kvfree(tv);
  }
  
+#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
+
  static void preempt_rebind_work_func(struct work_struct *w)
  {
         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -519,6 +522,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
         struct dma_fence *rebind_fence;
         unsigned int fence_count = 0;
         LIST_HEAD(preempt_fences);
+       ktime_t end = 0;
         int err;
         long wait;
         int __maybe_unused tries = 0;
@@ -637,6 +641,24 @@ out_unlock_outer:
                 trace_xe_vm_rebind_worker_retry(vm);
                 goto retry;
         }
+
+       /*
+        * With multiple active VMs, under memory pressure, it is possible that
+        * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
+        * Until ttm properly handles locking in such scenarios, best thing the
+        * driver can do is retry with a timeout. Killing the VM or putting it
+        * in error state after timeout or other error scenarios is still TBD.
+        */
+       if (err == -ENOMEM) {
+               ktime_t cur = ktime_get();
+
+               end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
+               if (ktime_before(cur, end)) {
+                       msleep(20);
+                       trace_xe_vm_rebind_worker_retry(vm);
+                       goto retry;
+               }
+       }
         up_write(&vm->lock);
  
         free_preempt_fences(&preempt_fences);
author	Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
	Mon, 8 May 2023 05:22:23 +0000 (05:22 +0000)
committer	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Tue, 19 Dec 2023 23:33:49 +0000 (18:33 -0500)