drm/xe: Add TDR for invalidation fence timeout cleanup
authorMatthew Brost <matthew.brost@intel.com>
Tue, 24 Jan 2023 18:35:59 +0000 (10:35 -0800)
committerRodrigo Vivi <rodrigo.vivi@intel.com>
Tue, 19 Dec 2023 23:27:46 +0000 (18:27 -0500)
Endless fences are not good, add a TDR to cleanup any invalidation
fences which have not received an invalidation message within a timeout
period.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h
drivers/gpu/drm/xe/xe_gt_types.h
drivers/gpu/drm/xe/xe_trace.h

index 4d179357ce65dd92aaadd2b361782ac8abe69414..9e026fd0a45d9804a7eee43e5f52eb8ea170a436 100644 (file)
@@ -9,12 +9,45 @@
 #include "xe_guc_ct.h"
 #include "xe_trace.h"
 
+#define TLB_TIMEOUT    (HZ / 4)
+
 static struct xe_gt *
 guc_to_gt(struct xe_guc *guc)
 {
        return container_of(guc, struct xe_gt, uc.guc);
 }
 
+static void xe_gt_tlb_fence_timeout(struct work_struct *work)
+{
+       struct xe_gt *gt = container_of(work, struct xe_gt,
+                                       tlb_invalidation.fence_tdr.work);
+       struct xe_gt_tlb_invalidation_fence *fence, *next;
+
+       mutex_lock(&gt->uc.guc.ct.lock);
+       list_for_each_entry_safe(fence, next,
+                                &gt->tlb_invalidation.pending_fences, link) {
+               s64 since_inval_ms = ktime_ms_delta(ktime_get(),
+                                                   fence->invalidation_time);
+
+               if (msecs_to_jiffies(since_inval_ms) < TLB_TIMEOUT)
+                       break;
+
+               trace_xe_gt_tlb_invalidation_fence_timeout(fence);
+               drm_err(&gt_to_xe(gt)->drm, "TLB invalidation fence timeout, seqno=%d",
+                       fence->seqno);
+
+               list_del(&fence->link);
+               fence->base.error = -ETIME;
+               dma_fence_signal(&fence->base);
+               dma_fence_put(&fence->base);
+       }
+       if (!list_empty(&gt->tlb_invalidation.pending_fences))
+               queue_delayed_work(system_wq,
+                                  &gt->tlb_invalidation.fence_tdr,
+                                  TLB_TIMEOUT);
+       mutex_unlock(&gt->uc.guc.ct.lock);
+}
+
 /**
  * xe_gt_tlb_invalidation_init - Initialize GT TLB invalidation state
  * @gt: graphics tile
@@ -30,6 +63,8 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
        INIT_LIST_HEAD(&gt->tlb_invalidation.pending_fences);
        spin_lock_init(&gt->tlb_invalidation.lock);
        gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
+       INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
+                         xe_gt_tlb_fence_timeout);
 
        return 0;
 }
@@ -44,6 +79,8 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
 {
        struct xe_gt_tlb_invalidation_fence *fence, *next;
 
+       cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
+
        mutex_lock(&gt->uc.guc.ct.lock);
        list_for_each_entry_safe(fence, next,
                                 &gt->tlb_invalidation.pending_fences, link) {
@@ -67,6 +104,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
        };
        int seqno;
        int ret;
+       bool queue_work;
 
        /*
         * XXX: The seqno algorithm relies on TLB invalidation being processed
@@ -76,10 +114,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
        mutex_lock(&guc->ct.lock);
        seqno = gt->tlb_invalidation.seqno;
        if (fence) {
-               /*
-                * FIXME: How to deal TLB invalidation timeout, right now we
-                * just have an endless fence which isn't ideal.
-                */
+               queue_work = list_empty(&gt->tlb_invalidation.pending_fences);
                fence->seqno = seqno;
                list_add_tail(&fence->link,
                              &gt->tlb_invalidation.pending_fences);
@@ -92,6 +127,13 @@ static int send_tlb_invalidation(struct xe_guc *guc,
                gt->tlb_invalidation.seqno = 1;
        ret = xe_guc_ct_send_locked(&guc->ct, action, ARRAY_SIZE(action),
                                    G2H_LEN_DW_TLB_INVALIDATE, 1);
+       if (!ret && fence) {
+               fence->invalidation_time = ktime_get();
+               if (queue_work)
+                       queue_delayed_work(system_wq,
+                                          &gt->tlb_invalidation.fence_tdr,
+                                          TLB_TIMEOUT);
+       }
        if (!ret)
                ret = seqno;
        mutex_unlock(&guc->ct.lock);
@@ -152,7 +194,7 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
         */
        ret = wait_event_timeout(guc->ct.wq,
                                 tlb_invalidation_seqno_past(gt, seqno),
-                                HZ / 5);
+                                TLB_TIMEOUT);
        if (!ret) {
                drm_err(&xe->drm, "TLB invalidation time'd out, seqno=%d, recv=%d\n",
                        seqno, gt->tlb_invalidation.seqno_recv);
@@ -201,6 +243,12 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
        if (fence && tlb_invalidation_seqno_past(gt, fence->seqno)) {
                trace_xe_gt_tlb_invalidation_fence_signal(fence);
                list_del(&fence->link);
+               if (!list_empty(&gt->tlb_invalidation.pending_fences))
+                       mod_delayed_work(system_wq,
+                                        &gt->tlb_invalidation.fence_tdr,
+                                        TLB_TIMEOUT);
+               else
+                       cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
                dma_fence_signal(&fence->base);
                dma_fence_put(&fence->base);
        }
index ab57c14c6d144b960467a65aa55a5c09106e3d0f..934c828efe31cb728d7e751555e9525233f30ae6 100644 (file)
@@ -21,6 +21,8 @@ struct xe_gt_tlb_invalidation_fence {
        struct list_head link;
        /** @seqno: seqno of TLB invalidation to signal fence one */
        int seqno;
+       /** @invalidation_time: time of TLB invalidation */
+       ktime_t invalidation_time;
 };
 
 #endif
index 3b2d9842add7bf86e5de0c5c5054ef8291998b90..a40fab262ac9e40ffde334aa91b764f8e4384563 100644 (file)
@@ -174,6 +174,11 @@ struct xe_gt {
                 * invaliations, protected by CT lock
                 */
                struct list_head pending_fences;
+               /**
+                * @fence_tdr: schedules a delayed call to
+                * xe_gt_tlb_fence_timeout after the timeut interval is over.
+                */
+               struct delayed_work fence_tdr;
                /** @fence_context: context for TLB invalidation fences */
                u64 fence_context;
                /**
index b5b0f1bff7ec87ce3bb2fe75e8255388a8cb0f86..d1cd4b57a9740a33cdb1fd548f0a0a03f973d689 100644 (file)
@@ -70,6 +70,11 @@ DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_signal,
             TP_ARGS(fence)
 );
 
+DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_timeout,
+            TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence),
+            TP_ARGS(fence)
+);
+
 DECLARE_EVENT_CLASS(xe_bo,
                    TP_PROTO(struct xe_bo *bo),
                    TP_ARGS(bo),