drm/xe: Use ring ops TLB invalidation for rebinds
authorThomas Hellström <thomas.hellstrom@linux.intel.com>
Wed, 27 Mar 2024 09:11:33 +0000 (10:11 +0100)
committerLucas De Marchi <lucas.demarchi@intel.com>
Thu, 4 Apr 2024 13:32:22 +0000 (08:32 -0500)
For each rebind we insert a GuC TLB invalidation and add a
corresponding unordered TLB invalidation fence. This might
add a huge number of TLB invalidation fences to wait for so
rather than doing that, defer the TLB invalidation to the
next ring ops for each affected exec queue. Since the TLB
is invalidated on exec_queue switch, we need to invalidate
once for each affected exec_queue.

v2:
- Simplify if-statements around the tlb_flush_seqno.
  (Matthew Brost)
- Add some comments and asserts.

Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs")
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: <stable@vger.kernel.org> # v6.8+
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-2-thomas.hellstrom@linux.intel.com
(cherry picked from commit 4fc4899e86f7afbd09f4bcb899f0fc57e0296e62)
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
drivers/gpu/drm/xe/xe_exec_queue_types.h
drivers/gpu/drm/xe/xe_pt.c
drivers/gpu/drm/xe/xe_ring_ops.c
drivers/gpu/drm/xe/xe_sched_job.c
drivers/gpu/drm/xe/xe_sched_job_types.h
drivers/gpu/drm/xe/xe_vm_types.h

index 62b3d9d1d7cdd4f2d65c55db414a00b7bd7fbd06..462b331950320c0e49901fb09c32a8cdcffc1745 100644 (file)
@@ -148,6 +148,11 @@ struct xe_exec_queue {
        const struct xe_ring_ops *ring_ops;
        /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */
        struct drm_sched_entity *entity;
+       /**
+        * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed
+        * Protected by @vm's resv. Unused if @vm == NULL.
+        */
+       u64 tlb_flush_seqno;
        /** @lrc: logical ring context for this exec queue */
        struct xe_lrc lrc[];
 };
index 7f54bc3e389d58f8023f3a1092aa47d3e852a16b..9fd65f5d3d8036ec02866fb985da78a2b706da00 100644 (file)
@@ -1254,11 +1254,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
         * non-faulting LR, in particular on user-space batch buffer chaining,
         * it needs to be done here.
         */
-       if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) ||
-           (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
+       if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
                ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
                if (!ifence)
                        return ERR_PTR(-ENOMEM);
+       } else if (rebind && !xe_vm_in_lr_mode(vm)) {
+               /* We bump also if batch_invalidate_tlb is true */
+               vm->tlb_flush_seqno++;
        }
 
        rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
index c4edffcd4a320666d576d950ab15dc614545a053..5b2b37b598130ac464a2c344bad52b731e778e28 100644 (file)
@@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
 {
        u32 dw[MAX_JOB_SIZE_DW], i = 0;
        u32 ppgtt_flag = get_ppgtt_flag(job);
-       struct xe_vm *vm = job->q->vm;
        struct xe_gt *gt = job->q->gt;
 
-       if (vm && vm->batch_invalidate_tlb) {
+       if (job->ring_ops_flush_tlb) {
                dw[i++] = preparser_disable(true);
                i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
                                        seqno, true, dw, i);
@@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
        struct xe_gt *gt = job->q->gt;
        struct xe_device *xe = gt_to_xe(gt);
        bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
-       struct xe_vm *vm = job->q->vm;
 
        dw[i++] = preparser_disable(true);
 
@@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
                        i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
        }
 
-       if (vm && vm->batch_invalidate_tlb)
+       if (job->ring_ops_flush_tlb)
                i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
                                        seqno, true, dw, i);
 
        dw[i++] = preparser_disable(false);
 
-       if (!vm || !vm->batch_invalidate_tlb)
+       if (!job->ring_ops_flush_tlb)
                i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
                                        seqno, dw, i);
 
@@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
        struct xe_gt *gt = job->q->gt;
        struct xe_device *xe = gt_to_xe(gt);
        bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
-       struct xe_vm *vm = job->q->vm;
        u32 mask_flags = 0;
 
        dw[i++] = preparser_disable(true);
@@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
                mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
 
        /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
-       i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
+       i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
 
        /* hsdes: 1809175790 */
        if (has_aux_ccs(xe))
index 8151ddafb940756d87dbca45e6d3407354535ce4..b0c7fa4693cfe4a999b93b3878cb72c6150ebcbd 100644 (file)
@@ -250,6 +250,16 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
 
 void xe_sched_job_arm(struct xe_sched_job *job)
 {
+       struct xe_exec_queue *q = job->q;
+       struct xe_vm *vm = q->vm;
+
+       if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
+           (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) {
+               xe_vm_assert_held(vm);
+               q->tlb_flush_seqno = vm->tlb_flush_seqno;
+               job->ring_ops_flush_tlb = true;
+       }
+
        drm_sched_job_arm(&job->drm);
 }
 
index b1d83da50a53da59b6d72af1bbd21c8d98ca3517..5e12724219fdd485f2b770bd4b31e78aa2ab42af 100644 (file)
@@ -39,6 +39,8 @@ struct xe_sched_job {
        } user_fence;
        /** @migrate_flush_flags: Additional flush flags for migration jobs */
        u32 migrate_flush_flags;
+       /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
+       bool ring_ops_flush_tlb;
        /** @batch_addr: batch buffer address of job */
        u64 batch_addr[];
 };
index ae5fb565f6bf48d52e29c811a8333793e4e128fd..5747f136d24d1f3790f8ae928b86e52373c4cec7 100644 (file)
@@ -264,6 +264,11 @@ struct xe_vm {
                bool capture_once;
        } error_capture;
 
+       /**
+        * @tlb_flush_seqno: Required TLB flush seqno for the next exec.
+        * protected by the vm resv.
+        */
+       u64 tlb_flush_seqno;
        /** @batch_invalidate_tlb: Always invalidate TLB before batch start */
        bool batch_invalidate_tlb;
        /** @xef: XE file handle for tracking this VM's drm client */