drivers/gpu/drm/i915/gt/gen6_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2020 Intel Corporation
   4  */
   5
   6 #include "gen6_engine_cs.h"
   7 #include "intel_engine.h"
   8 #include "intel_engine_regs.h"
   9 #include "intel_gpu_commands.h"
  10 #include "intel_gt.h"
  11 #include "intel_gt_irq.h"
  12 #include "intel_gt_pm_irq.h"
  13 #include "intel_ring.h"
  14
  15 #define HWS_SCRATCH_ADDR        (I915_GEM_HWS_SCRATCH * sizeof(u32))
  16
  17 /*
  18  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
  19  * implementing two workarounds on gen6.  From section 1.4.7.1
  20  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
  21  *
  22  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
  23  * produced by non-pipelined state commands), software needs to first
  24  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
  25  * 0.
  26  *
  27  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
  28  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
  29  *
  30  * And the workaround for these two requires this workaround first:
  31  *
  32  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
  33  * BEFORE the pipe-control with a post-sync op and no write-cache
  34  * flushes.
  35  *
  36  * And this last workaround is tricky because of the requirements on
  37  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
  38  * volume 2 part 1:
  39  *
  40  *     "1 of the following must also be set:
  41  *      - Render Target Cache Flush Enable ([12] of DW1)
  42  *      - Depth Cache Flush Enable ([0] of DW1)
  43  *      - Stall at Pixel Scoreboard ([1] of DW1)
  44  *      - Depth Stall ([13] of DW1)
  45  *      - Post-Sync Operation ([13] of DW1)
  46  *      - Notify Enable ([8] of DW1)"
  47  *
  48  * The cache flushes require the workaround flush that triggered this
  49  * one, so we can't use it.  Depth stall would trigger the same.
  50  * Post-sync nonzero is what triggered this second workaround, so we
  51  * can't use that one either.  Notify enable is IRQs, which aren't
  52  * really our business.  That leaves only stall at scoreboard.
  53  */
  54 static int
  55 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
  56 {
  57         u32 scratch_addr =
  58                 intel_gt_scratch_offset(rq->engine->gt,
  59                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
  60         u32 *cs;
  61
  62         cs = intel_ring_begin(rq, 6);
  63         if (IS_ERR(cs))
  64                 return PTR_ERR(cs);
  65
  66         *cs++ = GFX_OP_PIPE_CONTROL(5);
  67         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
  68         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
  69         *cs++ = 0; /* low dword */
  70         *cs++ = 0; /* high dword */
  71         *cs++ = MI_NOOP;
  72         intel_ring_advance(rq, cs);
  73
  74         cs = intel_ring_begin(rq, 6);
  75         if (IS_ERR(cs))
  76                 return PTR_ERR(cs);
  77
  78         *cs++ = GFX_OP_PIPE_CONTROL(5);
  79         *cs++ = PIPE_CONTROL_QW_WRITE;
  80         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
  81         *cs++ = 0;
  82         *cs++ = 0;
  83         *cs++ = MI_NOOP;
  84         intel_ring_advance(rq, cs);
  85
  86         return 0;
  87 }
  88
  89 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
  90 {
  91         u32 scratch_addr =
  92                 intel_gt_scratch_offset(rq->engine->gt,
  93                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
  94         u32 *cs, flags = 0;
  95         int ret;
  96
  97         /* Force SNB workarounds for PIPE_CONTROL flushes */
  98         ret = gen6_emit_post_sync_nonzero_flush(rq);
  99         if (ret)
 100                 return ret;
 101
 102         /*
 103          * Just flush everything.  Experiments have shown that reducing the
 104          * number of bits based on the write domains has little performance
 105          * impact. And when rearranging requests, the order of flushes is
 106          * unknown.
 107          */
 108         if (mode & EMIT_FLUSH) {
 109                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 110                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 111                 /*
 112                  * Ensure that any following seqno writes only happen
 113                  * when the render cache is indeed flushed.
 114                  */
 115                 flags |= PIPE_CONTROL_CS_STALL;
 116         }
 117         if (mode & EMIT_INVALIDATE) {
 118                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 119                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 120                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 121                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 122                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 123                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 124                 /*
 125                  * TLB invalidate requires a post-sync write.
 126                  */
 127                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 128         }
 129
 130         cs = intel_ring_begin(rq, 4);
 131         if (IS_ERR(cs))
 132                 return PTR_ERR(cs);
 133
 134         *cs++ = GFX_OP_PIPE_CONTROL(4);
 135         *cs++ = flags;
 136         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 137         *cs++ = 0;
 138         intel_ring_advance(rq, cs);
 139
 140         return 0;
 141 }
 142
 143 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 144 {
 145         /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 146         *cs++ = GFX_OP_PIPE_CONTROL(4);
 147         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 148         *cs++ = 0;
 149         *cs++ = 0;
 150
 151         *cs++ = GFX_OP_PIPE_CONTROL(4);
 152         *cs++ = PIPE_CONTROL_QW_WRITE;
 153         *cs++ = intel_gt_scratch_offset(rq->engine->gt,
 154                                         INTEL_GT_SCRATCH_FIELD_DEFAULT) |
 155                 PIPE_CONTROL_GLOBAL_GTT;
 156         *cs++ = 0;
 157
 158         /* Finally we can flush and with it emit the breadcrumb */
 159         *cs++ = GFX_OP_PIPE_CONTROL(4);
 160         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 161                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 162                  PIPE_CONTROL_DC_FLUSH_ENABLE |
 163                  PIPE_CONTROL_QW_WRITE |
 164                  PIPE_CONTROL_CS_STALL);
 165         *cs++ = i915_request_active_seqno(rq) |
 166                 PIPE_CONTROL_GLOBAL_GTT;
 167         *cs++ = rq->fence.seqno;
 168
 169         *cs++ = MI_USER_INTERRUPT;
 170         *cs++ = MI_NOOP;
 171
 172         rq->tail = intel_ring_offset(rq, cs);
 173         assert_ring_tail_valid(rq->ring, rq->tail);
 174
 175         return cs;
 176 }
 177
 178 static int mi_flush_dw(struct i915_request *rq, u32 flags)
 179 {
 180         u32 cmd, *cs;
 181
 182         cs = intel_ring_begin(rq, 4);
 183         if (IS_ERR(cs))
 184                 return PTR_ERR(cs);
 185
 186         cmd = MI_FLUSH_DW;
 187
 188         /*
 189          * We always require a command barrier so that subsequent
 190          * commands, such as breadcrumb interrupts, are strictly ordered
 191          * wrt the contents of the write cache being flushed to memory
 192          * (and thus being coherent from the CPU).
 193          */
 194         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 195
 196         /*
 197          * Bspec vol 1c.3 - blitter engine command streamer:
 198          * "If ENABLED, all TLBs will be invalidated once the flush
 199          * operation is complete. This bit is only valid when the
 200          * Post-Sync Operation field is a value of 1h or 3h."
 201          */
 202         cmd |= flags;
 203
 204         *cs++ = cmd;
 205         *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 206         *cs++ = 0;
 207         *cs++ = MI_NOOP;
 208
 209         intel_ring_advance(rq, cs);
 210
 211         return 0;
 212 }
 213
 214 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
 215 {
 216         return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
 217 }
 218
 219 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
 220 {
 221         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
 222 }
 223
 224 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
 225 {
 226         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
 227 }
 228
 229 int gen6_emit_bb_start(struct i915_request *rq,
 230                        u64 offset, u32 len,
 231                        unsigned int dispatch_flags)
 232 {
 233         u32 security;
 234         u32 *cs;
 235
 236         security = MI_BATCH_NON_SECURE_I965;
 237         if (dispatch_flags & I915_DISPATCH_SECURE)
 238                 security = 0;
 239
 240         cs = intel_ring_begin(rq, 2);
 241         if (IS_ERR(cs))
 242                 return PTR_ERR(cs);
 243
 244         cs = __gen6_emit_bb_start(cs, offset, security);
 245         intel_ring_advance(rq, cs);
 246
 247         return 0;
 248 }
 249
 250 int
 251 hsw_emit_bb_start(struct i915_request *rq,
 252                   u64 offset, u32 len,
 253                   unsigned int dispatch_flags)
 254 {
 255         u32 security;
 256         u32 *cs;
 257
 258         security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
 259         if (dispatch_flags & I915_DISPATCH_SECURE)
 260                 security = 0;
 261
 262         cs = intel_ring_begin(rq, 2);
 263         if (IS_ERR(cs))
 264                 return PTR_ERR(cs);
 265
 266         cs = __gen6_emit_bb_start(cs, offset, security);
 267         intel_ring_advance(rq, cs);
 268
 269         return 0;
 270 }
 271
 272 static int gen7_stall_cs(struct i915_request *rq)
 273 {
 274         u32 *cs;
 275
 276         cs = intel_ring_begin(rq, 4);
 277         if (IS_ERR(cs))
 278                 return PTR_ERR(cs);
 279
 280         *cs++ = GFX_OP_PIPE_CONTROL(4);
 281         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 282         *cs++ = 0;
 283         *cs++ = 0;
 284         intel_ring_advance(rq, cs);
 285
 286         return 0;
 287 }
 288
 289 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
 290 {
 291         u32 scratch_addr =
 292                 intel_gt_scratch_offset(rq->engine->gt,
 293                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 294         u32 *cs, flags = 0;
 295
 296         /*
 297          * Ensure that any following seqno writes only happen when the render
 298          * cache is indeed flushed.
 299          *
 300          * Workaround: 4th PIPE_CONTROL command (except the ones with only
 301          * read-cache invalidate bits set) must have the CS_STALL bit set. We
 302          * don't try to be clever and just set it unconditionally.
 303          */
 304         flags |= PIPE_CONTROL_CS_STALL;
 305
 306         /*
 307          * CS_STALL suggests at least a post-sync write.
 308          */
 309         flags |= PIPE_CONTROL_QW_WRITE;
 310         flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 311
 312         /*
 313          * Just flush everything.  Experiments have shown that reducing the
 314          * number of bits based on the write domains has little performance
 315          * impact.
 316          */
 317         if (mode & EMIT_FLUSH) {
 318                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 319                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 320                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 321                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 322         }
 323         if (mode & EMIT_INVALIDATE) {
 324                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 325                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 326                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 327                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 328                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 329                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 330                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 331
 332                 /*
 333                  * Workaround: we must issue a pipe_control with CS-stall bit
 334                  * set before a pipe_control command that has the state cache
 335                  * invalidate bit set.
 336                  */
 337                 gen7_stall_cs(rq);
 338         }
 339
 340         cs = intel_ring_begin(rq, 4);
 341         if (IS_ERR(cs))
 342                 return PTR_ERR(cs);
 343
 344         *cs++ = GFX_OP_PIPE_CONTROL(4);
 345         *cs++ = flags;
 346         *cs++ = scratch_addr;
 347         *cs++ = 0;
 348         intel_ring_advance(rq, cs);
 349
 350         return 0;
 351 }
 352
 353 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 354 {
 355         *cs++ = GFX_OP_PIPE_CONTROL(4);
 356         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 357                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 358                  PIPE_CONTROL_DC_FLUSH_ENABLE |
 359                  PIPE_CONTROL_FLUSH_ENABLE |
 360                  PIPE_CONTROL_QW_WRITE |
 361                  PIPE_CONTROL_GLOBAL_GTT_IVB |
 362                  PIPE_CONTROL_CS_STALL);
 363         *cs++ = i915_request_active_seqno(rq);
 364         *cs++ = rq->fence.seqno;
 365
 366         *cs++ = MI_USER_INTERRUPT;
 367         *cs++ = MI_NOOP;
 368
 369         rq->tail = intel_ring_offset(rq, cs);
 370         assert_ring_tail_valid(rq->ring, rq->tail);
 371
 372         return cs;
 373 }
 374
 375 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 376 {
 377         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 378         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
 379
 380         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 381         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 382         *cs++ = rq->fence.seqno;
 383
 384         *cs++ = MI_USER_INTERRUPT;
 385
 386         rq->tail = intel_ring_offset(rq, cs);
 387         assert_ring_tail_valid(rq->ring, rq->tail);
 388
 389         return cs;
 390 }
 391
 392 #define GEN7_XCS_WA 32
 393 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 394 {
 395         int i;
 396
 397         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 398         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
 399
 400         *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
 401                 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 402         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 403         *cs++ = rq->fence.seqno;
 404
 405         for (i = 0; i < GEN7_XCS_WA; i++) {
 406                 *cs++ = MI_STORE_DWORD_INDEX;
 407                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 408                 *cs++ = rq->fence.seqno;
 409         }
 410
 411         *cs++ = MI_FLUSH_DW;
 412         *cs++ = 0;
 413         *cs++ = 0;
 414
 415         *cs++ = MI_USER_INTERRUPT;
 416         *cs++ = MI_NOOP;
 417
 418         rq->tail = intel_ring_offset(rq, cs);
 419         assert_ring_tail_valid(rq->ring, rq->tail);
 420
 421         return cs;
 422 }
 423 #undef GEN7_XCS_WA
 424
 425 void gen6_irq_enable(struct intel_engine_cs *engine)
 426 {
 427         ENGINE_WRITE(engine, RING_IMR,
 428                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
 429
 430         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 431         ENGINE_POSTING_READ(engine, RING_IMR);
 432
 433         gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
 434 }
 435
 436 void gen6_irq_disable(struct intel_engine_cs *engine)
 437 {
 438         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
 439         gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
 440 }
 441
 442 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
 443 {
 444         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
 445
 446         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 447         ENGINE_POSTING_READ(engine, RING_IMR);
 448
 449         gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
 450 }
 451
 452 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
 453 {
 454         ENGINE_WRITE(engine, RING_IMR, ~0);
 455         gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
 456 }