Merge tag 'gfs2-v5.17-rc4-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-block.git] / drivers / gpu / drm / i915 / gt / gen6_engine_cs.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5
6 #include "gen6_engine_cs.h"
7 #include "intel_engine.h"
8 #include "intel_engine_regs.h"
9 #include "intel_gpu_commands.h"
10 #include "intel_gt.h"
11 #include "intel_gt_irq.h"
12 #include "intel_gt_pm_irq.h"
13 #include "intel_ring.h"
14
15 #define HWS_SCRATCH_ADDR        (I915_GEM_HWS_SCRATCH * sizeof(u32))
16
17 /*
18  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
19  * implementing two workarounds on gen6.  From section 1.4.7.1
20  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21  *
22  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
23  * produced by non-pipelined state commands), software needs to first
24  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
25  * 0.
26  *
27  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
28  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29  *
30  * And the workaround for these two requires this workaround first:
31  *
32  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
33  * BEFORE the pipe-control with a post-sync op and no write-cache
34  * flushes.
35  *
36  * And this last workaround is tricky because of the requirements on
37  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
38  * volume 2 part 1:
39  *
40  *     "1 of the following must also be set:
41  *      - Render Target Cache Flush Enable ([12] of DW1)
42  *      - Depth Cache Flush Enable ([0] of DW1)
43  *      - Stall at Pixel Scoreboard ([1] of DW1)
44  *      - Depth Stall ([13] of DW1)
45  *      - Post-Sync Operation ([13] of DW1)
46  *      - Notify Enable ([8] of DW1)"
47  *
48  * The cache flushes require the workaround flush that triggered this
49  * one, so we can't use it.  Depth stall would trigger the same.
50  * Post-sync nonzero is what triggered this second workaround, so we
51  * can't use that one either.  Notify enable is IRQs, which aren't
52  * really our business.  That leaves only stall at scoreboard.
53  */
54 static int
55 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
56 {
57         u32 scratch_addr =
58                 intel_gt_scratch_offset(rq->engine->gt,
59                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
60         u32 *cs;
61
62         cs = intel_ring_begin(rq, 6);
63         if (IS_ERR(cs))
64                 return PTR_ERR(cs);
65
66         *cs++ = GFX_OP_PIPE_CONTROL(5);
67         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
68         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
69         *cs++ = 0; /* low dword */
70         *cs++ = 0; /* high dword */
71         *cs++ = MI_NOOP;
72         intel_ring_advance(rq, cs);
73
74         cs = intel_ring_begin(rq, 6);
75         if (IS_ERR(cs))
76                 return PTR_ERR(cs);
77
78         *cs++ = GFX_OP_PIPE_CONTROL(5);
79         *cs++ = PIPE_CONTROL_QW_WRITE;
80         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
81         *cs++ = 0;
82         *cs++ = 0;
83         *cs++ = MI_NOOP;
84         intel_ring_advance(rq, cs);
85
86         return 0;
87 }
88
89 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
90 {
91         u32 scratch_addr =
92                 intel_gt_scratch_offset(rq->engine->gt,
93                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
94         u32 *cs, flags = 0;
95         int ret;
96
97         /* Force SNB workarounds for PIPE_CONTROL flushes */
98         ret = gen6_emit_post_sync_nonzero_flush(rq);
99         if (ret)
100                 return ret;
101
102         /*
103          * Just flush everything.  Experiments have shown that reducing the
104          * number of bits based on the write domains has little performance
105          * impact. And when rearranging requests, the order of flushes is
106          * unknown.
107          */
108         if (mode & EMIT_FLUSH) {
109                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
110                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111                 /*
112                  * Ensure that any following seqno writes only happen
113                  * when the render cache is indeed flushed.
114                  */
115                 flags |= PIPE_CONTROL_CS_STALL;
116         }
117         if (mode & EMIT_INVALIDATE) {
118                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
119                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
120                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
121                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
122                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
123                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124                 /*
125                  * TLB invalidate requires a post-sync write.
126                  */
127                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
128         }
129
130         cs = intel_ring_begin(rq, 4);
131         if (IS_ERR(cs))
132                 return PTR_ERR(cs);
133
134         *cs++ = GFX_OP_PIPE_CONTROL(4);
135         *cs++ = flags;
136         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
137         *cs++ = 0;
138         intel_ring_advance(rq, cs);
139
140         return 0;
141 }
142
143 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
144 {
145         /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
146         *cs++ = GFX_OP_PIPE_CONTROL(4);
147         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
148         *cs++ = 0;
149         *cs++ = 0;
150
151         *cs++ = GFX_OP_PIPE_CONTROL(4);
152         *cs++ = PIPE_CONTROL_QW_WRITE;
153         *cs++ = intel_gt_scratch_offset(rq->engine->gt,
154                                         INTEL_GT_SCRATCH_FIELD_DEFAULT) |
155                 PIPE_CONTROL_GLOBAL_GTT;
156         *cs++ = 0;
157
158         /* Finally we can flush and with it emit the breadcrumb */
159         *cs++ = GFX_OP_PIPE_CONTROL(4);
160         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
161                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
162                  PIPE_CONTROL_DC_FLUSH_ENABLE |
163                  PIPE_CONTROL_QW_WRITE |
164                  PIPE_CONTROL_CS_STALL);
165         *cs++ = i915_request_active_seqno(rq) |
166                 PIPE_CONTROL_GLOBAL_GTT;
167         *cs++ = rq->fence.seqno;
168
169         *cs++ = MI_USER_INTERRUPT;
170         *cs++ = MI_NOOP;
171
172         rq->tail = intel_ring_offset(rq, cs);
173         assert_ring_tail_valid(rq->ring, rq->tail);
174
175         return cs;
176 }
177
178 static int mi_flush_dw(struct i915_request *rq, u32 flags)
179 {
180         u32 cmd, *cs;
181
182         cs = intel_ring_begin(rq, 4);
183         if (IS_ERR(cs))
184                 return PTR_ERR(cs);
185
186         cmd = MI_FLUSH_DW;
187
188         /*
189          * We always require a command barrier so that subsequent
190          * commands, such as breadcrumb interrupts, are strictly ordered
191          * wrt the contents of the write cache being flushed to memory
192          * (and thus being coherent from the CPU).
193          */
194         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
195
196         /*
197          * Bspec vol 1c.3 - blitter engine command streamer:
198          * "If ENABLED, all TLBs will be invalidated once the flush
199          * operation is complete. This bit is only valid when the
200          * Post-Sync Operation field is a value of 1h or 3h."
201          */
202         cmd |= flags;
203
204         *cs++ = cmd;
205         *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
206         *cs++ = 0;
207         *cs++ = MI_NOOP;
208
209         intel_ring_advance(rq, cs);
210
211         return 0;
212 }
213
214 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215 {
216         return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
217 }
218
219 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220 {
221         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
222 }
223
224 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225 {
226         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
227 }
228
229 int gen6_emit_bb_start(struct i915_request *rq,
230                        u64 offset, u32 len,
231                        unsigned int dispatch_flags)
232 {
233         u32 security;
234         u32 *cs;
235
236         security = MI_BATCH_NON_SECURE_I965;
237         if (dispatch_flags & I915_DISPATCH_SECURE)
238                 security = 0;
239
240         cs = intel_ring_begin(rq, 2);
241         if (IS_ERR(cs))
242                 return PTR_ERR(cs);
243
244         cs = __gen6_emit_bb_start(cs, offset, security);
245         intel_ring_advance(rq, cs);
246
247         return 0;
248 }
249
250 int
251 hsw_emit_bb_start(struct i915_request *rq,
252                   u64 offset, u32 len,
253                   unsigned int dispatch_flags)
254 {
255         u32 security;
256         u32 *cs;
257
258         security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
259         if (dispatch_flags & I915_DISPATCH_SECURE)
260                 security = 0;
261
262         cs = intel_ring_begin(rq, 2);
263         if (IS_ERR(cs))
264                 return PTR_ERR(cs);
265
266         cs = __gen6_emit_bb_start(cs, offset, security);
267         intel_ring_advance(rq, cs);
268
269         return 0;
270 }
271
272 static int gen7_stall_cs(struct i915_request *rq)
273 {
274         u32 *cs;
275
276         cs = intel_ring_begin(rq, 4);
277         if (IS_ERR(cs))
278                 return PTR_ERR(cs);
279
280         *cs++ = GFX_OP_PIPE_CONTROL(4);
281         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
282         *cs++ = 0;
283         *cs++ = 0;
284         intel_ring_advance(rq, cs);
285
286         return 0;
287 }
288
289 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
290 {
291         u32 scratch_addr =
292                 intel_gt_scratch_offset(rq->engine->gt,
293                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
294         u32 *cs, flags = 0;
295
296         /*
297          * Ensure that any following seqno writes only happen when the render
298          * cache is indeed flushed.
299          *
300          * Workaround: 4th PIPE_CONTROL command (except the ones with only
301          * read-cache invalidate bits set) must have the CS_STALL bit set. We
302          * don't try to be clever and just set it unconditionally.
303          */
304         flags |= PIPE_CONTROL_CS_STALL;
305
306         /*
307          * CS_STALL suggests at least a post-sync write.
308          */
309         flags |= PIPE_CONTROL_QW_WRITE;
310         flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
311
312         /*
313          * Just flush everything.  Experiments have shown that reducing the
314          * number of bits based on the write domains has little performance
315          * impact.
316          */
317         if (mode & EMIT_FLUSH) {
318                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
319                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
320                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
321                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
322         }
323         if (mode & EMIT_INVALIDATE) {
324                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
325                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
326                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
327                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
328                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
329                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
330                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
331
332                 /*
333                  * Workaround: we must issue a pipe_control with CS-stall bit
334                  * set before a pipe_control command that has the state cache
335                  * invalidate bit set.
336                  */
337                 gen7_stall_cs(rq);
338         }
339
340         cs = intel_ring_begin(rq, 4);
341         if (IS_ERR(cs))
342                 return PTR_ERR(cs);
343
344         *cs++ = GFX_OP_PIPE_CONTROL(4);
345         *cs++ = flags;
346         *cs++ = scratch_addr;
347         *cs++ = 0;
348         intel_ring_advance(rq, cs);
349
350         return 0;
351 }
352
353 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
354 {
355         *cs++ = GFX_OP_PIPE_CONTROL(4);
356         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
357                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
358                  PIPE_CONTROL_DC_FLUSH_ENABLE |
359                  PIPE_CONTROL_FLUSH_ENABLE |
360                  PIPE_CONTROL_QW_WRITE |
361                  PIPE_CONTROL_GLOBAL_GTT_IVB |
362                  PIPE_CONTROL_CS_STALL);
363         *cs++ = i915_request_active_seqno(rq);
364         *cs++ = rq->fence.seqno;
365
366         *cs++ = MI_USER_INTERRUPT;
367         *cs++ = MI_NOOP;
368
369         rq->tail = intel_ring_offset(rq, cs);
370         assert_ring_tail_valid(rq->ring, rq->tail);
371
372         return cs;
373 }
374
375 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
376 {
377         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
378         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379
380         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
381         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
382         *cs++ = rq->fence.seqno;
383
384         *cs++ = MI_USER_INTERRUPT;
385
386         rq->tail = intel_ring_offset(rq, cs);
387         assert_ring_tail_valid(rq->ring, rq->tail);
388
389         return cs;
390 }
391
392 #define GEN7_XCS_WA 32
393 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
394 {
395         int i;
396
397         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
398         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399
400         *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
401                 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
402         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
403         *cs++ = rq->fence.seqno;
404
405         for (i = 0; i < GEN7_XCS_WA; i++) {
406                 *cs++ = MI_STORE_DWORD_INDEX;
407                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
408                 *cs++ = rq->fence.seqno;
409         }
410
411         *cs++ = MI_FLUSH_DW;
412         *cs++ = 0;
413         *cs++ = 0;
414
415         *cs++ = MI_USER_INTERRUPT;
416         *cs++ = MI_NOOP;
417
418         rq->tail = intel_ring_offset(rq, cs);
419         assert_ring_tail_valid(rq->ring, rq->tail);
420
421         return cs;
422 }
423 #undef GEN7_XCS_WA
424
425 void gen6_irq_enable(struct intel_engine_cs *engine)
426 {
427         ENGINE_WRITE(engine, RING_IMR,
428                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
429
430         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
431         ENGINE_POSTING_READ(engine, RING_IMR);
432
433         gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
434 }
435
436 void gen6_irq_disable(struct intel_engine_cs *engine)
437 {
438         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
439         gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
440 }
441
442 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443 {
444         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445
446         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
447         ENGINE_POSTING_READ(engine, RING_IMR);
448
449         gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
450 }
451
452 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453 {
454         ENGINE_WRITE(engine, RING_IMR, ~0);
455         gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
456 }