Merge tag 'drm-intel-gt-next-2022-11-03' of git://anongit.freedesktop.org/drm/drm...
[linux-block.git] / drivers / gpu / drm / i915 / gt / intel_workarounds.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014-2018 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "intel_context.h"
8 #include "intel_engine_pm.h"
9 #include "intel_engine_regs.h"
10 #include "intel_gpu_commands.h"
11 #include "intel_gt.h"
12 #include "intel_gt_mcr.h"
13 #include "intel_gt_regs.h"
14 #include "intel_ring.h"
15 #include "intel_workarounds.h"
16
17 /**
18  * DOC: Hardware workarounds
19  *
20  * This file is intended as a central place to implement most [1]_ of the
21  * required workarounds for hardware to work as originally intended. They fall
22  * in five basic categories depending on how/when they are applied:
23  *
24  * - Workarounds that touch registers that are saved/restored to/from the HW
25  *   context image. The list is emitted (via Load Register Immediate commands)
26  *   everytime a new context is created.
27  * - GT workarounds. The list of these WAs is applied whenever these registers
28  *   revert to default values (on GPU reset, suspend/resume [2]_, etc..).
29  * - Display workarounds. The list is applied during display clock-gating
30  *   initialization.
31  * - Workarounds that whitelist a privileged register, so that UMDs can manage
32  *   them directly. This is just a special case of a MMMIO workaround (as we
33  *   write the list of these to/be-whitelisted registers to some special HW
34  *   registers).
35  * - Workaround batchbuffers, that get executed automatically by the hardware
36  *   on every HW context restore.
37  *
38  * .. [1] Please notice that there are other WAs that, due to their nature,
39  *    cannot be applied from a central place. Those are peppered around the rest
40  *    of the code, as needed.
41  *
42  * .. [2] Technically, some registers are powercontext saved & restored, so they
43  *    survive a suspend/resume. In practice, writing them again is not too
44  *    costly and simplifies things. We can revisit this in the future.
45  *
46  * Layout
47  * ~~~~~~
48  *
49  * Keep things in this file ordered by WA type, as per the above (context, GT,
50  * display, register whitelist, batchbuffer). Then, inside each type, keep the
51  * following order:
52  *
53  * - Infrastructure functions and macros
54  * - WAs per platform in standard gen/chrono order
55  * - Public functions to init or apply the given workaround type.
56  */
57
58 static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name)
59 {
60         wal->name = name;
61         wal->engine_name = engine_name;
62 }
63
64 #define WA_LIST_CHUNK (1 << 4)
65
66 static void wa_init_finish(struct i915_wa_list *wal)
67 {
68         /* Trim unused entries. */
69         if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
70                 struct i915_wa *list = kmemdup(wal->list,
71                                                wal->count * sizeof(*list),
72                                                GFP_KERNEL);
73
74                 if (list) {
75                         kfree(wal->list);
76                         wal->list = list;
77                 }
78         }
79
80         if (!wal->count)
81                 return;
82
83         DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n",
84                          wal->wa_count, wal->name, wal->engine_name);
85 }
86
87 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
88 {
89         unsigned int addr = i915_mmio_reg_offset(wa->reg);
90         unsigned int start = 0, end = wal->count;
91         const unsigned int grow = WA_LIST_CHUNK;
92         struct i915_wa *wa_;
93
94         GEM_BUG_ON(!is_power_of_2(grow));
95
96         if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
97                 struct i915_wa *list;
98
99                 list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
100                                      GFP_KERNEL);
101                 if (!list) {
102                         DRM_ERROR("No space for workaround init!\n");
103                         return;
104                 }
105
106                 if (wal->list) {
107                         memcpy(list, wal->list, sizeof(*wa) * wal->count);
108                         kfree(wal->list);
109                 }
110
111                 wal->list = list;
112         }
113
114         while (start < end) {
115                 unsigned int mid = start + (end - start) / 2;
116
117                 if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
118                         start = mid + 1;
119                 } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
120                         end = mid;
121                 } else {
122                         wa_ = &wal->list[mid];
123
124                         if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
125                                 DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
126                                           i915_mmio_reg_offset(wa_->reg),
127                                           wa_->clr, wa_->set);
128
129                                 wa_->set &= ~wa->clr;
130                         }
131
132                         wal->wa_count++;
133                         wa_->set |= wa->set;
134                         wa_->clr |= wa->clr;
135                         wa_->read |= wa->read;
136                         return;
137                 }
138         }
139
140         wal->wa_count++;
141         wa_ = &wal->list[wal->count++];
142         *wa_ = *wa;
143
144         while (wa_-- > wal->list) {
145                 GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
146                            i915_mmio_reg_offset(wa_[1].reg));
147                 if (i915_mmio_reg_offset(wa_[1].reg) >
148                     i915_mmio_reg_offset(wa_[0].reg))
149                         break;
150
151                 swap(wa_[1], wa_[0]);
152         }
153 }
154
155 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
156                    u32 clear, u32 set, u32 read_mask, bool masked_reg)
157 {
158         struct i915_wa wa = {
159                 .reg  = reg,
160                 .clr  = clear,
161                 .set  = set,
162                 .read = read_mask,
163                 .masked_reg = masked_reg,
164         };
165
166         _wa_add(wal, &wa);
167 }
168
169 static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
170                        u32 clear, u32 set, u32 read_mask, bool masked_reg)
171 {
172         struct i915_wa wa = {
173                 .mcr_reg = reg,
174                 .clr  = clear,
175                 .set  = set,
176                 .read = read_mask,
177                 .masked_reg = masked_reg,
178                 .is_mcr = 1,
179         };
180
181         _wa_add(wal, &wa);
182 }
183
184 static void
185 wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
186 {
187         wa_add(wal, reg, clear, set, clear, false);
188 }
189
190 static void
191 wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
192 {
193         wa_mcr_add(wal, reg, clear, set, clear, false);
194 }
195
196 static void
197 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
198 {
199         wa_write_clr_set(wal, reg, ~0, set);
200 }
201
202 static void
203 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
204 {
205         wa_write_clr_set(wal, reg, set, set);
206 }
207
208 static void
209 wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
210 {
211         wa_mcr_write_clr_set(wal, reg, set, set);
212 }
213
214 static void
215 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
216 {
217         wa_write_clr_set(wal, reg, clr, 0);
218 }
219
220 static void
221 wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
222 {
223         wa_mcr_write_clr_set(wal, reg, clr, 0);
224 }
225
226 /*
227  * WA operations on "masked register". A masked register has the upper 16 bits
228  * documented as "masked" in b-spec. Its purpose is to allow writing to just a
229  * portion of the register without a rmw: you simply write in the upper 16 bits
230  * the mask of bits you are going to modify.
231  *
232  * The wa_masked_* family of functions already does the necessary operations to
233  * calculate the mask based on the parameters passed, so user only has to
234  * provide the lower 16 bits of that register.
235  */
236
237 static void
238 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
239 {
240         wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
241 }
242
243 static void
244 wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
245 {
246         wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
247 }
248
249 static void
250 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
251 {
252         wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
253 }
254
255 static void
256 wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
257 {
258         wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
259 }
260
261 static void
262 wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
263                     u32 mask, u32 val)
264 {
265         wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
266 }
267
268 static void
269 wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
270                         u32 mask, u32 val)
271 {
272         wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
273 }
274
275 static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
276                                       struct i915_wa_list *wal)
277 {
278         wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
279 }
280
281 static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
282                                       struct i915_wa_list *wal)
283 {
284         wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
285 }
286
287 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
288                                       struct i915_wa_list *wal)
289 {
290         wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
291
292         /* WaDisableAsyncFlipPerfMode:bdw,chv */
293         wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
294
295         /* WaDisablePartialInstShootdown:bdw,chv */
296         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
297                          PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
298
299         /* Use Force Non-Coherent whenever executing a 3D context. This is a
300          * workaround for a possible hang in the unlikely event a TLB
301          * invalidation occurs during a PSD flush.
302          */
303         /* WaForceEnableNonCoherent:bdw,chv */
304         /* WaHdcDisableFetchWhenMasked:bdw,chv */
305         wa_masked_en(wal, HDC_CHICKEN0,
306                      HDC_DONOT_FETCH_MEM_WHEN_MASKED |
307                      HDC_FORCE_NON_COHERENT);
308
309         /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
310          * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
311          *  polygons in the same 8x4 pixel/sample area to be processed without
312          *  stalling waiting for the earlier ones to write to Hierarchical Z
313          *  buffer."
314          *
315          * This optimization is off by default for BDW and CHV; turn it on.
316          */
317         wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
318
319         /* Wa4x4STCOptimizationDisable:bdw,chv */
320         wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
321
322         /*
323          * BSpec recommends 8x4 when MSAA is used,
324          * however in practice 16x4 seems fastest.
325          *
326          * Note that PS/WM thread counts depend on the WIZ hashing
327          * disable bit, which we don't touch here, but it's good
328          * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
329          */
330         wa_masked_field_set(wal, GEN7_GT_MODE,
331                             GEN6_WIZ_HASHING_MASK,
332                             GEN6_WIZ_HASHING_16x4);
333 }
334
335 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
336                                      struct i915_wa_list *wal)
337 {
338         struct drm_i915_private *i915 = engine->i915;
339
340         gen8_ctx_workarounds_init(engine, wal);
341
342         /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
343         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
344
345         /* WaDisableDopClockGating:bdw
346          *
347          * Also see the related UCGTCL1 write in bdw_init_clock_gating()
348          * to disable EUTC clock gating.
349          */
350         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
351                          DOP_CLOCK_GATING_DISABLE);
352
353         wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
354                          GEN8_SAMPLER_POWER_BYPASS_DIS);
355
356         wa_masked_en(wal, HDC_CHICKEN0,
357                      /* WaForceContextSaveRestoreNonCoherent:bdw */
358                      HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
359                      /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
360                      (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
361 }
362
363 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
364                                      struct i915_wa_list *wal)
365 {
366         gen8_ctx_workarounds_init(engine, wal);
367
368         /* WaDisableThreadStallDopClockGating:chv */
369         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
370
371         /* Improve HiZ throughput on CHV. */
372         wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
373 }
374
375 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
376                                       struct i915_wa_list *wal)
377 {
378         struct drm_i915_private *i915 = engine->i915;
379
380         if (HAS_LLC(i915)) {
381                 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
382                  *
383                  * Must match Display Engine. See
384                  * WaCompressedResourceDisplayNewHashMode.
385                  */
386                 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
387                              GEN9_PBE_COMPRESSED_HASH_SELECTION);
388                 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
389                                  GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
390         }
391
392         /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
393         /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
394         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
395                          FLOW_CONTROL_ENABLE |
396                          PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
397
398         /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
399         /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
400         wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
401                          GEN9_ENABLE_YV12_BUGFIX |
402                          GEN9_ENABLE_GPGPU_PREEMPTION);
403
404         /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
405         /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
406         wa_masked_en(wal, CACHE_MODE_1,
407                      GEN8_4x4_STC_OPTIMIZATION_DISABLE |
408                      GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
409
410         /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
411         wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
412                           GEN9_CCS_TLB_PREFETCH_ENABLE);
413
414         /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
415         wa_masked_en(wal, HDC_CHICKEN0,
416                      HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
417                      HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
418
419         /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
420          * both tied to WaForceContextSaveRestoreNonCoherent
421          * in some hsds for skl. We keep the tie for all gen9. The
422          * documentation is a bit hazy and so we want to get common behaviour,
423          * even though there is no clear evidence we would need both on kbl/bxt.
424          * This area has been source of system hangs so we play it safe
425          * and mimic the skl regardless of what bspec says.
426          *
427          * Use Force Non-Coherent whenever executing a 3D context. This
428          * is a workaround for a possible hang in the unlikely event
429          * a TLB invalidation occurs during a PSD flush.
430          */
431
432         /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
433         wa_masked_en(wal, HDC_CHICKEN0,
434                      HDC_FORCE_NON_COHERENT);
435
436         /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
437         if (IS_SKYLAKE(i915) ||
438             IS_KABYLAKE(i915) ||
439             IS_COFFEELAKE(i915) ||
440             IS_COMETLAKE(i915))
441                 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
442                                  GEN8_SAMPLER_POWER_BYPASS_DIS);
443
444         /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
445         wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
446
447         /*
448          * Supporting preemption with fine-granularity requires changes in the
449          * batch buffer programming. Since we can't break old userspace, we
450          * need to set our default preemption level to safe value. Userspace is
451          * still able to use more fine-grained preemption levels, since in
452          * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
453          * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
454          * not real HW workarounds, but merely a way to start using preemption
455          * while maintaining old contract with userspace.
456          */
457
458         /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
459         wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
460
461         /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
462         wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
463                             GEN9_PREEMPT_GPGPU_LEVEL_MASK,
464                             GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
465
466         /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
467         if (IS_GEN9_LP(i915))
468                 wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
469 }
470
471 static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
472                                 struct i915_wa_list *wal)
473 {
474         struct intel_gt *gt = engine->gt;
475         u8 vals[3] = { 0, 0, 0 };
476         unsigned int i;
477
478         for (i = 0; i < 3; i++) {
479                 u8 ss;
480
481                 /*
482                  * Only consider slices where one, and only one, subslice has 7
483                  * EUs
484                  */
485                 if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
486                         continue;
487
488                 /*
489                  * subslice_7eu[i] != 0 (because of the check above) and
490                  * ss_max == 4 (maximum number of subslices possible per slice)
491                  *
492                  * ->    0 <= ss <= 3;
493                  */
494                 ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
495                 vals[i] = 3 - ss;
496         }
497
498         if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
499                 return;
500
501         /* Tune IZ hashing. See intel_device_info_runtime_init() */
502         wa_masked_field_set(wal, GEN7_GT_MODE,
503                             GEN9_IZ_HASHING_MASK(2) |
504                             GEN9_IZ_HASHING_MASK(1) |
505                             GEN9_IZ_HASHING_MASK(0),
506                             GEN9_IZ_HASHING(2, vals[2]) |
507                             GEN9_IZ_HASHING(1, vals[1]) |
508                             GEN9_IZ_HASHING(0, vals[0]));
509 }
510
511 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
512                                      struct i915_wa_list *wal)
513 {
514         gen9_ctx_workarounds_init(engine, wal);
515         skl_tune_iz_hashing(engine, wal);
516 }
517
518 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
519                                      struct i915_wa_list *wal)
520 {
521         gen9_ctx_workarounds_init(engine, wal);
522
523         /* WaDisableThreadStallDopClockGating:bxt */
524         wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
525                          STALL_DOP_GATING_DISABLE);
526
527         /* WaToEnableHwFixForPushConstHWBug:bxt */
528         wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
529                      GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
530 }
531
532 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
533                                      struct i915_wa_list *wal)
534 {
535         struct drm_i915_private *i915 = engine->i915;
536
537         gen9_ctx_workarounds_init(engine, wal);
538
539         /* WaToEnableHwFixForPushConstHWBug:kbl */
540         if (IS_KBL_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
541                 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
542                              GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
543
544         /* WaDisableSbeCacheDispatchPortSharing:kbl */
545         wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
546                          GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
547 }
548
549 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
550                                      struct i915_wa_list *wal)
551 {
552         gen9_ctx_workarounds_init(engine, wal);
553
554         /* WaToEnableHwFixForPushConstHWBug:glk */
555         wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
556                      GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
557 }
558
559 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
560                                      struct i915_wa_list *wal)
561 {
562         gen9_ctx_workarounds_init(engine, wal);
563
564         /* WaToEnableHwFixForPushConstHWBug:cfl */
565         wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
566                      GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
567
568         /* WaDisableSbeCacheDispatchPortSharing:cfl */
569         wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
570                          GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
571 }
572
573 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
574                                      struct i915_wa_list *wal)
575 {
576         /* Wa_1406697149 (WaDisableBankHangMode:icl) */
577         wa_write(wal,
578                  GEN8_L3CNTLREG,
579                  intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) |
580                  GEN8_ERRDETBCTRL);
581
582         /* WaForceEnableNonCoherent:icl
583          * This is not the same workaround as in early Gen9 platforms, where
584          * lacking this could cause system hangs, but coherency performance
585          * overhead is high and only a few compute workloads really need it
586          * (the register is whitelisted in hardware now, so UMDs can opt in
587          * for coherency if they have a good reason).
588          */
589         wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
590
591         /* WaEnableFloatBlendOptimization:icl */
592         wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
593                    _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
594                    0 /* write-only, so skip validation */,
595                    true);
596
597         /* WaDisableGPGPUMidThreadPreemption:icl */
598         wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
599                             GEN9_PREEMPT_GPGPU_LEVEL_MASK,
600                             GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
601
602         /* allow headerless messages for preemptible GPGPU context */
603         wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
604                          GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
605
606         /* Wa_1604278689:icl,ehl */
607         wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
608         wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
609                          0, /* write-only register; skip validation */
610                          0xFFFFFFFF);
611
612         /* Wa_1406306137:icl,ehl */
613         wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
614 }
615
616 /*
617  * These settings aren't actually workarounds, but general tuning settings that
618  * need to be programmed on dg2 platform.
619  */
620 static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
621                                    struct i915_wa_list *wal)
622 {
623         wa_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
624         wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
625                              REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
626         wa_mcr_add(wal,
627                    XEHP_FF_MODE2,
628                    FF_MODE2_TDS_TIMER_MASK,
629                    FF_MODE2_TDS_TIMER_128,
630                    0, false);
631 }
632
633 /*
634  * These settings aren't actually workarounds, but general tuning settings that
635  * need to be programmed on several platforms.
636  */
637 static void gen12_ctx_gt_tuning_init(struct intel_engine_cs *engine,
638                                      struct i915_wa_list *wal)
639 {
640         /*
641          * Although some platforms refer to it as Wa_1604555607, we need to
642          * program it even on those that don't explicitly list that
643          * workaround.
644          *
645          * Note that the programming of this register is further modified
646          * according to the FF_MODE2 guidance given by Wa_1608008084:gen12.
647          * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
648          * value when read. The default value for this register is zero for all
649          * fields and there are no bit masks. So instead of doing a RMW we
650          * should just write TDS timer value. For the same reason read
651          * verification is ignored.
652          */
653         wa_add(wal,
654                GEN12_FF_MODE2,
655                FF_MODE2_TDS_TIMER_MASK,
656                FF_MODE2_TDS_TIMER_128,
657                0, false);
658 }
659
660 static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
661                                        struct i915_wa_list *wal)
662 {
663         struct drm_i915_private *i915 = engine->i915;
664
665         gen12_ctx_gt_tuning_init(engine, wal);
666
667         /*
668          * Wa_1409142259:tgl,dg1,adl-p
669          * Wa_1409347922:tgl,dg1,adl-p
670          * Wa_1409252684:tgl,dg1,adl-p
671          * Wa_1409217633:tgl,dg1,adl-p
672          * Wa_1409207793:tgl,dg1,adl-p
673          * Wa_1409178076:tgl,dg1,adl-p
674          * Wa_1408979724:tgl,dg1,adl-p
675          * Wa_14010443199:tgl,rkl,dg1,adl-p
676          * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
677          * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
678          */
679         wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
680                      GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
681
682         /* WaDisableGPGPUMidThreadPreemption:gen12 */
683         wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
684                             GEN9_PREEMPT_GPGPU_LEVEL_MASK,
685                             GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
686
687         /*
688          * Wa_16011163337
689          *
690          * Like in gen12_ctx_gt_tuning_init(), read verification is ignored due
691          * to Wa_1608008084.
692          */
693         wa_add(wal,
694                GEN12_FF_MODE2,
695                FF_MODE2_GS_TIMER_MASK,
696                FF_MODE2_GS_TIMER_224,
697                0, false);
698
699         if (!IS_DG1(i915))
700                 /* Wa_1806527549 */
701                 wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
702 }
703
704 static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
705                                      struct i915_wa_list *wal)
706 {
707         gen12_ctx_workarounds_init(engine, wal);
708
709         /* Wa_1409044764 */
710         wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
711                       DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
712
713         /* Wa_22010493298 */
714         wa_masked_en(wal, HIZ_CHICKEN,
715                      DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
716 }
717
718 static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
719                                      struct i915_wa_list *wal)
720 {
721         dg2_ctx_gt_tuning_init(engine, wal);
722
723         /* Wa_16011186671:dg2_g11 */
724         if (IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) {
725                 wa_mcr_masked_dis(wal, VFLSKPD, DIS_MULT_MISS_RD_SQUASH);
726                 wa_mcr_masked_en(wal, VFLSKPD, DIS_OVER_FETCH_CACHE);
727         }
728
729         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) {
730                 /* Wa_14010469329:dg2_g10 */
731                 wa_mcr_masked_en(wal, XEHP_COMMON_SLICE_CHICKEN3,
732                                  XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE);
733
734                 /*
735                  * Wa_22010465075:dg2_g10
736                  * Wa_22010613112:dg2_g10
737                  * Wa_14010698770:dg2_g10
738                  */
739                 wa_mcr_masked_en(wal, XEHP_COMMON_SLICE_CHICKEN3,
740                                  GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
741         }
742
743         /* Wa_16013271637:dg2 */
744         wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
745                          MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
746
747         /* Wa_14014947963:dg2 */
748         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_B0, STEP_FOREVER) ||
749                 IS_DG2_G11(engine->i915) || IS_DG2_G12(engine->i915))
750                 wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
751
752         /* Wa_15010599737:dg2 */
753         wa_masked_en(wal, CHICKEN_RASTER_1, DIS_SF_ROUND_NEAREST_EVEN);
754 }
755
756 static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
757                                          struct i915_wa_list *wal)
758 {
759         /*
760          * This is a "fake" workaround defined by software to ensure we
761          * maintain reliable, backward-compatible behavior for userspace with
762          * regards to how nested MI_BATCH_BUFFER_START commands are handled.
763          *
764          * The per-context setting of MI_MODE[12] determines whether the bits
765          * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
766          * in the traditional manner or whether they should instead use a new
767          * tgl+ meaning that breaks backward compatibility, but allows nesting
768          * into 3rd-level batchbuffers.  When this new capability was first
769          * added in TGL, it remained off by default unless a context
770          * intentionally opted in to the new behavior.  However Xe_HPG now
771          * flips this on by default and requires that we explicitly opt out if
772          * we don't want the new behavior.
773          *
774          * From a SW perspective, we want to maintain the backward-compatible
775          * behavior for userspace, so we'll apply a fake workaround to set it
776          * back to the legacy behavior on platforms where the hardware default
777          * is to break compatibility.  At the moment there is no Linux
778          * userspace that utilizes third-level batchbuffers, so this will avoid
779          * userspace from needing to make any changes.  using the legacy
780          * meaning is the correct thing to do.  If/when we have userspace
781          * consumers that want to utilize third-level batch nesting, we can
782          * provide a context parameter to allow them to opt-in.
783          */
784         wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
785 }
786
787 static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
788                                    struct i915_wa_list *wal)
789 {
790         u8 mocs;
791
792         /*
793          * Some blitter commands do not have a field for MOCS, those
794          * commands will use MOCS index pointed by BLIT_CCTL.
795          * BLIT_CCTL registers are needed to be programmed to un-cached.
796          */
797         if (engine->class == COPY_ENGINE_CLASS) {
798                 mocs = engine->gt->mocs.uc_index;
799                 wa_write_clr_set(wal,
800                                  BLIT_CCTL(engine->mmio_base),
801                                  BLIT_CCTL_MASK,
802                                  BLIT_CCTL_MOCS(mocs, mocs));
803         }
804 }
805
806 /*
807  * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
808  * defined by the hardware team, but it programming general context registers.
809  * Adding those context register programming in context workaround
810  * allow us to use the wa framework for proper application and validation.
811  */
812 static void
813 gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
814                           struct i915_wa_list *wal)
815 {
816         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
817                 fakewa_disable_nestedbb_mode(engine, wal);
818
819         gen12_ctx_gt_mocs_init(engine, wal);
820 }
821
822 static void
823 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
824                            struct i915_wa_list *wal,
825                            const char *name)
826 {
827         struct drm_i915_private *i915 = engine->i915;
828
829         wa_init_start(wal, name, engine->name);
830
831         /* Applies to all engines */
832         /*
833          * Fake workarounds are not the actual workaround but
834          * programming of context registers using workaround framework.
835          */
836         if (GRAPHICS_VER(i915) >= 12)
837                 gen12_ctx_gt_fake_wa_init(engine, wal);
838
839         if (engine->class != RENDER_CLASS)
840                 goto done;
841
842         if (IS_PONTEVECCHIO(i915))
843                 ; /* noop; none at this time */
844         else if (IS_DG2(i915))
845                 dg2_ctx_workarounds_init(engine, wal);
846         else if (IS_XEHPSDV(i915))
847                 ; /* noop; none at this time */
848         else if (IS_DG1(i915))
849                 dg1_ctx_workarounds_init(engine, wal);
850         else if (GRAPHICS_VER(i915) == 12)
851                 gen12_ctx_workarounds_init(engine, wal);
852         else if (GRAPHICS_VER(i915) == 11)
853                 icl_ctx_workarounds_init(engine, wal);
854         else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
855                 cfl_ctx_workarounds_init(engine, wal);
856         else if (IS_GEMINILAKE(i915))
857                 glk_ctx_workarounds_init(engine, wal);
858         else if (IS_KABYLAKE(i915))
859                 kbl_ctx_workarounds_init(engine, wal);
860         else if (IS_BROXTON(i915))
861                 bxt_ctx_workarounds_init(engine, wal);
862         else if (IS_SKYLAKE(i915))
863                 skl_ctx_workarounds_init(engine, wal);
864         else if (IS_CHERRYVIEW(i915))
865                 chv_ctx_workarounds_init(engine, wal);
866         else if (IS_BROADWELL(i915))
867                 bdw_ctx_workarounds_init(engine, wal);
868         else if (GRAPHICS_VER(i915) == 7)
869                 gen7_ctx_workarounds_init(engine, wal);
870         else if (GRAPHICS_VER(i915) == 6)
871                 gen6_ctx_workarounds_init(engine, wal);
872         else if (GRAPHICS_VER(i915) < 8)
873                 ;
874         else
875                 MISSING_CASE(GRAPHICS_VER(i915));
876
877 done:
878         wa_init_finish(wal);
879 }
880
881 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
882 {
883         __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
884 }
885
886 int intel_engine_emit_ctx_wa(struct i915_request *rq)
887 {
888         struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
889         struct i915_wa *wa;
890         unsigned int i;
891         u32 *cs;
892         int ret;
893
894         if (wal->count == 0)
895                 return 0;
896
897         ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
898         if (ret)
899                 return ret;
900
901         cs = intel_ring_begin(rq, (wal->count * 2 + 2));
902         if (IS_ERR(cs))
903                 return PTR_ERR(cs);
904
905         *cs++ = MI_LOAD_REGISTER_IMM(wal->count);
906         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
907                 *cs++ = i915_mmio_reg_offset(wa->reg);
908                 *cs++ = wa->set;
909         }
910         *cs++ = MI_NOOP;
911
912         intel_ring_advance(rq, cs);
913
914         ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
915         if (ret)
916                 return ret;
917
918         return 0;
919 }
920
921 static void
922 gen4_gt_workarounds_init(struct intel_gt *gt,
923                          struct i915_wa_list *wal)
924 {
925         /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
926         wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
927 }
928
929 static void
930 g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
931 {
932         gen4_gt_workarounds_init(gt, wal);
933
934         /* WaDisableRenderCachePipelinedFlush:g4x,ilk */
935         wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
936 }
937
938 static void
939 ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
940 {
941         g4x_gt_workarounds_init(gt, wal);
942
943         wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
944 }
945
946 static void
947 snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
948 {
949 }
950
951 static void
952 ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
953 {
954         /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
955         wa_masked_dis(wal,
956                       GEN7_COMMON_SLICE_CHICKEN1,
957                       GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
958
959         /* WaApplyL3ControlAndL3ChickenMode:ivb */
960         wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
961         wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
962
963         /* WaForceL3Serialization:ivb */
964         wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
965 }
966
967 static void
968 vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
969 {
970         /* WaForceL3Serialization:vlv */
971         wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
972
973         /*
974          * WaIncreaseL3CreditsForVLVB0:vlv
975          * This is the hardware default actually.
976          */
977         wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
978 }
979
980 static void
981 hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
982 {
983         /* L3 caching of data atomics doesn't work -- disable it. */
984         wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
985
986         wa_add(wal,
987                HSW_ROW_CHICKEN3, 0,
988                _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
989                0 /* XXX does this reg exist? */, true);
990
991         /* WaVSRefCountFullforceMissDisable:hsw */
992         wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
993 }
994
995 static void
996 gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
997 {
998         const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
999         unsigned int slice, subslice;
1000         u32 mcr, mcr_mask;
1001
1002         GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1003
1004         /*
1005          * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1006          * Before any MMIO read into slice/subslice specific registers, MCR
1007          * packet control register needs to be programmed to point to any
1008          * enabled s/ss pair. Otherwise, incorrect values will be returned.
1009          * This means each subsequent MMIO read will be forwarded to an
1010          * specific s/ss combination, but this is OK since these registers
1011          * are consistent across s/ss in almost all cases. In the rare
1012          * occasions, such as INSTDONE, where this value is dependent
1013          * on s/ss combo, the read should be done with read_subslice_reg.
1014          */
1015         slice = ffs(sseu->slice_mask) - 1;
1016         GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1017         subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1018         GEM_BUG_ON(!subslice);
1019         subslice--;
1020
1021         /*
1022          * We use GEN8_MCR..() macros to calculate the |mcr| value for
1023          * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1024          */
1025         mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1026         mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1027
1028         drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1029
1030         wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1031 }
1032
1033 static void
1034 gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1035 {
1036         struct drm_i915_private *i915 = gt->i915;
1037
1038         /* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1039         gen9_wa_init_mcr(i915, wal);
1040
1041         /* WaDisableKillLogic:bxt,skl,kbl */
1042         if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1043                 wa_write_or(wal,
1044                             GAM_ECOCHK,
1045                             ECOCHK_DIS_TLB);
1046
1047         if (HAS_LLC(i915)) {
1048                 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1049                  *
1050                  * Must match Display Engine. See
1051                  * WaCompressedResourceDisplayNewHashMode.
1052                  */
1053                 wa_write_or(wal,
1054                             MMCD_MISC_CTRL,
1055                             MMCD_PCLA | MMCD_HOTSPOT_EN);
1056         }
1057
1058         /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1059         wa_write_or(wal,
1060                     GAM_ECOCHK,
1061                     BDW_DISABLE_HDC_INVALIDATION);
1062 }
1063
1064 static void
1065 skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1066 {
1067         gen9_gt_workarounds_init(gt, wal);
1068
1069         /* WaDisableGafsUnitClkGating:skl */
1070         wa_write_or(wal,
1071                     GEN7_UCGCTL4,
1072                     GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1073
1074         /* WaInPlaceDecompressionHang:skl */
1075         if (IS_SKL_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1076                 wa_write_or(wal,
1077                             GEN9_GAMT_ECO_REG_RW_IA,
1078                             GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1079 }
1080
1081 static void
1082 kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1083 {
1084         gen9_gt_workarounds_init(gt, wal);
1085
1086         /* WaDisableDynamicCreditSharing:kbl */
1087         if (IS_KBL_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1088                 wa_write_or(wal,
1089                             GAMT_CHKN_BIT_REG,
1090                             GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1091
1092         /* WaDisableGafsUnitClkGating:kbl */
1093         wa_write_or(wal,
1094                     GEN7_UCGCTL4,
1095                     GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1096
1097         /* WaInPlaceDecompressionHang:kbl */
1098         wa_write_or(wal,
1099                     GEN9_GAMT_ECO_REG_RW_IA,
1100                     GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1101 }
1102
1103 static void
1104 glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1105 {
1106         gen9_gt_workarounds_init(gt, wal);
1107 }
1108
1109 static void
1110 cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1111 {
1112         gen9_gt_workarounds_init(gt, wal);
1113
1114         /* WaDisableGafsUnitClkGating:cfl */
1115         wa_write_or(wal,
1116                     GEN7_UCGCTL4,
1117                     GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1118
1119         /* WaInPlaceDecompressionHang:cfl */
1120         wa_write_or(wal,
1121                     GEN9_GAMT_ECO_REG_RW_IA,
1122                     GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1123 }
1124
1125 static void __set_mcr_steering(struct i915_wa_list *wal,
1126                                i915_reg_t steering_reg,
1127                                unsigned int slice, unsigned int subslice)
1128 {
1129         u32 mcr, mcr_mask;
1130
1131         mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1132         mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1133
1134         wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1135 }
1136
1137 static void debug_dump_steering(struct intel_gt *gt)
1138 {
1139         struct drm_printer p = drm_debug_printer("MCR Steering:");
1140
1141         if (drm_debug_enabled(DRM_UT_DRIVER))
1142                 intel_gt_mcr_report_steering(&p, gt, false);
1143 }
1144
1145 static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1146                          unsigned int slice, unsigned int subslice)
1147 {
1148         __set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1149
1150         gt->default_steering.groupid = slice;
1151         gt->default_steering.instanceid = subslice;
1152
1153         debug_dump_steering(gt);
1154 }
1155
1156 static void
1157 icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1158 {
1159         const struct sseu_dev_info *sseu = &gt->info.sseu;
1160         unsigned int subslice;
1161
1162         GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1163         GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1164
1165         /*
1166          * Although a platform may have subslices, we need to always steer
1167          * reads to the lowest instance that isn't fused off.  When Render
1168          * Power Gating is enabled, grabbing forcewake will only power up a
1169          * single subslice (the "minconfig") if there isn't a real workload
1170          * that needs to be run; this means that if we steer register reads to
1171          * one of the higher subslices, we run the risk of reading back 0's or
1172          * random garbage.
1173          */
1174         subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1175
1176         /*
1177          * If the subslice we picked above also steers us to a valid L3 bank,
1178          * then we can just rely on the default steering and won't need to
1179          * worry about explicitly re-steering L3BANK reads later.
1180          */
1181         if (gt->info.l3bank_mask & BIT(subslice))
1182                 gt->steering_table[L3BANK] = NULL;
1183
1184         __add_mcr_wa(gt, wal, 0, subslice);
1185 }
1186
1187 static void
1188 xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1189 {
1190         const struct sseu_dev_info *sseu = &gt->info.sseu;
1191         unsigned long slice, subslice = 0, slice_mask = 0;
1192         u32 lncf_mask = 0;
1193         int i;
1194
1195         /*
1196          * On Xe_HP the steering increases in complexity. There are now several
1197          * more units that require steering and we're not guaranteed to be able
1198          * to find a common setting for all of them. These are:
1199          * - GSLICE (fusable)
1200          * - DSS (sub-unit within gslice; fusable)
1201          * - L3 Bank (fusable)
1202          * - MSLICE (fusable)
1203          * - LNCF (sub-unit within mslice; always present if mslice is present)
1204          *
1205          * We'll do our default/implicit steering based on GSLICE (in the
1206          * sliceid field) and DSS (in the subsliceid field).  If we can
1207          * find overlap between the valid MSLICE and/or LNCF values with
1208          * a suitable GSLICE, then we can just re-use the default value and
1209          * skip and explicit steering at runtime.
1210          *
1211          * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1212          * a valid sliceid value.  DSS steering is the only type of steering
1213          * that utilizes the 'subsliceid' bits.
1214          *
1215          * Also note that, even though the steering domain is called "GSlice"
1216          * and it is encoded in the register using the gslice format, the spec
1217          * says that the combined (geometry | compute) fuse should be used to
1218          * select the steering.
1219          */
1220
1221         /* Find the potential gslice candidates */
1222         slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1223                                                        GEN_DSS_PER_GSLICE);
1224
1225         /*
1226          * Find the potential LNCF candidates.  Either LNCF within a valid
1227          * mslice is fine.
1228          */
1229         for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1230                 lncf_mask |= (0x3 << (i * 2));
1231
1232         /*
1233          * Are there any sliceid values that work for both GSLICE and LNCF
1234          * steering?
1235          */
1236         if (slice_mask & lncf_mask) {
1237                 slice_mask &= lncf_mask;
1238                 gt->steering_table[LNCF] = NULL;
1239         }
1240
1241         /* How about sliceid values that also work for MSLICE steering? */
1242         if (slice_mask & gt->info.mslice_mask) {
1243                 slice_mask &= gt->info.mslice_mask;
1244                 gt->steering_table[MSLICE] = NULL;
1245         }
1246
1247         if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1248                 gt->steering_table[GAM] = NULL;
1249
1250         slice = __ffs(slice_mask);
1251         subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1252                 GEN_DSS_PER_GSLICE;
1253
1254         __add_mcr_wa(gt, wal, slice, subslice);
1255
1256         /*
1257          * SQIDI ranges are special because they use different steering
1258          * registers than everything else we work with.  On XeHP SDV and
1259          * DG2-G10, any value in the steering registers will work fine since
1260          * all instances are present, but DG2-G11 only has SQIDI instances at
1261          * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1262          * we'll just steer to a hardcoded "2" since that value will work
1263          * everywhere.
1264          */
1265         __set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1266         __set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1267
1268         /*
1269          * On DG2, GAM registers have a dedicated steering control register
1270          * and must always be programmed to a hardcoded groupid of "1."
1271          */
1272         if (IS_DG2(gt->i915))
1273                 __set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1274 }
1275
1276 static void
1277 pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1278 {
1279         unsigned int dss;
1280
1281         /*
1282          * Setup implicit steering for COMPUTE and DSS ranges to the first
1283          * non-fused-off DSS.  All other types of MCR registers will be
1284          * explicitly steered.
1285          */
1286         dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
1287         __add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1288 }
1289
1290 static void
1291 icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1292 {
1293         struct drm_i915_private *i915 = gt->i915;
1294
1295         icl_wa_init_mcr(gt, wal);
1296
1297         /* WaModifyGamTlbPartitioning:icl */
1298         wa_write_clr_set(wal,
1299                          GEN11_GACB_PERF_CTRL,
1300                          GEN11_HASH_CTRL_MASK,
1301                          GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1302
1303         /* Wa_1405766107:icl
1304          * Formerly known as WaCL2SFHalfMaxAlloc
1305          */
1306         wa_write_or(wal,
1307                     GEN11_LSN_UNSLCVC,
1308                     GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1309                     GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1310
1311         /* Wa_220166154:icl
1312          * Formerly known as WaDisCtxReload
1313          */
1314         wa_write_or(wal,
1315                     GEN8_GAMW_ECO_DEV_RW_IA,
1316                     GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1317
1318         /* Wa_1406463099:icl
1319          * Formerly known as WaGamTlbPendError
1320          */
1321         wa_write_or(wal,
1322                     GAMT_CHKN_BIT_REG,
1323                     GAMT_CHKN_DISABLE_L3_COH_PIPE);
1324
1325         /* Wa_1407352427:icl,ehl */
1326         wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1327                     PSDUNIT_CLKGATE_DIS);
1328
1329         /* Wa_1406680159:icl,ehl */
1330         wa_mcr_write_or(wal,
1331                         GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1332                         GWUNIT_CLKGATE_DIS);
1333
1334         /* Wa_1607087056:icl,ehl,jsl */
1335         if (IS_ICELAKE(i915) ||
1336             IS_JSL_EHL_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1337                 wa_write_or(wal,
1338                             GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1339                             L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1340
1341         /*
1342          * This is not a documented workaround, but rather an optimization
1343          * to reduce sampler power.
1344          */
1345         wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1346 }
1347
1348 /*
1349  * Though there are per-engine instances of these registers,
1350  * they retain their value through engine resets and should
1351  * only be provided on the GT workaround list rather than
1352  * the engine-specific workaround list.
1353  */
1354 static void
1355 wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1356 {
1357         struct intel_engine_cs *engine;
1358         int id;
1359
1360         for_each_engine(engine, gt, id) {
1361                 if (engine->class != VIDEO_DECODE_CLASS ||
1362                     (engine->instance % 2))
1363                         continue;
1364
1365                 wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1366                             IECPUNIT_CLKGATE_DIS);
1367         }
1368 }
1369
1370 static void
1371 gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1372 {
1373         icl_wa_init_mcr(gt, wal);
1374
1375         /* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1376         wa_14011060649(gt, wal);
1377
1378         /* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1379         wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1380 }
1381
1382 static void
1383 tgl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1384 {
1385         struct drm_i915_private *i915 = gt->i915;
1386
1387         gen12_gt_workarounds_init(gt, wal);
1388
1389         /* Wa_1409420604:tgl */
1390         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1391                 wa_mcr_write_or(wal,
1392                                 SUBSLICE_UNIT_LEVEL_CLKGATE2,
1393                                 CPSSUNIT_CLKGATE_DIS);
1394
1395         /* Wa_1607087056:tgl also know as BUG:1409180338 */
1396         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1397                 wa_write_or(wal,
1398                             GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1399                             L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1400
1401         /* Wa_1408615072:tgl[a0] */
1402         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1403                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1404                             VSUNIT_CLKGATE_DIS_TGL);
1405 }
1406
1407 static void
1408 dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1409 {
1410         struct drm_i915_private *i915 = gt->i915;
1411
1412         gen12_gt_workarounds_init(gt, wal);
1413
1414         /* Wa_1607087056:dg1 */
1415         if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1416                 wa_write_or(wal,
1417                             GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1418                             L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1419
1420         /* Wa_1409420604:dg1 */
1421         if (IS_DG1(i915))
1422                 wa_mcr_write_or(wal,
1423                                 SUBSLICE_UNIT_LEVEL_CLKGATE2,
1424                                 CPSSUNIT_CLKGATE_DIS);
1425
1426         /* Wa_1408615072:dg1 */
1427         /* Empirical testing shows this register is unaffected by engine reset. */
1428         if (IS_DG1(i915))
1429                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1430                             VSUNIT_CLKGATE_DIS_TGL);
1431 }
1432
1433 static void
1434 xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1435 {
1436         struct drm_i915_private *i915 = gt->i915;
1437
1438         xehp_init_mcr(gt, wal);
1439
1440         /* Wa_1409757795:xehpsdv */
1441         wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1442
1443         /* Wa_16011155590:xehpsdv */
1444         if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1445                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1446                             TSGUNIT_CLKGATE_DIS);
1447
1448         /* Wa_14011780169:xehpsdv */
1449         if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1450                 wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1451                             GAMTLBVDBOX7_CLKGATE_DIS |
1452                             GAMTLBVDBOX6_CLKGATE_DIS |
1453                             GAMTLBVDBOX5_CLKGATE_DIS |
1454                             GAMTLBVDBOX4_CLKGATE_DIS |
1455                             GAMTLBVDBOX3_CLKGATE_DIS |
1456                             GAMTLBVDBOX2_CLKGATE_DIS |
1457                             GAMTLBVDBOX1_CLKGATE_DIS |
1458                             GAMTLBVDBOX0_CLKGATE_DIS |
1459                             GAMTLBKCR_CLKGATE_DIS |
1460                             GAMTLBGUC_CLKGATE_DIS |
1461                             GAMTLBBLT_CLKGATE_DIS);
1462                 wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1463                             GAMTLBGFXA1_CLKGATE_DIS |
1464                             GAMTLBCOMPA0_CLKGATE_DIS |
1465                             GAMTLBCOMPA1_CLKGATE_DIS |
1466                             GAMTLBCOMPB0_CLKGATE_DIS |
1467                             GAMTLBCOMPB1_CLKGATE_DIS |
1468                             GAMTLBCOMPC0_CLKGATE_DIS |
1469                             GAMTLBCOMPC1_CLKGATE_DIS |
1470                             GAMTLBCOMPD0_CLKGATE_DIS |
1471                             GAMTLBCOMPD1_CLKGATE_DIS |
1472                             GAMTLBMERT_CLKGATE_DIS   |
1473                             GAMTLBVEBOX3_CLKGATE_DIS |
1474                             GAMTLBVEBOX2_CLKGATE_DIS |
1475                             GAMTLBVEBOX1_CLKGATE_DIS |
1476                             GAMTLBVEBOX0_CLKGATE_DIS);
1477         }
1478
1479         /* Wa_16012725990:xehpsdv */
1480         if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1481                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1482
1483         /* Wa_14011060649:xehpsdv */
1484         wa_14011060649(gt, wal);
1485 }
1486
1487 static void
1488 dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1489 {
1490         struct intel_engine_cs *engine;
1491         int id;
1492
1493         xehp_init_mcr(gt, wal);
1494
1495         /* Wa_14011060649:dg2 */
1496         wa_14011060649(gt, wal);
1497
1498         /*
1499          * Although there are per-engine instances of these registers,
1500          * they technically exist outside the engine itself and are not
1501          * impacted by engine resets.  Furthermore, they're part of the
1502          * GuC blacklist so trying to treat them as engine workarounds
1503          * will result in GuC initialization failure and a wedged GPU.
1504          */
1505         for_each_engine(engine, gt, id) {
1506                 if (engine->class != VIDEO_DECODE_CLASS)
1507                         continue;
1508
1509                 /* Wa_16010515920:dg2_g10 */
1510                 if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0))
1511                         wa_write_or(wal, VDBOX_CGCTL3F18(engine->mmio_base),
1512                                     ALNUNIT_CLKGATE_DIS);
1513         }
1514
1515         if (IS_DG2_G10(gt->i915)) {
1516                 /* Wa_22010523718:dg2 */
1517                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1518                             CG3DDISCFEG_CLKGATE_DIS);
1519
1520                 /* Wa_14011006942:dg2 */
1521                 wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1522                                 DSS_ROUTER_CLKGATE_DIS);
1523         }
1524
1525         if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0)) {
1526                 /* Wa_14010948348:dg2_g10 */
1527                 wa_write_or(wal, UNSLCGCTL9430, MSQDUNIT_CLKGATE_DIS);
1528
1529                 /* Wa_14011037102:dg2_g10 */
1530                 wa_write_or(wal, UNSLCGCTL9444, LTCDD_CLKGATE_DIS);
1531
1532                 /* Wa_14011371254:dg2_g10 */
1533                 wa_mcr_write_or(wal, XEHP_SLICE_UNIT_LEVEL_CLKGATE, NODEDSS_CLKGATE_DIS);
1534
1535                 /* Wa_14011431319:dg2_g10 */
1536                 wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1537                             GAMTLBVDBOX7_CLKGATE_DIS |
1538                             GAMTLBVDBOX6_CLKGATE_DIS |
1539                             GAMTLBVDBOX5_CLKGATE_DIS |
1540                             GAMTLBVDBOX4_CLKGATE_DIS |
1541                             GAMTLBVDBOX3_CLKGATE_DIS |
1542                             GAMTLBVDBOX2_CLKGATE_DIS |
1543                             GAMTLBVDBOX1_CLKGATE_DIS |
1544                             GAMTLBVDBOX0_CLKGATE_DIS |
1545                             GAMTLBKCR_CLKGATE_DIS |
1546                             GAMTLBGUC_CLKGATE_DIS |
1547                             GAMTLBBLT_CLKGATE_DIS);
1548                 wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1549                             GAMTLBGFXA1_CLKGATE_DIS |
1550                             GAMTLBCOMPA0_CLKGATE_DIS |
1551                             GAMTLBCOMPA1_CLKGATE_DIS |
1552                             GAMTLBCOMPB0_CLKGATE_DIS |
1553                             GAMTLBCOMPB1_CLKGATE_DIS |
1554                             GAMTLBCOMPC0_CLKGATE_DIS |
1555                             GAMTLBCOMPC1_CLKGATE_DIS |
1556                             GAMTLBCOMPD0_CLKGATE_DIS |
1557                             GAMTLBCOMPD1_CLKGATE_DIS |
1558                             GAMTLBMERT_CLKGATE_DIS   |
1559                             GAMTLBVEBOX3_CLKGATE_DIS |
1560                             GAMTLBVEBOX2_CLKGATE_DIS |
1561                             GAMTLBVEBOX1_CLKGATE_DIS |
1562                             GAMTLBVEBOX0_CLKGATE_DIS);
1563
1564                 /* Wa_14010569222:dg2_g10 */
1565                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1566                             GAMEDIA_CLKGATE_DIS);
1567
1568                 /* Wa_14011028019:dg2_g10 */
1569                 wa_mcr_write_or(wal, SSMCGCTL9530, RTFUNIT_CLKGATE_DIS);
1570         }
1571
1572         /* Wa_14014830051:dg2 */
1573         wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1574
1575         /*
1576          * The following are not actually "workarounds" but rather
1577          * recommended tuning settings documented in the bspec's
1578          * performance guide section.
1579          */
1580         wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1581
1582         /* Wa_14015795083 */
1583         wa_mcr_write_clr(wal, GEN8_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1584 }
1585
1586 static void
1587 pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1588 {
1589         pvc_init_mcr(gt, wal);
1590
1591         /* Wa_14015795083 */
1592         wa_mcr_write_clr(wal, GEN8_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1593 }
1594
1595 static void
1596 xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1597 {
1598         /* FIXME: Actual workarounds will be added in future patch(es) */
1599
1600         /*
1601          * Unlike older platforms, we no longer setup implicit steering here;
1602          * all MCR accesses are explicitly steered.
1603          */
1604         debug_dump_steering(gt);
1605 }
1606
1607 static void
1608 xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1609 {
1610         /* FIXME: Actual workarounds will be added in future patch(es) */
1611
1612         debug_dump_steering(gt);
1613 }
1614
1615 static void
1616 gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1617 {
1618         struct drm_i915_private *i915 = gt->i915;
1619
1620         if (gt->type == GT_MEDIA) {
1621                 if (MEDIA_VER(i915) >= 13)
1622                         xelpmp_gt_workarounds_init(gt, wal);
1623                 else
1624                         MISSING_CASE(MEDIA_VER(i915));
1625
1626                 return;
1627         }
1628
1629         if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))
1630                 xelpg_gt_workarounds_init(gt, wal);
1631         else if (IS_PONTEVECCHIO(i915))
1632                 pvc_gt_workarounds_init(gt, wal);
1633         else if (IS_DG2(i915))
1634                 dg2_gt_workarounds_init(gt, wal);
1635         else if (IS_XEHPSDV(i915))
1636                 xehpsdv_gt_workarounds_init(gt, wal);
1637         else if (IS_DG1(i915))
1638                 dg1_gt_workarounds_init(gt, wal);
1639         else if (IS_TIGERLAKE(i915))
1640                 tgl_gt_workarounds_init(gt, wal);
1641         else if (GRAPHICS_VER(i915) == 12)
1642                 gen12_gt_workarounds_init(gt, wal);
1643         else if (GRAPHICS_VER(i915) == 11)
1644                 icl_gt_workarounds_init(gt, wal);
1645         else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1646                 cfl_gt_workarounds_init(gt, wal);
1647         else if (IS_GEMINILAKE(i915))
1648                 glk_gt_workarounds_init(gt, wal);
1649         else if (IS_KABYLAKE(i915))
1650                 kbl_gt_workarounds_init(gt, wal);
1651         else if (IS_BROXTON(i915))
1652                 gen9_gt_workarounds_init(gt, wal);
1653         else if (IS_SKYLAKE(i915))
1654                 skl_gt_workarounds_init(gt, wal);
1655         else if (IS_HASWELL(i915))
1656                 hsw_gt_workarounds_init(gt, wal);
1657         else if (IS_VALLEYVIEW(i915))
1658                 vlv_gt_workarounds_init(gt, wal);
1659         else if (IS_IVYBRIDGE(i915))
1660                 ivb_gt_workarounds_init(gt, wal);
1661         else if (GRAPHICS_VER(i915) == 6)
1662                 snb_gt_workarounds_init(gt, wal);
1663         else if (GRAPHICS_VER(i915) == 5)
1664                 ilk_gt_workarounds_init(gt, wal);
1665         else if (IS_G4X(i915))
1666                 g4x_gt_workarounds_init(gt, wal);
1667         else if (GRAPHICS_VER(i915) == 4)
1668                 gen4_gt_workarounds_init(gt, wal);
1669         else if (GRAPHICS_VER(i915) <= 8)
1670                 ;
1671         else
1672                 MISSING_CASE(GRAPHICS_VER(i915));
1673 }
1674
1675 void intel_gt_init_workarounds(struct intel_gt *gt)
1676 {
1677         struct i915_wa_list *wal = &gt->wa_list;
1678
1679         wa_init_start(wal, "GT", "global");
1680         gt_init_workarounds(gt, wal);
1681         wa_init_finish(wal);
1682 }
1683
1684 static enum forcewake_domains
1685 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1686 {
1687         enum forcewake_domains fw = 0;
1688         struct i915_wa *wa;
1689         unsigned int i;
1690
1691         for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1692                 fw |= intel_uncore_forcewake_for_reg(uncore,
1693                                                      wa->reg,
1694                                                      FW_REG_READ |
1695                                                      FW_REG_WRITE);
1696
1697         return fw;
1698 }
1699
1700 static bool
1701 wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
1702 {
1703         if ((cur ^ wa->set) & wa->read) {
1704                 DRM_ERROR("%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1705                           name, from, i915_mmio_reg_offset(wa->reg),
1706                           cur, cur & wa->read, wa->set & wa->read);
1707
1708                 return false;
1709         }
1710
1711         return true;
1712 }
1713
1714 static void
1715 wa_list_apply(struct intel_gt *gt, const struct i915_wa_list *wal)
1716 {
1717         struct intel_uncore *uncore = gt->uncore;
1718         enum forcewake_domains fw;
1719         unsigned long flags;
1720         struct i915_wa *wa;
1721         unsigned int i;
1722
1723         if (!wal->count)
1724                 return;
1725
1726         fw = wal_get_fw_for_rmw(uncore, wal);
1727
1728         spin_lock_irqsave(&uncore->lock, flags);
1729         intel_uncore_forcewake_get__locked(uncore, fw);
1730
1731         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1732                 u32 val, old = 0;
1733
1734                 /* open-coded rmw due to steering */
1735                 if (wa->clr)
1736                         old = wa->is_mcr ?
1737                                 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1738                                 intel_uncore_read_fw(uncore, wa->reg);
1739                 val = (old & ~wa->clr) | wa->set;
1740                 if (val != old || !wa->clr) {
1741                         if (wa->is_mcr)
1742                                 intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1743                         else
1744                                 intel_uncore_write_fw(uncore, wa->reg, val);
1745                 }
1746
1747                 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1748                         u32 val = wa->is_mcr ?
1749                                 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1750                                 intel_uncore_read_fw(uncore, wa->reg);
1751
1752                         wa_verify(wa, val, wal->name, "application");
1753                 }
1754         }
1755
1756         intel_uncore_forcewake_put__locked(uncore, fw);
1757         spin_unlock_irqrestore(&uncore->lock, flags);
1758 }
1759
1760 void intel_gt_apply_workarounds(struct intel_gt *gt)
1761 {
1762         wa_list_apply(gt, &gt->wa_list);
1763 }
1764
1765 static bool wa_list_verify(struct intel_gt *gt,
1766                            const struct i915_wa_list *wal,
1767                            const char *from)
1768 {
1769         struct intel_uncore *uncore = gt->uncore;
1770         struct i915_wa *wa;
1771         enum forcewake_domains fw;
1772         unsigned long flags;
1773         unsigned int i;
1774         bool ok = true;
1775
1776         fw = wal_get_fw_for_rmw(uncore, wal);
1777
1778         spin_lock_irqsave(&uncore->lock, flags);
1779         intel_uncore_forcewake_get__locked(uncore, fw);
1780
1781         for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1782                 ok &= wa_verify(wa, wa->is_mcr ?
1783                                 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1784                                 intel_uncore_read_fw(uncore, wa->reg),
1785                                 wal->name, from);
1786
1787         intel_uncore_forcewake_put__locked(uncore, fw);
1788         spin_unlock_irqrestore(&uncore->lock, flags);
1789
1790         return ok;
1791 }
1792
1793 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1794 {
1795         return wa_list_verify(gt, &gt->wa_list, from);
1796 }
1797
1798 __maybe_unused
1799 static bool is_nonpriv_flags_valid(u32 flags)
1800 {
1801         /* Check only valid flag bits are set */
1802         if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1803                 return false;
1804
1805         /* NB: Only 3 out of 4 enum values are valid for access field */
1806         if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1807             RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1808                 return false;
1809
1810         return true;
1811 }
1812
1813 static void
1814 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1815 {
1816         struct i915_wa wa = {
1817                 .reg = reg
1818         };
1819
1820         if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1821                 return;
1822
1823         if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1824                 return;
1825
1826         wa.reg.reg |= flags;
1827         _wa_add(wal, &wa);
1828 }
1829
1830 static void
1831 whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1832 {
1833         struct i915_wa wa = {
1834                 .mcr_reg = reg,
1835                 .is_mcr = 1,
1836         };
1837
1838         if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1839                 return;
1840
1841         if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1842                 return;
1843
1844         wa.mcr_reg.reg |= flags;
1845         _wa_add(wal, &wa);
1846 }
1847
1848 static void
1849 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1850 {
1851         whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1852 }
1853
1854 static void
1855 whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1856 {
1857         whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1858 }
1859
1860 static void gen9_whitelist_build(struct i915_wa_list *w)
1861 {
1862         /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1863         whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1864
1865         /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1866         whitelist_reg(w, GEN8_CS_CHICKEN1);
1867
1868         /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1869         whitelist_reg(w, GEN8_HDC_CHICKEN1);
1870
1871         /* WaSendPushConstantsFromMMIO:skl,bxt */
1872         whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1873 }
1874
1875 static void skl_whitelist_build(struct intel_engine_cs *engine)
1876 {
1877         struct i915_wa_list *w = &engine->whitelist;
1878
1879         if (engine->class != RENDER_CLASS)
1880                 return;
1881
1882         gen9_whitelist_build(w);
1883
1884         /* WaDisableLSQCROPERFforOCL:skl */
1885         whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1886 }
1887
1888 static void bxt_whitelist_build(struct intel_engine_cs *engine)
1889 {
1890         if (engine->class != RENDER_CLASS)
1891                 return;
1892
1893         gen9_whitelist_build(&engine->whitelist);
1894 }
1895
1896 static void kbl_whitelist_build(struct intel_engine_cs *engine)
1897 {
1898         struct i915_wa_list *w = &engine->whitelist;
1899
1900         if (engine->class != RENDER_CLASS)
1901                 return;
1902
1903         gen9_whitelist_build(w);
1904
1905         /* WaDisableLSQCROPERFforOCL:kbl */
1906         whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1907 }
1908
1909 static void glk_whitelist_build(struct intel_engine_cs *engine)
1910 {
1911         struct i915_wa_list *w = &engine->whitelist;
1912
1913         if (engine->class != RENDER_CLASS)
1914                 return;
1915
1916         gen9_whitelist_build(w);
1917
1918         /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1919         whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1920 }
1921
1922 static void cfl_whitelist_build(struct intel_engine_cs *engine)
1923 {
1924         struct i915_wa_list *w = &engine->whitelist;
1925
1926         if (engine->class != RENDER_CLASS)
1927                 return;
1928
1929         gen9_whitelist_build(w);
1930
1931         /*
1932          * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1933          *
1934          * This covers 4 register which are next to one another :
1935          *   - PS_INVOCATION_COUNT
1936          *   - PS_INVOCATION_COUNT_UDW
1937          *   - PS_DEPTH_COUNT
1938          *   - PS_DEPTH_COUNT_UDW
1939          */
1940         whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1941                           RING_FORCE_TO_NONPRIV_ACCESS_RD |
1942                           RING_FORCE_TO_NONPRIV_RANGE_4);
1943 }
1944
1945 static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1946 {
1947         struct i915_wa_list *w = &engine->whitelist;
1948
1949         if (engine->class != RENDER_CLASS)
1950                 whitelist_reg_ext(w,
1951                                   RING_CTX_TIMESTAMP(engine->mmio_base),
1952                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
1953 }
1954
1955 static void cml_whitelist_build(struct intel_engine_cs *engine)
1956 {
1957         allow_read_ctx_timestamp(engine);
1958
1959         cfl_whitelist_build(engine);
1960 }
1961
1962 static void icl_whitelist_build(struct intel_engine_cs *engine)
1963 {
1964         struct i915_wa_list *w = &engine->whitelist;
1965
1966         allow_read_ctx_timestamp(engine);
1967
1968         switch (engine->class) {
1969         case RENDER_CLASS:
1970                 /* WaAllowUMDToModifyHalfSliceChicken7:icl */
1971                 whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1972
1973                 /* WaAllowUMDToModifySamplerMode:icl */
1974                 whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
1975
1976                 /* WaEnableStateCacheRedirectToCS:icl */
1977                 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1978
1979                 /*
1980                  * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1981                  *
1982                  * This covers 4 register which are next to one another :
1983                  *   - PS_INVOCATION_COUNT
1984                  *   - PS_INVOCATION_COUNT_UDW
1985                  *   - PS_DEPTH_COUNT
1986                  *   - PS_DEPTH_COUNT_UDW
1987                  */
1988                 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1989                                   RING_FORCE_TO_NONPRIV_ACCESS_RD |
1990                                   RING_FORCE_TO_NONPRIV_RANGE_4);
1991                 break;
1992
1993         case VIDEO_DECODE_CLASS:
1994                 /* hucStatusRegOffset */
1995                 whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
1996                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
1997                 /* hucUKernelHdrInfoRegOffset */
1998                 whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
1999                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
2000                 /* hucStatus2RegOffset */
2001                 whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2002                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
2003                 break;
2004
2005         default:
2006                 break;
2007         }
2008 }
2009
2010 static void tgl_whitelist_build(struct intel_engine_cs *engine)
2011 {
2012         struct i915_wa_list *w = &engine->whitelist;
2013
2014         allow_read_ctx_timestamp(engine);
2015
2016         switch (engine->class) {
2017         case RENDER_CLASS:
2018                 /*
2019                  * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2020                  * Wa_1408556865:tgl
2021                  *
2022                  * This covers 4 registers which are next to one another :
2023                  *   - PS_INVOCATION_COUNT
2024                  *   - PS_INVOCATION_COUNT_UDW
2025                  *   - PS_DEPTH_COUNT
2026                  *   - PS_DEPTH_COUNT_UDW
2027                  */
2028                 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2029                                   RING_FORCE_TO_NONPRIV_ACCESS_RD |
2030                                   RING_FORCE_TO_NONPRIV_RANGE_4);
2031
2032                 /*
2033                  * Wa_1808121037:tgl
2034                  * Wa_14012131227:dg1
2035                  * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2036                  */
2037                 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2038
2039                 /* Wa_1806527549:tgl */
2040                 whitelist_reg(w, HIZ_CHICKEN);
2041                 break;
2042         default:
2043                 break;
2044         }
2045 }
2046
2047 static void dg1_whitelist_build(struct intel_engine_cs *engine)
2048 {
2049         struct i915_wa_list *w = &engine->whitelist;
2050
2051         tgl_whitelist_build(engine);
2052
2053         /* GEN:BUG:1409280441:dg1 */
2054         if (IS_DG1_GRAPHICS_STEP(engine->i915, STEP_A0, STEP_B0) &&
2055             (engine->class == RENDER_CLASS ||
2056              engine->class == COPY_ENGINE_CLASS))
2057                 whitelist_reg_ext(w, RING_ID(engine->mmio_base),
2058                                   RING_FORCE_TO_NONPRIV_ACCESS_RD);
2059 }
2060
2061 static void xehpsdv_whitelist_build(struct intel_engine_cs *engine)
2062 {
2063         allow_read_ctx_timestamp(engine);
2064 }
2065
2066 static void dg2_whitelist_build(struct intel_engine_cs *engine)
2067 {
2068         struct i915_wa_list *w = &engine->whitelist;
2069
2070         allow_read_ctx_timestamp(engine);
2071
2072         switch (engine->class) {
2073         case RENDER_CLASS:
2074                 /*
2075                  * Wa_1507100340:dg2_g10
2076                  *
2077                  * This covers 4 registers which are next to one another :
2078                  *   - PS_INVOCATION_COUNT
2079                  *   - PS_INVOCATION_COUNT_UDW
2080                  *   - PS_DEPTH_COUNT
2081                  *   - PS_DEPTH_COUNT_UDW
2082                  */
2083                 if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0))
2084                         whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2085                                           RING_FORCE_TO_NONPRIV_ACCESS_RD |
2086                                           RING_FORCE_TO_NONPRIV_RANGE_4);
2087
2088                 break;
2089         case COMPUTE_CLASS:
2090                 /* Wa_16011157294:dg2_g10 */
2091                 if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0))
2092                         whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
2093                 break;
2094         default:
2095                 break;
2096         }
2097 }
2098
2099 static void blacklist_trtt(struct intel_engine_cs *engine)
2100 {
2101         struct i915_wa_list *w = &engine->whitelist;
2102
2103         /*
2104          * Prevent read/write access to [0x4400, 0x4600) which covers
2105          * the TRTT range across all engines. Note that normally userspace
2106          * cannot access the other engines' trtt control, but for simplicity
2107          * we cover the entire range on each engine.
2108          */
2109         whitelist_reg_ext(w, _MMIO(0x4400),
2110                           RING_FORCE_TO_NONPRIV_DENY |
2111                           RING_FORCE_TO_NONPRIV_RANGE_64);
2112         whitelist_reg_ext(w, _MMIO(0x4500),
2113                           RING_FORCE_TO_NONPRIV_DENY |
2114                           RING_FORCE_TO_NONPRIV_RANGE_64);
2115 }
2116
2117 static void pvc_whitelist_build(struct intel_engine_cs *engine)
2118 {
2119         allow_read_ctx_timestamp(engine);
2120
2121         /* Wa_16014440446:pvc */
2122         blacklist_trtt(engine);
2123 }
2124
2125 void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2126 {
2127         struct drm_i915_private *i915 = engine->i915;
2128         struct i915_wa_list *w = &engine->whitelist;
2129
2130         wa_init_start(w, "whitelist", engine->name);
2131
2132         if (IS_PONTEVECCHIO(i915))
2133                 pvc_whitelist_build(engine);
2134         else if (IS_DG2(i915))
2135                 dg2_whitelist_build(engine);
2136         else if (IS_XEHPSDV(i915))
2137                 xehpsdv_whitelist_build(engine);
2138         else if (IS_DG1(i915))
2139                 dg1_whitelist_build(engine);
2140         else if (GRAPHICS_VER(i915) == 12)
2141                 tgl_whitelist_build(engine);
2142         else if (GRAPHICS_VER(i915) == 11)
2143                 icl_whitelist_build(engine);
2144         else if (IS_COMETLAKE(i915))
2145                 cml_whitelist_build(engine);
2146         else if (IS_COFFEELAKE(i915))
2147                 cfl_whitelist_build(engine);
2148         else if (IS_GEMINILAKE(i915))
2149                 glk_whitelist_build(engine);
2150         else if (IS_KABYLAKE(i915))
2151                 kbl_whitelist_build(engine);
2152         else if (IS_BROXTON(i915))
2153                 bxt_whitelist_build(engine);
2154         else if (IS_SKYLAKE(i915))
2155                 skl_whitelist_build(engine);
2156         else if (GRAPHICS_VER(i915) <= 8)
2157                 ;
2158         else
2159                 MISSING_CASE(GRAPHICS_VER(i915));
2160
2161         wa_init_finish(w);
2162 }
2163
2164 void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2165 {
2166         const struct i915_wa_list *wal = &engine->whitelist;
2167         struct intel_uncore *uncore = engine->uncore;
2168         const u32 base = engine->mmio_base;
2169         struct i915_wa *wa;
2170         unsigned int i;
2171
2172         if (!wal->count)
2173                 return;
2174
2175         for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2176                 intel_uncore_write(uncore,
2177                                    RING_FORCE_TO_NONPRIV(base, i),
2178                                    i915_mmio_reg_offset(wa->reg));
2179
2180         /* And clear the rest just in case of garbage */
2181         for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2182                 intel_uncore_write(uncore,
2183                                    RING_FORCE_TO_NONPRIV(base, i),
2184                                    i915_mmio_reg_offset(RING_NOPID(base)));
2185 }
2186
2187 /*
2188  * engine_fake_wa_init(), a place holder to program the registers
2189  * which are not part of an official workaround defined by the
2190  * hardware team.
2191  * Adding programming of those register inside workaround will
2192  * allow utilizing wa framework to proper application and verification.
2193  */
2194 static void
2195 engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2196 {
2197         u8 mocs_w, mocs_r;
2198
2199         /*
2200          * RING_CMD_CCTL specifies the default MOCS entry that will be used
2201          * by the command streamer when executing commands that don't have
2202          * a way to explicitly specify a MOCS setting.  The default should
2203          * usually reference whichever MOCS entry corresponds to uncached
2204          * behavior, although use of a WB cached entry is recommended by the
2205          * spec in certain circumstances on specific platforms.
2206          */
2207         if (GRAPHICS_VER(engine->i915) >= 12) {
2208                 mocs_r = engine->gt->mocs.uc_index;
2209                 mocs_w = engine->gt->mocs.uc_index;
2210
2211                 if (HAS_L3_CCS_READ(engine->i915) &&
2212                     engine->class == COMPUTE_CLASS) {
2213                         mocs_r = engine->gt->mocs.wb_index;
2214
2215                         /*
2216                          * Even on the few platforms where MOCS 0 is a
2217                          * legitimate table entry, it's never the correct
2218                          * setting to use here; we can assume the MOCS init
2219                          * just forgot to initialize wb_index.
2220                          */
2221                         drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2222                 }
2223
2224                 wa_masked_field_set(wal,
2225                                     RING_CMD_CCTL(engine->mmio_base),
2226                                     CMD_CCTL_MOCS_MASK,
2227                                     CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2228         }
2229 }
2230
2231 static bool needs_wa_1308578152(struct intel_engine_cs *engine)
2232 {
2233         return intel_sseu_find_first_xehp_dss(&engine->gt->info.sseu, 0, 0) >=
2234                 GEN_DSS_PER_GSLICE;
2235 }
2236
2237 static void
2238 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2239 {
2240         struct drm_i915_private *i915 = engine->i915;
2241
2242         if (IS_DG2(i915)) {
2243                 /* Wa_1509235366:dg2 */
2244                 wa_write_or(wal, GEN12_GAMCNTRL_CTRL, INVALIDATION_BROADCAST_MODE_DIS |
2245                             GLOBAL_INVALIDATION_MODE);
2246         }
2247
2248         if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) {
2249                 /* Wa_14013392000:dg2_g11 */
2250                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_ENABLE_LARGE_GRF_MODE);
2251         }
2252
2253         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) ||
2254             IS_DG2_G11(i915) || IS_DG2_G12(i915)) {
2255                 /* Wa_1509727124:dg2 */
2256                 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2257                                  SC_DISABLE_POWER_OPTIMIZATION_EBB);
2258         }
2259
2260         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0) ||
2261             IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) {
2262                 /* Wa_14012419201:dg2 */
2263                 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4,
2264                                  GEN12_DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX);
2265         }
2266
2267         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) ||
2268             IS_DG2_G11(i915)) {
2269                 /*
2270                  * Wa_22012826095:dg2
2271                  * Wa_22013059131:dg2
2272                  */
2273                 wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2274                                      MAXREQS_PER_BANK,
2275                                      REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2276
2277                 /* Wa_22013059131:dg2 */
2278                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2279                                 FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2280         }
2281
2282         /* Wa_1308578152:dg2_g10 when first gslice is fused off */
2283         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) &&
2284             needs_wa_1308578152(engine)) {
2285                 wa_masked_dis(wal, GEN12_CS_DEBUG_MODE1_CCCSUNIT_BE_COMMON,
2286                               GEN12_REPLAY_MODE_GRANULARITY);
2287         }
2288
2289         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) ||
2290             IS_DG2_G11(i915) || IS_DG2_G12(i915)) {
2291                 /* Wa_22013037850:dg2 */
2292                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2293                                 DISABLE_128B_EVICTION_COMMAND_UDW);
2294
2295                 /* Wa_22012856258:dg2 */
2296                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2297                                  GEN12_DISABLE_READ_SUPPRESSION);
2298
2299                 /*
2300                  * Wa_22010960976:dg2
2301                  * Wa_14013347512:dg2
2302                  */
2303                 wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2304                                   LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2305         }
2306
2307         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) {
2308                 /*
2309                  * Wa_1608949956:dg2_g10
2310                  * Wa_14010198302:dg2_g10
2311                  */
2312                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
2313                                  MDQ_ARBITRATION_MODE | UGM_BACKUP_MODE);
2314
2315                 /*
2316                  * Wa_14010918519:dg2_g10
2317                  *
2318                  * LSC_CHICKEN_BIT_0 always reads back as 0 is this stepping,
2319                  * so ignoring verification.
2320                  */
2321                 wa_mcr_add(wal, LSC_CHICKEN_BIT_0_UDW, 0,
2322                            FORCE_SLM_FENCE_SCOPE_TO_TILE | FORCE_UGM_FENCE_SCOPE_TO_TILE,
2323                            0, false);
2324         }
2325
2326         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) {
2327                 /* Wa_22010430635:dg2 */
2328                 wa_mcr_masked_en(wal,
2329                                  GEN9_ROW_CHICKEN4,
2330                                  GEN12_DISABLE_GRF_CLEAR);
2331
2332                 /* Wa_14010648519:dg2 */
2333                 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
2334         }
2335
2336         /* Wa_14013202645:dg2 */
2337         if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) ||
2338             IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0))
2339                 wa_mcr_write_or(wal, RT_CTRL, DIS_NULL_QUERY);
2340
2341         /* Wa_22012532006:dg2 */
2342         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_C0) ||
2343             IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0))
2344                 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
2345                                  DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA);
2346
2347         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) {
2348                 /* Wa_14010680813:dg2_g10 */
2349                 wa_write_or(wal, GEN12_GAMSTLB_CTRL, CONTROL_BLOCK_CLKGATE_DIS |
2350                             EGRESS_BLOCK_CLKGATE_DIS | TAG_BLOCK_CLKGATE_DIS);
2351         }
2352
2353         if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0) ||
2354             IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) {
2355                 /* Wa_14012362059:dg2 */
2356                 wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
2357         }
2358
2359         if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_B0, STEP_FOREVER) ||
2360             IS_DG2_G10(i915)) {
2361                 /* Wa_22014600077:dg2 */
2362                 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2363                            _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH),
2364                            0 /* Wa_14012342262 write-only reg, so skip verification */,
2365                            true);
2366         }
2367
2368         if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2369             IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)) {
2370                 /*
2371                  * Wa_1607138336:tgl[a0],dg1[a0]
2372                  * Wa_1607063988:tgl[a0],dg1[a0]
2373                  */
2374                 wa_write_or(wal,
2375                             GEN9_CTX_PREEMPT_REG,
2376                             GEN12_DISABLE_POSH_BUSY_FF_DOP_CG);
2377         }
2378
2379         if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)) {
2380                 /*
2381                  * Wa_1606679103:tgl
2382                  * (see also Wa_1606682166:icl)
2383                  */
2384                 wa_write_or(wal,
2385                             GEN7_SARCHKMD,
2386                             GEN7_DISABLE_SAMPLER_PREFETCH);
2387         }
2388
2389         if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2390             IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2391                 /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2392                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2393
2394                 /*
2395                  * Wa_1407928979:tgl A*
2396                  * Wa_18011464164:tgl[B0+],dg1[B0+]
2397                  * Wa_22010931296:tgl[B0+],dg1[B0+]
2398                  * Wa_14010919138:rkl,dg1,adl-s,adl-p
2399                  */
2400                 wa_write_or(wal, GEN7_FF_THREAD_MODE,
2401                             GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2402         }
2403
2404         if (IS_ALDERLAKE_P(i915) || IS_DG2(i915) || IS_ALDERLAKE_S(i915) ||
2405             IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2406                 /*
2407                  * Wa_1606700617:tgl,dg1,adl-p
2408                  * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2409                  * Wa_14010826681:tgl,dg1,rkl,adl-p
2410                  * Wa_18019627453:dg2
2411                  */
2412                 wa_masked_en(wal,
2413                              GEN9_CS_DEBUG_MODE1,
2414                              FF_DOP_CLOCK_GATE_DISABLE);
2415         }
2416
2417         if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2418             IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2419             IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2420                 /* Wa_1409804808:tgl,rkl,dg1[a0],adl-s,adl-p */
2421                 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2422                                  GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2423
2424                 /*
2425                  * Wa_1409085225:tgl
2426                  * Wa_14010229206:tgl,rkl,dg1[a0],adl-s,adl-p
2427                  */
2428                 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2429         }
2430
2431         if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2432             IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2433                 /*
2434                  * Wa_1607030317:tgl
2435                  * Wa_1607186500:tgl
2436                  * Wa_1607297627:tgl,rkl,dg1[a0],adlp
2437                  *
2438                  * On TGL and RKL there are multiple entries for this WA in the
2439                  * BSpec; some indicate this is an A0-only WA, others indicate
2440                  * it applies to all steppings so we trust the "all steppings."
2441                  * For DG1 this only applies to A0.
2442                  */
2443                 wa_masked_en(wal,
2444                              RING_PSMI_CTL(RENDER_RING_BASE),
2445                              GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2446                              GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2447         }
2448
2449         if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) ||
2450             IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915)) {
2451                 /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2452                 wa_mcr_masked_en(wal,
2453                                  GEN10_SAMPLER_MODE,
2454                                  ENABLE_SMALLPL);
2455         }
2456
2457         if (GRAPHICS_VER(i915) == 11) {
2458                 /* This is not an Wa. Enable for better image quality */
2459                 wa_masked_en(wal,
2460                              _3D_CHICKEN3,
2461                              _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2462
2463                 /*
2464                  * Wa_1405543622:icl
2465                  * Formerly known as WaGAPZPriorityScheme
2466                  */
2467                 wa_write_or(wal,
2468                             GEN8_GARBCNTL,
2469                             GEN11_ARBITRATION_PRIO_ORDER_MASK);
2470
2471                 /*
2472                  * Wa_1604223664:icl
2473                  * Formerly known as WaL3BankAddressHashing
2474                  */
2475                 wa_write_clr_set(wal,
2476                                  GEN8_GARBCNTL,
2477                                  GEN11_HASH_CTRL_EXCL_MASK,
2478                                  GEN11_HASH_CTRL_EXCL_BIT0);
2479                 wa_write_clr_set(wal,
2480                                  GEN11_GLBLINVL,
2481                                  GEN11_BANK_HASH_ADDR_EXCL_MASK,
2482                                  GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2483
2484                 /*
2485                  * Wa_1405733216:icl
2486                  * Formerly known as WaDisableCleanEvicts
2487                  */
2488                 wa_mcr_write_or(wal,
2489                                 GEN8_L3SQCREG4,
2490                                 GEN11_LQSC_CLEAN_EVICT_DISABLE);
2491
2492                 /* Wa_1606682166:icl */
2493                 wa_write_or(wal,
2494                             GEN7_SARCHKMD,
2495                             GEN7_DISABLE_SAMPLER_PREFETCH);
2496
2497                 /* Wa_1409178092:icl */
2498                 wa_mcr_write_clr_set(wal,
2499                                      GEN11_SCRATCH2,
2500                                      GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2501                                      0);
2502
2503                 /* WaEnable32PlaneMode:icl */
2504                 wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2505                              GEN11_ENABLE_32_PLANE_MODE);
2506
2507                 /*
2508                  * Wa_1408615072:icl,ehl  (vsunit)
2509                  * Wa_1407596294:icl,ehl  (hsunit)
2510                  */
2511                 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
2512                             VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
2513
2514                 /*
2515                  * Wa_1408767742:icl[a2..forever],ehl[all]
2516                  * Wa_1605460711:icl[a0..c0]
2517                  */
2518                 wa_write_or(wal,
2519                             GEN7_FF_THREAD_MODE,
2520                             GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2521
2522                 /* Wa_22010271021 */
2523                 wa_masked_en(wal,
2524                              GEN9_CS_DEBUG_MODE1,
2525                              FF_DOP_CLOCK_GATE_DISABLE);
2526         }
2527
2528         /*
2529          * Intel platforms that support fine-grained preemption (i.e., gen9 and
2530          * beyond) allow the kernel-mode driver to choose between two different
2531          * options for controlling preemption granularity and behavior.
2532          *
2533          * Option 1 (hardware default):
2534          *   Preemption settings are controlled in a global manner via
2535          *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2536          *   and settings chosen by the kernel-mode driver will apply to all
2537          *   userspace clients.
2538          *
2539          * Option 2:
2540          *   Preemption settings are controlled on a per-context basis via
2541          *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2542          *   context switch and is writable by userspace (e.g., via
2543          *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2544          *   which allows different userspace drivers/clients to select
2545          *   different settings, or to change those settings on the fly in
2546          *   response to runtime needs.  This option was known by name
2547          *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2548          *   that name is somewhat misleading as other non-granularity
2549          *   preemption settings are also impacted by this decision.
2550          *
2551          * On Linux, our policy has always been to let userspace drivers
2552          * control preemption granularity/settings (Option 2).  This was
2553          * originally mandatory on gen9 to prevent ABI breakage (old gen9
2554          * userspace developed before object-level preemption was enabled would
2555          * not behave well if i915 were to go with Option 1 and enable that
2556          * preemption in a global manner).  On gen9 each context would have
2557          * object-level preemption disabled by default (see
2558          * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2559          * userspace drivers could opt-in to object-level preemption as they
2560          * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2561          * even though it is no longer necessary for ABI compatibility when
2562          * enabling a new platform, it does ensure that userspace will be able
2563          * to implement any workarounds that show up requiring temporary
2564          * adjustments to preemption behavior at runtime.
2565          *
2566          * Notes/Workarounds:
2567          *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2568          *      CS_CHICKEN1[0] does not disable object-level preemption as
2569          *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2570          *      using Option 1).  Effectively this means userspace is unable
2571          *      to disable object-level preemption on these platforms/steppings
2572          *      despite the setting here.
2573          *
2574          *  - Wa_16013994831:  May require that userspace program
2575          *      CS_CHICKEN1[10] when certain runtime conditions are true.
2576          *      Userspace requires Option 2 to be in effect for their update of
2577          *      CS_CHICKEN1[10] to be effective.
2578          *
2579          * Other workarounds may appear in the future that will also require
2580          * Option 2 behavior to allow proper userspace implementation.
2581          */
2582         if (GRAPHICS_VER(i915) >= 9)
2583                 wa_masked_en(wal,
2584                              GEN7_FF_SLICE_CS_CHICKEN1,
2585                              GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2586
2587         if (IS_SKYLAKE(i915) ||
2588             IS_KABYLAKE(i915) ||
2589             IS_COFFEELAKE(i915) ||
2590             IS_COMETLAKE(i915)) {
2591                 /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2592                 wa_write_or(wal,
2593                             GEN8_GARBCNTL,
2594                             GEN9_GAPS_TSV_CREDIT_DISABLE);
2595         }
2596
2597         if (IS_BROXTON(i915)) {
2598                 /* WaDisablePooledEuLoadBalancingFix:bxt */
2599                 wa_masked_en(wal,
2600                              FF_SLICE_CS_CHICKEN2,
2601                              GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2602         }
2603
2604         if (GRAPHICS_VER(i915) == 9) {
2605                 /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2606                 wa_masked_en(wal,
2607                              GEN9_CSFE_CHICKEN1_RCS,
2608                              GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2609
2610                 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2611                 wa_mcr_write_or(wal,
2612                                 BDW_SCRATCH1,
2613                                 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2614
2615                 /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2616                 if (IS_GEN9_LP(i915))
2617                         wa_mcr_write_clr_set(wal,
2618                                              GEN8_L3SQCREG1,
2619                                              L3_PRIO_CREDITS_MASK,
2620                                              L3_GENERAL_PRIO_CREDITS(62) |
2621                                              L3_HIGH_PRIO_CREDITS(2));
2622
2623                 /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2624                 wa_mcr_write_or(wal,
2625                                 GEN8_L3SQCREG4,
2626                                 GEN8_LQSC_FLUSH_COHERENT_LINES);
2627
2628                 /* Disable atomics in L3 to prevent unrecoverable hangs */
2629                 wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2630                                  GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2631                 wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2632                                      GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2633                 wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2634                                      EVICTION_PERF_FIX_ENABLE, 0);
2635         }
2636
2637         if (IS_HASWELL(i915)) {
2638                 /* WaSampleCChickenBitEnable:hsw */
2639                 wa_masked_en(wal,
2640                              HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2641
2642                 wa_masked_dis(wal,
2643                               CACHE_MODE_0_GEN7,
2644                               /* enable HiZ Raw Stall Optimization */
2645                               HIZ_RAW_STALL_OPT_DISABLE);
2646         }
2647
2648         if (IS_VALLEYVIEW(i915)) {
2649                 /* WaDisableEarlyCull:vlv */
2650                 wa_masked_en(wal,
2651                              _3D_CHICKEN3,
2652                              _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2653
2654                 /*
2655                  * WaVSThreadDispatchOverride:ivb,vlv
2656                  *
2657                  * This actually overrides the dispatch
2658                  * mode for all thread types.
2659                  */
2660                 wa_write_clr_set(wal,
2661                                  GEN7_FF_THREAD_MODE,
2662                                  GEN7_FF_SCHED_MASK,
2663                                  GEN7_FF_TS_SCHED_HW |
2664                                  GEN7_FF_VS_SCHED_HW |
2665                                  GEN7_FF_DS_SCHED_HW);
2666
2667                 /* WaPsdDispatchEnable:vlv */
2668                 /* WaDisablePSDDualDispatchEnable:vlv */
2669                 wa_masked_en(wal,
2670                              GEN7_HALF_SLICE_CHICKEN1,
2671                              GEN7_MAX_PS_THREAD_DEP |
2672                              GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2673         }
2674
2675         if (IS_IVYBRIDGE(i915)) {
2676                 /* WaDisableEarlyCull:ivb */
2677                 wa_masked_en(wal,
2678                              _3D_CHICKEN3,
2679                              _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2680
2681                 if (0) { /* causes HiZ corruption on ivb:gt1 */
2682                         /* enable HiZ Raw Stall Optimization */
2683                         wa_masked_dis(wal,
2684                                       CACHE_MODE_0_GEN7,
2685                                       HIZ_RAW_STALL_OPT_DISABLE);
2686                 }
2687
2688                 /*
2689                  * WaVSThreadDispatchOverride:ivb,vlv
2690                  *
2691                  * This actually overrides the dispatch
2692                  * mode for all thread types.
2693                  */
2694                 wa_write_clr_set(wal,
2695                                  GEN7_FF_THREAD_MODE,
2696                                  GEN7_FF_SCHED_MASK,
2697                                  GEN7_FF_TS_SCHED_HW |
2698                                  GEN7_FF_VS_SCHED_HW |
2699                                  GEN7_FF_DS_SCHED_HW);
2700
2701                 /* WaDisablePSDDualDispatchEnable:ivb */
2702                 if (IS_IVB_GT1(i915))
2703                         wa_masked_en(wal,
2704                                      GEN7_HALF_SLICE_CHICKEN1,
2705                                      GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2706         }
2707
2708         if (GRAPHICS_VER(i915) == 7) {
2709                 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2710                 wa_masked_en(wal,
2711                              RING_MODE_GEN7(RENDER_RING_BASE),
2712                              GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2713
2714                 /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2715                 wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2716
2717                 /*
2718                  * BSpec says this must be set, even though
2719                  * WaDisable4x2SubspanOptimization:ivb,hsw
2720                  * WaDisable4x2SubspanOptimization isn't listed for VLV.
2721                  */
2722                 wa_masked_en(wal,
2723                              CACHE_MODE_1,
2724                              PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2725
2726                 /*
2727                  * BSpec recommends 8x4 when MSAA is used,
2728                  * however in practice 16x4 seems fastest.
2729                  *
2730                  * Note that PS/WM thread counts depend on the WIZ hashing
2731                  * disable bit, which we don't touch here, but it's good
2732                  * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2733                  */
2734                 wa_masked_field_set(wal,
2735                                     GEN7_GT_MODE,
2736                                     GEN6_WIZ_HASHING_MASK,
2737                                     GEN6_WIZ_HASHING_16x4);
2738         }
2739
2740         if (IS_GRAPHICS_VER(i915, 6, 7))
2741                 /*
2742                  * We need to disable the AsyncFlip performance optimisations in
2743                  * order to use MI_WAIT_FOR_EVENT within the CS. It should
2744                  * already be programmed to '1' on all products.
2745                  *
2746                  * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2747                  */
2748                 wa_masked_en(wal,
2749                              RING_MI_MODE(RENDER_RING_BASE),
2750                              ASYNC_FLIP_PERF_DISABLE);
2751
2752         if (GRAPHICS_VER(i915) == 6) {
2753                 /*
2754                  * Required for the hardware to program scanline values for
2755                  * waiting
2756                  * WaEnableFlushTlbInvalidationMode:snb
2757                  */
2758                 wa_masked_en(wal,
2759                              GFX_MODE,
2760                              GFX_TLB_INVALIDATE_EXPLICIT);
2761
2762                 /* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2763                 wa_masked_en(wal,
2764                              _3D_CHICKEN,
2765                              _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2766
2767                 wa_masked_en(wal,
2768                              _3D_CHICKEN3,
2769                              /* WaStripsFansDisableFastClipPerformanceFix:snb */
2770                              _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2771                              /*
2772                               * Bspec says:
2773                               * "This bit must be set if 3DSTATE_CLIP clip mode is set
2774                               * to normal and 3DSTATE_SF number of SF output attributes
2775                               * is more than 16."
2776                               */
2777                              _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2778
2779                 /*
2780                  * BSpec recommends 8x4 when MSAA is used,
2781                  * however in practice 16x4 seems fastest.
2782                  *
2783                  * Note that PS/WM thread counts depend on the WIZ hashing
2784                  * disable bit, which we don't touch here, but it's good
2785                  * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2786                  */
2787                 wa_masked_field_set(wal,
2788                                     GEN6_GT_MODE,
2789                                     GEN6_WIZ_HASHING_MASK,
2790                                     GEN6_WIZ_HASHING_16x4);
2791
2792                 /* WaDisable_RenderCache_OperationalFlush:snb */
2793                 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2794
2795                 /*
2796                  * From the Sandybridge PRM, volume 1 part 3, page 24:
2797                  * "If this bit is set, STCunit will have LRA as replacement
2798                  *  policy. [...] This bit must be reset. LRA replacement
2799                  *  policy is not supported."
2800                  */
2801                 wa_masked_dis(wal,
2802                               CACHE_MODE_0,
2803                               CM0_STC_EVICT_DISABLE_LRA_SNB);
2804         }
2805
2806         if (IS_GRAPHICS_VER(i915, 4, 6))
2807                 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2808                 wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2809                        0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2810                        /* XXX bit doesn't stick on Broadwater */
2811                        IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2812
2813         if (GRAPHICS_VER(i915) == 4)
2814                 /*
2815                  * Disable CONSTANT_BUFFER before it is loaded from the context
2816                  * image. For as it is loaded, it is executed and the stored
2817                  * address may no longer be valid, leading to a GPU hang.
2818                  *
2819                  * This imposes the requirement that userspace reload their
2820                  * CONSTANT_BUFFER on every batch, fortunately a requirement
2821                  * they are already accustomed to from before contexts were
2822                  * enabled.
2823                  */
2824                 wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2825                        0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2826                        0 /* XXX bit doesn't stick on Broadwater */,
2827                        true);
2828 }
2829
2830 static void
2831 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2832 {
2833         struct drm_i915_private *i915 = engine->i915;
2834
2835         /* WaKBLVECSSemaphoreWaitPoll:kbl */
2836         if (IS_KBL_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2837                 wa_write(wal,
2838                          RING_SEMA_WAIT_POLL(engine->mmio_base),
2839                          1);
2840         }
2841 }
2842
2843 static void
2844 ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2845 {
2846         if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2847                 /* Wa_14014999345:pvc */
2848                 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2849         }
2850 }
2851
2852 /*
2853  * The bspec performance guide has recommended MMIO tuning settings.  These
2854  * aren't truly "workarounds" but we want to program them with the same
2855  * workaround infrastructure to ensure that they're automatically added to
2856  * the GuC save/restore lists, re-applied at the right times, and checked for
2857  * any conflicting programming requested by real workarounds.
2858  *
2859  * Programming settings should be added here only if their registers are not
2860  * part of an engine's register state context.  If a register is part of a
2861  * context, then any tuning settings should be programmed in an appropriate
2862  * function invoked by __intel_engine_init_ctx_wa().
2863  */
2864 static void
2865 add_render_compute_tuning_settings(struct drm_i915_private *i915,
2866                                    struct i915_wa_list *wal)
2867 {
2868         if (IS_PONTEVECCHIO(i915)) {
2869                 wa_write(wal, XEHPC_L3SCRUB,
2870                          SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
2871         }
2872
2873         if (IS_DG2(i915)) {
2874                 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
2875                 wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2876
2877                 /*
2878                  * This is also listed as Wa_22012654132 for certain DG2
2879                  * steppings, but the tuning setting programming is a superset
2880                  * since it applies to all DG2 variants and steppings.
2881                  *
2882                  * Note that register 0xE420 is write-only and cannot be read
2883                  * back for verification on DG2 (due to Wa_14012342262), so
2884                  * we need to explicitly skip the readback.
2885                  */
2886                 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2887                            _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2888                            0 /* write-only, so skip validation */,
2889                            true);
2890         }
2891
2892         /*
2893          * This tuning setting proves beneficial only on ATS-M designs; the
2894          * default "age based" setting is optimal on regular DG2 and other
2895          * platforms.
2896          */
2897         if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2898                 wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2899                                         THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2900 }
2901
2902 /*
2903  * The workarounds in this function apply to shared registers in
2904  * the general render reset domain that aren't tied to a
2905  * specific engine.  Since all render+compute engines get reset
2906  * together, and the contents of these registers are lost during
2907  * the shared render domain reset, we'll define such workarounds
2908  * here and then add them to just a single RCS or CCS engine's
2909  * workaround list (whichever engine has the XXXX flag).
2910  */
2911 static void
2912 general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2913 {
2914         struct drm_i915_private *i915 = engine->i915;
2915
2916         add_render_compute_tuning_settings(i915, wal);
2917
2918         if (IS_PONTEVECCHIO(i915)) {
2919                 /* Wa_16016694945 */
2920                 wa_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
2921         }
2922
2923         if (IS_XEHPSDV(i915)) {
2924                 /* Wa_1409954639 */
2925                 wa_mcr_masked_en(wal,
2926                                  GEN8_ROW_CHICKEN,
2927                                  SYSTOLIC_DOP_CLOCK_GATING_DIS);
2928
2929                 /* Wa_1607196519 */
2930                 wa_mcr_masked_en(wal,
2931                                  GEN9_ROW_CHICKEN4,
2932                                  GEN12_DISABLE_GRF_CLEAR);
2933
2934                 /* Wa_14010670810:xehpsdv */
2935                 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
2936
2937                 /* Wa_14010449647:xehpsdv */
2938                 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
2939                                  GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2940
2941                 /* Wa_18011725039:xehpsdv */
2942                 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
2943                         wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
2944                         wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
2945                 }
2946
2947                 /* Wa_14012362059:xehpsdv */
2948                 wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
2949
2950                 /* Wa_14014368820:xehpsdv */
2951                 wa_write_or(wal, GEN12_GAMCNTRL_CTRL, INVALIDATION_BROADCAST_MODE_DIS |
2952                                 GLOBAL_INVALIDATION_MODE);
2953         }
2954
2955         if (IS_DG2(i915) || IS_PONTEVECCHIO(i915)) {
2956                 /* Wa_14015227452:dg2,pvc */
2957                 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2958
2959                 /* Wa_22014226127:dg2,pvc */
2960                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2961
2962                 /* Wa_16015675438:dg2,pvc */
2963                 wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2964
2965                 /* Wa_18018781329:dg2,pvc */
2966                 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
2967                 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
2968                 wa_mcr_write_or(wal, VDBX_MOD_CTRL, FORCE_MISS_FTLB);
2969                 wa_mcr_write_or(wal, VEBX_MOD_CTRL, FORCE_MISS_FTLB);
2970         }
2971
2972         if (IS_DG2(i915)) {
2973                 /*
2974                  * Wa_16011620976:dg2_g11
2975                  * Wa_22015475538:dg2
2976                  */
2977                 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2978
2979                 /* Wa_18017747507:dg2 */
2980                 wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2981         }
2982 }
2983
2984 static void
2985 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2986 {
2987         if (I915_SELFTEST_ONLY(GRAPHICS_VER(engine->i915) < 4))
2988                 return;
2989
2990         engine_fake_wa_init(engine, wal);
2991
2992         /*
2993          * These are common workarounds that just need to applied
2994          * to a single RCS/CCS engine's workaround list since
2995          * they're reset as part of the general render domain reset.
2996          */
2997         if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
2998                 general_render_compute_wa_init(engine, wal);
2999
3000         if (engine->class == COMPUTE_CLASS)
3001                 ccs_engine_wa_init(engine, wal);
3002         else if (engine->class == RENDER_CLASS)
3003                 rcs_engine_wa_init(engine, wal);
3004         else
3005                 xcs_engine_wa_init(engine, wal);
3006 }
3007
3008 void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3009 {
3010         struct i915_wa_list *wal = &engine->wa_list;
3011
3012         if (GRAPHICS_VER(engine->i915) < 4)
3013                 return;
3014
3015         wa_init_start(wal, "engine", engine->name);
3016         engine_init_workarounds(engine, wal);
3017         wa_init_finish(wal);
3018 }
3019
3020 void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3021 {
3022         wa_list_apply(engine->gt, &engine->wa_list);
3023 }
3024
3025 static const struct i915_range mcr_ranges_gen8[] = {
3026         { .start = 0x5500, .end = 0x55ff },
3027         { .start = 0x7000, .end = 0x7fff },
3028         { .start = 0x9400, .end = 0x97ff },
3029         { .start = 0xb000, .end = 0xb3ff },
3030         { .start = 0xe000, .end = 0xe7ff },
3031         {},
3032 };
3033
3034 static const struct i915_range mcr_ranges_gen12[] = {
3035         { .start =  0x8150, .end =  0x815f },
3036         { .start =  0x9520, .end =  0x955f },
3037         { .start =  0xb100, .end =  0xb3ff },
3038         { .start =  0xde80, .end =  0xe8ff },
3039         { .start = 0x24a00, .end = 0x24a7f },
3040         {},
3041 };
3042
3043 static const struct i915_range mcr_ranges_xehp[] = {
3044         { .start =  0x4000, .end =  0x4aff },
3045         { .start =  0x5200, .end =  0x52ff },
3046         { .start =  0x5400, .end =  0x7fff },
3047         { .start =  0x8140, .end =  0x815f },
3048         { .start =  0x8c80, .end =  0x8dff },
3049         { .start =  0x94d0, .end =  0x955f },
3050         { .start =  0x9680, .end =  0x96ff },
3051         { .start =  0xb000, .end =  0xb3ff },
3052         { .start =  0xc800, .end =  0xcfff },
3053         { .start =  0xd800, .end =  0xd8ff },
3054         { .start =  0xdc00, .end =  0xffff },
3055         { .start = 0x17000, .end = 0x17fff },
3056         { .start = 0x24a00, .end = 0x24a7f },
3057         {},
3058 };
3059
3060 static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3061 {
3062         const struct i915_range *mcr_ranges;
3063         int i;
3064
3065         if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3066                 mcr_ranges = mcr_ranges_xehp;
3067         else if (GRAPHICS_VER(i915) >= 12)
3068                 mcr_ranges = mcr_ranges_gen12;
3069         else if (GRAPHICS_VER(i915) >= 8)
3070                 mcr_ranges = mcr_ranges_gen8;
3071         else
3072                 return false;
3073
3074         /*
3075          * Registers in these ranges are affected by the MCR selector
3076          * which only controls CPU initiated MMIO. Routing does not
3077          * work for CS access so we cannot verify them on this path.
3078          */
3079         for (i = 0; mcr_ranges[i].start; i++)
3080                 if (offset >= mcr_ranges[i].start &&
3081                     offset <= mcr_ranges[i].end)
3082                         return true;
3083
3084         return false;
3085 }
3086
3087 static int
3088 wa_list_srm(struct i915_request *rq,
3089             const struct i915_wa_list *wal,
3090             struct i915_vma *vma)
3091 {
3092         struct drm_i915_private *i915 = rq->engine->i915;
3093         unsigned int i, count = 0;
3094         const struct i915_wa *wa;
3095         u32 srm, *cs;
3096
3097         srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3098         if (GRAPHICS_VER(i915) >= 8)
3099                 srm++;
3100
3101         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3102                 if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3103                         count++;
3104         }
3105
3106         cs = intel_ring_begin(rq, 4 * count);
3107         if (IS_ERR(cs))
3108                 return PTR_ERR(cs);
3109
3110         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3111                 u32 offset = i915_mmio_reg_offset(wa->reg);
3112
3113                 if (mcr_range(i915, offset))
3114                         continue;
3115
3116                 *cs++ = srm;
3117                 *cs++ = offset;
3118                 *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3119                 *cs++ = 0;
3120         }
3121         intel_ring_advance(rq, cs);
3122
3123         return 0;
3124 }
3125
3126 static int engine_wa_list_verify(struct intel_context *ce,
3127                                  const struct i915_wa_list * const wal,
3128                                  const char *from)
3129 {
3130         const struct i915_wa *wa;
3131         struct i915_request *rq;
3132         struct i915_vma *vma;
3133         struct i915_gem_ww_ctx ww;
3134         unsigned int i;
3135         u32 *results;
3136         int err;
3137
3138         if (!wal->count)
3139                 return 0;
3140
3141         vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3142                                            wal->count * sizeof(u32));
3143         if (IS_ERR(vma))
3144                 return PTR_ERR(vma);
3145
3146         intel_engine_pm_get(ce->engine);
3147         i915_gem_ww_ctx_init(&ww, false);
3148 retry:
3149         err = i915_gem_object_lock(vma->obj, &ww);
3150         if (err == 0)
3151                 err = intel_context_pin_ww(ce, &ww);
3152         if (err)
3153                 goto err_pm;
3154
3155         err = i915_vma_pin_ww(vma, &ww, 0, 0,
3156                            i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3157         if (err)
3158                 goto err_unpin;
3159
3160         rq = i915_request_create(ce);
3161         if (IS_ERR(rq)) {
3162                 err = PTR_ERR(rq);
3163                 goto err_vma;
3164         }
3165
3166         err = i915_request_await_object(rq, vma->obj, true);
3167         if (err == 0)
3168                 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3169         if (err == 0)
3170                 err = wa_list_srm(rq, wal, vma);
3171
3172         i915_request_get(rq);
3173         if (err)
3174                 i915_request_set_error_once(rq, err);
3175         i915_request_add(rq);
3176
3177         if (err)
3178                 goto err_rq;
3179
3180         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3181                 err = -ETIME;
3182                 goto err_rq;
3183         }
3184
3185         results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3186         if (IS_ERR(results)) {
3187                 err = PTR_ERR(results);
3188                 goto err_rq;
3189         }
3190
3191         err = 0;
3192         for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3193                 if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg)))
3194                         continue;
3195
3196                 if (!wa_verify(wa, results[i], wal->name, from))
3197                         err = -ENXIO;
3198         }
3199
3200         i915_gem_object_unpin_map(vma->obj);
3201
3202 err_rq:
3203         i915_request_put(rq);
3204 err_vma:
3205         i915_vma_unpin(vma);
3206 err_unpin:
3207         intel_context_unpin(ce);
3208 err_pm:
3209         if (err == -EDEADLK) {
3210                 err = i915_gem_ww_ctx_backoff(&ww);
3211                 if (!err)
3212                         goto retry;
3213         }
3214         i915_gem_ww_ctx_fini(&ww);
3215         intel_engine_pm_put(ce->engine);
3216         i915_vma_put(vma);
3217         return err;
3218 }
3219
3220 int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3221                                     const char *from)
3222 {
3223         return engine_wa_list_verify(engine->kernel_context,
3224                                      &engine->wa_list,
3225                                      from);
3226 }
3227
3228 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3229 #include "selftest_workarounds.c"
3230 #endif