Merge drm/drm-next into drm-intel-next-queued
[linux-2.6-block.git] / drivers / gpu / drm / i915 / gt / intel_reset.c
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2008-2018 Intel Corporation
5  */
6
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
9
10 #include "display/intel_overlay.h"
11
12 #include "gem/i915_gem_context.h"
13
14 #include "i915_drv.h"
15 #include "i915_gpu_error.h"
16 #include "i915_irq.h"
17 #include "intel_engine_pm.h"
18 #include "intel_gt.h"
19 #include "intel_gt_pm.h"
20 #include "intel_reset.h"
21
22 #include "uc/intel_guc.h"
23
24 #define RESET_MAX_RETRIES 3
25
26 /* XXX How to handle concurrent GGTT updates using tiling registers? */
27 #define RESET_UNDER_STOP_MACHINE 0
28
29 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
30 {
31         intel_uncore_rmw_fw(uncore, reg, 0, set);
32 }
33
34 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
35 {
36         intel_uncore_rmw_fw(uncore, reg, clr, 0);
37 }
38
39 static void engine_skip_context(struct i915_request *rq)
40 {
41         struct intel_engine_cs *engine = rq->engine;
42         struct i915_gem_context *hung_ctx = rq->gem_context;
43
44         lockdep_assert_held(&engine->active.lock);
45
46         if (!i915_request_is_active(rq))
47                 return;
48
49         list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
50                 if (rq->gem_context == hung_ctx)
51                         i915_request_skip(rq, -EIO);
52 }
53
54 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
55                                const struct i915_gem_context *ctx)
56 {
57         unsigned int score;
58         unsigned long prev_hang;
59
60         if (i915_gem_context_is_banned(ctx))
61                 score = I915_CLIENT_SCORE_CONTEXT_BAN;
62         else
63                 score = 0;
64
65         prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
66         if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
67                 score += I915_CLIENT_SCORE_HANG_FAST;
68
69         if (score) {
70                 atomic_add(score, &file_priv->ban_score);
71
72                 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
73                                  ctx->name, score,
74                                  atomic_read(&file_priv->ban_score));
75         }
76 }
77
78 static bool context_mark_guilty(struct i915_gem_context *ctx)
79 {
80         unsigned long prev_hang;
81         bool banned;
82         int i;
83
84         atomic_inc(&ctx->guilty_count);
85
86         /* Cool contexts are too cool to be banned! (Used for reset testing.) */
87         if (!i915_gem_context_is_bannable(ctx))
88                 return false;
89
90         /* Record the timestamp for the last N hangs */
91         prev_hang = ctx->hang_timestamp[0];
92         for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
93                 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
94         ctx->hang_timestamp[i] = jiffies;
95
96         /* If we have hung N+1 times in rapid succession, we ban the context! */
97         banned = !i915_gem_context_is_recoverable(ctx);
98         if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
99                 banned = true;
100         if (banned) {
101                 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
102                                  ctx->name, atomic_read(&ctx->guilty_count));
103                 i915_gem_context_set_banned(ctx);
104         }
105
106         if (!IS_ERR_OR_NULL(ctx->file_priv))
107                 client_mark_guilty(ctx->file_priv, ctx);
108
109         return banned;
110 }
111
112 static void context_mark_innocent(struct i915_gem_context *ctx)
113 {
114         atomic_inc(&ctx->active_count);
115 }
116
117 void __i915_request_reset(struct i915_request *rq, bool guilty)
118 {
119         GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n",
120                   rq->engine->name,
121                   rq->fence.context,
122                   rq->fence.seqno,
123                   yesno(guilty));
124
125         lockdep_assert_held(&rq->engine->active.lock);
126         GEM_BUG_ON(i915_request_completed(rq));
127
128         if (guilty) {
129                 i915_request_skip(rq, -EIO);
130                 if (context_mark_guilty(rq->gem_context))
131                         engine_skip_context(rq);
132         } else {
133                 dma_fence_set_error(&rq->fence, -EAGAIN);
134                 context_mark_innocent(rq->gem_context);
135         }
136 }
137
138 static bool i915_in_reset(struct pci_dev *pdev)
139 {
140         u8 gdrst;
141
142         pci_read_config_byte(pdev, I915_GDRST, &gdrst);
143         return gdrst & GRDOM_RESET_STATUS;
144 }
145
146 static int i915_do_reset(struct intel_gt *gt,
147                          intel_engine_mask_t engine_mask,
148                          unsigned int retry)
149 {
150         struct pci_dev *pdev = gt->i915->drm.pdev;
151         int err;
152
153         /* Assert reset for at least 20 usec, and wait for acknowledgement. */
154         pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
155         udelay(50);
156         err = wait_for_atomic(i915_in_reset(pdev), 50);
157
158         /* Clear the reset request. */
159         pci_write_config_byte(pdev, I915_GDRST, 0);
160         udelay(50);
161         if (!err)
162                 err = wait_for_atomic(!i915_in_reset(pdev), 50);
163
164         return err;
165 }
166
167 static bool g4x_reset_complete(struct pci_dev *pdev)
168 {
169         u8 gdrst;
170
171         pci_read_config_byte(pdev, I915_GDRST, &gdrst);
172         return (gdrst & GRDOM_RESET_ENABLE) == 0;
173 }
174
175 static int g33_do_reset(struct intel_gt *gt,
176                         intel_engine_mask_t engine_mask,
177                         unsigned int retry)
178 {
179         struct pci_dev *pdev = gt->i915->drm.pdev;
180
181         pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
182         return wait_for_atomic(g4x_reset_complete(pdev), 50);
183 }
184
185 static int g4x_do_reset(struct intel_gt *gt,
186                         intel_engine_mask_t engine_mask,
187                         unsigned int retry)
188 {
189         struct pci_dev *pdev = gt->i915->drm.pdev;
190         struct intel_uncore *uncore = gt->uncore;
191         int ret;
192
193         /* WaVcpClkGateDisableForMediaReset:ctg,elk */
194         rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
195         intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
196
197         pci_write_config_byte(pdev, I915_GDRST,
198                               GRDOM_MEDIA | GRDOM_RESET_ENABLE);
199         ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
200         if (ret) {
201                 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
202                 goto out;
203         }
204
205         pci_write_config_byte(pdev, I915_GDRST,
206                               GRDOM_RENDER | GRDOM_RESET_ENABLE);
207         ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
208         if (ret) {
209                 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
210                 goto out;
211         }
212
213 out:
214         pci_write_config_byte(pdev, I915_GDRST, 0);
215
216         rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
217         intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
218
219         return ret;
220 }
221
222 static int ironlake_do_reset(struct intel_gt *gt,
223                              intel_engine_mask_t engine_mask,
224                              unsigned int retry)
225 {
226         struct intel_uncore *uncore = gt->uncore;
227         int ret;
228
229         intel_uncore_write_fw(uncore, ILK_GDSR,
230                               ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
231         ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
232                                            ILK_GRDOM_RESET_ENABLE, 0,
233                                            5000, 0,
234                                            NULL);
235         if (ret) {
236                 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
237                 goto out;
238         }
239
240         intel_uncore_write_fw(uncore, ILK_GDSR,
241                               ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
242         ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
243                                            ILK_GRDOM_RESET_ENABLE, 0,
244                                            5000, 0,
245                                            NULL);
246         if (ret) {
247                 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
248                 goto out;
249         }
250
251 out:
252         intel_uncore_write_fw(uncore, ILK_GDSR, 0);
253         intel_uncore_posting_read_fw(uncore, ILK_GDSR);
254         return ret;
255 }
256
257 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
258 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
259 {
260         struct intel_uncore *uncore = gt->uncore;
261         int err;
262
263         /*
264          * GEN6_GDRST is not in the gt power well, no need to check
265          * for fifo space for the write or forcewake the chip for
266          * the read
267          */
268         intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
269
270         /* Wait for the device to ack the reset requests */
271         err = __intel_wait_for_register_fw(uncore,
272                                            GEN6_GDRST, hw_domain_mask, 0,
273                                            500, 0,
274                                            NULL);
275         if (err)
276                 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
277                                  hw_domain_mask);
278
279         return err;
280 }
281
282 static int gen6_reset_engines(struct intel_gt *gt,
283                               intel_engine_mask_t engine_mask,
284                               unsigned int retry)
285 {
286         struct intel_engine_cs *engine;
287         const u32 hw_engine_mask[] = {
288                 [RCS0]  = GEN6_GRDOM_RENDER,
289                 [BCS0]  = GEN6_GRDOM_BLT,
290                 [VCS0]  = GEN6_GRDOM_MEDIA,
291                 [VCS1]  = GEN8_GRDOM_MEDIA2,
292                 [VECS0] = GEN6_GRDOM_VECS,
293         };
294         u32 hw_mask;
295
296         if (engine_mask == ALL_ENGINES) {
297                 hw_mask = GEN6_GRDOM_FULL;
298         } else {
299                 intel_engine_mask_t tmp;
300
301                 hw_mask = 0;
302                 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
303                         GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
304                         hw_mask |= hw_engine_mask[engine->id];
305                 }
306         }
307
308         return gen6_hw_domain_reset(gt, hw_mask);
309 }
310
311 static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
312 {
313         struct intel_uncore *uncore = engine->uncore;
314         u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
315         i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
316         u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
317         i915_reg_t sfc_usage;
318         u32 sfc_usage_bit;
319         u32 sfc_reset_bit;
320
321         switch (engine->class) {
322         case VIDEO_DECODE_CLASS:
323                 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
324                         return 0;
325
326                 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
327                 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
328
329                 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
330                 sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
331
332                 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
333                 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
334                 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
335                 break;
336
337         case VIDEO_ENHANCEMENT_CLASS:
338                 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
339                 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
340
341                 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
342                 sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
343
344                 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
345                 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
346                 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
347                 break;
348
349         default:
350                 return 0;
351         }
352
353         /*
354          * Tell the engine that a software reset is going to happen. The engine
355          * will then try to force lock the SFC (if currently locked, it will
356          * remain so until we tell the engine it is safe to unlock; if currently
357          * unlocked, it will ignore this and all new lock requests). If SFC
358          * ends up being locked to the engine we want to reset, we have to reset
359          * it as well (we will unlock it once the reset sequence is completed).
360          */
361         rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
362
363         if (__intel_wait_for_register_fw(uncore,
364                                          sfc_forced_lock_ack,
365                                          sfc_forced_lock_ack_bit,
366                                          sfc_forced_lock_ack_bit,
367                                          1000, 0, NULL)) {
368                 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
369                 return 0;
370         }
371
372         if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
373                 return sfc_reset_bit;
374
375         return 0;
376 }
377
378 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
379 {
380         struct intel_uncore *uncore = engine->uncore;
381         u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
382         i915_reg_t sfc_forced_lock;
383         u32 sfc_forced_lock_bit;
384
385         switch (engine->class) {
386         case VIDEO_DECODE_CLASS:
387                 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
388                         return;
389
390                 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
391                 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
392                 break;
393
394         case VIDEO_ENHANCEMENT_CLASS:
395                 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
396                 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
397                 break;
398
399         default:
400                 return;
401         }
402
403         rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
404 }
405
406 static int gen11_reset_engines(struct intel_gt *gt,
407                                intel_engine_mask_t engine_mask,
408                                unsigned int retry)
409 {
410         const u32 hw_engine_mask[] = {
411                 [RCS0]  = GEN11_GRDOM_RENDER,
412                 [BCS0]  = GEN11_GRDOM_BLT,
413                 [VCS0]  = GEN11_GRDOM_MEDIA,
414                 [VCS1]  = GEN11_GRDOM_MEDIA2,
415                 [VCS2]  = GEN11_GRDOM_MEDIA3,
416                 [VCS3]  = GEN11_GRDOM_MEDIA4,
417                 [VECS0] = GEN11_GRDOM_VECS,
418                 [VECS1] = GEN11_GRDOM_VECS2,
419         };
420         struct intel_engine_cs *engine;
421         intel_engine_mask_t tmp;
422         u32 hw_mask;
423         int ret;
424
425         if (engine_mask == ALL_ENGINES) {
426                 hw_mask = GEN11_GRDOM_FULL;
427         } else {
428                 hw_mask = 0;
429                 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
430                         GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
431                         hw_mask |= hw_engine_mask[engine->id];
432                         hw_mask |= gen11_lock_sfc(engine);
433                 }
434         }
435
436         ret = gen6_hw_domain_reset(gt, hw_mask);
437
438         if (engine_mask != ALL_ENGINES)
439                 for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
440                         gen11_unlock_sfc(engine);
441
442         return ret;
443 }
444
445 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
446 {
447         struct intel_uncore *uncore = engine->uncore;
448         const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
449         u32 request, mask, ack;
450         int ret;
451
452         ack = intel_uncore_read_fw(uncore, reg);
453         if (ack & RESET_CTL_CAT_ERROR) {
454                 /*
455                  * For catastrophic errors, ready-for-reset sequence
456                  * needs to be bypassed: HAS#396813
457                  */
458                 request = RESET_CTL_CAT_ERROR;
459                 mask = RESET_CTL_CAT_ERROR;
460
461                 /* Catastrophic errors need to be cleared by HW */
462                 ack = 0;
463         } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
464                 request = RESET_CTL_REQUEST_RESET;
465                 mask = RESET_CTL_READY_TO_RESET;
466                 ack = RESET_CTL_READY_TO_RESET;
467         } else {
468                 return 0;
469         }
470
471         intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
472         ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
473                                            700, 0, NULL);
474         if (ret)
475                 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
476                           engine->name, request,
477                           intel_uncore_read_fw(uncore, reg));
478
479         return ret;
480 }
481
482 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
483 {
484         intel_uncore_write_fw(engine->uncore,
485                               RING_RESET_CTL(engine->mmio_base),
486                               _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
487 }
488
489 static int gen8_reset_engines(struct intel_gt *gt,
490                               intel_engine_mask_t engine_mask,
491                               unsigned int retry)
492 {
493         struct intel_engine_cs *engine;
494         const bool reset_non_ready = retry >= 1;
495         intel_engine_mask_t tmp;
496         int ret;
497
498         for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
499                 ret = gen8_engine_reset_prepare(engine);
500                 if (ret && !reset_non_ready)
501                         goto skip_reset;
502
503                 /*
504                  * If this is not the first failed attempt to prepare,
505                  * we decide to proceed anyway.
506                  *
507                  * By doing so we risk context corruption and with
508                  * some gens (kbl), possible system hang if reset
509                  * happens during active bb execution.
510                  *
511                  * We rather take context corruption instead of
512                  * failed reset with a wedged driver/gpu. And
513                  * active bb execution case should be covered by
514                  * stop_engines() we have before the reset.
515                  */
516         }
517
518         if (INTEL_GEN(gt->i915) >= 11)
519                 ret = gen11_reset_engines(gt, engine_mask, retry);
520         else
521                 ret = gen6_reset_engines(gt, engine_mask, retry);
522
523 skip_reset:
524         for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
525                 gen8_engine_reset_cancel(engine);
526
527         return ret;
528 }
529
530 typedef int (*reset_func)(struct intel_gt *,
531                           intel_engine_mask_t engine_mask,
532                           unsigned int retry);
533
534 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
535 {
536         if (INTEL_GEN(i915) >= 8)
537                 return gen8_reset_engines;
538         else if (INTEL_GEN(i915) >= 6)
539                 return gen6_reset_engines;
540         else if (INTEL_GEN(i915) >= 5)
541                 return ironlake_do_reset;
542         else if (IS_G4X(i915))
543                 return g4x_do_reset;
544         else if (IS_G33(i915) || IS_PINEVIEW(i915))
545                 return g33_do_reset;
546         else if (INTEL_GEN(i915) >= 3)
547                 return i915_do_reset;
548         else
549                 return NULL;
550 }
551
552 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
553 {
554         const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
555         reset_func reset;
556         int ret = -ETIMEDOUT;
557         int retry;
558
559         reset = intel_get_gpu_reset(gt->i915);
560         if (!reset)
561                 return -ENODEV;
562
563         /*
564          * If the power well sleeps during the reset, the reset
565          * request may be dropped and never completes (causing -EIO).
566          */
567         intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
568         for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
569                 GEM_TRACE("engine_mask=%x\n", engine_mask);
570                 preempt_disable();
571                 ret = reset(gt, engine_mask, retry);
572                 preempt_enable();
573         }
574         intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
575
576         return ret;
577 }
578
579 bool intel_has_gpu_reset(struct drm_i915_private *i915)
580 {
581         if (!i915_modparams.reset)
582                 return NULL;
583
584         return intel_get_gpu_reset(i915);
585 }
586
587 bool intel_has_reset_engine(struct drm_i915_private *i915)
588 {
589         return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
590 }
591
592 int intel_reset_guc(struct intel_gt *gt)
593 {
594         u32 guc_domain =
595                 INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
596         int ret;
597
598         GEM_BUG_ON(!HAS_GT_UC(gt->i915));
599
600         intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
601         ret = gen6_hw_domain_reset(gt, guc_domain);
602         intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
603
604         return ret;
605 }
606
607 /*
608  * Ensure irq handler finishes, and not run again.
609  * Also return the active request so that we only search for it once.
610  */
611 static void reset_prepare_engine(struct intel_engine_cs *engine)
612 {
613         /*
614          * During the reset sequence, we must prevent the engine from
615          * entering RC6. As the context state is undefined until we restart
616          * the engine, if it does enter RC6 during the reset, the state
617          * written to the powercontext is undefined and so we may lose
618          * GPU state upon resume, i.e. fail to restart after a reset.
619          */
620         intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
621         engine->reset.prepare(engine);
622 }
623
624 static void revoke_mmaps(struct intel_gt *gt)
625 {
626         int i;
627
628         for (i = 0; i < gt->ggtt->num_fences; i++) {
629                 struct drm_vma_offset_node *node;
630                 struct i915_vma *vma;
631                 u64 vma_offset;
632
633                 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
634                 if (!vma)
635                         continue;
636
637                 if (!i915_vma_has_userfault(vma))
638                         continue;
639
640                 GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
641                 node = &vma->obj->base.vma_node;
642                 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
643                 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
644                                     drm_vma_node_offset_addr(node) + vma_offset,
645                                     vma->size,
646                                     1);
647         }
648 }
649
650 static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
651 {
652         struct intel_engine_cs *engine;
653         intel_engine_mask_t awake = 0;
654         enum intel_engine_id id;
655
656         for_each_engine(engine, gt->i915, id) {
657                 if (intel_engine_pm_get_if_awake(engine))
658                         awake |= engine->mask;
659                 reset_prepare_engine(engine);
660         }
661
662         intel_uc_reset_prepare(&gt->uc);
663
664         return awake;
665 }
666
667 static void gt_revoke(struct intel_gt *gt)
668 {
669         revoke_mmaps(gt);
670 }
671
672 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
673 {
674         struct intel_engine_cs *engine;
675         enum intel_engine_id id;
676         int err;
677
678         /*
679          * Everything depends on having the GTT running, so we need to start
680          * there.
681          */
682         err = i915_ggtt_enable_hw(gt->i915);
683         if (err)
684                 return err;
685
686         for_each_engine(engine, gt->i915, id)
687                 __intel_engine_reset(engine, stalled_mask & engine->mask);
688
689         i915_gem_restore_fences(gt->i915);
690
691         return err;
692 }
693
694 static void reset_finish_engine(struct intel_engine_cs *engine)
695 {
696         engine->reset.finish(engine);
697         intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
698
699         intel_engine_signal_breadcrumbs(engine);
700 }
701
702 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
703 {
704         struct intel_engine_cs *engine;
705         enum intel_engine_id id;
706
707         for_each_engine(engine, gt->i915, id) {
708                 reset_finish_engine(engine);
709                 if (awake & engine->mask)
710                         intel_engine_pm_put(engine);
711         }
712 }
713
714 static void nop_submit_request(struct i915_request *request)
715 {
716         struct intel_engine_cs *engine = request->engine;
717         unsigned long flags;
718
719         GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
720                   engine->name, request->fence.context, request->fence.seqno);
721         dma_fence_set_error(&request->fence, -EIO);
722
723         spin_lock_irqsave(&engine->active.lock, flags);
724         __i915_request_submit(request);
725         i915_request_mark_complete(request);
726         spin_unlock_irqrestore(&engine->active.lock, flags);
727
728         intel_engine_queue_breadcrumbs(engine);
729 }
730
731 static void __intel_gt_set_wedged(struct intel_gt *gt)
732 {
733         struct intel_engine_cs *engine;
734         intel_engine_mask_t awake;
735         enum intel_engine_id id;
736
737         if (test_bit(I915_WEDGED, &gt->reset.flags))
738                 return;
739
740         if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
741                 struct drm_printer p = drm_debug_printer(__func__);
742
743                 for_each_engine(engine, gt->i915, id)
744                         intel_engine_dump(engine, &p, "%s\n", engine->name);
745         }
746
747         GEM_TRACE("start\n");
748
749         /*
750          * First, stop submission to hw, but do not yet complete requests by
751          * rolling the global seqno forward (since this would complete requests
752          * for which we haven't set the fence error to EIO yet).
753          */
754         awake = reset_prepare(gt);
755
756         /* Even if the GPU reset fails, it should still stop the engines */
757         if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
758                 __intel_gt_reset(gt, ALL_ENGINES);
759
760         for_each_engine(engine, gt->i915, id) {
761                 engine->submit_request = nop_submit_request;
762                 engine->schedule = NULL;
763         }
764         gt->i915->caps.scheduler = 0;
765
766         /*
767          * Make sure no request can slip through without getting completed by
768          * either this call here to intel_engine_write_global_seqno, or the one
769          * in nop_submit_request.
770          */
771         synchronize_rcu_expedited();
772         set_bit(I915_WEDGED, &gt->reset.flags);
773
774         /* Mark all executing requests as skipped */
775         for_each_engine(engine, gt->i915, id)
776                 engine->cancel_requests(engine);
777
778         reset_finish(gt, awake);
779
780         GEM_TRACE("end\n");
781 }
782
783 void intel_gt_set_wedged(struct intel_gt *gt)
784 {
785         intel_wakeref_t wakeref;
786
787         mutex_lock(&gt->reset.mutex);
788         with_intel_runtime_pm(&gt->i915->runtime_pm, wakeref)
789                 __intel_gt_set_wedged(gt);
790         mutex_unlock(&gt->reset.mutex);
791 }
792
793 static bool __intel_gt_unset_wedged(struct intel_gt *gt)
794 {
795         struct intel_gt_timelines *timelines = &gt->timelines;
796         struct intel_timeline *tl;
797
798         if (!test_bit(I915_WEDGED, &gt->reset.flags))
799                 return true;
800
801         if (!gt->scratch) /* Never full initialised, recovery impossible */
802                 return false;
803
804         GEM_TRACE("start\n");
805
806         /*
807          * Before unwedging, make sure that all pending operations
808          * are flushed and errored out - we may have requests waiting upon
809          * third party fences. We marked all inflight requests as EIO, and
810          * every execbuf since returned EIO, for consistency we want all
811          * the currently pending requests to also be marked as EIO, which
812          * is done inside our nop_submit_request - and so we must wait.
813          *
814          * No more can be submitted until we reset the wedged bit.
815          */
816         mutex_lock(&timelines->mutex);
817         list_for_each_entry(tl, &timelines->active_list, link) {
818                 struct i915_request *rq;
819
820                 rq = i915_active_request_get_unlocked(&tl->last_request);
821                 if (!rq)
822                         continue;
823
824                 /*
825                  * All internal dependencies (i915_requests) will have
826                  * been flushed by the set-wedge, but we may be stuck waiting
827                  * for external fences. These should all be capped to 10s
828                  * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
829                  * in the worst case.
830                  */
831                 dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
832                 i915_request_put(rq);
833         }
834         mutex_unlock(&timelines->mutex);
835
836         intel_gt_sanitize(gt, false);
837
838         /*
839          * Undo nop_submit_request. We prevent all new i915 requests from
840          * being queued (by disallowing execbuf whilst wedged) so having
841          * waited for all active requests above, we know the system is idle
842          * and do not have to worry about a thread being inside
843          * engine->submit_request() as we swap over. So unlike installing
844          * the nop_submit_request on reset, we can do this from normal
845          * context and do not require stop_machine().
846          */
847         intel_engines_reset_default_submission(gt);
848
849         GEM_TRACE("end\n");
850
851         smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
852         clear_bit(I915_WEDGED, &gt->reset.flags);
853
854         return true;
855 }
856
857 bool intel_gt_unset_wedged(struct intel_gt *gt)
858 {
859         bool result;
860
861         mutex_lock(&gt->reset.mutex);
862         result = __intel_gt_unset_wedged(gt);
863         mutex_unlock(&gt->reset.mutex);
864
865         return result;
866 }
867
868 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
869 {
870         int err, i;
871
872         gt_revoke(gt);
873
874         err = __intel_gt_reset(gt, ALL_ENGINES);
875         for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
876                 msleep(10 * (i + 1));
877                 err = __intel_gt_reset(gt, ALL_ENGINES);
878         }
879         if (err)
880                 return err;
881
882         return gt_reset(gt, stalled_mask);
883 }
884
885 static int resume(struct intel_gt *gt)
886 {
887         struct intel_engine_cs *engine;
888         enum intel_engine_id id;
889         int ret;
890
891         for_each_engine(engine, gt->i915, id) {
892                 ret = engine->resume(engine);
893                 if (ret)
894                         return ret;
895         }
896
897         return 0;
898 }
899
900 /**
901  * intel_gt_reset - reset chip after a hang
902  * @gt: #intel_gt to reset
903  * @stalled_mask: mask of the stalled engines with the guilty requests
904  * @reason: user error message for why we are resetting
905  *
906  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
907  * on failure.
908  *
909  * Procedure is fairly simple:
910  *   - reset the chip using the reset reg
911  *   - re-init context state
912  *   - re-init hardware status page
913  *   - re-init ring buffer
914  *   - re-init interrupt state
915  *   - re-init display
916  */
917 void intel_gt_reset(struct intel_gt *gt,
918                     intel_engine_mask_t stalled_mask,
919                     const char *reason)
920 {
921         intel_engine_mask_t awake;
922         int ret;
923
924         GEM_TRACE("flags=%lx\n", gt->reset.flags);
925
926         might_sleep();
927         GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
928         mutex_lock(&gt->reset.mutex);
929
930         /* Clear any previous failed attempts at recovery. Time to try again. */
931         if (!__intel_gt_unset_wedged(gt))
932                 goto unlock;
933
934         if (reason)
935                 dev_notice(gt->i915->drm.dev,
936                            "Resetting chip for %s\n", reason);
937         atomic_inc(&gt->i915->gpu_error.reset_count);
938
939         awake = reset_prepare(gt);
940
941         if (!intel_has_gpu_reset(gt->i915)) {
942                 if (i915_modparams.reset)
943                         dev_err(gt->i915->drm.dev, "GPU reset not supported\n");
944                 else
945                         DRM_DEBUG_DRIVER("GPU reset disabled\n");
946                 goto error;
947         }
948
949         if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
950                 intel_runtime_pm_disable_interrupts(gt->i915);
951
952         if (do_reset(gt, stalled_mask)) {
953                 dev_err(gt->i915->drm.dev, "Failed to reset chip\n");
954                 goto taint;
955         }
956
957         if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
958                 intel_runtime_pm_enable_interrupts(gt->i915);
959
960         intel_overlay_reset(gt->i915);
961
962         /*
963          * Next we need to restore the context, but we don't use those
964          * yet either...
965          *
966          * Ring buffer needs to be re-initialized in the KMS case, or if X
967          * was running at the time of the reset (i.e. we weren't VT
968          * switched away).
969          */
970         ret = i915_gem_init_hw(gt->i915);
971         if (ret) {
972                 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
973                           ret);
974                 goto taint;
975         }
976
977         ret = resume(gt);
978         if (ret)
979                 goto taint;
980
981         intel_gt_queue_hangcheck(gt);
982
983 finish:
984         reset_finish(gt, awake);
985 unlock:
986         mutex_unlock(&gt->reset.mutex);
987         return;
988
989 taint:
990         /*
991          * History tells us that if we cannot reset the GPU now, we
992          * never will. This then impacts everything that is run
993          * subsequently. On failing the reset, we mark the driver
994          * as wedged, preventing further execution on the GPU.
995          * We also want to go one step further and add a taint to the
996          * kernel so that any subsequent faults can be traced back to
997          * this failure. This is important for CI, where if the
998          * GPU/driver fails we would like to reboot and restart testing
999          * rather than continue on into oblivion. For everyone else,
1000          * the system should still plod along, but they have been warned!
1001          */
1002         add_taint_for_CI(TAINT_WARN);
1003 error:
1004         __intel_gt_set_wedged(gt);
1005         goto finish;
1006 }
1007
1008 static inline int intel_gt_reset_engine(struct intel_engine_cs *engine)
1009 {
1010         return __intel_gt_reset(engine->gt, engine->mask);
1011 }
1012
1013 /**
1014  * intel_engine_reset - reset GPU engine to recover from a hang
1015  * @engine: engine to reset
1016  * @msg: reason for GPU reset; or NULL for no dev_notice()
1017  *
1018  * Reset a specific GPU engine. Useful if a hang is detected.
1019  * Returns zero on successful reset or otherwise an error code.
1020  *
1021  * Procedure is:
1022  *  - identifies the request that caused the hang and it is dropped
1023  *  - reset engine (which will force the engine to idle)
1024  *  - re-init/configure engine
1025  */
1026 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1027 {
1028         struct intel_gt *gt = engine->gt;
1029         int ret;
1030
1031         GEM_TRACE("%s flags=%lx\n", engine->name, gt->reset.flags);
1032         GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1033
1034         if (!intel_engine_pm_get_if_awake(engine))
1035                 return 0;
1036
1037         reset_prepare_engine(engine);
1038
1039         if (msg)
1040                 dev_notice(engine->i915->drm.dev,
1041                            "Resetting %s for %s\n", engine->name, msg);
1042         atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1043
1044         if (!engine->gt->uc.guc.execbuf_client)
1045                 ret = intel_gt_reset_engine(engine);
1046         else
1047                 ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine);
1048         if (ret) {
1049                 /* If we fail here, we expect to fallback to a global reset */
1050                 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1051                                  engine->gt->uc.guc.execbuf_client ? "GuC " : "",
1052                                  engine->name, ret);
1053                 goto out;
1054         }
1055
1056         /*
1057          * The request that caused the hang is stuck on elsp, we know the
1058          * active request and can drop it, adjust head to skip the offending
1059          * request to resume executing remaining requests in the queue.
1060          */
1061         __intel_engine_reset(engine, true);
1062
1063         /*
1064          * The engine and its registers (and workarounds in case of render)
1065          * have been reset to their default values. Follow the init_ring
1066          * process to program RING_MODE, HWSP and re-enable submission.
1067          */
1068         ret = engine->resume(engine);
1069
1070 out:
1071         intel_engine_cancel_stop_cs(engine);
1072         reset_finish_engine(engine);
1073         intel_engine_pm_put(engine);
1074         return ret;
1075 }
1076
1077 static void intel_gt_reset_global(struct intel_gt *gt,
1078                                   u32 engine_mask,
1079                                   const char *reason)
1080 {
1081         struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1082         char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1083         char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1084         char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1085         struct intel_wedge_me w;
1086
1087         kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1088
1089         DRM_DEBUG_DRIVER("resetting chip\n");
1090         kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1091
1092         /* Use a watchdog to ensure that our reset completes */
1093         intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1094                 intel_prepare_reset(gt->i915);
1095
1096                 /* Flush everyone using a resource about to be clobbered */
1097                 synchronize_srcu_expedited(&gt->reset.backoff_srcu);
1098
1099                 intel_gt_reset(gt, engine_mask, reason);
1100
1101                 intel_finish_reset(gt->i915);
1102         }
1103
1104         if (!test_bit(I915_WEDGED, &gt->reset.flags))
1105                 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1106 }
1107
1108 /**
1109  * intel_gt_handle_error - handle a gpu error
1110  * @gt: the intel_gt
1111  * @engine_mask: mask representing engines that are hung
1112  * @flags: control flags
1113  * @fmt: Error message format string
1114  *
1115  * Do some basic checking of register state at error time and
1116  * dump it to the syslog.  Also call i915_capture_error_state() to make
1117  * sure we get a record and make it available in debugfs.  Fire a uevent
1118  * so userspace knows something bad happened (should trigger collection
1119  * of a ring dump etc.).
1120  */
1121 void intel_gt_handle_error(struct intel_gt *gt,
1122                            intel_engine_mask_t engine_mask,
1123                            unsigned long flags,
1124                            const char *fmt, ...)
1125 {
1126         struct intel_engine_cs *engine;
1127         intel_wakeref_t wakeref;
1128         intel_engine_mask_t tmp;
1129         char error_msg[80];
1130         char *msg = NULL;
1131
1132         if (fmt) {
1133                 va_list args;
1134
1135                 va_start(args, fmt);
1136                 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1137                 va_end(args);
1138
1139                 msg = error_msg;
1140         }
1141
1142         /*
1143          * In most cases it's guaranteed that we get here with an RPM
1144          * reference held, for example because there is a pending GPU
1145          * request that won't finish until the reset is done. This
1146          * isn't the case at least when we get here by doing a
1147          * simulated reset via debugfs, so get an RPM reference.
1148          */
1149         wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1150
1151         engine_mask &= INTEL_INFO(gt->i915)->engine_mask;
1152
1153         if (flags & I915_ERROR_CAPTURE) {
1154                 i915_capture_error_state(gt->i915, engine_mask, msg);
1155                 intel_gt_clear_error_registers(gt, engine_mask);
1156         }
1157
1158         /*
1159          * Try engine reset when available. We fall back to full reset if
1160          * single reset fails.
1161          */
1162         if (intel_has_reset_engine(gt->i915) && !intel_gt_is_wedged(gt)) {
1163                 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
1164                         BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1165                         if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1166                                              &gt->reset.flags))
1167                                 continue;
1168
1169                         if (intel_engine_reset(engine, msg) == 0)
1170                                 engine_mask &= ~engine->mask;
1171
1172                         clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1173                                               &gt->reset.flags);
1174                 }
1175         }
1176
1177         if (!engine_mask)
1178                 goto out;
1179
1180         /* Full reset needs the mutex, stop any other user trying to do so. */
1181         if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1182                 wait_event(gt->reset.queue,
1183                            !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1184                 goto out; /* piggy-back on the other reset */
1185         }
1186
1187         /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1188         synchronize_rcu_expedited();
1189
1190         /* Prevent any other reset-engine attempt. */
1191         for_each_engine(engine, gt->i915, tmp) {
1192                 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1193                                         &gt->reset.flags))
1194                         wait_on_bit(&gt->reset.flags,
1195                                     I915_RESET_ENGINE + engine->id,
1196                                     TASK_UNINTERRUPTIBLE);
1197         }
1198
1199         intel_gt_reset_global(gt, engine_mask, msg);
1200
1201         for_each_engine(engine, gt->i915, tmp)
1202                 clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1203                                  &gt->reset.flags);
1204         clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
1205         smp_mb__after_atomic();
1206         wake_up_all(&gt->reset.queue);
1207
1208 out:
1209         intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1210 }
1211
1212 int intel_gt_reset_trylock(struct intel_gt *gt)
1213 {
1214         int srcu;
1215
1216         might_lock(&gt->reset.backoff_srcu);
1217         might_sleep();
1218
1219         rcu_read_lock();
1220         while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1221                 rcu_read_unlock();
1222
1223                 if (wait_event_interruptible(gt->reset.queue,
1224                                              !test_bit(I915_RESET_BACKOFF,
1225                                                        &gt->reset.flags)))
1226                         return -EINTR;
1227
1228                 rcu_read_lock();
1229         }
1230         srcu = srcu_read_lock(&gt->reset.backoff_srcu);
1231         rcu_read_unlock();
1232
1233         return srcu;
1234 }
1235
1236 void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1237 __releases(&gt->reset.backoff_srcu)
1238 {
1239         srcu_read_unlock(&gt->reset.backoff_srcu, tag);
1240 }
1241
1242 int intel_gt_terminally_wedged(struct intel_gt *gt)
1243 {
1244         might_sleep();
1245
1246         if (!intel_gt_is_wedged(gt))
1247                 return 0;
1248
1249         /* Reset still in progress? Maybe we will recover? */
1250         if (!test_bit(I915_RESET_BACKOFF, &gt->reset.flags))
1251                 return -EIO;
1252
1253         /* XXX intel_reset_finish() still takes struct_mutex!!! */
1254         if (mutex_is_locked(&gt->i915->drm.struct_mutex))
1255                 return -EAGAIN;
1256
1257         if (wait_event_interruptible(gt->reset.queue,
1258                                      !test_bit(I915_RESET_BACKOFF,
1259                                                &gt->reset.flags)))
1260                 return -EINTR;
1261
1262         return intel_gt_is_wedged(gt) ? -EIO : 0;
1263 }
1264
1265 void intel_gt_init_reset(struct intel_gt *gt)
1266 {
1267         init_waitqueue_head(&gt->reset.queue);
1268         mutex_init(&gt->reset.mutex);
1269         init_srcu_struct(&gt->reset.backoff_srcu);
1270 }
1271
1272 void intel_gt_fini_reset(struct intel_gt *gt)
1273 {
1274         cleanup_srcu_struct(&gt->reset.backoff_srcu);
1275 }
1276
1277 static void intel_wedge_me(struct work_struct *work)
1278 {
1279         struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1280
1281         dev_err(w->gt->i915->drm.dev,
1282                 "%s timed out, cancelling all in-flight rendering.\n",
1283                 w->name);
1284         intel_gt_set_wedged(w->gt);
1285 }
1286
1287 void __intel_init_wedge(struct intel_wedge_me *w,
1288                         struct intel_gt *gt,
1289                         long timeout,
1290                         const char *name)
1291 {
1292         w->gt = gt;
1293         w->name = name;
1294
1295         INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1296         schedule_delayed_work(&w->work, timeout);
1297 }
1298
1299 void __intel_fini_wedge(struct intel_wedge_me *w)
1300 {
1301         cancel_delayed_work_sync(&w->work);
1302         destroy_delayed_work_on_stack(&w->work);
1303         w->gt = NULL;
1304 }
1305
1306 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1307 #include "selftest_reset.c"
1308 #endif