drm/i915: Compute the HWS offsets explicitly
[linux-2.6-block.git] / drivers / gpu / drm / i915 / i915_reset.c
CommitLineData
9f58892e
CW
1/*
2 * SPDX-License-Identifier: MIT
3 *
4 * Copyright © 2008-2018 Intel Corporation
5 */
6
7#include <linux/sched/mm.h>
8
9#include "i915_drv.h"
10#include "i915_gpu_error.h"
11#include "i915_reset.h"
12
13#include "intel_guc.h"
14
15static void engine_skip_context(struct i915_request *rq)
16{
17 struct intel_engine_cs *engine = rq->engine;
18 struct i915_gem_context *hung_ctx = rq->gem_context;
19 struct i915_timeline *timeline = rq->timeline;
20 unsigned long flags;
21
22 GEM_BUG_ON(timeline == &engine->timeline);
23
24 spin_lock_irqsave(&engine->timeline.lock, flags);
25 spin_lock(&timeline->lock);
26
27 list_for_each_entry_continue(rq, &engine->timeline.requests, link)
28 if (rq->gem_context == hung_ctx)
29 i915_request_skip(rq, -EIO);
30
31 list_for_each_entry(rq, &timeline->requests, link)
32 i915_request_skip(rq, -EIO);
33
34 spin_unlock(&timeline->lock);
35 spin_unlock_irqrestore(&engine->timeline.lock, flags);
36}
37
38static void client_mark_guilty(struct drm_i915_file_private *file_priv,
39 const struct i915_gem_context *ctx)
40{
41 unsigned int score;
42 unsigned long prev_hang;
43
44 if (i915_gem_context_is_banned(ctx))
45 score = I915_CLIENT_SCORE_CONTEXT_BAN;
46 else
47 score = 0;
48
49 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
50 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
51 score += I915_CLIENT_SCORE_HANG_FAST;
52
53 if (score) {
54 atomic_add(score, &file_priv->ban_score);
55
56 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
57 ctx->name, score,
58 atomic_read(&file_priv->ban_score));
59 }
60}
61
62static void context_mark_guilty(struct i915_gem_context *ctx)
63{
64 unsigned int score;
65 bool banned, bannable;
66
67 atomic_inc(&ctx->guilty_count);
68
69 bannable = i915_gem_context_is_bannable(ctx);
70 score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
71 banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
72
73 /* Cool contexts don't accumulate client ban score */
74 if (!bannable)
75 return;
76
77 if (banned) {
78 DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
79 ctx->name, atomic_read(&ctx->guilty_count),
80 score);
81 i915_gem_context_set_banned(ctx);
82 }
83
84 if (!IS_ERR_OR_NULL(ctx->file_priv))
85 client_mark_guilty(ctx->file_priv, ctx);
86}
87
88static void context_mark_innocent(struct i915_gem_context *ctx)
89{
90 atomic_inc(&ctx->active_count);
91}
92
93static void gen3_stop_engine(struct intel_engine_cs *engine)
94{
95 struct drm_i915_private *dev_priv = engine->i915;
96 const u32 base = engine->mmio_base;
97
98 if (intel_engine_stop_cs(engine))
99 DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
100
101 I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
102 POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
103
104 I915_WRITE_FW(RING_HEAD(base), 0);
105 I915_WRITE_FW(RING_TAIL(base), 0);
106 POSTING_READ_FW(RING_TAIL(base));
107
108 /* The ring must be empty before it is disabled */
109 I915_WRITE_FW(RING_CTL(base), 0);
110
111 /* Check acts as a post */
112 if (I915_READ_FW(RING_HEAD(base)) != 0)
113 DRM_DEBUG_DRIVER("%s: ring head not parked\n",
114 engine->name);
115}
116
117static void i915_stop_engines(struct drm_i915_private *i915,
118 unsigned int engine_mask)
119{
120 struct intel_engine_cs *engine;
121 enum intel_engine_id id;
122
123 if (INTEL_GEN(i915) < 3)
124 return;
125
126 for_each_engine_masked(engine, i915, engine_mask, id)
127 gen3_stop_engine(engine);
128}
129
130static bool i915_in_reset(struct pci_dev *pdev)
131{
132 u8 gdrst;
133
134 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
135 return gdrst & GRDOM_RESET_STATUS;
136}
137
138static int i915_do_reset(struct drm_i915_private *i915,
139 unsigned int engine_mask,
140 unsigned int retry)
141{
142 struct pci_dev *pdev = i915->drm.pdev;
143 int err;
144
145 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
146 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
147 usleep_range(50, 200);
148 err = wait_for(i915_in_reset(pdev), 500);
149
150 /* Clear the reset request. */
151 pci_write_config_byte(pdev, I915_GDRST, 0);
152 usleep_range(50, 200);
153 if (!err)
154 err = wait_for(!i915_in_reset(pdev), 500);
155
156 return err;
157}
158
159static bool g4x_reset_complete(struct pci_dev *pdev)
160{
161 u8 gdrst;
162
163 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
164 return (gdrst & GRDOM_RESET_ENABLE) == 0;
165}
166
167static int g33_do_reset(struct drm_i915_private *i915,
168 unsigned int engine_mask,
169 unsigned int retry)
170{
171 struct pci_dev *pdev = i915->drm.pdev;
172
173 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
174 return wait_for(g4x_reset_complete(pdev), 500);
175}
176
177static int g4x_do_reset(struct drm_i915_private *dev_priv,
178 unsigned int engine_mask,
179 unsigned int retry)
180{
181 struct pci_dev *pdev = dev_priv->drm.pdev;
182 int ret;
183
184 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
185 I915_WRITE(VDECCLK_GATE_D,
186 I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
187 POSTING_READ(VDECCLK_GATE_D);
188
189 pci_write_config_byte(pdev, I915_GDRST,
190 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
191 ret = wait_for(g4x_reset_complete(pdev), 500);
192 if (ret) {
193 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
194 goto out;
195 }
196
197 pci_write_config_byte(pdev, I915_GDRST,
198 GRDOM_RENDER | GRDOM_RESET_ENABLE);
199 ret = wait_for(g4x_reset_complete(pdev), 500);
200 if (ret) {
201 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
202 goto out;
203 }
204
205out:
206 pci_write_config_byte(pdev, I915_GDRST, 0);
207
208 I915_WRITE(VDECCLK_GATE_D,
209 I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
210 POSTING_READ(VDECCLK_GATE_D);
211
212 return ret;
213}
214
215static int ironlake_do_reset(struct drm_i915_private *dev_priv,
216 unsigned int engine_mask,
217 unsigned int retry)
218{
219 int ret;
220
221 I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
222 ret = intel_wait_for_register(dev_priv,
223 ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
224 500);
225 if (ret) {
226 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
227 goto out;
228 }
229
230 I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
231 ret = intel_wait_for_register(dev_priv,
232 ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
233 500);
234 if (ret) {
235 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
236 goto out;
237 }
238
239out:
240 I915_WRITE(ILK_GDSR, 0);
241 POSTING_READ(ILK_GDSR);
242 return ret;
243}
244
245/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
246static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
247 u32 hw_domain_mask)
248{
249 int err;
250
251 /*
252 * GEN6_GDRST is not in the gt power well, no need to check
253 * for fifo space for the write or forcewake the chip for
254 * the read
255 */
256 I915_WRITE_FW(GEN6_GDRST, hw_domain_mask);
257
258 /* Wait for the device to ack the reset requests */
259 err = __intel_wait_for_register_fw(dev_priv,
260 GEN6_GDRST, hw_domain_mask, 0,
261 500, 0,
262 NULL);
263 if (err)
264 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
265 hw_domain_mask);
266
267 return err;
268}
269
270static int gen6_reset_engines(struct drm_i915_private *i915,
271 unsigned int engine_mask,
272 unsigned int retry)
273{
274 struct intel_engine_cs *engine;
275 const u32 hw_engine_mask[I915_NUM_ENGINES] = {
276 [RCS] = GEN6_GRDOM_RENDER,
277 [BCS] = GEN6_GRDOM_BLT,
278 [VCS] = GEN6_GRDOM_MEDIA,
279 [VCS2] = GEN8_GRDOM_MEDIA2,
280 [VECS] = GEN6_GRDOM_VECS,
281 };
282 u32 hw_mask;
283
284 if (engine_mask == ALL_ENGINES) {
285 hw_mask = GEN6_GRDOM_FULL;
286 } else {
287 unsigned int tmp;
288
289 hw_mask = 0;
290 for_each_engine_masked(engine, i915, engine_mask, tmp)
291 hw_mask |= hw_engine_mask[engine->id];
292 }
293
294 return gen6_hw_domain_reset(i915, hw_mask);
295}
296
297static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv,
298 struct intel_engine_cs *engine)
299{
300 u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
301 i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
302 u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
303 i915_reg_t sfc_usage;
304 u32 sfc_usage_bit;
305 u32 sfc_reset_bit;
306
307 switch (engine->class) {
308 case VIDEO_DECODE_CLASS:
309 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
310 return 0;
311
312 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
313 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
314
315 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
316 sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
317
318 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
319 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
320 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
321 break;
322
323 case VIDEO_ENHANCEMENT_CLASS:
324 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
325 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
326
327 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
328 sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
329
330 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
331 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
332 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
333 break;
334
335 default:
336 return 0;
337 }
338
339 /*
340 * Tell the engine that a software reset is going to happen. The engine
341 * will then try to force lock the SFC (if currently locked, it will
342 * remain so until we tell the engine it is safe to unlock; if currently
343 * unlocked, it will ignore this and all new lock requests). If SFC
344 * ends up being locked to the engine we want to reset, we have to reset
345 * it as well (we will unlock it once the reset sequence is completed).
346 */
347 I915_WRITE_FW(sfc_forced_lock,
348 I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit);
349
350 if (__intel_wait_for_register_fw(dev_priv,
351 sfc_forced_lock_ack,
352 sfc_forced_lock_ack_bit,
353 sfc_forced_lock_ack_bit,
354 1000, 0, NULL)) {
355 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
356 return 0;
357 }
358
359 if (I915_READ_FW(sfc_usage) & sfc_usage_bit)
360 return sfc_reset_bit;
361
362 return 0;
363}
364
365static void gen11_unlock_sfc(struct drm_i915_private *dev_priv,
366 struct intel_engine_cs *engine)
367{
368 u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
369 i915_reg_t sfc_forced_lock;
370 u32 sfc_forced_lock_bit;
371
372 switch (engine->class) {
373 case VIDEO_DECODE_CLASS:
374 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
375 return;
376
377 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
378 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
379 break;
380
381 case VIDEO_ENHANCEMENT_CLASS:
382 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
383 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
384 break;
385
386 default:
387 return;
388 }
389
390 I915_WRITE_FW(sfc_forced_lock,
391 I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit);
392}
393
394static int gen11_reset_engines(struct drm_i915_private *i915,
395 unsigned int engine_mask,
396 unsigned int retry)
397{
398 const u32 hw_engine_mask[I915_NUM_ENGINES] = {
399 [RCS] = GEN11_GRDOM_RENDER,
400 [BCS] = GEN11_GRDOM_BLT,
401 [VCS] = GEN11_GRDOM_MEDIA,
402 [VCS2] = GEN11_GRDOM_MEDIA2,
403 [VCS3] = GEN11_GRDOM_MEDIA3,
404 [VCS4] = GEN11_GRDOM_MEDIA4,
405 [VECS] = GEN11_GRDOM_VECS,
406 [VECS2] = GEN11_GRDOM_VECS2,
407 };
408 struct intel_engine_cs *engine;
409 unsigned int tmp;
410 u32 hw_mask;
411 int ret;
412
413 BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
414
415 if (engine_mask == ALL_ENGINES) {
416 hw_mask = GEN11_GRDOM_FULL;
417 } else {
418 hw_mask = 0;
419 for_each_engine_masked(engine, i915, engine_mask, tmp) {
420 hw_mask |= hw_engine_mask[engine->id];
421 hw_mask |= gen11_lock_sfc(i915, engine);
422 }
423 }
424
425 ret = gen6_hw_domain_reset(i915, hw_mask);
426
427 if (engine_mask != ALL_ENGINES)
428 for_each_engine_masked(engine, i915, engine_mask, tmp)
429 gen11_unlock_sfc(i915, engine);
430
431 return ret;
432}
433
434static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
435{
436 struct drm_i915_private *dev_priv = engine->i915;
437 int ret;
438
439 I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
440 _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
441
442 ret = __intel_wait_for_register_fw(dev_priv,
443 RING_RESET_CTL(engine->mmio_base),
444 RESET_CTL_READY_TO_RESET,
445 RESET_CTL_READY_TO_RESET,
446 700, 0,
447 NULL);
448 if (ret)
449 DRM_ERROR("%s: reset request timeout\n", engine->name);
450
451 return ret;
452}
453
454static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
455{
456 struct drm_i915_private *dev_priv = engine->i915;
457
458 I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
459 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
460}
461
462static int gen8_reset_engines(struct drm_i915_private *i915,
463 unsigned int engine_mask,
464 unsigned int retry)
465{
466 struct intel_engine_cs *engine;
467 const bool reset_non_ready = retry >= 1;
468 unsigned int tmp;
469 int ret;
470
471 for_each_engine_masked(engine, i915, engine_mask, tmp) {
472 ret = gen8_engine_reset_prepare(engine);
473 if (ret && !reset_non_ready)
474 goto skip_reset;
475
476 /*
477 * If this is not the first failed attempt to prepare,
478 * we decide to proceed anyway.
479 *
480 * By doing so we risk context corruption and with
481 * some gens (kbl), possible system hang if reset
482 * happens during active bb execution.
483 *
484 * We rather take context corruption instead of
485 * failed reset with a wedged driver/gpu. And
486 * active bb execution case should be covered by
487 * i915_stop_engines we have before the reset.
488 */
489 }
490
491 if (INTEL_GEN(i915) >= 11)
492 ret = gen11_reset_engines(i915, engine_mask, retry);
493 else
494 ret = gen6_reset_engines(i915, engine_mask, retry);
495
496skip_reset:
497 for_each_engine_masked(engine, i915, engine_mask, tmp)
498 gen8_engine_reset_cancel(engine);
499
500 return ret;
501}
502
503typedef int (*reset_func)(struct drm_i915_private *,
504 unsigned int engine_mask,
505 unsigned int retry);
506
507static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
508{
509 if (!i915_modparams.reset)
510 return NULL;
511
512 if (INTEL_GEN(i915) >= 8)
513 return gen8_reset_engines;
514 else if (INTEL_GEN(i915) >= 6)
515 return gen6_reset_engines;
516 else if (INTEL_GEN(i915) >= 5)
517 return ironlake_do_reset;
518 else if (IS_G4X(i915))
519 return g4x_do_reset;
520 else if (IS_G33(i915) || IS_PINEVIEW(i915))
521 return g33_do_reset;
522 else if (INTEL_GEN(i915) >= 3)
523 return i915_do_reset;
524 else
525 return NULL;
526}
527
528int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
529{
530 reset_func reset = intel_get_gpu_reset(i915);
531 int retry;
532 int ret;
533
534 /*
535 * We want to perform per-engine reset from atomic context (e.g.
536 * softirq), which imposes the constraint that we cannot sleep.
537 * However, experience suggests that spending a bit of time waiting
538 * for a reset helps in various cases, so for a full-device reset
539 * we apply the opposite rule and wait if we want to. As we should
540 * always follow up a failed per-engine reset with a full device reset,
541 * being a little faster, stricter and more error prone for the
542 * atomic case seems an acceptable compromise.
543 *
544 * Unfortunately this leads to a bimodal routine, when the goal was
545 * to have a single reset function that worked for resetting any
546 * number of engines simultaneously.
547 */
548 might_sleep_if(engine_mask == ALL_ENGINES);
549
550 /*
551 * If the power well sleeps during the reset, the reset
552 * request may be dropped and never completes (causing -EIO).
553 */
554 intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
555 for (retry = 0; retry < 3; retry++) {
556 /*
557 * We stop engines, otherwise we might get failed reset and a
558 * dead gpu (on elk). Also as modern gpu as kbl can suffer
559 * from system hang if batchbuffer is progressing when
560 * the reset is issued, regardless of READY_TO_RESET ack.
561 * Thus assume it is best to stop engines on all gens
562 * where we have a gpu reset.
563 *
564 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
565 *
566 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
567 *
568 * FIXME: Wa for more modern gens needs to be validated
569 */
570 i915_stop_engines(i915, engine_mask);
571
572 ret = -ENODEV;
573 if (reset) {
574 GEM_TRACE("engine_mask=%x\n", engine_mask);
575 ret = reset(i915, engine_mask, retry);
576 }
577 if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
578 break;
579
580 cond_resched();
581 }
582 intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
583
584 return ret;
585}
586
587bool intel_has_gpu_reset(struct drm_i915_private *i915)
588{
589 return intel_get_gpu_reset(i915);
590}
591
592bool intel_has_reset_engine(struct drm_i915_private *i915)
593{
594 return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
595}
596
597int intel_reset_guc(struct drm_i915_private *i915)
598{
599 u32 guc_domain =
600 INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
601 int ret;
602
603 GEM_BUG_ON(!HAS_GUC(i915));
604
605 intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
606 ret = gen6_hw_domain_reset(i915, guc_domain);
607 intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
608
609 return ret;
610}
611
612/*
613 * Ensure irq handler finishes, and not run again.
614 * Also return the active request so that we only search for it once.
615 */
616static struct i915_request *
617reset_prepare_engine(struct intel_engine_cs *engine)
618{
619 struct i915_request *rq;
620
621 /*
622 * During the reset sequence, we must prevent the engine from
623 * entering RC6. As the context state is undefined until we restart
624 * the engine, if it does enter RC6 during the reset, the state
625 * written to the powercontext is undefined and so we may lose
626 * GPU state upon resume, i.e. fail to restart after a reset.
627 */
628 intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
629
630 rq = engine->reset.prepare(engine);
631 if (rq && rq->fence.error == -EIO)
632 rq = ERR_PTR(-EIO); /* Previous reset failed! */
633
634 return rq;
635}
636
637static int reset_prepare(struct drm_i915_private *i915)
638{
639 struct intel_engine_cs *engine;
640 struct i915_request *rq;
641 enum intel_engine_id id;
642 int err = 0;
643
644 for_each_engine(engine, i915, id) {
645 rq = reset_prepare_engine(engine);
646 if (IS_ERR(rq)) {
647 err = PTR_ERR(rq);
648 continue;
649 }
650
651 engine->hangcheck.active_request = rq;
652 }
653
654 i915_gem_revoke_fences(i915);
655 intel_uc_sanitize(i915);
656
657 return err;
658}
659
660/* Returns the request if it was guilty of the hang */
661static struct i915_request *
662reset_request(struct intel_engine_cs *engine,
663 struct i915_request *rq,
664 bool stalled)
665{
666 /*
667 * The guilty request will get skipped on a hung engine.
668 *
669 * Users of client default contexts do not rely on logical
670 * state preserved between batches so it is safe to execute
671 * queued requests following the hang. Non default contexts
672 * rely on preserved state, so skipping a batch loses the
673 * evolution of the state and it needs to be considered corrupted.
674 * Executing more queued batches on top of corrupted state is
675 * risky. But we take the risk by trying to advance through
676 * the queued requests in order to make the client behaviour
677 * more predictable around resets, by not throwing away random
678 * amount of batches it has prepared for execution. Sophisticated
679 * clients can use gem_reset_stats_ioctl and dma fence status
680 * (exported via sync_file info ioctl on explicit fences) to observe
681 * when it loses the context state and should rebuild accordingly.
682 *
683 * The context ban, and ultimately the client ban, mechanism are safety
684 * valves if client submission ends up resulting in nothing more than
685 * subsequent hangs.
686 */
687
688 if (i915_request_completed(rq)) {
689 GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
690 engine->name, rq->global_seqno,
691 rq->fence.context, rq->fence.seqno,
692 intel_engine_get_seqno(engine));
693 stalled = false;
694 }
695
696 if (stalled) {
697 context_mark_guilty(rq->gem_context);
698 i915_request_skip(rq, -EIO);
699
700 /* If this context is now banned, skip all pending requests. */
701 if (i915_gem_context_is_banned(rq->gem_context))
702 engine_skip_context(rq);
703 } else {
704 /*
705 * Since this is not the hung engine, it may have advanced
706 * since the hang declaration. Double check by refinding
707 * the active request at the time of the reset.
708 */
709 rq = i915_gem_find_active_request(engine);
710 if (rq) {
711 unsigned long flags;
712
713 context_mark_innocent(rq->gem_context);
714 dma_fence_set_error(&rq->fence, -EAGAIN);
715
716 /* Rewind the engine to replay the incomplete rq */
717 spin_lock_irqsave(&engine->timeline.lock, flags);
718 rq = list_prev_entry(rq, link);
719 if (&rq->link == &engine->timeline.requests)
720 rq = NULL;
721 spin_unlock_irqrestore(&engine->timeline.lock, flags);
722 }
723 }
724
725 return rq;
726}
727
728static void reset_engine(struct intel_engine_cs *engine,
729 struct i915_request *rq,
730 bool stalled)
731{
732 if (rq)
733 rq = reset_request(engine, rq, stalled);
734
735 /* Setup the CS to resume from the breadcrumb of the hung request */
736 engine->reset.reset(engine, rq);
737}
738
739static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
740{
741 struct intel_engine_cs *engine;
742 enum intel_engine_id id;
743
744 lockdep_assert_held(&i915->drm.struct_mutex);
745
746 i915_retire_requests(i915);
747
748 for_each_engine(engine, i915, id) {
749 struct intel_context *ce;
750
751 reset_engine(engine,
752 engine->hangcheck.active_request,
753 stalled_mask & ENGINE_MASK(id));
754 ce = fetch_and_zero(&engine->last_retired_context);
755 if (ce)
756 intel_context_unpin(ce);
757
758 /*
759 * Ostensibily, we always want a context loaded for powersaving,
760 * so if the engine is idle after the reset, send a request
761 * to load our scratch kernel_context.
762 *
763 * More mysteriously, if we leave the engine idle after a reset,
764 * the next userspace batch may hang, with what appears to be
765 * an incoherent read by the CS (presumably stale TLB). An
766 * empty request appears sufficient to paper over the glitch.
767 */
768 if (intel_engine_is_idle(engine)) {
769 struct i915_request *rq;
770
771 rq = i915_request_alloc(engine, i915->kernel_context);
772 if (!IS_ERR(rq))
773 i915_request_add(rq);
774 }
775 }
776
777 i915_gem_restore_fences(i915);
778}
779
780static void reset_finish_engine(struct intel_engine_cs *engine)
781{
782 engine->reset.finish(engine);
783
784 intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
785}
786
787static void reset_finish(struct drm_i915_private *i915)
788{
789 struct intel_engine_cs *engine;
790 enum intel_engine_id id;
791
792 lockdep_assert_held(&i915->drm.struct_mutex);
793
794 for_each_engine(engine, i915, id) {
795 engine->hangcheck.active_request = NULL;
796 reset_finish_engine(engine);
797 }
798}
799
800static void nop_submit_request(struct i915_request *request)
801{
802 unsigned long flags;
803
804 GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
805 request->engine->name,
806 request->fence.context, request->fence.seqno);
807 dma_fence_set_error(&request->fence, -EIO);
808
809 spin_lock_irqsave(&request->engine->timeline.lock, flags);
810 __i915_request_submit(request);
811 intel_engine_write_global_seqno(request->engine, request->global_seqno);
812 spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
813}
814
815void i915_gem_set_wedged(struct drm_i915_private *i915)
816{
817 struct i915_gpu_error *error = &i915->gpu_error;
818 struct intel_engine_cs *engine;
819 enum intel_engine_id id;
820
821 mutex_lock(&error->wedge_mutex);
822 if (test_bit(I915_WEDGED, &error->flags)) {
823 mutex_unlock(&error->wedge_mutex);
824 return;
825 }
826
827 if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
828 struct drm_printer p = drm_debug_printer(__func__);
829
830 for_each_engine(engine, i915, id)
831 intel_engine_dump(engine, &p, "%s\n", engine->name);
832 }
833
834 GEM_TRACE("start\n");
835
836 /*
837 * First, stop submission to hw, but do not yet complete requests by
838 * rolling the global seqno forward (since this would complete requests
839 * for which we haven't set the fence error to EIO yet).
840 */
841 for_each_engine(engine, i915, id)
842 reset_prepare_engine(engine);
843
844 /* Even if the GPU reset fails, it should still stop the engines */
845 if (INTEL_GEN(i915) >= 5)
846 intel_gpu_reset(i915, ALL_ENGINES);
847
848 for_each_engine(engine, i915, id) {
849 engine->submit_request = nop_submit_request;
850 engine->schedule = NULL;
851 }
852 i915->caps.scheduler = 0;
853
854 /*
855 * Make sure no request can slip through without getting completed by
856 * either this call here to intel_engine_write_global_seqno, or the one
857 * in nop_submit_request.
858 */
859 synchronize_rcu();
860
861 /* Mark all executing requests as skipped */
862 for_each_engine(engine, i915, id)
863 engine->cancel_requests(engine);
864
865 for_each_engine(engine, i915, id) {
866 reset_finish_engine(engine);
867 intel_engine_wakeup(engine);
868 }
869
870 smp_mb__before_atomic();
871 set_bit(I915_WEDGED, &error->flags);
872
873 GEM_TRACE("end\n");
874 mutex_unlock(&error->wedge_mutex);
875
876 wake_up_all(&error->reset_queue);
877}
878
879bool i915_gem_unset_wedged(struct drm_i915_private *i915)
880{
881 struct i915_gpu_error *error = &i915->gpu_error;
882 struct i915_timeline *tl;
883 bool ret = false;
884
885 lockdep_assert_held(&i915->drm.struct_mutex);
886
887 if (!test_bit(I915_WEDGED, &error->flags))
888 return true;
889
890 if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
891 return false;
892
893 mutex_lock(&error->wedge_mutex);
894
895 GEM_TRACE("start\n");
896
897 /*
898 * Before unwedging, make sure that all pending operations
899 * are flushed and errored out - we may have requests waiting upon
900 * third party fences. We marked all inflight requests as EIO, and
901 * every execbuf since returned EIO, for consistency we want all
902 * the currently pending requests to also be marked as EIO, which
903 * is done inside our nop_submit_request - and so we must wait.
904 *
905 * No more can be submitted until we reset the wedged bit.
906 */
907 list_for_each_entry(tl, &i915->gt.timelines, link) {
908 struct i915_request *rq;
909
910 rq = i915_gem_active_peek(&tl->last_request,
911 &i915->drm.struct_mutex);
912 if (!rq)
913 continue;
914
915 /*
916 * We can't use our normal waiter as we want to
917 * avoid recursively trying to handle the current
918 * reset. The basic dma_fence_default_wait() installs
919 * a callback for dma_fence_signal(), which is
920 * triggered by our nop handler (indirectly, the
921 * callback enables the signaler thread which is
922 * woken by the nop_submit_request() advancing the seqno
923 * and when the seqno passes the fence, the signaler
924 * then signals the fence waking us up).
925 */
926 if (dma_fence_default_wait(&rq->fence, true,
927 MAX_SCHEDULE_TIMEOUT) < 0)
928 goto unlock;
929 }
930 i915_retire_requests(i915);
931 GEM_BUG_ON(i915->gt.active_requests);
932
933 intel_engines_sanitize(i915, false);
934
935 /*
936 * Undo nop_submit_request. We prevent all new i915 requests from
937 * being queued (by disallowing execbuf whilst wedged) so having
938 * waited for all active requests above, we know the system is idle
939 * and do not have to worry about a thread being inside
940 * engine->submit_request() as we swap over. So unlike installing
941 * the nop_submit_request on reset, we can do this from normal
942 * context and do not require stop_machine().
943 */
944 intel_engines_reset_default_submission(i915);
945 i915_gem_contexts_lost(i915);
946
947 GEM_TRACE("end\n");
948
949 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
950 clear_bit(I915_WEDGED, &i915->gpu_error.flags);
951 ret = true;
952unlock:
953 mutex_unlock(&i915->gpu_error.wedge_mutex);
954
955 return ret;
956}
957
958/**
959 * i915_reset - reset chip after a hang
960 * @i915: #drm_i915_private to reset
961 * @stalled_mask: mask of the stalled engines with the guilty requests
962 * @reason: user error message for why we are resetting
963 *
964 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
965 * on failure.
966 *
967 * Caller must hold the struct_mutex.
968 *
969 * Procedure is fairly simple:
970 * - reset the chip using the reset reg
971 * - re-init context state
972 * - re-init hardware status page
973 * - re-init ring buffer
974 * - re-init interrupt state
975 * - re-init display
976 */
977void i915_reset(struct drm_i915_private *i915,
978 unsigned int stalled_mask,
979 const char *reason)
980{
981 struct i915_gpu_error *error = &i915->gpu_error;
982 int ret;
983 int i;
984
985 GEM_TRACE("flags=%lx\n", error->flags);
986
987 might_sleep();
988 lockdep_assert_held(&i915->drm.struct_mutex);
989 assert_rpm_wakelock_held(i915);
990 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
991
992 if (!test_bit(I915_RESET_HANDOFF, &error->flags))
993 return;
994
995 /* Clear any previous failed attempts at recovery. Time to try again. */
996 if (!i915_gem_unset_wedged(i915))
997 goto wakeup;
998
999 if (reason)
1000 dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
1001 error->reset_count++;
1002
1003 ret = reset_prepare(i915);
1004 if (ret) {
1005 dev_err(i915->drm.dev, "GPU recovery failed\n");
1006 goto taint;
1007 }
1008
1009 if (!intel_has_gpu_reset(i915)) {
1010 if (i915_modparams.reset)
1011 dev_err(i915->drm.dev, "GPU reset not supported\n");
1012 else
1013 DRM_DEBUG_DRIVER("GPU reset disabled\n");
1014 goto error;
1015 }
1016
1017 for (i = 0; i < 3; i++) {
1018 ret = intel_gpu_reset(i915, ALL_ENGINES);
1019 if (ret == 0)
1020 break;
1021
1022 msleep(100);
1023 }
1024 if (ret) {
1025 dev_err(i915->drm.dev, "Failed to reset chip\n");
1026 goto taint;
1027 }
1028
1029 /* Ok, now get things going again... */
1030
1031 /*
1032 * Everything depends on having the GTT running, so we need to start
1033 * there.
1034 */
1035 ret = i915_ggtt_enable_hw(i915);
1036 if (ret) {
1037 DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
1038 ret);
1039 goto error;
1040 }
1041
1042 gt_reset(i915, stalled_mask);
1043 intel_overlay_reset(i915);
1044
1045 /*
1046 * Next we need to restore the context, but we don't use those
1047 * yet either...
1048 *
1049 * Ring buffer needs to be re-initialized in the KMS case, or if X
1050 * was running at the time of the reset (i.e. we weren't VT
1051 * switched away).
1052 */
1053 ret = i915_gem_init_hw(i915);
1054 if (ret) {
1055 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1056 ret);
1057 goto error;
1058 }
1059
1060 i915_queue_hangcheck(i915);
1061
1062finish:
1063 reset_finish(i915);
1064wakeup:
1065 clear_bit(I915_RESET_HANDOFF, &error->flags);
1066 wake_up_bit(&error->flags, I915_RESET_HANDOFF);
1067 return;
1068
1069taint:
1070 /*
1071 * History tells us that if we cannot reset the GPU now, we
1072 * never will. This then impacts everything that is run
1073 * subsequently. On failing the reset, we mark the driver
1074 * as wedged, preventing further execution on the GPU.
1075 * We also want to go one step further and add a taint to the
1076 * kernel so that any subsequent faults can be traced back to
1077 * this failure. This is important for CI, where if the
1078 * GPU/driver fails we would like to reboot and restart testing
1079 * rather than continue on into oblivion. For everyone else,
1080 * the system should still plod along, but they have been warned!
1081 */
1082 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
1083error:
1084 i915_gem_set_wedged(i915);
1085 i915_retire_requests(i915);
1086 goto finish;
1087}
1088
1089static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
1090 struct intel_engine_cs *engine)
1091{
1092 return intel_gpu_reset(i915, intel_engine_flag(engine));
1093}
1094
1095/**
1096 * i915_reset_engine - reset GPU engine to recover from a hang
1097 * @engine: engine to reset
1098 * @msg: reason for GPU reset; or NULL for no dev_notice()
1099 *
1100 * Reset a specific GPU engine. Useful if a hang is detected.
1101 * Returns zero on successful reset or otherwise an error code.
1102 *
1103 * Procedure is:
1104 * - identifies the request that caused the hang and it is dropped
1105 * - reset engine (which will force the engine to idle)
1106 * - re-init/configure engine
1107 */
1108int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
1109{
1110 struct i915_gpu_error *error = &engine->i915->gpu_error;
1111 struct i915_request *active_request;
1112 int ret;
1113
1114 GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
1115 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
1116
1117 active_request = reset_prepare_engine(engine);
1118 if (IS_ERR_OR_NULL(active_request)) {
1119 /* Either the previous reset failed, or we pardon the reset. */
1120 ret = PTR_ERR(active_request);
1121 goto out;
1122 }
1123
1124 if (msg)
1125 dev_notice(engine->i915->drm.dev,
1126 "Resetting %s for %s\n", engine->name, msg);
1127 error->reset_engine_count[engine->id]++;
1128
1129 if (!engine->i915->guc.execbuf_client)
1130 ret = intel_gt_reset_engine(engine->i915, engine);
1131 else
1132 ret = intel_guc_reset_engine(&engine->i915->guc, engine);
1133 if (ret) {
1134 /* If we fail here, we expect to fallback to a global reset */
1135 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1136 engine->i915->guc.execbuf_client ? "GuC " : "",
1137 engine->name, ret);
1138 goto out;
1139 }
1140
1141 /*
1142 * The request that caused the hang is stuck on elsp, we know the
1143 * active request and can drop it, adjust head to skip the offending
1144 * request to resume executing remaining requests in the queue.
1145 */
1146 reset_engine(engine, active_request, true);
1147
1148 /*
1149 * The engine and its registers (and workarounds in case of render)
1150 * have been reset to their default values. Follow the init_ring
1151 * process to program RING_MODE, HWSP and re-enable submission.
1152 */
1153 ret = engine->init_hw(engine);
1154 if (ret)
1155 goto out;
1156
1157out:
1158 intel_engine_cancel_stop_cs(engine);
1159 reset_finish_engine(engine);
1160 return ret;
1161}
1162
1163static void i915_reset_device(struct drm_i915_private *i915,
1164 u32 engine_mask,
1165 const char *reason)
1166{
1167 struct i915_gpu_error *error = &i915->gpu_error;
1168 struct kobject *kobj = &i915->drm.primary->kdev->kobj;
1169 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1170 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1171 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1172 struct i915_wedge_me w;
1173
1174 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1175
1176 DRM_DEBUG_DRIVER("resetting chip\n");
1177 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1178
1179 /* Use a watchdog to ensure that our reset completes */
1180 i915_wedge_on_timeout(&w, i915, 5 * HZ) {
1181 intel_prepare_reset(i915);
1182
1183 error->reason = reason;
1184 error->stalled_mask = engine_mask;
1185
1186 /* Signal that locked waiters should reset the GPU */
1187 smp_mb__before_atomic();
1188 set_bit(I915_RESET_HANDOFF, &error->flags);
1189 wake_up_all(&error->wait_queue);
1190
1191 /*
1192 * Wait for anyone holding the lock to wakeup, without
1193 * blocking indefinitely on struct_mutex.
1194 */
1195 do {
1196 if (mutex_trylock(&i915->drm.struct_mutex)) {
1197 i915_reset(i915, engine_mask, reason);
1198 mutex_unlock(&i915->drm.struct_mutex);
1199 }
1200 } while (wait_on_bit_timeout(&error->flags,
1201 I915_RESET_HANDOFF,
1202 TASK_UNINTERRUPTIBLE,
1203 1));
1204
1205 error->stalled_mask = 0;
1206 error->reason = NULL;
1207
1208 intel_finish_reset(i915);
1209 }
1210
1211 if (!test_bit(I915_WEDGED, &error->flags))
1212 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1213}
1214
1215void i915_clear_error_registers(struct drm_i915_private *dev_priv)
1216{
1217 u32 eir;
1218
1219 if (!IS_GEN(dev_priv, 2))
1220 I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
1221
1222 if (INTEL_GEN(dev_priv) < 4)
1223 I915_WRITE(IPEIR, I915_READ(IPEIR));
1224 else
1225 I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
1226
1227 I915_WRITE(EIR, I915_READ(EIR));
1228 eir = I915_READ(EIR);
1229 if (eir) {
1230 /*
1231 * some errors might have become stuck,
1232 * mask them.
1233 */
1234 DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
1235 I915_WRITE(EMR, I915_READ(EMR) | eir);
1236 I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
1237 }
1238
1239 if (INTEL_GEN(dev_priv) >= 8) {
1240 I915_WRITE(GEN8_RING_FAULT_REG,
1241 I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID);
1242 POSTING_READ(GEN8_RING_FAULT_REG);
1243 } else if (INTEL_GEN(dev_priv) >= 6) {
1244 struct intel_engine_cs *engine;
1245 enum intel_engine_id id;
1246
1247 for_each_engine(engine, dev_priv, id) {
1248 I915_WRITE(RING_FAULT_REG(engine),
1249 I915_READ(RING_FAULT_REG(engine)) &
1250 ~RING_FAULT_VALID);
1251 }
1252 POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS]));
1253 }
1254}
1255
1256/**
1257 * i915_handle_error - handle a gpu error
1258 * @i915: i915 device private
1259 * @engine_mask: mask representing engines that are hung
1260 * @flags: control flags
1261 * @fmt: Error message format string
1262 *
1263 * Do some basic checking of register state at error time and
1264 * dump it to the syslog. Also call i915_capture_error_state() to make
1265 * sure we get a record and make it available in debugfs. Fire a uevent
1266 * so userspace knows something bad happened (should trigger collection
1267 * of a ring dump etc.).
1268 */
1269void i915_handle_error(struct drm_i915_private *i915,
1270 u32 engine_mask,
1271 unsigned long flags,
1272 const char *fmt, ...)
1273{
1274 struct intel_engine_cs *engine;
1275 intel_wakeref_t wakeref;
1276 unsigned int tmp;
1277 char error_msg[80];
1278 char *msg = NULL;
1279
1280 if (fmt) {
1281 va_list args;
1282
1283 va_start(args, fmt);
1284 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1285 va_end(args);
1286
1287 msg = error_msg;
1288 }
1289
1290 /*
1291 * In most cases it's guaranteed that we get here with an RPM
1292 * reference held, for example because there is a pending GPU
1293 * request that won't finish until the reset is done. This
1294 * isn't the case at least when we get here by doing a
1295 * simulated reset via debugfs, so get an RPM reference.
1296 */
1297 wakeref = intel_runtime_pm_get(i915);
1298
1299 engine_mask &= INTEL_INFO(i915)->ring_mask;
1300
1301 if (flags & I915_ERROR_CAPTURE) {
1302 i915_capture_error_state(i915, engine_mask, msg);
1303 i915_clear_error_registers(i915);
1304 }
1305
1306 /*
1307 * Try engine reset when available. We fall back to full reset if
1308 * single reset fails.
1309 */
1310 if (intel_has_reset_engine(i915) &&
1311 !i915_terminally_wedged(&i915->gpu_error)) {
1312 for_each_engine_masked(engine, i915, engine_mask, tmp) {
1313 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1314 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1315 &i915->gpu_error.flags))
1316 continue;
1317
1318 if (i915_reset_engine(engine, msg) == 0)
1319 engine_mask &= ~intel_engine_flag(engine);
1320
1321 clear_bit(I915_RESET_ENGINE + engine->id,
1322 &i915->gpu_error.flags);
1323 wake_up_bit(&i915->gpu_error.flags,
1324 I915_RESET_ENGINE + engine->id);
1325 }
1326 }
1327
1328 if (!engine_mask)
1329 goto out;
1330
1331 /* Full reset needs the mutex, stop any other user trying to do so. */
1332 if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
1333 wait_event(i915->gpu_error.reset_queue,
1334 !test_bit(I915_RESET_BACKOFF,
1335 &i915->gpu_error.flags));
1336 goto out;
1337 }
1338
1339 /* Prevent any other reset-engine attempt. */
1340 for_each_engine(engine, i915, tmp) {
1341 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1342 &i915->gpu_error.flags))
1343 wait_on_bit(&i915->gpu_error.flags,
1344 I915_RESET_ENGINE + engine->id,
1345 TASK_UNINTERRUPTIBLE);
1346 }
1347
1348 i915_reset_device(i915, engine_mask, msg);
1349
1350 for_each_engine(engine, i915, tmp) {
1351 clear_bit(I915_RESET_ENGINE + engine->id,
1352 &i915->gpu_error.flags);
1353 }
1354
1355 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
1356 wake_up_all(&i915->gpu_error.reset_queue);
1357
1358out:
1359 intel_runtime_pm_put(i915, wakeref);
1360}
1361
1362static void i915_wedge_me(struct work_struct *work)
1363{
1364 struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
1365
1366 dev_err(w->i915->drm.dev,
1367 "%s timed out, cancelling all in-flight rendering.\n",
1368 w->name);
1369 i915_gem_set_wedged(w->i915);
1370}
1371
1372void __i915_init_wedge(struct i915_wedge_me *w,
1373 struct drm_i915_private *i915,
1374 long timeout,
1375 const char *name)
1376{
1377 w->i915 = i915;
1378 w->name = name;
1379
1380 INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me);
1381 schedule_delayed_work(&w->work, timeout);
1382}
1383
1384void __i915_fini_wedge(struct i915_wedge_me *w)
1385{
1386 cancel_delayed_work_sync(&w->work);
1387 destroy_delayed_work_on_stack(&w->work);
1388 w->i915 = NULL;
1389}