Commit | Line | Data |
---|---|---|
9f58892e CW |
1 | /* |
2 | * SPDX-License-Identifier: MIT | |
3 | * | |
4 | * Copyright © 2008-2018 Intel Corporation | |
5 | */ | |
6 | ||
7 | #include <linux/sched/mm.h> | |
8 | ||
9 | #include "i915_drv.h" | |
10 | #include "i915_gpu_error.h" | |
11 | #include "i915_reset.h" | |
12 | ||
13 | #include "intel_guc.h" | |
14 | ||
15 | static void engine_skip_context(struct i915_request *rq) | |
16 | { | |
17 | struct intel_engine_cs *engine = rq->engine; | |
18 | struct i915_gem_context *hung_ctx = rq->gem_context; | |
19 | struct i915_timeline *timeline = rq->timeline; | |
20 | unsigned long flags; | |
21 | ||
22 | GEM_BUG_ON(timeline == &engine->timeline); | |
23 | ||
24 | spin_lock_irqsave(&engine->timeline.lock, flags); | |
25 | spin_lock(&timeline->lock); | |
26 | ||
27 | list_for_each_entry_continue(rq, &engine->timeline.requests, link) | |
28 | if (rq->gem_context == hung_ctx) | |
29 | i915_request_skip(rq, -EIO); | |
30 | ||
31 | list_for_each_entry(rq, &timeline->requests, link) | |
32 | i915_request_skip(rq, -EIO); | |
33 | ||
34 | spin_unlock(&timeline->lock); | |
35 | spin_unlock_irqrestore(&engine->timeline.lock, flags); | |
36 | } | |
37 | ||
38 | static void client_mark_guilty(struct drm_i915_file_private *file_priv, | |
39 | const struct i915_gem_context *ctx) | |
40 | { | |
41 | unsigned int score; | |
42 | unsigned long prev_hang; | |
43 | ||
44 | if (i915_gem_context_is_banned(ctx)) | |
45 | score = I915_CLIENT_SCORE_CONTEXT_BAN; | |
46 | else | |
47 | score = 0; | |
48 | ||
49 | prev_hang = xchg(&file_priv->hang_timestamp, jiffies); | |
50 | if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) | |
51 | score += I915_CLIENT_SCORE_HANG_FAST; | |
52 | ||
53 | if (score) { | |
54 | atomic_add(score, &file_priv->ban_score); | |
55 | ||
56 | DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", | |
57 | ctx->name, score, | |
58 | atomic_read(&file_priv->ban_score)); | |
59 | } | |
60 | } | |
61 | ||
62 | static void context_mark_guilty(struct i915_gem_context *ctx) | |
63 | { | |
64 | unsigned int score; | |
65 | bool banned, bannable; | |
66 | ||
67 | atomic_inc(&ctx->guilty_count); | |
68 | ||
69 | bannable = i915_gem_context_is_bannable(ctx); | |
70 | score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); | |
71 | banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; | |
72 | ||
73 | /* Cool contexts don't accumulate client ban score */ | |
74 | if (!bannable) | |
75 | return; | |
76 | ||
77 | if (banned) { | |
78 | DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n", | |
79 | ctx->name, atomic_read(&ctx->guilty_count), | |
80 | score); | |
81 | i915_gem_context_set_banned(ctx); | |
82 | } | |
83 | ||
84 | if (!IS_ERR_OR_NULL(ctx->file_priv)) | |
85 | client_mark_guilty(ctx->file_priv, ctx); | |
86 | } | |
87 | ||
88 | static void context_mark_innocent(struct i915_gem_context *ctx) | |
89 | { | |
90 | atomic_inc(&ctx->active_count); | |
91 | } | |
92 | ||
93 | static void gen3_stop_engine(struct intel_engine_cs *engine) | |
94 | { | |
95 | struct drm_i915_private *dev_priv = engine->i915; | |
96 | const u32 base = engine->mmio_base; | |
97 | ||
98 | if (intel_engine_stop_cs(engine)) | |
99 | DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name); | |
100 | ||
101 | I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base))); | |
102 | POSTING_READ_FW(RING_HEAD(base)); /* paranoia */ | |
103 | ||
104 | I915_WRITE_FW(RING_HEAD(base), 0); | |
105 | I915_WRITE_FW(RING_TAIL(base), 0); | |
106 | POSTING_READ_FW(RING_TAIL(base)); | |
107 | ||
108 | /* The ring must be empty before it is disabled */ | |
109 | I915_WRITE_FW(RING_CTL(base), 0); | |
110 | ||
111 | /* Check acts as a post */ | |
112 | if (I915_READ_FW(RING_HEAD(base)) != 0) | |
113 | DRM_DEBUG_DRIVER("%s: ring head not parked\n", | |
114 | engine->name); | |
115 | } | |
116 | ||
117 | static void i915_stop_engines(struct drm_i915_private *i915, | |
118 | unsigned int engine_mask) | |
119 | { | |
120 | struct intel_engine_cs *engine; | |
121 | enum intel_engine_id id; | |
122 | ||
123 | if (INTEL_GEN(i915) < 3) | |
124 | return; | |
125 | ||
126 | for_each_engine_masked(engine, i915, engine_mask, id) | |
127 | gen3_stop_engine(engine); | |
128 | } | |
129 | ||
130 | static bool i915_in_reset(struct pci_dev *pdev) | |
131 | { | |
132 | u8 gdrst; | |
133 | ||
134 | pci_read_config_byte(pdev, I915_GDRST, &gdrst); | |
135 | return gdrst & GRDOM_RESET_STATUS; | |
136 | } | |
137 | ||
138 | static int i915_do_reset(struct drm_i915_private *i915, | |
139 | unsigned int engine_mask, | |
140 | unsigned int retry) | |
141 | { | |
142 | struct pci_dev *pdev = i915->drm.pdev; | |
143 | int err; | |
144 | ||
145 | /* Assert reset for at least 20 usec, and wait for acknowledgement. */ | |
146 | pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); | |
147 | usleep_range(50, 200); | |
148 | err = wait_for(i915_in_reset(pdev), 500); | |
149 | ||
150 | /* Clear the reset request. */ | |
151 | pci_write_config_byte(pdev, I915_GDRST, 0); | |
152 | usleep_range(50, 200); | |
153 | if (!err) | |
154 | err = wait_for(!i915_in_reset(pdev), 500); | |
155 | ||
156 | return err; | |
157 | } | |
158 | ||
159 | static bool g4x_reset_complete(struct pci_dev *pdev) | |
160 | { | |
161 | u8 gdrst; | |
162 | ||
163 | pci_read_config_byte(pdev, I915_GDRST, &gdrst); | |
164 | return (gdrst & GRDOM_RESET_ENABLE) == 0; | |
165 | } | |
166 | ||
167 | static int g33_do_reset(struct drm_i915_private *i915, | |
168 | unsigned int engine_mask, | |
169 | unsigned int retry) | |
170 | { | |
171 | struct pci_dev *pdev = i915->drm.pdev; | |
172 | ||
173 | pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); | |
174 | return wait_for(g4x_reset_complete(pdev), 500); | |
175 | } | |
176 | ||
177 | static int g4x_do_reset(struct drm_i915_private *dev_priv, | |
178 | unsigned int engine_mask, | |
179 | unsigned int retry) | |
180 | { | |
181 | struct pci_dev *pdev = dev_priv->drm.pdev; | |
182 | int ret; | |
183 | ||
184 | /* WaVcpClkGateDisableForMediaReset:ctg,elk */ | |
185 | I915_WRITE(VDECCLK_GATE_D, | |
186 | I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE); | |
187 | POSTING_READ(VDECCLK_GATE_D); | |
188 | ||
189 | pci_write_config_byte(pdev, I915_GDRST, | |
190 | GRDOM_MEDIA | GRDOM_RESET_ENABLE); | |
191 | ret = wait_for(g4x_reset_complete(pdev), 500); | |
192 | if (ret) { | |
193 | DRM_DEBUG_DRIVER("Wait for media reset failed\n"); | |
194 | goto out; | |
195 | } | |
196 | ||
197 | pci_write_config_byte(pdev, I915_GDRST, | |
198 | GRDOM_RENDER | GRDOM_RESET_ENABLE); | |
199 | ret = wait_for(g4x_reset_complete(pdev), 500); | |
200 | if (ret) { | |
201 | DRM_DEBUG_DRIVER("Wait for render reset failed\n"); | |
202 | goto out; | |
203 | } | |
204 | ||
205 | out: | |
206 | pci_write_config_byte(pdev, I915_GDRST, 0); | |
207 | ||
208 | I915_WRITE(VDECCLK_GATE_D, | |
209 | I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE); | |
210 | POSTING_READ(VDECCLK_GATE_D); | |
211 | ||
212 | return ret; | |
213 | } | |
214 | ||
215 | static int ironlake_do_reset(struct drm_i915_private *dev_priv, | |
216 | unsigned int engine_mask, | |
217 | unsigned int retry) | |
218 | { | |
219 | int ret; | |
220 | ||
221 | I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); | |
222 | ret = intel_wait_for_register(dev_priv, | |
223 | ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0, | |
224 | 500); | |
225 | if (ret) { | |
226 | DRM_DEBUG_DRIVER("Wait for render reset failed\n"); | |
227 | goto out; | |
228 | } | |
229 | ||
230 | I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); | |
231 | ret = intel_wait_for_register(dev_priv, | |
232 | ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0, | |
233 | 500); | |
234 | if (ret) { | |
235 | DRM_DEBUG_DRIVER("Wait for media reset failed\n"); | |
236 | goto out; | |
237 | } | |
238 | ||
239 | out: | |
240 | I915_WRITE(ILK_GDSR, 0); | |
241 | POSTING_READ(ILK_GDSR); | |
242 | return ret; | |
243 | } | |
244 | ||
245 | /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ | |
246 | static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv, | |
247 | u32 hw_domain_mask) | |
248 | { | |
249 | int err; | |
250 | ||
251 | /* | |
252 | * GEN6_GDRST is not in the gt power well, no need to check | |
253 | * for fifo space for the write or forcewake the chip for | |
254 | * the read | |
255 | */ | |
256 | I915_WRITE_FW(GEN6_GDRST, hw_domain_mask); | |
257 | ||
258 | /* Wait for the device to ack the reset requests */ | |
259 | err = __intel_wait_for_register_fw(dev_priv, | |
260 | GEN6_GDRST, hw_domain_mask, 0, | |
261 | 500, 0, | |
262 | NULL); | |
263 | if (err) | |
264 | DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", | |
265 | hw_domain_mask); | |
266 | ||
267 | return err; | |
268 | } | |
269 | ||
270 | static int gen6_reset_engines(struct drm_i915_private *i915, | |
271 | unsigned int engine_mask, | |
272 | unsigned int retry) | |
273 | { | |
274 | struct intel_engine_cs *engine; | |
275 | const u32 hw_engine_mask[I915_NUM_ENGINES] = { | |
276 | [RCS] = GEN6_GRDOM_RENDER, | |
277 | [BCS] = GEN6_GRDOM_BLT, | |
278 | [VCS] = GEN6_GRDOM_MEDIA, | |
279 | [VCS2] = GEN8_GRDOM_MEDIA2, | |
280 | [VECS] = GEN6_GRDOM_VECS, | |
281 | }; | |
282 | u32 hw_mask; | |
283 | ||
284 | if (engine_mask == ALL_ENGINES) { | |
285 | hw_mask = GEN6_GRDOM_FULL; | |
286 | } else { | |
287 | unsigned int tmp; | |
288 | ||
289 | hw_mask = 0; | |
290 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
291 | hw_mask |= hw_engine_mask[engine->id]; | |
292 | } | |
293 | ||
294 | return gen6_hw_domain_reset(i915, hw_mask); | |
295 | } | |
296 | ||
297 | static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv, | |
298 | struct intel_engine_cs *engine) | |
299 | { | |
300 | u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; | |
301 | i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; | |
302 | u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; | |
303 | i915_reg_t sfc_usage; | |
304 | u32 sfc_usage_bit; | |
305 | u32 sfc_reset_bit; | |
306 | ||
307 | switch (engine->class) { | |
308 | case VIDEO_DECODE_CLASS: | |
309 | if ((BIT(engine->instance) & vdbox_sfc_access) == 0) | |
310 | return 0; | |
311 | ||
312 | sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); | |
313 | sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; | |
314 | ||
315 | sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); | |
316 | sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; | |
317 | ||
318 | sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); | |
319 | sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; | |
320 | sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); | |
321 | break; | |
322 | ||
323 | case VIDEO_ENHANCEMENT_CLASS: | |
324 | sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); | |
325 | sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; | |
326 | ||
327 | sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); | |
328 | sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; | |
329 | ||
330 | sfc_usage = GEN11_VECS_SFC_USAGE(engine); | |
331 | sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; | |
332 | sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); | |
333 | break; | |
334 | ||
335 | default: | |
336 | return 0; | |
337 | } | |
338 | ||
339 | /* | |
340 | * Tell the engine that a software reset is going to happen. The engine | |
341 | * will then try to force lock the SFC (if currently locked, it will | |
342 | * remain so until we tell the engine it is safe to unlock; if currently | |
343 | * unlocked, it will ignore this and all new lock requests). If SFC | |
344 | * ends up being locked to the engine we want to reset, we have to reset | |
345 | * it as well (we will unlock it once the reset sequence is completed). | |
346 | */ | |
347 | I915_WRITE_FW(sfc_forced_lock, | |
348 | I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit); | |
349 | ||
350 | if (__intel_wait_for_register_fw(dev_priv, | |
351 | sfc_forced_lock_ack, | |
352 | sfc_forced_lock_ack_bit, | |
353 | sfc_forced_lock_ack_bit, | |
354 | 1000, 0, NULL)) { | |
355 | DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); | |
356 | return 0; | |
357 | } | |
358 | ||
359 | if (I915_READ_FW(sfc_usage) & sfc_usage_bit) | |
360 | return sfc_reset_bit; | |
361 | ||
362 | return 0; | |
363 | } | |
364 | ||
365 | static void gen11_unlock_sfc(struct drm_i915_private *dev_priv, | |
366 | struct intel_engine_cs *engine) | |
367 | { | |
368 | u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; | |
369 | i915_reg_t sfc_forced_lock; | |
370 | u32 sfc_forced_lock_bit; | |
371 | ||
372 | switch (engine->class) { | |
373 | case VIDEO_DECODE_CLASS: | |
374 | if ((BIT(engine->instance) & vdbox_sfc_access) == 0) | |
375 | return; | |
376 | ||
377 | sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); | |
378 | sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; | |
379 | break; | |
380 | ||
381 | case VIDEO_ENHANCEMENT_CLASS: | |
382 | sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); | |
383 | sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; | |
384 | break; | |
385 | ||
386 | default: | |
387 | return; | |
388 | } | |
389 | ||
390 | I915_WRITE_FW(sfc_forced_lock, | |
391 | I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit); | |
392 | } | |
393 | ||
394 | static int gen11_reset_engines(struct drm_i915_private *i915, | |
395 | unsigned int engine_mask, | |
396 | unsigned int retry) | |
397 | { | |
398 | const u32 hw_engine_mask[I915_NUM_ENGINES] = { | |
399 | [RCS] = GEN11_GRDOM_RENDER, | |
400 | [BCS] = GEN11_GRDOM_BLT, | |
401 | [VCS] = GEN11_GRDOM_MEDIA, | |
402 | [VCS2] = GEN11_GRDOM_MEDIA2, | |
403 | [VCS3] = GEN11_GRDOM_MEDIA3, | |
404 | [VCS4] = GEN11_GRDOM_MEDIA4, | |
405 | [VECS] = GEN11_GRDOM_VECS, | |
406 | [VECS2] = GEN11_GRDOM_VECS2, | |
407 | }; | |
408 | struct intel_engine_cs *engine; | |
409 | unsigned int tmp; | |
410 | u32 hw_mask; | |
411 | int ret; | |
412 | ||
413 | BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES); | |
414 | ||
415 | if (engine_mask == ALL_ENGINES) { | |
416 | hw_mask = GEN11_GRDOM_FULL; | |
417 | } else { | |
418 | hw_mask = 0; | |
419 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
420 | hw_mask |= hw_engine_mask[engine->id]; | |
421 | hw_mask |= gen11_lock_sfc(i915, engine); | |
422 | } | |
423 | } | |
424 | ||
425 | ret = gen6_hw_domain_reset(i915, hw_mask); | |
426 | ||
427 | if (engine_mask != ALL_ENGINES) | |
428 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
429 | gen11_unlock_sfc(i915, engine); | |
430 | ||
431 | return ret; | |
432 | } | |
433 | ||
434 | static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) | |
435 | { | |
436 | struct drm_i915_private *dev_priv = engine->i915; | |
437 | int ret; | |
438 | ||
439 | I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base), | |
440 | _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET)); | |
441 | ||
442 | ret = __intel_wait_for_register_fw(dev_priv, | |
443 | RING_RESET_CTL(engine->mmio_base), | |
444 | RESET_CTL_READY_TO_RESET, | |
445 | RESET_CTL_READY_TO_RESET, | |
446 | 700, 0, | |
447 | NULL); | |
448 | if (ret) | |
449 | DRM_ERROR("%s: reset request timeout\n", engine->name); | |
450 | ||
451 | return ret; | |
452 | } | |
453 | ||
454 | static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) | |
455 | { | |
456 | struct drm_i915_private *dev_priv = engine->i915; | |
457 | ||
458 | I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base), | |
459 | _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); | |
460 | } | |
461 | ||
462 | static int gen8_reset_engines(struct drm_i915_private *i915, | |
463 | unsigned int engine_mask, | |
464 | unsigned int retry) | |
465 | { | |
466 | struct intel_engine_cs *engine; | |
467 | const bool reset_non_ready = retry >= 1; | |
468 | unsigned int tmp; | |
469 | int ret; | |
470 | ||
471 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
472 | ret = gen8_engine_reset_prepare(engine); | |
473 | if (ret && !reset_non_ready) | |
474 | goto skip_reset; | |
475 | ||
476 | /* | |
477 | * If this is not the first failed attempt to prepare, | |
478 | * we decide to proceed anyway. | |
479 | * | |
480 | * By doing so we risk context corruption and with | |
481 | * some gens (kbl), possible system hang if reset | |
482 | * happens during active bb execution. | |
483 | * | |
484 | * We rather take context corruption instead of | |
485 | * failed reset with a wedged driver/gpu. And | |
486 | * active bb execution case should be covered by | |
487 | * i915_stop_engines we have before the reset. | |
488 | */ | |
489 | } | |
490 | ||
491 | if (INTEL_GEN(i915) >= 11) | |
492 | ret = gen11_reset_engines(i915, engine_mask, retry); | |
493 | else | |
494 | ret = gen6_reset_engines(i915, engine_mask, retry); | |
495 | ||
496 | skip_reset: | |
497 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
498 | gen8_engine_reset_cancel(engine); | |
499 | ||
500 | return ret; | |
501 | } | |
502 | ||
503 | typedef int (*reset_func)(struct drm_i915_private *, | |
504 | unsigned int engine_mask, | |
505 | unsigned int retry); | |
506 | ||
507 | static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) | |
508 | { | |
509 | if (!i915_modparams.reset) | |
510 | return NULL; | |
511 | ||
512 | if (INTEL_GEN(i915) >= 8) | |
513 | return gen8_reset_engines; | |
514 | else if (INTEL_GEN(i915) >= 6) | |
515 | return gen6_reset_engines; | |
516 | else if (INTEL_GEN(i915) >= 5) | |
517 | return ironlake_do_reset; | |
518 | else if (IS_G4X(i915)) | |
519 | return g4x_do_reset; | |
520 | else if (IS_G33(i915) || IS_PINEVIEW(i915)) | |
521 | return g33_do_reset; | |
522 | else if (INTEL_GEN(i915) >= 3) | |
523 | return i915_do_reset; | |
524 | else | |
525 | return NULL; | |
526 | } | |
527 | ||
528 | int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask) | |
529 | { | |
530 | reset_func reset = intel_get_gpu_reset(i915); | |
531 | int retry; | |
532 | int ret; | |
533 | ||
534 | /* | |
535 | * We want to perform per-engine reset from atomic context (e.g. | |
536 | * softirq), which imposes the constraint that we cannot sleep. | |
537 | * However, experience suggests that spending a bit of time waiting | |
538 | * for a reset helps in various cases, so for a full-device reset | |
539 | * we apply the opposite rule and wait if we want to. As we should | |
540 | * always follow up a failed per-engine reset with a full device reset, | |
541 | * being a little faster, stricter and more error prone for the | |
542 | * atomic case seems an acceptable compromise. | |
543 | * | |
544 | * Unfortunately this leads to a bimodal routine, when the goal was | |
545 | * to have a single reset function that worked for resetting any | |
546 | * number of engines simultaneously. | |
547 | */ | |
548 | might_sleep_if(engine_mask == ALL_ENGINES); | |
549 | ||
550 | /* | |
551 | * If the power well sleeps during the reset, the reset | |
552 | * request may be dropped and never completes (causing -EIO). | |
553 | */ | |
554 | intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); | |
555 | for (retry = 0; retry < 3; retry++) { | |
556 | /* | |
557 | * We stop engines, otherwise we might get failed reset and a | |
558 | * dead gpu (on elk). Also as modern gpu as kbl can suffer | |
559 | * from system hang if batchbuffer is progressing when | |
560 | * the reset is issued, regardless of READY_TO_RESET ack. | |
561 | * Thus assume it is best to stop engines on all gens | |
562 | * where we have a gpu reset. | |
563 | * | |
564 | * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) | |
565 | * | |
566 | * WaMediaResetMainRingCleanup:ctg,elk (presumably) | |
567 | * | |
568 | * FIXME: Wa for more modern gens needs to be validated | |
569 | */ | |
570 | i915_stop_engines(i915, engine_mask); | |
571 | ||
572 | ret = -ENODEV; | |
573 | if (reset) { | |
574 | GEM_TRACE("engine_mask=%x\n", engine_mask); | |
575 | ret = reset(i915, engine_mask, retry); | |
576 | } | |
577 | if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES) | |
578 | break; | |
579 | ||
580 | cond_resched(); | |
581 | } | |
582 | intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); | |
583 | ||
584 | return ret; | |
585 | } | |
586 | ||
587 | bool intel_has_gpu_reset(struct drm_i915_private *i915) | |
588 | { | |
589 | return intel_get_gpu_reset(i915); | |
590 | } | |
591 | ||
592 | bool intel_has_reset_engine(struct drm_i915_private *i915) | |
593 | { | |
594 | return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; | |
595 | } | |
596 | ||
597 | int intel_reset_guc(struct drm_i915_private *i915) | |
598 | { | |
599 | u32 guc_domain = | |
600 | INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; | |
601 | int ret; | |
602 | ||
603 | GEM_BUG_ON(!HAS_GUC(i915)); | |
604 | ||
605 | intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); | |
606 | ret = gen6_hw_domain_reset(i915, guc_domain); | |
607 | intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); | |
608 | ||
609 | return ret; | |
610 | } | |
611 | ||
612 | /* | |
613 | * Ensure irq handler finishes, and not run again. | |
614 | * Also return the active request so that we only search for it once. | |
615 | */ | |
616 | static struct i915_request * | |
617 | reset_prepare_engine(struct intel_engine_cs *engine) | |
618 | { | |
619 | struct i915_request *rq; | |
620 | ||
621 | /* | |
622 | * During the reset sequence, we must prevent the engine from | |
623 | * entering RC6. As the context state is undefined until we restart | |
624 | * the engine, if it does enter RC6 during the reset, the state | |
625 | * written to the powercontext is undefined and so we may lose | |
626 | * GPU state upon resume, i.e. fail to restart after a reset. | |
627 | */ | |
628 | intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL); | |
629 | ||
630 | rq = engine->reset.prepare(engine); | |
631 | if (rq && rq->fence.error == -EIO) | |
632 | rq = ERR_PTR(-EIO); /* Previous reset failed! */ | |
633 | ||
634 | return rq; | |
635 | } | |
636 | ||
637 | static int reset_prepare(struct drm_i915_private *i915) | |
638 | { | |
639 | struct intel_engine_cs *engine; | |
640 | struct i915_request *rq; | |
641 | enum intel_engine_id id; | |
642 | int err = 0; | |
643 | ||
644 | for_each_engine(engine, i915, id) { | |
645 | rq = reset_prepare_engine(engine); | |
646 | if (IS_ERR(rq)) { | |
647 | err = PTR_ERR(rq); | |
648 | continue; | |
649 | } | |
650 | ||
651 | engine->hangcheck.active_request = rq; | |
652 | } | |
653 | ||
654 | i915_gem_revoke_fences(i915); | |
655 | intel_uc_sanitize(i915); | |
656 | ||
657 | return err; | |
658 | } | |
659 | ||
660 | /* Returns the request if it was guilty of the hang */ | |
661 | static struct i915_request * | |
662 | reset_request(struct intel_engine_cs *engine, | |
663 | struct i915_request *rq, | |
664 | bool stalled) | |
665 | { | |
666 | /* | |
667 | * The guilty request will get skipped on a hung engine. | |
668 | * | |
669 | * Users of client default contexts do not rely on logical | |
670 | * state preserved between batches so it is safe to execute | |
671 | * queued requests following the hang. Non default contexts | |
672 | * rely on preserved state, so skipping a batch loses the | |
673 | * evolution of the state and it needs to be considered corrupted. | |
674 | * Executing more queued batches on top of corrupted state is | |
675 | * risky. But we take the risk by trying to advance through | |
676 | * the queued requests in order to make the client behaviour | |
677 | * more predictable around resets, by not throwing away random | |
678 | * amount of batches it has prepared for execution. Sophisticated | |
679 | * clients can use gem_reset_stats_ioctl and dma fence status | |
680 | * (exported via sync_file info ioctl on explicit fences) to observe | |
681 | * when it loses the context state and should rebuild accordingly. | |
682 | * | |
683 | * The context ban, and ultimately the client ban, mechanism are safety | |
684 | * valves if client submission ends up resulting in nothing more than | |
685 | * subsequent hangs. | |
686 | */ | |
687 | ||
688 | if (i915_request_completed(rq)) { | |
689 | GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n", | |
690 | engine->name, rq->global_seqno, | |
691 | rq->fence.context, rq->fence.seqno, | |
692 | intel_engine_get_seqno(engine)); | |
693 | stalled = false; | |
694 | } | |
695 | ||
696 | if (stalled) { | |
697 | context_mark_guilty(rq->gem_context); | |
698 | i915_request_skip(rq, -EIO); | |
699 | ||
700 | /* If this context is now banned, skip all pending requests. */ | |
701 | if (i915_gem_context_is_banned(rq->gem_context)) | |
702 | engine_skip_context(rq); | |
703 | } else { | |
704 | /* | |
705 | * Since this is not the hung engine, it may have advanced | |
706 | * since the hang declaration. Double check by refinding | |
707 | * the active request at the time of the reset. | |
708 | */ | |
709 | rq = i915_gem_find_active_request(engine); | |
710 | if (rq) { | |
711 | unsigned long flags; | |
712 | ||
713 | context_mark_innocent(rq->gem_context); | |
714 | dma_fence_set_error(&rq->fence, -EAGAIN); | |
715 | ||
716 | /* Rewind the engine to replay the incomplete rq */ | |
717 | spin_lock_irqsave(&engine->timeline.lock, flags); | |
718 | rq = list_prev_entry(rq, link); | |
719 | if (&rq->link == &engine->timeline.requests) | |
720 | rq = NULL; | |
721 | spin_unlock_irqrestore(&engine->timeline.lock, flags); | |
722 | } | |
723 | } | |
724 | ||
725 | return rq; | |
726 | } | |
727 | ||
728 | static void reset_engine(struct intel_engine_cs *engine, | |
729 | struct i915_request *rq, | |
730 | bool stalled) | |
731 | { | |
732 | if (rq) | |
733 | rq = reset_request(engine, rq, stalled); | |
734 | ||
735 | /* Setup the CS to resume from the breadcrumb of the hung request */ | |
736 | engine->reset.reset(engine, rq); | |
737 | } | |
738 | ||
739 | static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask) | |
740 | { | |
741 | struct intel_engine_cs *engine; | |
742 | enum intel_engine_id id; | |
743 | ||
744 | lockdep_assert_held(&i915->drm.struct_mutex); | |
745 | ||
746 | i915_retire_requests(i915); | |
747 | ||
748 | for_each_engine(engine, i915, id) { | |
749 | struct intel_context *ce; | |
750 | ||
751 | reset_engine(engine, | |
752 | engine->hangcheck.active_request, | |
753 | stalled_mask & ENGINE_MASK(id)); | |
754 | ce = fetch_and_zero(&engine->last_retired_context); | |
755 | if (ce) | |
756 | intel_context_unpin(ce); | |
757 | ||
758 | /* | |
759 | * Ostensibily, we always want a context loaded for powersaving, | |
760 | * so if the engine is idle after the reset, send a request | |
761 | * to load our scratch kernel_context. | |
762 | * | |
763 | * More mysteriously, if we leave the engine idle after a reset, | |
764 | * the next userspace batch may hang, with what appears to be | |
765 | * an incoherent read by the CS (presumably stale TLB). An | |
766 | * empty request appears sufficient to paper over the glitch. | |
767 | */ | |
768 | if (intel_engine_is_idle(engine)) { | |
769 | struct i915_request *rq; | |
770 | ||
771 | rq = i915_request_alloc(engine, i915->kernel_context); | |
772 | if (!IS_ERR(rq)) | |
773 | i915_request_add(rq); | |
774 | } | |
775 | } | |
776 | ||
777 | i915_gem_restore_fences(i915); | |
778 | } | |
779 | ||
780 | static void reset_finish_engine(struct intel_engine_cs *engine) | |
781 | { | |
782 | engine->reset.finish(engine); | |
783 | ||
784 | intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); | |
785 | } | |
786 | ||
787 | static void reset_finish(struct drm_i915_private *i915) | |
788 | { | |
789 | struct intel_engine_cs *engine; | |
790 | enum intel_engine_id id; | |
791 | ||
792 | lockdep_assert_held(&i915->drm.struct_mutex); | |
793 | ||
794 | for_each_engine(engine, i915, id) { | |
795 | engine->hangcheck.active_request = NULL; | |
796 | reset_finish_engine(engine); | |
797 | } | |
798 | } | |
799 | ||
800 | static void nop_submit_request(struct i915_request *request) | |
801 | { | |
802 | unsigned long flags; | |
803 | ||
804 | GEM_TRACE("%s fence %llx:%lld -> -EIO\n", | |
805 | request->engine->name, | |
806 | request->fence.context, request->fence.seqno); | |
807 | dma_fence_set_error(&request->fence, -EIO); | |
808 | ||
809 | spin_lock_irqsave(&request->engine->timeline.lock, flags); | |
810 | __i915_request_submit(request); | |
811 | intel_engine_write_global_seqno(request->engine, request->global_seqno); | |
812 | spin_unlock_irqrestore(&request->engine->timeline.lock, flags); | |
813 | } | |
814 | ||
815 | void i915_gem_set_wedged(struct drm_i915_private *i915) | |
816 | { | |
817 | struct i915_gpu_error *error = &i915->gpu_error; | |
818 | struct intel_engine_cs *engine; | |
819 | enum intel_engine_id id; | |
820 | ||
821 | mutex_lock(&error->wedge_mutex); | |
822 | if (test_bit(I915_WEDGED, &error->flags)) { | |
823 | mutex_unlock(&error->wedge_mutex); | |
824 | return; | |
825 | } | |
826 | ||
827 | if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { | |
828 | struct drm_printer p = drm_debug_printer(__func__); | |
829 | ||
830 | for_each_engine(engine, i915, id) | |
831 | intel_engine_dump(engine, &p, "%s\n", engine->name); | |
832 | } | |
833 | ||
834 | GEM_TRACE("start\n"); | |
835 | ||
836 | /* | |
837 | * First, stop submission to hw, but do not yet complete requests by | |
838 | * rolling the global seqno forward (since this would complete requests | |
839 | * for which we haven't set the fence error to EIO yet). | |
840 | */ | |
841 | for_each_engine(engine, i915, id) | |
842 | reset_prepare_engine(engine); | |
843 | ||
844 | /* Even if the GPU reset fails, it should still stop the engines */ | |
845 | if (INTEL_GEN(i915) >= 5) | |
846 | intel_gpu_reset(i915, ALL_ENGINES); | |
847 | ||
848 | for_each_engine(engine, i915, id) { | |
849 | engine->submit_request = nop_submit_request; | |
850 | engine->schedule = NULL; | |
851 | } | |
852 | i915->caps.scheduler = 0; | |
853 | ||
854 | /* | |
855 | * Make sure no request can slip through without getting completed by | |
856 | * either this call here to intel_engine_write_global_seqno, or the one | |
857 | * in nop_submit_request. | |
858 | */ | |
859 | synchronize_rcu(); | |
860 | ||
861 | /* Mark all executing requests as skipped */ | |
862 | for_each_engine(engine, i915, id) | |
863 | engine->cancel_requests(engine); | |
864 | ||
865 | for_each_engine(engine, i915, id) { | |
866 | reset_finish_engine(engine); | |
867 | intel_engine_wakeup(engine); | |
868 | } | |
869 | ||
870 | smp_mb__before_atomic(); | |
871 | set_bit(I915_WEDGED, &error->flags); | |
872 | ||
873 | GEM_TRACE("end\n"); | |
874 | mutex_unlock(&error->wedge_mutex); | |
875 | ||
876 | wake_up_all(&error->reset_queue); | |
877 | } | |
878 | ||
879 | bool i915_gem_unset_wedged(struct drm_i915_private *i915) | |
880 | { | |
881 | struct i915_gpu_error *error = &i915->gpu_error; | |
882 | struct i915_timeline *tl; | |
883 | bool ret = false; | |
884 | ||
885 | lockdep_assert_held(&i915->drm.struct_mutex); | |
886 | ||
887 | if (!test_bit(I915_WEDGED, &error->flags)) | |
888 | return true; | |
889 | ||
890 | if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ | |
891 | return false; | |
892 | ||
893 | mutex_lock(&error->wedge_mutex); | |
894 | ||
895 | GEM_TRACE("start\n"); | |
896 | ||
897 | /* | |
898 | * Before unwedging, make sure that all pending operations | |
899 | * are flushed and errored out - we may have requests waiting upon | |
900 | * third party fences. We marked all inflight requests as EIO, and | |
901 | * every execbuf since returned EIO, for consistency we want all | |
902 | * the currently pending requests to also be marked as EIO, which | |
903 | * is done inside our nop_submit_request - and so we must wait. | |
904 | * | |
905 | * No more can be submitted until we reset the wedged bit. | |
906 | */ | |
907 | list_for_each_entry(tl, &i915->gt.timelines, link) { | |
908 | struct i915_request *rq; | |
909 | ||
910 | rq = i915_gem_active_peek(&tl->last_request, | |
911 | &i915->drm.struct_mutex); | |
912 | if (!rq) | |
913 | continue; | |
914 | ||
915 | /* | |
916 | * We can't use our normal waiter as we want to | |
917 | * avoid recursively trying to handle the current | |
918 | * reset. The basic dma_fence_default_wait() installs | |
919 | * a callback for dma_fence_signal(), which is | |
920 | * triggered by our nop handler (indirectly, the | |
921 | * callback enables the signaler thread which is | |
922 | * woken by the nop_submit_request() advancing the seqno | |
923 | * and when the seqno passes the fence, the signaler | |
924 | * then signals the fence waking us up). | |
925 | */ | |
926 | if (dma_fence_default_wait(&rq->fence, true, | |
927 | MAX_SCHEDULE_TIMEOUT) < 0) | |
928 | goto unlock; | |
929 | } | |
930 | i915_retire_requests(i915); | |
931 | GEM_BUG_ON(i915->gt.active_requests); | |
932 | ||
933 | intel_engines_sanitize(i915, false); | |
934 | ||
935 | /* | |
936 | * Undo nop_submit_request. We prevent all new i915 requests from | |
937 | * being queued (by disallowing execbuf whilst wedged) so having | |
938 | * waited for all active requests above, we know the system is idle | |
939 | * and do not have to worry about a thread being inside | |
940 | * engine->submit_request() as we swap over. So unlike installing | |
941 | * the nop_submit_request on reset, we can do this from normal | |
942 | * context and do not require stop_machine(). | |
943 | */ | |
944 | intel_engines_reset_default_submission(i915); | |
945 | i915_gem_contexts_lost(i915); | |
946 | ||
947 | GEM_TRACE("end\n"); | |
948 | ||
949 | smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ | |
950 | clear_bit(I915_WEDGED, &i915->gpu_error.flags); | |
951 | ret = true; | |
952 | unlock: | |
953 | mutex_unlock(&i915->gpu_error.wedge_mutex); | |
954 | ||
955 | return ret; | |
956 | } | |
957 | ||
958 | /** | |
959 | * i915_reset - reset chip after a hang | |
960 | * @i915: #drm_i915_private to reset | |
961 | * @stalled_mask: mask of the stalled engines with the guilty requests | |
962 | * @reason: user error message for why we are resetting | |
963 | * | |
964 | * Reset the chip. Useful if a hang is detected. Marks the device as wedged | |
965 | * on failure. | |
966 | * | |
967 | * Caller must hold the struct_mutex. | |
968 | * | |
969 | * Procedure is fairly simple: | |
970 | * - reset the chip using the reset reg | |
971 | * - re-init context state | |
972 | * - re-init hardware status page | |
973 | * - re-init ring buffer | |
974 | * - re-init interrupt state | |
975 | * - re-init display | |
976 | */ | |
977 | void i915_reset(struct drm_i915_private *i915, | |
978 | unsigned int stalled_mask, | |
979 | const char *reason) | |
980 | { | |
981 | struct i915_gpu_error *error = &i915->gpu_error; | |
982 | int ret; | |
983 | int i; | |
984 | ||
985 | GEM_TRACE("flags=%lx\n", error->flags); | |
986 | ||
987 | might_sleep(); | |
988 | lockdep_assert_held(&i915->drm.struct_mutex); | |
989 | assert_rpm_wakelock_held(i915); | |
990 | GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); | |
991 | ||
992 | if (!test_bit(I915_RESET_HANDOFF, &error->flags)) | |
993 | return; | |
994 | ||
995 | /* Clear any previous failed attempts at recovery. Time to try again. */ | |
996 | if (!i915_gem_unset_wedged(i915)) | |
997 | goto wakeup; | |
998 | ||
999 | if (reason) | |
1000 | dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); | |
1001 | error->reset_count++; | |
1002 | ||
1003 | ret = reset_prepare(i915); | |
1004 | if (ret) { | |
1005 | dev_err(i915->drm.dev, "GPU recovery failed\n"); | |
1006 | goto taint; | |
1007 | } | |
1008 | ||
1009 | if (!intel_has_gpu_reset(i915)) { | |
1010 | if (i915_modparams.reset) | |
1011 | dev_err(i915->drm.dev, "GPU reset not supported\n"); | |
1012 | else | |
1013 | DRM_DEBUG_DRIVER("GPU reset disabled\n"); | |
1014 | goto error; | |
1015 | } | |
1016 | ||
1017 | for (i = 0; i < 3; i++) { | |
1018 | ret = intel_gpu_reset(i915, ALL_ENGINES); | |
1019 | if (ret == 0) | |
1020 | break; | |
1021 | ||
1022 | msleep(100); | |
1023 | } | |
1024 | if (ret) { | |
1025 | dev_err(i915->drm.dev, "Failed to reset chip\n"); | |
1026 | goto taint; | |
1027 | } | |
1028 | ||
1029 | /* Ok, now get things going again... */ | |
1030 | ||
1031 | /* | |
1032 | * Everything depends on having the GTT running, so we need to start | |
1033 | * there. | |
1034 | */ | |
1035 | ret = i915_ggtt_enable_hw(i915); | |
1036 | if (ret) { | |
1037 | DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n", | |
1038 | ret); | |
1039 | goto error; | |
1040 | } | |
1041 | ||
1042 | gt_reset(i915, stalled_mask); | |
1043 | intel_overlay_reset(i915); | |
1044 | ||
1045 | /* | |
1046 | * Next we need to restore the context, but we don't use those | |
1047 | * yet either... | |
1048 | * | |
1049 | * Ring buffer needs to be re-initialized in the KMS case, or if X | |
1050 | * was running at the time of the reset (i.e. we weren't VT | |
1051 | * switched away). | |
1052 | */ | |
1053 | ret = i915_gem_init_hw(i915); | |
1054 | if (ret) { | |
1055 | DRM_ERROR("Failed to initialise HW following reset (%d)\n", | |
1056 | ret); | |
1057 | goto error; | |
1058 | } | |
1059 | ||
1060 | i915_queue_hangcheck(i915); | |
1061 | ||
1062 | finish: | |
1063 | reset_finish(i915); | |
1064 | wakeup: | |
1065 | clear_bit(I915_RESET_HANDOFF, &error->flags); | |
1066 | wake_up_bit(&error->flags, I915_RESET_HANDOFF); | |
1067 | return; | |
1068 | ||
1069 | taint: | |
1070 | /* | |
1071 | * History tells us that if we cannot reset the GPU now, we | |
1072 | * never will. This then impacts everything that is run | |
1073 | * subsequently. On failing the reset, we mark the driver | |
1074 | * as wedged, preventing further execution on the GPU. | |
1075 | * We also want to go one step further and add a taint to the | |
1076 | * kernel so that any subsequent faults can be traced back to | |
1077 | * this failure. This is important for CI, where if the | |
1078 | * GPU/driver fails we would like to reboot and restart testing | |
1079 | * rather than continue on into oblivion. For everyone else, | |
1080 | * the system should still plod along, but they have been warned! | |
1081 | */ | |
1082 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | |
1083 | error: | |
1084 | i915_gem_set_wedged(i915); | |
1085 | i915_retire_requests(i915); | |
1086 | goto finish; | |
1087 | } | |
1088 | ||
1089 | static inline int intel_gt_reset_engine(struct drm_i915_private *i915, | |
1090 | struct intel_engine_cs *engine) | |
1091 | { | |
1092 | return intel_gpu_reset(i915, intel_engine_flag(engine)); | |
1093 | } | |
1094 | ||
1095 | /** | |
1096 | * i915_reset_engine - reset GPU engine to recover from a hang | |
1097 | * @engine: engine to reset | |
1098 | * @msg: reason for GPU reset; or NULL for no dev_notice() | |
1099 | * | |
1100 | * Reset a specific GPU engine. Useful if a hang is detected. | |
1101 | * Returns zero on successful reset or otherwise an error code. | |
1102 | * | |
1103 | * Procedure is: | |
1104 | * - identifies the request that caused the hang and it is dropped | |
1105 | * - reset engine (which will force the engine to idle) | |
1106 | * - re-init/configure engine | |
1107 | */ | |
1108 | int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) | |
1109 | { | |
1110 | struct i915_gpu_error *error = &engine->i915->gpu_error; | |
1111 | struct i915_request *active_request; | |
1112 | int ret; | |
1113 | ||
1114 | GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); | |
1115 | GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); | |
1116 | ||
1117 | active_request = reset_prepare_engine(engine); | |
1118 | if (IS_ERR_OR_NULL(active_request)) { | |
1119 | /* Either the previous reset failed, or we pardon the reset. */ | |
1120 | ret = PTR_ERR(active_request); | |
1121 | goto out; | |
1122 | } | |
1123 | ||
1124 | if (msg) | |
1125 | dev_notice(engine->i915->drm.dev, | |
1126 | "Resetting %s for %s\n", engine->name, msg); | |
1127 | error->reset_engine_count[engine->id]++; | |
1128 | ||
1129 | if (!engine->i915->guc.execbuf_client) | |
1130 | ret = intel_gt_reset_engine(engine->i915, engine); | |
1131 | else | |
1132 | ret = intel_guc_reset_engine(&engine->i915->guc, engine); | |
1133 | if (ret) { | |
1134 | /* If we fail here, we expect to fallback to a global reset */ | |
1135 | DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", | |
1136 | engine->i915->guc.execbuf_client ? "GuC " : "", | |
1137 | engine->name, ret); | |
1138 | goto out; | |
1139 | } | |
1140 | ||
1141 | /* | |
1142 | * The request that caused the hang is stuck on elsp, we know the | |
1143 | * active request and can drop it, adjust head to skip the offending | |
1144 | * request to resume executing remaining requests in the queue. | |
1145 | */ | |
1146 | reset_engine(engine, active_request, true); | |
1147 | ||
1148 | /* | |
1149 | * The engine and its registers (and workarounds in case of render) | |
1150 | * have been reset to their default values. Follow the init_ring | |
1151 | * process to program RING_MODE, HWSP and re-enable submission. | |
1152 | */ | |
1153 | ret = engine->init_hw(engine); | |
1154 | if (ret) | |
1155 | goto out; | |
1156 | ||
1157 | out: | |
1158 | intel_engine_cancel_stop_cs(engine); | |
1159 | reset_finish_engine(engine); | |
1160 | return ret; | |
1161 | } | |
1162 | ||
1163 | static void i915_reset_device(struct drm_i915_private *i915, | |
1164 | u32 engine_mask, | |
1165 | const char *reason) | |
1166 | { | |
1167 | struct i915_gpu_error *error = &i915->gpu_error; | |
1168 | struct kobject *kobj = &i915->drm.primary->kdev->kobj; | |
1169 | char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; | |
1170 | char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; | |
1171 | char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; | |
1172 | struct i915_wedge_me w; | |
1173 | ||
1174 | kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); | |
1175 | ||
1176 | DRM_DEBUG_DRIVER("resetting chip\n"); | |
1177 | kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); | |
1178 | ||
1179 | /* Use a watchdog to ensure that our reset completes */ | |
1180 | i915_wedge_on_timeout(&w, i915, 5 * HZ) { | |
1181 | intel_prepare_reset(i915); | |
1182 | ||
1183 | error->reason = reason; | |
1184 | error->stalled_mask = engine_mask; | |
1185 | ||
1186 | /* Signal that locked waiters should reset the GPU */ | |
1187 | smp_mb__before_atomic(); | |
1188 | set_bit(I915_RESET_HANDOFF, &error->flags); | |
1189 | wake_up_all(&error->wait_queue); | |
1190 | ||
1191 | /* | |
1192 | * Wait for anyone holding the lock to wakeup, without | |
1193 | * blocking indefinitely on struct_mutex. | |
1194 | */ | |
1195 | do { | |
1196 | if (mutex_trylock(&i915->drm.struct_mutex)) { | |
1197 | i915_reset(i915, engine_mask, reason); | |
1198 | mutex_unlock(&i915->drm.struct_mutex); | |
1199 | } | |
1200 | } while (wait_on_bit_timeout(&error->flags, | |
1201 | I915_RESET_HANDOFF, | |
1202 | TASK_UNINTERRUPTIBLE, | |
1203 | 1)); | |
1204 | ||
1205 | error->stalled_mask = 0; | |
1206 | error->reason = NULL; | |
1207 | ||
1208 | intel_finish_reset(i915); | |
1209 | } | |
1210 | ||
1211 | if (!test_bit(I915_WEDGED, &error->flags)) | |
1212 | kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); | |
1213 | } | |
1214 | ||
1215 | void i915_clear_error_registers(struct drm_i915_private *dev_priv) | |
1216 | { | |
1217 | u32 eir; | |
1218 | ||
1219 | if (!IS_GEN(dev_priv, 2)) | |
1220 | I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER)); | |
1221 | ||
1222 | if (INTEL_GEN(dev_priv) < 4) | |
1223 | I915_WRITE(IPEIR, I915_READ(IPEIR)); | |
1224 | else | |
1225 | I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965)); | |
1226 | ||
1227 | I915_WRITE(EIR, I915_READ(EIR)); | |
1228 | eir = I915_READ(EIR); | |
1229 | if (eir) { | |
1230 | /* | |
1231 | * some errors might have become stuck, | |
1232 | * mask them. | |
1233 | */ | |
1234 | DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); | |
1235 | I915_WRITE(EMR, I915_READ(EMR) | eir); | |
1236 | I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT); | |
1237 | } | |
1238 | ||
1239 | if (INTEL_GEN(dev_priv) >= 8) { | |
1240 | I915_WRITE(GEN8_RING_FAULT_REG, | |
1241 | I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID); | |
1242 | POSTING_READ(GEN8_RING_FAULT_REG); | |
1243 | } else if (INTEL_GEN(dev_priv) >= 6) { | |
1244 | struct intel_engine_cs *engine; | |
1245 | enum intel_engine_id id; | |
1246 | ||
1247 | for_each_engine(engine, dev_priv, id) { | |
1248 | I915_WRITE(RING_FAULT_REG(engine), | |
1249 | I915_READ(RING_FAULT_REG(engine)) & | |
1250 | ~RING_FAULT_VALID); | |
1251 | } | |
1252 | POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS])); | |
1253 | } | |
1254 | } | |
1255 | ||
1256 | /** | |
1257 | * i915_handle_error - handle a gpu error | |
1258 | * @i915: i915 device private | |
1259 | * @engine_mask: mask representing engines that are hung | |
1260 | * @flags: control flags | |
1261 | * @fmt: Error message format string | |
1262 | * | |
1263 | * Do some basic checking of register state at error time and | |
1264 | * dump it to the syslog. Also call i915_capture_error_state() to make | |
1265 | * sure we get a record and make it available in debugfs. Fire a uevent | |
1266 | * so userspace knows something bad happened (should trigger collection | |
1267 | * of a ring dump etc.). | |
1268 | */ | |
1269 | void i915_handle_error(struct drm_i915_private *i915, | |
1270 | u32 engine_mask, | |
1271 | unsigned long flags, | |
1272 | const char *fmt, ...) | |
1273 | { | |
1274 | struct intel_engine_cs *engine; | |
1275 | intel_wakeref_t wakeref; | |
1276 | unsigned int tmp; | |
1277 | char error_msg[80]; | |
1278 | char *msg = NULL; | |
1279 | ||
1280 | if (fmt) { | |
1281 | va_list args; | |
1282 | ||
1283 | va_start(args, fmt); | |
1284 | vscnprintf(error_msg, sizeof(error_msg), fmt, args); | |
1285 | va_end(args); | |
1286 | ||
1287 | msg = error_msg; | |
1288 | } | |
1289 | ||
1290 | /* | |
1291 | * In most cases it's guaranteed that we get here with an RPM | |
1292 | * reference held, for example because there is a pending GPU | |
1293 | * request that won't finish until the reset is done. This | |
1294 | * isn't the case at least when we get here by doing a | |
1295 | * simulated reset via debugfs, so get an RPM reference. | |
1296 | */ | |
1297 | wakeref = intel_runtime_pm_get(i915); | |
1298 | ||
1299 | engine_mask &= INTEL_INFO(i915)->ring_mask; | |
1300 | ||
1301 | if (flags & I915_ERROR_CAPTURE) { | |
1302 | i915_capture_error_state(i915, engine_mask, msg); | |
1303 | i915_clear_error_registers(i915); | |
1304 | } | |
1305 | ||
1306 | /* | |
1307 | * Try engine reset when available. We fall back to full reset if | |
1308 | * single reset fails. | |
1309 | */ | |
1310 | if (intel_has_reset_engine(i915) && | |
1311 | !i915_terminally_wedged(&i915->gpu_error)) { | |
1312 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
1313 | BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); | |
1314 | if (test_and_set_bit(I915_RESET_ENGINE + engine->id, | |
1315 | &i915->gpu_error.flags)) | |
1316 | continue; | |
1317 | ||
1318 | if (i915_reset_engine(engine, msg) == 0) | |
1319 | engine_mask &= ~intel_engine_flag(engine); | |
1320 | ||
1321 | clear_bit(I915_RESET_ENGINE + engine->id, | |
1322 | &i915->gpu_error.flags); | |
1323 | wake_up_bit(&i915->gpu_error.flags, | |
1324 | I915_RESET_ENGINE + engine->id); | |
1325 | } | |
1326 | } | |
1327 | ||
1328 | if (!engine_mask) | |
1329 | goto out; | |
1330 | ||
1331 | /* Full reset needs the mutex, stop any other user trying to do so. */ | |
1332 | if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) { | |
1333 | wait_event(i915->gpu_error.reset_queue, | |
1334 | !test_bit(I915_RESET_BACKOFF, | |
1335 | &i915->gpu_error.flags)); | |
1336 | goto out; | |
1337 | } | |
1338 | ||
1339 | /* Prevent any other reset-engine attempt. */ | |
1340 | for_each_engine(engine, i915, tmp) { | |
1341 | while (test_and_set_bit(I915_RESET_ENGINE + engine->id, | |
1342 | &i915->gpu_error.flags)) | |
1343 | wait_on_bit(&i915->gpu_error.flags, | |
1344 | I915_RESET_ENGINE + engine->id, | |
1345 | TASK_UNINTERRUPTIBLE); | |
1346 | } | |
1347 | ||
1348 | i915_reset_device(i915, engine_mask, msg); | |
1349 | ||
1350 | for_each_engine(engine, i915, tmp) { | |
1351 | clear_bit(I915_RESET_ENGINE + engine->id, | |
1352 | &i915->gpu_error.flags); | |
1353 | } | |
1354 | ||
1355 | clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); | |
1356 | wake_up_all(&i915->gpu_error.reset_queue); | |
1357 | ||
1358 | out: | |
1359 | intel_runtime_pm_put(i915, wakeref); | |
1360 | } | |
1361 | ||
1362 | static void i915_wedge_me(struct work_struct *work) | |
1363 | { | |
1364 | struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); | |
1365 | ||
1366 | dev_err(w->i915->drm.dev, | |
1367 | "%s timed out, cancelling all in-flight rendering.\n", | |
1368 | w->name); | |
1369 | i915_gem_set_wedged(w->i915); | |
1370 | } | |
1371 | ||
1372 | void __i915_init_wedge(struct i915_wedge_me *w, | |
1373 | struct drm_i915_private *i915, | |
1374 | long timeout, | |
1375 | const char *name) | |
1376 | { | |
1377 | w->i915 = i915; | |
1378 | w->name = name; | |
1379 | ||
1380 | INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); | |
1381 | schedule_delayed_work(&w->work, timeout); | |
1382 | } | |
1383 | ||
1384 | void __i915_fini_wedge(struct i915_wedge_me *w) | |
1385 | { | |
1386 | cancel_delayed_work_sync(&w->work); | |
1387 | destroy_delayed_work_on_stack(&w->work); | |
1388 | w->i915 = NULL; | |
1389 | } |