Commit | Line | Data |
---|---|---|
9f58892e CW |
1 | /* |
2 | * SPDX-License-Identifier: MIT | |
3 | * | |
4 | * Copyright © 2008-2018 Intel Corporation | |
5 | */ | |
6 | ||
7 | #include <linux/sched/mm.h> | |
8 | ||
9 | #include "i915_drv.h" | |
10 | #include "i915_gpu_error.h" | |
11 | #include "i915_reset.h" | |
12 | ||
13 | #include "intel_guc.h" | |
14 | ||
ade8a0f5 CW |
15 | #define RESET_MAX_RETRIES 3 |
16 | ||
9f58892e CW |
17 | static void engine_skip_context(struct i915_request *rq) |
18 | { | |
19 | struct intel_engine_cs *engine = rq->engine; | |
20 | struct i915_gem_context *hung_ctx = rq->gem_context; | |
21 | struct i915_timeline *timeline = rq->timeline; | |
22 | unsigned long flags; | |
23 | ||
24 | GEM_BUG_ON(timeline == &engine->timeline); | |
25 | ||
26 | spin_lock_irqsave(&engine->timeline.lock, flags); | |
27 | spin_lock(&timeline->lock); | |
28 | ||
29 | list_for_each_entry_continue(rq, &engine->timeline.requests, link) | |
30 | if (rq->gem_context == hung_ctx) | |
31 | i915_request_skip(rq, -EIO); | |
32 | ||
33 | list_for_each_entry(rq, &timeline->requests, link) | |
34 | i915_request_skip(rq, -EIO); | |
35 | ||
36 | spin_unlock(&timeline->lock); | |
37 | spin_unlock_irqrestore(&engine->timeline.lock, flags); | |
38 | } | |
39 | ||
40 | static void client_mark_guilty(struct drm_i915_file_private *file_priv, | |
41 | const struct i915_gem_context *ctx) | |
42 | { | |
43 | unsigned int score; | |
44 | unsigned long prev_hang; | |
45 | ||
46 | if (i915_gem_context_is_banned(ctx)) | |
47 | score = I915_CLIENT_SCORE_CONTEXT_BAN; | |
48 | else | |
49 | score = 0; | |
50 | ||
51 | prev_hang = xchg(&file_priv->hang_timestamp, jiffies); | |
52 | if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) | |
53 | score += I915_CLIENT_SCORE_HANG_FAST; | |
54 | ||
55 | if (score) { | |
56 | atomic_add(score, &file_priv->ban_score); | |
57 | ||
58 | DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", | |
59 | ctx->name, score, | |
60 | atomic_read(&file_priv->ban_score)); | |
61 | } | |
62 | } | |
63 | ||
64 | static void context_mark_guilty(struct i915_gem_context *ctx) | |
65 | { | |
66 | unsigned int score; | |
67 | bool banned, bannable; | |
68 | ||
69 | atomic_inc(&ctx->guilty_count); | |
70 | ||
71 | bannable = i915_gem_context_is_bannable(ctx); | |
72 | score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); | |
73 | banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; | |
74 | ||
75 | /* Cool contexts don't accumulate client ban score */ | |
76 | if (!bannable) | |
77 | return; | |
78 | ||
79 | if (banned) { | |
80 | DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n", | |
81 | ctx->name, atomic_read(&ctx->guilty_count), | |
82 | score); | |
83 | i915_gem_context_set_banned(ctx); | |
84 | } | |
85 | ||
86 | if (!IS_ERR_OR_NULL(ctx->file_priv)) | |
87 | client_mark_guilty(ctx->file_priv, ctx); | |
88 | } | |
89 | ||
90 | static void context_mark_innocent(struct i915_gem_context *ctx) | |
91 | { | |
92 | atomic_inc(&ctx->active_count); | |
93 | } | |
94 | ||
95 | static void gen3_stop_engine(struct intel_engine_cs *engine) | |
96 | { | |
97 | struct drm_i915_private *dev_priv = engine->i915; | |
98 | const u32 base = engine->mmio_base; | |
99 | ||
100 | if (intel_engine_stop_cs(engine)) | |
101 | DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name); | |
102 | ||
103 | I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base))); | |
104 | POSTING_READ_FW(RING_HEAD(base)); /* paranoia */ | |
105 | ||
106 | I915_WRITE_FW(RING_HEAD(base), 0); | |
107 | I915_WRITE_FW(RING_TAIL(base), 0); | |
108 | POSTING_READ_FW(RING_TAIL(base)); | |
109 | ||
110 | /* The ring must be empty before it is disabled */ | |
111 | I915_WRITE_FW(RING_CTL(base), 0); | |
112 | ||
113 | /* Check acts as a post */ | |
114 | if (I915_READ_FW(RING_HEAD(base)) != 0) | |
115 | DRM_DEBUG_DRIVER("%s: ring head not parked\n", | |
116 | engine->name); | |
117 | } | |
118 | ||
119 | static void i915_stop_engines(struct drm_i915_private *i915, | |
120 | unsigned int engine_mask) | |
121 | { | |
122 | struct intel_engine_cs *engine; | |
123 | enum intel_engine_id id; | |
124 | ||
125 | if (INTEL_GEN(i915) < 3) | |
126 | return; | |
127 | ||
128 | for_each_engine_masked(engine, i915, engine_mask, id) | |
129 | gen3_stop_engine(engine); | |
130 | } | |
131 | ||
132 | static bool i915_in_reset(struct pci_dev *pdev) | |
133 | { | |
134 | u8 gdrst; | |
135 | ||
136 | pci_read_config_byte(pdev, I915_GDRST, &gdrst); | |
137 | return gdrst & GRDOM_RESET_STATUS; | |
138 | } | |
139 | ||
140 | static int i915_do_reset(struct drm_i915_private *i915, | |
141 | unsigned int engine_mask, | |
142 | unsigned int retry) | |
143 | { | |
144 | struct pci_dev *pdev = i915->drm.pdev; | |
145 | int err; | |
146 | ||
147 | /* Assert reset for at least 20 usec, and wait for acknowledgement. */ | |
148 | pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); | |
ade8a0f5 CW |
149 | udelay(50); |
150 | err = wait_for_atomic(i915_in_reset(pdev), 50); | |
9f58892e CW |
151 | |
152 | /* Clear the reset request. */ | |
153 | pci_write_config_byte(pdev, I915_GDRST, 0); | |
ade8a0f5 | 154 | udelay(50); |
9f58892e | 155 | if (!err) |
ade8a0f5 | 156 | err = wait_for_atomic(!i915_in_reset(pdev), 50); |
9f58892e CW |
157 | |
158 | return err; | |
159 | } | |
160 | ||
161 | static bool g4x_reset_complete(struct pci_dev *pdev) | |
162 | { | |
163 | u8 gdrst; | |
164 | ||
165 | pci_read_config_byte(pdev, I915_GDRST, &gdrst); | |
166 | return (gdrst & GRDOM_RESET_ENABLE) == 0; | |
167 | } | |
168 | ||
169 | static int g33_do_reset(struct drm_i915_private *i915, | |
170 | unsigned int engine_mask, | |
171 | unsigned int retry) | |
172 | { | |
173 | struct pci_dev *pdev = i915->drm.pdev; | |
174 | ||
175 | pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); | |
ade8a0f5 | 176 | return wait_for_atomic(g4x_reset_complete(pdev), 50); |
9f58892e CW |
177 | } |
178 | ||
179 | static int g4x_do_reset(struct drm_i915_private *dev_priv, | |
180 | unsigned int engine_mask, | |
181 | unsigned int retry) | |
182 | { | |
183 | struct pci_dev *pdev = dev_priv->drm.pdev; | |
184 | int ret; | |
185 | ||
186 | /* WaVcpClkGateDisableForMediaReset:ctg,elk */ | |
ade8a0f5 CW |
187 | I915_WRITE_FW(VDECCLK_GATE_D, |
188 | I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE); | |
189 | POSTING_READ_FW(VDECCLK_GATE_D); | |
9f58892e CW |
190 | |
191 | pci_write_config_byte(pdev, I915_GDRST, | |
192 | GRDOM_MEDIA | GRDOM_RESET_ENABLE); | |
ade8a0f5 | 193 | ret = wait_for_atomic(g4x_reset_complete(pdev), 50); |
9f58892e CW |
194 | if (ret) { |
195 | DRM_DEBUG_DRIVER("Wait for media reset failed\n"); | |
196 | goto out; | |
197 | } | |
198 | ||
199 | pci_write_config_byte(pdev, I915_GDRST, | |
200 | GRDOM_RENDER | GRDOM_RESET_ENABLE); | |
ade8a0f5 | 201 | ret = wait_for_atomic(g4x_reset_complete(pdev), 50); |
9f58892e CW |
202 | if (ret) { |
203 | DRM_DEBUG_DRIVER("Wait for render reset failed\n"); | |
204 | goto out; | |
205 | } | |
206 | ||
207 | out: | |
208 | pci_write_config_byte(pdev, I915_GDRST, 0); | |
209 | ||
ade8a0f5 CW |
210 | I915_WRITE_FW(VDECCLK_GATE_D, |
211 | I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE); | |
212 | POSTING_READ_FW(VDECCLK_GATE_D); | |
9f58892e CW |
213 | |
214 | return ret; | |
215 | } | |
216 | ||
217 | static int ironlake_do_reset(struct drm_i915_private *dev_priv, | |
218 | unsigned int engine_mask, | |
219 | unsigned int retry) | |
220 | { | |
221 | int ret; | |
222 | ||
ade8a0f5 CW |
223 | I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); |
224 | ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR, | |
225 | ILK_GRDOM_RESET_ENABLE, 0, | |
226 | 5000, 0, | |
227 | NULL); | |
9f58892e CW |
228 | if (ret) { |
229 | DRM_DEBUG_DRIVER("Wait for render reset failed\n"); | |
230 | goto out; | |
231 | } | |
232 | ||
ade8a0f5 CW |
233 | I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); |
234 | ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR, | |
235 | ILK_GRDOM_RESET_ENABLE, 0, | |
236 | 5000, 0, | |
237 | NULL); | |
9f58892e CW |
238 | if (ret) { |
239 | DRM_DEBUG_DRIVER("Wait for media reset failed\n"); | |
240 | goto out; | |
241 | } | |
242 | ||
243 | out: | |
ade8a0f5 CW |
244 | I915_WRITE_FW(ILK_GDSR, 0); |
245 | POSTING_READ_FW(ILK_GDSR); | |
9f58892e CW |
246 | return ret; |
247 | } | |
248 | ||
249 | /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ | |
250 | static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv, | |
251 | u32 hw_domain_mask) | |
252 | { | |
253 | int err; | |
254 | ||
255 | /* | |
256 | * GEN6_GDRST is not in the gt power well, no need to check | |
257 | * for fifo space for the write or forcewake the chip for | |
258 | * the read | |
259 | */ | |
260 | I915_WRITE_FW(GEN6_GDRST, hw_domain_mask); | |
261 | ||
262 | /* Wait for the device to ack the reset requests */ | |
263 | err = __intel_wait_for_register_fw(dev_priv, | |
264 | GEN6_GDRST, hw_domain_mask, 0, | |
265 | 500, 0, | |
266 | NULL); | |
267 | if (err) | |
268 | DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", | |
269 | hw_domain_mask); | |
270 | ||
271 | return err; | |
272 | } | |
273 | ||
274 | static int gen6_reset_engines(struct drm_i915_private *i915, | |
275 | unsigned int engine_mask, | |
276 | unsigned int retry) | |
277 | { | |
278 | struct intel_engine_cs *engine; | |
279 | const u32 hw_engine_mask[I915_NUM_ENGINES] = { | |
280 | [RCS] = GEN6_GRDOM_RENDER, | |
281 | [BCS] = GEN6_GRDOM_BLT, | |
282 | [VCS] = GEN6_GRDOM_MEDIA, | |
283 | [VCS2] = GEN8_GRDOM_MEDIA2, | |
284 | [VECS] = GEN6_GRDOM_VECS, | |
285 | }; | |
286 | u32 hw_mask; | |
287 | ||
288 | if (engine_mask == ALL_ENGINES) { | |
289 | hw_mask = GEN6_GRDOM_FULL; | |
290 | } else { | |
291 | unsigned int tmp; | |
292 | ||
293 | hw_mask = 0; | |
294 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
295 | hw_mask |= hw_engine_mask[engine->id]; | |
296 | } | |
297 | ||
298 | return gen6_hw_domain_reset(i915, hw_mask); | |
299 | } | |
300 | ||
301 | static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv, | |
302 | struct intel_engine_cs *engine) | |
303 | { | |
304 | u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; | |
305 | i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; | |
306 | u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; | |
307 | i915_reg_t sfc_usage; | |
308 | u32 sfc_usage_bit; | |
309 | u32 sfc_reset_bit; | |
310 | ||
311 | switch (engine->class) { | |
312 | case VIDEO_DECODE_CLASS: | |
313 | if ((BIT(engine->instance) & vdbox_sfc_access) == 0) | |
314 | return 0; | |
315 | ||
316 | sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); | |
317 | sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; | |
318 | ||
319 | sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); | |
320 | sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; | |
321 | ||
322 | sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); | |
323 | sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; | |
324 | sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); | |
325 | break; | |
326 | ||
327 | case VIDEO_ENHANCEMENT_CLASS: | |
328 | sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); | |
329 | sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; | |
330 | ||
331 | sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); | |
332 | sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; | |
333 | ||
334 | sfc_usage = GEN11_VECS_SFC_USAGE(engine); | |
335 | sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; | |
336 | sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); | |
337 | break; | |
338 | ||
339 | default: | |
340 | return 0; | |
341 | } | |
342 | ||
343 | /* | |
344 | * Tell the engine that a software reset is going to happen. The engine | |
345 | * will then try to force lock the SFC (if currently locked, it will | |
346 | * remain so until we tell the engine it is safe to unlock; if currently | |
347 | * unlocked, it will ignore this and all new lock requests). If SFC | |
348 | * ends up being locked to the engine we want to reset, we have to reset | |
349 | * it as well (we will unlock it once the reset sequence is completed). | |
350 | */ | |
351 | I915_WRITE_FW(sfc_forced_lock, | |
352 | I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit); | |
353 | ||
354 | if (__intel_wait_for_register_fw(dev_priv, | |
355 | sfc_forced_lock_ack, | |
356 | sfc_forced_lock_ack_bit, | |
357 | sfc_forced_lock_ack_bit, | |
358 | 1000, 0, NULL)) { | |
359 | DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); | |
360 | return 0; | |
361 | } | |
362 | ||
363 | if (I915_READ_FW(sfc_usage) & sfc_usage_bit) | |
364 | return sfc_reset_bit; | |
365 | ||
366 | return 0; | |
367 | } | |
368 | ||
369 | static void gen11_unlock_sfc(struct drm_i915_private *dev_priv, | |
370 | struct intel_engine_cs *engine) | |
371 | { | |
372 | u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; | |
373 | i915_reg_t sfc_forced_lock; | |
374 | u32 sfc_forced_lock_bit; | |
375 | ||
376 | switch (engine->class) { | |
377 | case VIDEO_DECODE_CLASS: | |
378 | if ((BIT(engine->instance) & vdbox_sfc_access) == 0) | |
379 | return; | |
380 | ||
381 | sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); | |
382 | sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; | |
383 | break; | |
384 | ||
385 | case VIDEO_ENHANCEMENT_CLASS: | |
386 | sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); | |
387 | sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; | |
388 | break; | |
389 | ||
390 | default: | |
391 | return; | |
392 | } | |
393 | ||
394 | I915_WRITE_FW(sfc_forced_lock, | |
395 | I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit); | |
396 | } | |
397 | ||
398 | static int gen11_reset_engines(struct drm_i915_private *i915, | |
399 | unsigned int engine_mask, | |
400 | unsigned int retry) | |
401 | { | |
402 | const u32 hw_engine_mask[I915_NUM_ENGINES] = { | |
403 | [RCS] = GEN11_GRDOM_RENDER, | |
404 | [BCS] = GEN11_GRDOM_BLT, | |
405 | [VCS] = GEN11_GRDOM_MEDIA, | |
406 | [VCS2] = GEN11_GRDOM_MEDIA2, | |
407 | [VCS3] = GEN11_GRDOM_MEDIA3, | |
408 | [VCS4] = GEN11_GRDOM_MEDIA4, | |
409 | [VECS] = GEN11_GRDOM_VECS, | |
410 | [VECS2] = GEN11_GRDOM_VECS2, | |
411 | }; | |
412 | struct intel_engine_cs *engine; | |
413 | unsigned int tmp; | |
414 | u32 hw_mask; | |
415 | int ret; | |
416 | ||
417 | BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES); | |
418 | ||
419 | if (engine_mask == ALL_ENGINES) { | |
420 | hw_mask = GEN11_GRDOM_FULL; | |
421 | } else { | |
422 | hw_mask = 0; | |
423 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
424 | hw_mask |= hw_engine_mask[engine->id]; | |
425 | hw_mask |= gen11_lock_sfc(i915, engine); | |
426 | } | |
427 | } | |
428 | ||
429 | ret = gen6_hw_domain_reset(i915, hw_mask); | |
430 | ||
431 | if (engine_mask != ALL_ENGINES) | |
432 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
433 | gen11_unlock_sfc(i915, engine); | |
434 | ||
435 | return ret; | |
436 | } | |
437 | ||
438 | static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) | |
439 | { | |
440 | struct drm_i915_private *dev_priv = engine->i915; | |
441 | int ret; | |
442 | ||
443 | I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base), | |
444 | _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET)); | |
445 | ||
446 | ret = __intel_wait_for_register_fw(dev_priv, | |
447 | RING_RESET_CTL(engine->mmio_base), | |
448 | RESET_CTL_READY_TO_RESET, | |
449 | RESET_CTL_READY_TO_RESET, | |
450 | 700, 0, | |
451 | NULL); | |
452 | if (ret) | |
453 | DRM_ERROR("%s: reset request timeout\n", engine->name); | |
454 | ||
455 | return ret; | |
456 | } | |
457 | ||
458 | static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) | |
459 | { | |
460 | struct drm_i915_private *dev_priv = engine->i915; | |
461 | ||
462 | I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base), | |
463 | _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); | |
464 | } | |
465 | ||
466 | static int gen8_reset_engines(struct drm_i915_private *i915, | |
467 | unsigned int engine_mask, | |
468 | unsigned int retry) | |
469 | { | |
470 | struct intel_engine_cs *engine; | |
471 | const bool reset_non_ready = retry >= 1; | |
472 | unsigned int tmp; | |
473 | int ret; | |
474 | ||
475 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
476 | ret = gen8_engine_reset_prepare(engine); | |
477 | if (ret && !reset_non_ready) | |
478 | goto skip_reset; | |
479 | ||
480 | /* | |
481 | * If this is not the first failed attempt to prepare, | |
482 | * we decide to proceed anyway. | |
483 | * | |
484 | * By doing so we risk context corruption and with | |
485 | * some gens (kbl), possible system hang if reset | |
486 | * happens during active bb execution. | |
487 | * | |
488 | * We rather take context corruption instead of | |
489 | * failed reset with a wedged driver/gpu. And | |
490 | * active bb execution case should be covered by | |
491 | * i915_stop_engines we have before the reset. | |
492 | */ | |
493 | } | |
494 | ||
495 | if (INTEL_GEN(i915) >= 11) | |
496 | ret = gen11_reset_engines(i915, engine_mask, retry); | |
497 | else | |
498 | ret = gen6_reset_engines(i915, engine_mask, retry); | |
499 | ||
500 | skip_reset: | |
501 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
502 | gen8_engine_reset_cancel(engine); | |
503 | ||
504 | return ret; | |
505 | } | |
506 | ||
507 | typedef int (*reset_func)(struct drm_i915_private *, | |
508 | unsigned int engine_mask, | |
509 | unsigned int retry); | |
510 | ||
511 | static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) | |
512 | { | |
513 | if (!i915_modparams.reset) | |
514 | return NULL; | |
515 | ||
516 | if (INTEL_GEN(i915) >= 8) | |
517 | return gen8_reset_engines; | |
518 | else if (INTEL_GEN(i915) >= 6) | |
519 | return gen6_reset_engines; | |
520 | else if (INTEL_GEN(i915) >= 5) | |
521 | return ironlake_do_reset; | |
522 | else if (IS_G4X(i915)) | |
523 | return g4x_do_reset; | |
524 | else if (IS_G33(i915) || IS_PINEVIEW(i915)) | |
525 | return g33_do_reset; | |
526 | else if (INTEL_GEN(i915) >= 3) | |
527 | return i915_do_reset; | |
528 | else | |
529 | return NULL; | |
530 | } | |
531 | ||
532 | int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask) | |
533 | { | |
ade8a0f5 CW |
534 | const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; |
535 | reset_func reset; | |
536 | int ret = -ETIMEDOUT; | |
9f58892e | 537 | int retry; |
9f58892e | 538 | |
ade8a0f5 CW |
539 | reset = intel_get_gpu_reset(i915); |
540 | if (!reset) | |
541 | return -ENODEV; | |
9f58892e CW |
542 | |
543 | /* | |
544 | * If the power well sleeps during the reset, the reset | |
545 | * request may be dropped and never completes (causing -EIO). | |
546 | */ | |
547 | intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); | |
ade8a0f5 | 548 | for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { |
9f58892e CW |
549 | /* |
550 | * We stop engines, otherwise we might get failed reset and a | |
551 | * dead gpu (on elk). Also as modern gpu as kbl can suffer | |
552 | * from system hang if batchbuffer is progressing when | |
553 | * the reset is issued, regardless of READY_TO_RESET ack. | |
554 | * Thus assume it is best to stop engines on all gens | |
555 | * where we have a gpu reset. | |
556 | * | |
557 | * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) | |
558 | * | |
559 | * WaMediaResetMainRingCleanup:ctg,elk (presumably) | |
560 | * | |
561 | * FIXME: Wa for more modern gens needs to be validated | |
562 | */ | |
563 | i915_stop_engines(i915, engine_mask); | |
564 | ||
ade8a0f5 CW |
565 | GEM_TRACE("engine_mask=%x\n", engine_mask); |
566 | preempt_disable(); | |
567 | ret = reset(i915, engine_mask, retry); | |
568 | preempt_enable(); | |
9f58892e CW |
569 | } |
570 | intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); | |
571 | ||
572 | return ret; | |
573 | } | |
574 | ||
575 | bool intel_has_gpu_reset(struct drm_i915_private *i915) | |
576 | { | |
fe62365f CW |
577 | if (USES_GUC(i915)) |
578 | return false; | |
579 | ||
9f58892e CW |
580 | return intel_get_gpu_reset(i915); |
581 | } | |
582 | ||
583 | bool intel_has_reset_engine(struct drm_i915_private *i915) | |
584 | { | |
585 | return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; | |
586 | } | |
587 | ||
588 | int intel_reset_guc(struct drm_i915_private *i915) | |
589 | { | |
590 | u32 guc_domain = | |
591 | INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; | |
592 | int ret; | |
593 | ||
594 | GEM_BUG_ON(!HAS_GUC(i915)); | |
595 | ||
596 | intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); | |
597 | ret = gen6_hw_domain_reset(i915, guc_domain); | |
598 | intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); | |
599 | ||
600 | return ret; | |
601 | } | |
602 | ||
603 | /* | |
604 | * Ensure irq handler finishes, and not run again. | |
605 | * Also return the active request so that we only search for it once. | |
606 | */ | |
607 | static struct i915_request * | |
608 | reset_prepare_engine(struct intel_engine_cs *engine) | |
609 | { | |
610 | struct i915_request *rq; | |
611 | ||
612 | /* | |
613 | * During the reset sequence, we must prevent the engine from | |
614 | * entering RC6. As the context state is undefined until we restart | |
615 | * the engine, if it does enter RC6 during the reset, the state | |
616 | * written to the powercontext is undefined and so we may lose | |
617 | * GPU state upon resume, i.e. fail to restart after a reset. | |
618 | */ | |
619 | intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL); | |
620 | ||
621 | rq = engine->reset.prepare(engine); | |
622 | if (rq && rq->fence.error == -EIO) | |
623 | rq = ERR_PTR(-EIO); /* Previous reset failed! */ | |
624 | ||
625 | return rq; | |
626 | } | |
627 | ||
628 | static int reset_prepare(struct drm_i915_private *i915) | |
629 | { | |
630 | struct intel_engine_cs *engine; | |
631 | struct i915_request *rq; | |
632 | enum intel_engine_id id; | |
633 | int err = 0; | |
634 | ||
635 | for_each_engine(engine, i915, id) { | |
636 | rq = reset_prepare_engine(engine); | |
637 | if (IS_ERR(rq)) { | |
638 | err = PTR_ERR(rq); | |
639 | continue; | |
640 | } | |
641 | ||
642 | engine->hangcheck.active_request = rq; | |
643 | } | |
644 | ||
645 | i915_gem_revoke_fences(i915); | |
646 | intel_uc_sanitize(i915); | |
647 | ||
648 | return err; | |
649 | } | |
650 | ||
651 | /* Returns the request if it was guilty of the hang */ | |
652 | static struct i915_request * | |
653 | reset_request(struct intel_engine_cs *engine, | |
654 | struct i915_request *rq, | |
655 | bool stalled) | |
656 | { | |
657 | /* | |
658 | * The guilty request will get skipped on a hung engine. | |
659 | * | |
660 | * Users of client default contexts do not rely on logical | |
661 | * state preserved between batches so it is safe to execute | |
662 | * queued requests following the hang. Non default contexts | |
663 | * rely on preserved state, so skipping a batch loses the | |
664 | * evolution of the state and it needs to be considered corrupted. | |
665 | * Executing more queued batches on top of corrupted state is | |
666 | * risky. But we take the risk by trying to advance through | |
667 | * the queued requests in order to make the client behaviour | |
668 | * more predictable around resets, by not throwing away random | |
669 | * amount of batches it has prepared for execution. Sophisticated | |
670 | * clients can use gem_reset_stats_ioctl and dma fence status | |
671 | * (exported via sync_file info ioctl on explicit fences) to observe | |
672 | * when it loses the context state and should rebuild accordingly. | |
673 | * | |
674 | * The context ban, and ultimately the client ban, mechanism are safety | |
675 | * valves if client submission ends up resulting in nothing more than | |
676 | * subsequent hangs. | |
677 | */ | |
678 | ||
679 | if (i915_request_completed(rq)) { | |
680 | GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n", | |
681 | engine->name, rq->global_seqno, | |
682 | rq->fence.context, rq->fence.seqno, | |
683 | intel_engine_get_seqno(engine)); | |
684 | stalled = false; | |
685 | } | |
686 | ||
687 | if (stalled) { | |
688 | context_mark_guilty(rq->gem_context); | |
689 | i915_request_skip(rq, -EIO); | |
690 | ||
691 | /* If this context is now banned, skip all pending requests. */ | |
692 | if (i915_gem_context_is_banned(rq->gem_context)) | |
693 | engine_skip_context(rq); | |
694 | } else { | |
695 | /* | |
696 | * Since this is not the hung engine, it may have advanced | |
697 | * since the hang declaration. Double check by refinding | |
698 | * the active request at the time of the reset. | |
699 | */ | |
700 | rq = i915_gem_find_active_request(engine); | |
701 | if (rq) { | |
702 | unsigned long flags; | |
703 | ||
704 | context_mark_innocent(rq->gem_context); | |
705 | dma_fence_set_error(&rq->fence, -EAGAIN); | |
706 | ||
707 | /* Rewind the engine to replay the incomplete rq */ | |
708 | spin_lock_irqsave(&engine->timeline.lock, flags); | |
709 | rq = list_prev_entry(rq, link); | |
710 | if (&rq->link == &engine->timeline.requests) | |
711 | rq = NULL; | |
712 | spin_unlock_irqrestore(&engine->timeline.lock, flags); | |
713 | } | |
714 | } | |
715 | ||
716 | return rq; | |
717 | } | |
718 | ||
719 | static void reset_engine(struct intel_engine_cs *engine, | |
720 | struct i915_request *rq, | |
721 | bool stalled) | |
722 | { | |
723 | if (rq) | |
724 | rq = reset_request(engine, rq, stalled); | |
725 | ||
726 | /* Setup the CS to resume from the breadcrumb of the hung request */ | |
727 | engine->reset.reset(engine, rq); | |
728 | } | |
729 | ||
730 | static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask) | |
731 | { | |
732 | struct intel_engine_cs *engine; | |
733 | enum intel_engine_id id; | |
734 | ||
735 | lockdep_assert_held(&i915->drm.struct_mutex); | |
736 | ||
737 | i915_retire_requests(i915); | |
738 | ||
739 | for_each_engine(engine, i915, id) { | |
740 | struct intel_context *ce; | |
741 | ||
742 | reset_engine(engine, | |
743 | engine->hangcheck.active_request, | |
744 | stalled_mask & ENGINE_MASK(id)); | |
745 | ce = fetch_and_zero(&engine->last_retired_context); | |
746 | if (ce) | |
747 | intel_context_unpin(ce); | |
748 | ||
749 | /* | |
750 | * Ostensibily, we always want a context loaded for powersaving, | |
751 | * so if the engine is idle after the reset, send a request | |
752 | * to load our scratch kernel_context. | |
753 | * | |
754 | * More mysteriously, if we leave the engine idle after a reset, | |
755 | * the next userspace batch may hang, with what appears to be | |
756 | * an incoherent read by the CS (presumably stale TLB). An | |
757 | * empty request appears sufficient to paper over the glitch. | |
758 | */ | |
759 | if (intel_engine_is_idle(engine)) { | |
760 | struct i915_request *rq; | |
761 | ||
762 | rq = i915_request_alloc(engine, i915->kernel_context); | |
763 | if (!IS_ERR(rq)) | |
764 | i915_request_add(rq); | |
765 | } | |
766 | } | |
767 | ||
768 | i915_gem_restore_fences(i915); | |
769 | } | |
770 | ||
771 | static void reset_finish_engine(struct intel_engine_cs *engine) | |
772 | { | |
773 | engine->reset.finish(engine); | |
774 | ||
775 | intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); | |
776 | } | |
777 | ||
778 | static void reset_finish(struct drm_i915_private *i915) | |
779 | { | |
780 | struct intel_engine_cs *engine; | |
781 | enum intel_engine_id id; | |
782 | ||
783 | lockdep_assert_held(&i915->drm.struct_mutex); | |
784 | ||
785 | for_each_engine(engine, i915, id) { | |
786 | engine->hangcheck.active_request = NULL; | |
787 | reset_finish_engine(engine); | |
788 | } | |
789 | } | |
790 | ||
791 | static void nop_submit_request(struct i915_request *request) | |
792 | { | |
793 | unsigned long flags; | |
794 | ||
795 | GEM_TRACE("%s fence %llx:%lld -> -EIO\n", | |
796 | request->engine->name, | |
797 | request->fence.context, request->fence.seqno); | |
798 | dma_fence_set_error(&request->fence, -EIO); | |
799 | ||
800 | spin_lock_irqsave(&request->engine->timeline.lock, flags); | |
801 | __i915_request_submit(request); | |
802 | intel_engine_write_global_seqno(request->engine, request->global_seqno); | |
803 | spin_unlock_irqrestore(&request->engine->timeline.lock, flags); | |
804 | } | |
805 | ||
806 | void i915_gem_set_wedged(struct drm_i915_private *i915) | |
807 | { | |
808 | struct i915_gpu_error *error = &i915->gpu_error; | |
809 | struct intel_engine_cs *engine; | |
810 | enum intel_engine_id id; | |
811 | ||
812 | mutex_lock(&error->wedge_mutex); | |
813 | if (test_bit(I915_WEDGED, &error->flags)) { | |
814 | mutex_unlock(&error->wedge_mutex); | |
815 | return; | |
816 | } | |
817 | ||
818 | if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { | |
819 | struct drm_printer p = drm_debug_printer(__func__); | |
820 | ||
821 | for_each_engine(engine, i915, id) | |
822 | intel_engine_dump(engine, &p, "%s\n", engine->name); | |
823 | } | |
824 | ||
825 | GEM_TRACE("start\n"); | |
826 | ||
827 | /* | |
828 | * First, stop submission to hw, but do not yet complete requests by | |
829 | * rolling the global seqno forward (since this would complete requests | |
830 | * for which we haven't set the fence error to EIO yet). | |
831 | */ | |
832 | for_each_engine(engine, i915, id) | |
833 | reset_prepare_engine(engine); | |
834 | ||
835 | /* Even if the GPU reset fails, it should still stop the engines */ | |
836 | if (INTEL_GEN(i915) >= 5) | |
837 | intel_gpu_reset(i915, ALL_ENGINES); | |
838 | ||
839 | for_each_engine(engine, i915, id) { | |
840 | engine->submit_request = nop_submit_request; | |
841 | engine->schedule = NULL; | |
842 | } | |
843 | i915->caps.scheduler = 0; | |
844 | ||
845 | /* | |
846 | * Make sure no request can slip through without getting completed by | |
847 | * either this call here to intel_engine_write_global_seqno, or the one | |
848 | * in nop_submit_request. | |
849 | */ | |
850 | synchronize_rcu(); | |
851 | ||
852 | /* Mark all executing requests as skipped */ | |
853 | for_each_engine(engine, i915, id) | |
854 | engine->cancel_requests(engine); | |
855 | ||
856 | for_each_engine(engine, i915, id) { | |
857 | reset_finish_engine(engine); | |
858 | intel_engine_wakeup(engine); | |
859 | } | |
860 | ||
861 | smp_mb__before_atomic(); | |
862 | set_bit(I915_WEDGED, &error->flags); | |
863 | ||
864 | GEM_TRACE("end\n"); | |
865 | mutex_unlock(&error->wedge_mutex); | |
866 | ||
867 | wake_up_all(&error->reset_queue); | |
868 | } | |
869 | ||
870 | bool i915_gem_unset_wedged(struct drm_i915_private *i915) | |
871 | { | |
872 | struct i915_gpu_error *error = &i915->gpu_error; | |
873 | struct i915_timeline *tl; | |
874 | bool ret = false; | |
875 | ||
876 | lockdep_assert_held(&i915->drm.struct_mutex); | |
877 | ||
878 | if (!test_bit(I915_WEDGED, &error->flags)) | |
879 | return true; | |
880 | ||
881 | if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ | |
882 | return false; | |
883 | ||
884 | mutex_lock(&error->wedge_mutex); | |
885 | ||
886 | GEM_TRACE("start\n"); | |
887 | ||
888 | /* | |
889 | * Before unwedging, make sure that all pending operations | |
890 | * are flushed and errored out - we may have requests waiting upon | |
891 | * third party fences. We marked all inflight requests as EIO, and | |
892 | * every execbuf since returned EIO, for consistency we want all | |
893 | * the currently pending requests to also be marked as EIO, which | |
894 | * is done inside our nop_submit_request - and so we must wait. | |
895 | * | |
896 | * No more can be submitted until we reset the wedged bit. | |
897 | */ | |
898 | list_for_each_entry(tl, &i915->gt.timelines, link) { | |
899 | struct i915_request *rq; | |
900 | ||
901 | rq = i915_gem_active_peek(&tl->last_request, | |
902 | &i915->drm.struct_mutex); | |
903 | if (!rq) | |
904 | continue; | |
905 | ||
906 | /* | |
907 | * We can't use our normal waiter as we want to | |
908 | * avoid recursively trying to handle the current | |
909 | * reset. The basic dma_fence_default_wait() installs | |
910 | * a callback for dma_fence_signal(), which is | |
911 | * triggered by our nop handler (indirectly, the | |
912 | * callback enables the signaler thread which is | |
913 | * woken by the nop_submit_request() advancing the seqno | |
914 | * and when the seqno passes the fence, the signaler | |
915 | * then signals the fence waking us up). | |
916 | */ | |
917 | if (dma_fence_default_wait(&rq->fence, true, | |
918 | MAX_SCHEDULE_TIMEOUT) < 0) | |
919 | goto unlock; | |
920 | } | |
921 | i915_retire_requests(i915); | |
922 | GEM_BUG_ON(i915->gt.active_requests); | |
923 | ||
924 | intel_engines_sanitize(i915, false); | |
925 | ||
926 | /* | |
927 | * Undo nop_submit_request. We prevent all new i915 requests from | |
928 | * being queued (by disallowing execbuf whilst wedged) so having | |
929 | * waited for all active requests above, we know the system is idle | |
930 | * and do not have to worry about a thread being inside | |
931 | * engine->submit_request() as we swap over. So unlike installing | |
932 | * the nop_submit_request on reset, we can do this from normal | |
933 | * context and do not require stop_machine(). | |
934 | */ | |
935 | intel_engines_reset_default_submission(i915); | |
936 | i915_gem_contexts_lost(i915); | |
937 | ||
938 | GEM_TRACE("end\n"); | |
939 | ||
940 | smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ | |
941 | clear_bit(I915_WEDGED, &i915->gpu_error.flags); | |
942 | ret = true; | |
943 | unlock: | |
944 | mutex_unlock(&i915->gpu_error.wedge_mutex); | |
945 | ||
946 | return ret; | |
947 | } | |
948 | ||
949 | /** | |
950 | * i915_reset - reset chip after a hang | |
951 | * @i915: #drm_i915_private to reset | |
952 | * @stalled_mask: mask of the stalled engines with the guilty requests | |
953 | * @reason: user error message for why we are resetting | |
954 | * | |
955 | * Reset the chip. Useful if a hang is detected. Marks the device as wedged | |
956 | * on failure. | |
957 | * | |
958 | * Caller must hold the struct_mutex. | |
959 | * | |
960 | * Procedure is fairly simple: | |
961 | * - reset the chip using the reset reg | |
962 | * - re-init context state | |
963 | * - re-init hardware status page | |
964 | * - re-init ring buffer | |
965 | * - re-init interrupt state | |
966 | * - re-init display | |
967 | */ | |
968 | void i915_reset(struct drm_i915_private *i915, | |
969 | unsigned int stalled_mask, | |
970 | const char *reason) | |
971 | { | |
972 | struct i915_gpu_error *error = &i915->gpu_error; | |
973 | int ret; | |
974 | int i; | |
975 | ||
976 | GEM_TRACE("flags=%lx\n", error->flags); | |
977 | ||
978 | might_sleep(); | |
979 | lockdep_assert_held(&i915->drm.struct_mutex); | |
980 | assert_rpm_wakelock_held(i915); | |
981 | GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); | |
982 | ||
983 | if (!test_bit(I915_RESET_HANDOFF, &error->flags)) | |
984 | return; | |
985 | ||
986 | /* Clear any previous failed attempts at recovery. Time to try again. */ | |
987 | if (!i915_gem_unset_wedged(i915)) | |
988 | goto wakeup; | |
989 | ||
990 | if (reason) | |
991 | dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); | |
992 | error->reset_count++; | |
993 | ||
994 | ret = reset_prepare(i915); | |
995 | if (ret) { | |
996 | dev_err(i915->drm.dev, "GPU recovery failed\n"); | |
997 | goto taint; | |
998 | } | |
999 | ||
1000 | if (!intel_has_gpu_reset(i915)) { | |
1001 | if (i915_modparams.reset) | |
1002 | dev_err(i915->drm.dev, "GPU reset not supported\n"); | |
1003 | else | |
1004 | DRM_DEBUG_DRIVER("GPU reset disabled\n"); | |
1005 | goto error; | |
1006 | } | |
1007 | ||
ade8a0f5 | 1008 | for (i = 0; i < RESET_MAX_RETRIES; i++) { |
9f58892e CW |
1009 | ret = intel_gpu_reset(i915, ALL_ENGINES); |
1010 | if (ret == 0) | |
1011 | break; | |
1012 | ||
1013 | msleep(100); | |
1014 | } | |
1015 | if (ret) { | |
1016 | dev_err(i915->drm.dev, "Failed to reset chip\n"); | |
1017 | goto taint; | |
1018 | } | |
1019 | ||
1020 | /* Ok, now get things going again... */ | |
1021 | ||
1022 | /* | |
1023 | * Everything depends on having the GTT running, so we need to start | |
1024 | * there. | |
1025 | */ | |
1026 | ret = i915_ggtt_enable_hw(i915); | |
1027 | if (ret) { | |
1028 | DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n", | |
1029 | ret); | |
1030 | goto error; | |
1031 | } | |
1032 | ||
1033 | gt_reset(i915, stalled_mask); | |
1034 | intel_overlay_reset(i915); | |
1035 | ||
1036 | /* | |
1037 | * Next we need to restore the context, but we don't use those | |
1038 | * yet either... | |
1039 | * | |
1040 | * Ring buffer needs to be re-initialized in the KMS case, or if X | |
1041 | * was running at the time of the reset (i.e. we weren't VT | |
1042 | * switched away). | |
1043 | */ | |
1044 | ret = i915_gem_init_hw(i915); | |
1045 | if (ret) { | |
1046 | DRM_ERROR("Failed to initialise HW following reset (%d)\n", | |
1047 | ret); | |
1048 | goto error; | |
1049 | } | |
1050 | ||
1051 | i915_queue_hangcheck(i915); | |
1052 | ||
1053 | finish: | |
1054 | reset_finish(i915); | |
1055 | wakeup: | |
1056 | clear_bit(I915_RESET_HANDOFF, &error->flags); | |
1057 | wake_up_bit(&error->flags, I915_RESET_HANDOFF); | |
1058 | return; | |
1059 | ||
1060 | taint: | |
1061 | /* | |
1062 | * History tells us that if we cannot reset the GPU now, we | |
1063 | * never will. This then impacts everything that is run | |
1064 | * subsequently. On failing the reset, we mark the driver | |
1065 | * as wedged, preventing further execution on the GPU. | |
1066 | * We also want to go one step further and add a taint to the | |
1067 | * kernel so that any subsequent faults can be traced back to | |
1068 | * this failure. This is important for CI, where if the | |
1069 | * GPU/driver fails we would like to reboot and restart testing | |
1070 | * rather than continue on into oblivion. For everyone else, | |
1071 | * the system should still plod along, but they have been warned! | |
1072 | */ | |
1073 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | |
1074 | error: | |
1075 | i915_gem_set_wedged(i915); | |
1076 | i915_retire_requests(i915); | |
1077 | goto finish; | |
1078 | } | |
1079 | ||
1080 | static inline int intel_gt_reset_engine(struct drm_i915_private *i915, | |
1081 | struct intel_engine_cs *engine) | |
1082 | { | |
1083 | return intel_gpu_reset(i915, intel_engine_flag(engine)); | |
1084 | } | |
1085 | ||
1086 | /** | |
1087 | * i915_reset_engine - reset GPU engine to recover from a hang | |
1088 | * @engine: engine to reset | |
1089 | * @msg: reason for GPU reset; or NULL for no dev_notice() | |
1090 | * | |
1091 | * Reset a specific GPU engine. Useful if a hang is detected. | |
1092 | * Returns zero on successful reset or otherwise an error code. | |
1093 | * | |
1094 | * Procedure is: | |
1095 | * - identifies the request that caused the hang and it is dropped | |
1096 | * - reset engine (which will force the engine to idle) | |
1097 | * - re-init/configure engine | |
1098 | */ | |
1099 | int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) | |
1100 | { | |
1101 | struct i915_gpu_error *error = &engine->i915->gpu_error; | |
1102 | struct i915_request *active_request; | |
1103 | int ret; | |
1104 | ||
1105 | GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); | |
1106 | GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); | |
1107 | ||
1108 | active_request = reset_prepare_engine(engine); | |
1109 | if (IS_ERR_OR_NULL(active_request)) { | |
1110 | /* Either the previous reset failed, or we pardon the reset. */ | |
1111 | ret = PTR_ERR(active_request); | |
1112 | goto out; | |
1113 | } | |
1114 | ||
1115 | if (msg) | |
1116 | dev_notice(engine->i915->drm.dev, | |
1117 | "Resetting %s for %s\n", engine->name, msg); | |
1118 | error->reset_engine_count[engine->id]++; | |
1119 | ||
1120 | if (!engine->i915->guc.execbuf_client) | |
1121 | ret = intel_gt_reset_engine(engine->i915, engine); | |
1122 | else | |
1123 | ret = intel_guc_reset_engine(&engine->i915->guc, engine); | |
1124 | if (ret) { | |
1125 | /* If we fail here, we expect to fallback to a global reset */ | |
1126 | DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", | |
1127 | engine->i915->guc.execbuf_client ? "GuC " : "", | |
1128 | engine->name, ret); | |
1129 | goto out; | |
1130 | } | |
1131 | ||
1132 | /* | |
1133 | * The request that caused the hang is stuck on elsp, we know the | |
1134 | * active request and can drop it, adjust head to skip the offending | |
1135 | * request to resume executing remaining requests in the queue. | |
1136 | */ | |
1137 | reset_engine(engine, active_request, true); | |
1138 | ||
1139 | /* | |
1140 | * The engine and its registers (and workarounds in case of render) | |
1141 | * have been reset to their default values. Follow the init_ring | |
1142 | * process to program RING_MODE, HWSP and re-enable submission. | |
1143 | */ | |
1144 | ret = engine->init_hw(engine); | |
1145 | if (ret) | |
1146 | goto out; | |
1147 | ||
1148 | out: | |
1149 | intel_engine_cancel_stop_cs(engine); | |
1150 | reset_finish_engine(engine); | |
1151 | return ret; | |
1152 | } | |
1153 | ||
1154 | static void i915_reset_device(struct drm_i915_private *i915, | |
1155 | u32 engine_mask, | |
1156 | const char *reason) | |
1157 | { | |
1158 | struct i915_gpu_error *error = &i915->gpu_error; | |
1159 | struct kobject *kobj = &i915->drm.primary->kdev->kobj; | |
1160 | char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; | |
1161 | char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; | |
1162 | char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; | |
1163 | struct i915_wedge_me w; | |
1164 | ||
1165 | kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); | |
1166 | ||
1167 | DRM_DEBUG_DRIVER("resetting chip\n"); | |
1168 | kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); | |
1169 | ||
1170 | /* Use a watchdog to ensure that our reset completes */ | |
1171 | i915_wedge_on_timeout(&w, i915, 5 * HZ) { | |
1172 | intel_prepare_reset(i915); | |
1173 | ||
1174 | error->reason = reason; | |
1175 | error->stalled_mask = engine_mask; | |
1176 | ||
1177 | /* Signal that locked waiters should reset the GPU */ | |
1178 | smp_mb__before_atomic(); | |
1179 | set_bit(I915_RESET_HANDOFF, &error->flags); | |
1180 | wake_up_all(&error->wait_queue); | |
1181 | ||
1182 | /* | |
1183 | * Wait for anyone holding the lock to wakeup, without | |
1184 | * blocking indefinitely on struct_mutex. | |
1185 | */ | |
1186 | do { | |
1187 | if (mutex_trylock(&i915->drm.struct_mutex)) { | |
1188 | i915_reset(i915, engine_mask, reason); | |
1189 | mutex_unlock(&i915->drm.struct_mutex); | |
1190 | } | |
1191 | } while (wait_on_bit_timeout(&error->flags, | |
1192 | I915_RESET_HANDOFF, | |
1193 | TASK_UNINTERRUPTIBLE, | |
1194 | 1)); | |
1195 | ||
1196 | error->stalled_mask = 0; | |
1197 | error->reason = NULL; | |
1198 | ||
1199 | intel_finish_reset(i915); | |
1200 | } | |
1201 | ||
1202 | if (!test_bit(I915_WEDGED, &error->flags)) | |
1203 | kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); | |
1204 | } | |
1205 | ||
1206 | void i915_clear_error_registers(struct drm_i915_private *dev_priv) | |
1207 | { | |
1208 | u32 eir; | |
1209 | ||
1210 | if (!IS_GEN(dev_priv, 2)) | |
1211 | I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER)); | |
1212 | ||
1213 | if (INTEL_GEN(dev_priv) < 4) | |
1214 | I915_WRITE(IPEIR, I915_READ(IPEIR)); | |
1215 | else | |
1216 | I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965)); | |
1217 | ||
1218 | I915_WRITE(EIR, I915_READ(EIR)); | |
1219 | eir = I915_READ(EIR); | |
1220 | if (eir) { | |
1221 | /* | |
1222 | * some errors might have become stuck, | |
1223 | * mask them. | |
1224 | */ | |
1225 | DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); | |
1226 | I915_WRITE(EMR, I915_READ(EMR) | eir); | |
1227 | I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT); | |
1228 | } | |
1229 | ||
1230 | if (INTEL_GEN(dev_priv) >= 8) { | |
1231 | I915_WRITE(GEN8_RING_FAULT_REG, | |
1232 | I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID); | |
1233 | POSTING_READ(GEN8_RING_FAULT_REG); | |
1234 | } else if (INTEL_GEN(dev_priv) >= 6) { | |
1235 | struct intel_engine_cs *engine; | |
1236 | enum intel_engine_id id; | |
1237 | ||
1238 | for_each_engine(engine, dev_priv, id) { | |
1239 | I915_WRITE(RING_FAULT_REG(engine), | |
1240 | I915_READ(RING_FAULT_REG(engine)) & | |
1241 | ~RING_FAULT_VALID); | |
1242 | } | |
1243 | POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS])); | |
1244 | } | |
1245 | } | |
1246 | ||
1247 | /** | |
1248 | * i915_handle_error - handle a gpu error | |
1249 | * @i915: i915 device private | |
1250 | * @engine_mask: mask representing engines that are hung | |
1251 | * @flags: control flags | |
1252 | * @fmt: Error message format string | |
1253 | * | |
1254 | * Do some basic checking of register state at error time and | |
1255 | * dump it to the syslog. Also call i915_capture_error_state() to make | |
1256 | * sure we get a record and make it available in debugfs. Fire a uevent | |
1257 | * so userspace knows something bad happened (should trigger collection | |
1258 | * of a ring dump etc.). | |
1259 | */ | |
1260 | void i915_handle_error(struct drm_i915_private *i915, | |
1261 | u32 engine_mask, | |
1262 | unsigned long flags, | |
1263 | const char *fmt, ...) | |
1264 | { | |
1265 | struct intel_engine_cs *engine; | |
1266 | intel_wakeref_t wakeref; | |
1267 | unsigned int tmp; | |
1268 | char error_msg[80]; | |
1269 | char *msg = NULL; | |
1270 | ||
1271 | if (fmt) { | |
1272 | va_list args; | |
1273 | ||
1274 | va_start(args, fmt); | |
1275 | vscnprintf(error_msg, sizeof(error_msg), fmt, args); | |
1276 | va_end(args); | |
1277 | ||
1278 | msg = error_msg; | |
1279 | } | |
1280 | ||
1281 | /* | |
1282 | * In most cases it's guaranteed that we get here with an RPM | |
1283 | * reference held, for example because there is a pending GPU | |
1284 | * request that won't finish until the reset is done. This | |
1285 | * isn't the case at least when we get here by doing a | |
1286 | * simulated reset via debugfs, so get an RPM reference. | |
1287 | */ | |
1288 | wakeref = intel_runtime_pm_get(i915); | |
1289 | ||
1290 | engine_mask &= INTEL_INFO(i915)->ring_mask; | |
1291 | ||
1292 | if (flags & I915_ERROR_CAPTURE) { | |
1293 | i915_capture_error_state(i915, engine_mask, msg); | |
1294 | i915_clear_error_registers(i915); | |
1295 | } | |
1296 | ||
1297 | /* | |
1298 | * Try engine reset when available. We fall back to full reset if | |
1299 | * single reset fails. | |
1300 | */ | |
1301 | if (intel_has_reset_engine(i915) && | |
1302 | !i915_terminally_wedged(&i915->gpu_error)) { | |
1303 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
1304 | BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); | |
1305 | if (test_and_set_bit(I915_RESET_ENGINE + engine->id, | |
1306 | &i915->gpu_error.flags)) | |
1307 | continue; | |
1308 | ||
1309 | if (i915_reset_engine(engine, msg) == 0) | |
1310 | engine_mask &= ~intel_engine_flag(engine); | |
1311 | ||
1312 | clear_bit(I915_RESET_ENGINE + engine->id, | |
1313 | &i915->gpu_error.flags); | |
1314 | wake_up_bit(&i915->gpu_error.flags, | |
1315 | I915_RESET_ENGINE + engine->id); | |
1316 | } | |
1317 | } | |
1318 | ||
1319 | if (!engine_mask) | |
1320 | goto out; | |
1321 | ||
1322 | /* Full reset needs the mutex, stop any other user trying to do so. */ | |
1323 | if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) { | |
1324 | wait_event(i915->gpu_error.reset_queue, | |
1325 | !test_bit(I915_RESET_BACKOFF, | |
1326 | &i915->gpu_error.flags)); | |
1327 | goto out; | |
1328 | } | |
1329 | ||
1330 | /* Prevent any other reset-engine attempt. */ | |
1331 | for_each_engine(engine, i915, tmp) { | |
1332 | while (test_and_set_bit(I915_RESET_ENGINE + engine->id, | |
1333 | &i915->gpu_error.flags)) | |
1334 | wait_on_bit(&i915->gpu_error.flags, | |
1335 | I915_RESET_ENGINE + engine->id, | |
1336 | TASK_UNINTERRUPTIBLE); | |
1337 | } | |
1338 | ||
1339 | i915_reset_device(i915, engine_mask, msg); | |
1340 | ||
1341 | for_each_engine(engine, i915, tmp) { | |
1342 | clear_bit(I915_RESET_ENGINE + engine->id, | |
1343 | &i915->gpu_error.flags); | |
1344 | } | |
1345 | ||
1346 | clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); | |
1347 | wake_up_all(&i915->gpu_error.reset_queue); | |
1348 | ||
1349 | out: | |
1350 | intel_runtime_pm_put(i915, wakeref); | |
1351 | } | |
1352 | ||
1353 | static void i915_wedge_me(struct work_struct *work) | |
1354 | { | |
1355 | struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); | |
1356 | ||
1357 | dev_err(w->i915->drm.dev, | |
1358 | "%s timed out, cancelling all in-flight rendering.\n", | |
1359 | w->name); | |
1360 | i915_gem_set_wedged(w->i915); | |
1361 | } | |
1362 | ||
1363 | void __i915_init_wedge(struct i915_wedge_me *w, | |
1364 | struct drm_i915_private *i915, | |
1365 | long timeout, | |
1366 | const char *name) | |
1367 | { | |
1368 | w->i915 = i915; | |
1369 | w->name = name; | |
1370 | ||
1371 | INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); | |
1372 | schedule_delayed_work(&w->work, timeout); | |
1373 | } | |
1374 | ||
1375 | void __i915_fini_wedge(struct i915_wedge_me *w) | |
1376 | { | |
1377 | cancel_delayed_work_sync(&w->work); | |
1378 | destroy_delayed_work_on_stack(&w->work); | |
1379 | w->i915 = NULL; | |
1380 | } |