Commit | Line | Data |
---|---|---|
9f58892e CW |
1 | /* |
2 | * SPDX-License-Identifier: MIT | |
3 | * | |
4 | * Copyright © 2008-2018 Intel Corporation | |
5 | */ | |
6 | ||
7 | #include <linux/sched/mm.h> | |
eb8d0f5a | 8 | #include <linux/stop_machine.h> |
9f58892e CW |
9 | |
10 | #include "i915_drv.h" | |
11 | #include "i915_gpu_error.h" | |
12 | #include "i915_reset.h" | |
13 | ||
14 | #include "intel_guc.h" | |
15 | ||
ade8a0f5 CW |
16 | #define RESET_MAX_RETRIES 3 |
17 | ||
eb8d0f5a CW |
18 | /* XXX How to handle concurrent GGTT updates using tiling registers? */ |
19 | #define RESET_UNDER_STOP_MACHINE 0 | |
20 | ||
9f58892e CW |
21 | static void engine_skip_context(struct i915_request *rq) |
22 | { | |
23 | struct intel_engine_cs *engine = rq->engine; | |
24 | struct i915_gem_context *hung_ctx = rq->gem_context; | |
25 | struct i915_timeline *timeline = rq->timeline; | |
9f58892e | 26 | |
eb8d0f5a | 27 | lockdep_assert_held(&engine->timeline.lock); |
9f58892e CW |
28 | GEM_BUG_ON(timeline == &engine->timeline); |
29 | ||
9f58892e CW |
30 | spin_lock(&timeline->lock); |
31 | ||
52c0fdb2 | 32 | if (i915_request_is_active(rq)) { |
eb8d0f5a CW |
33 | list_for_each_entry_continue(rq, |
34 | &engine->timeline.requests, link) | |
35 | if (rq->gem_context == hung_ctx) | |
36 | i915_request_skip(rq, -EIO); | |
37 | } | |
9f58892e CW |
38 | |
39 | list_for_each_entry(rq, &timeline->requests, link) | |
40 | i915_request_skip(rq, -EIO); | |
41 | ||
42 | spin_unlock(&timeline->lock); | |
9f58892e CW |
43 | } |
44 | ||
45 | static void client_mark_guilty(struct drm_i915_file_private *file_priv, | |
46 | const struct i915_gem_context *ctx) | |
47 | { | |
48 | unsigned int score; | |
49 | unsigned long prev_hang; | |
50 | ||
51 | if (i915_gem_context_is_banned(ctx)) | |
52 | score = I915_CLIENT_SCORE_CONTEXT_BAN; | |
53 | else | |
54 | score = 0; | |
55 | ||
56 | prev_hang = xchg(&file_priv->hang_timestamp, jiffies); | |
57 | if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) | |
58 | score += I915_CLIENT_SCORE_HANG_FAST; | |
59 | ||
60 | if (score) { | |
61 | atomic_add(score, &file_priv->ban_score); | |
62 | ||
63 | DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", | |
64 | ctx->name, score, | |
65 | atomic_read(&file_priv->ban_score)); | |
66 | } | |
67 | } | |
68 | ||
eb8d0f5a | 69 | static bool context_mark_guilty(struct i915_gem_context *ctx) |
9f58892e CW |
70 | { |
71 | unsigned int score; | |
72 | bool banned, bannable; | |
73 | ||
74 | atomic_inc(&ctx->guilty_count); | |
75 | ||
76 | bannable = i915_gem_context_is_bannable(ctx); | |
77 | score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); | |
78 | banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; | |
79 | ||
80 | /* Cool contexts don't accumulate client ban score */ | |
81 | if (!bannable) | |
eb8d0f5a | 82 | return false; |
9f58892e CW |
83 | |
84 | if (banned) { | |
85 | DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n", | |
86 | ctx->name, atomic_read(&ctx->guilty_count), | |
87 | score); | |
88 | i915_gem_context_set_banned(ctx); | |
89 | } | |
90 | ||
91 | if (!IS_ERR_OR_NULL(ctx->file_priv)) | |
92 | client_mark_guilty(ctx->file_priv, ctx); | |
eb8d0f5a CW |
93 | |
94 | return banned; | |
9f58892e CW |
95 | } |
96 | ||
97 | static void context_mark_innocent(struct i915_gem_context *ctx) | |
98 | { | |
99 | atomic_inc(&ctx->active_count); | |
100 | } | |
101 | ||
eb8d0f5a CW |
102 | void i915_reset_request(struct i915_request *rq, bool guilty) |
103 | { | |
104 | lockdep_assert_held(&rq->engine->timeline.lock); | |
105 | GEM_BUG_ON(i915_request_completed(rq)); | |
106 | ||
107 | if (guilty) { | |
108 | i915_request_skip(rq, -EIO); | |
109 | if (context_mark_guilty(rq->gem_context)) | |
110 | engine_skip_context(rq); | |
111 | } else { | |
112 | dma_fence_set_error(&rq->fence, -EAGAIN); | |
113 | context_mark_innocent(rq->gem_context); | |
114 | } | |
115 | } | |
116 | ||
9f58892e CW |
117 | static void gen3_stop_engine(struct intel_engine_cs *engine) |
118 | { | |
119 | struct drm_i915_private *dev_priv = engine->i915; | |
120 | const u32 base = engine->mmio_base; | |
121 | ||
122 | if (intel_engine_stop_cs(engine)) | |
123 | DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name); | |
124 | ||
125 | I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base))); | |
126 | POSTING_READ_FW(RING_HEAD(base)); /* paranoia */ | |
127 | ||
128 | I915_WRITE_FW(RING_HEAD(base), 0); | |
129 | I915_WRITE_FW(RING_TAIL(base), 0); | |
130 | POSTING_READ_FW(RING_TAIL(base)); | |
131 | ||
132 | /* The ring must be empty before it is disabled */ | |
133 | I915_WRITE_FW(RING_CTL(base), 0); | |
134 | ||
135 | /* Check acts as a post */ | |
136 | if (I915_READ_FW(RING_HEAD(base)) != 0) | |
137 | DRM_DEBUG_DRIVER("%s: ring head not parked\n", | |
138 | engine->name); | |
139 | } | |
140 | ||
141 | static void i915_stop_engines(struct drm_i915_private *i915, | |
142 | unsigned int engine_mask) | |
143 | { | |
144 | struct intel_engine_cs *engine; | |
145 | enum intel_engine_id id; | |
146 | ||
147 | if (INTEL_GEN(i915) < 3) | |
148 | return; | |
149 | ||
150 | for_each_engine_masked(engine, i915, engine_mask, id) | |
151 | gen3_stop_engine(engine); | |
152 | } | |
153 | ||
154 | static bool i915_in_reset(struct pci_dev *pdev) | |
155 | { | |
156 | u8 gdrst; | |
157 | ||
158 | pci_read_config_byte(pdev, I915_GDRST, &gdrst); | |
159 | return gdrst & GRDOM_RESET_STATUS; | |
160 | } | |
161 | ||
162 | static int i915_do_reset(struct drm_i915_private *i915, | |
163 | unsigned int engine_mask, | |
164 | unsigned int retry) | |
165 | { | |
166 | struct pci_dev *pdev = i915->drm.pdev; | |
167 | int err; | |
168 | ||
169 | /* Assert reset for at least 20 usec, and wait for acknowledgement. */ | |
170 | pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); | |
ade8a0f5 CW |
171 | udelay(50); |
172 | err = wait_for_atomic(i915_in_reset(pdev), 50); | |
9f58892e CW |
173 | |
174 | /* Clear the reset request. */ | |
175 | pci_write_config_byte(pdev, I915_GDRST, 0); | |
ade8a0f5 | 176 | udelay(50); |
9f58892e | 177 | if (!err) |
ade8a0f5 | 178 | err = wait_for_atomic(!i915_in_reset(pdev), 50); |
9f58892e CW |
179 | |
180 | return err; | |
181 | } | |
182 | ||
183 | static bool g4x_reset_complete(struct pci_dev *pdev) | |
184 | { | |
185 | u8 gdrst; | |
186 | ||
187 | pci_read_config_byte(pdev, I915_GDRST, &gdrst); | |
188 | return (gdrst & GRDOM_RESET_ENABLE) == 0; | |
189 | } | |
190 | ||
191 | static int g33_do_reset(struct drm_i915_private *i915, | |
192 | unsigned int engine_mask, | |
193 | unsigned int retry) | |
194 | { | |
195 | struct pci_dev *pdev = i915->drm.pdev; | |
196 | ||
197 | pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); | |
ade8a0f5 | 198 | return wait_for_atomic(g4x_reset_complete(pdev), 50); |
9f58892e CW |
199 | } |
200 | ||
201 | static int g4x_do_reset(struct drm_i915_private *dev_priv, | |
202 | unsigned int engine_mask, | |
203 | unsigned int retry) | |
204 | { | |
205 | struct pci_dev *pdev = dev_priv->drm.pdev; | |
206 | int ret; | |
207 | ||
208 | /* WaVcpClkGateDisableForMediaReset:ctg,elk */ | |
ade8a0f5 CW |
209 | I915_WRITE_FW(VDECCLK_GATE_D, |
210 | I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE); | |
211 | POSTING_READ_FW(VDECCLK_GATE_D); | |
9f58892e CW |
212 | |
213 | pci_write_config_byte(pdev, I915_GDRST, | |
214 | GRDOM_MEDIA | GRDOM_RESET_ENABLE); | |
ade8a0f5 | 215 | ret = wait_for_atomic(g4x_reset_complete(pdev), 50); |
9f58892e CW |
216 | if (ret) { |
217 | DRM_DEBUG_DRIVER("Wait for media reset failed\n"); | |
218 | goto out; | |
219 | } | |
220 | ||
221 | pci_write_config_byte(pdev, I915_GDRST, | |
222 | GRDOM_RENDER | GRDOM_RESET_ENABLE); | |
ade8a0f5 | 223 | ret = wait_for_atomic(g4x_reset_complete(pdev), 50); |
9f58892e CW |
224 | if (ret) { |
225 | DRM_DEBUG_DRIVER("Wait for render reset failed\n"); | |
226 | goto out; | |
227 | } | |
228 | ||
229 | out: | |
230 | pci_write_config_byte(pdev, I915_GDRST, 0); | |
231 | ||
ade8a0f5 CW |
232 | I915_WRITE_FW(VDECCLK_GATE_D, |
233 | I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE); | |
234 | POSTING_READ_FW(VDECCLK_GATE_D); | |
9f58892e CW |
235 | |
236 | return ret; | |
237 | } | |
238 | ||
239 | static int ironlake_do_reset(struct drm_i915_private *dev_priv, | |
240 | unsigned int engine_mask, | |
241 | unsigned int retry) | |
242 | { | |
243 | int ret; | |
244 | ||
ade8a0f5 CW |
245 | I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); |
246 | ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR, | |
247 | ILK_GRDOM_RESET_ENABLE, 0, | |
248 | 5000, 0, | |
249 | NULL); | |
9f58892e CW |
250 | if (ret) { |
251 | DRM_DEBUG_DRIVER("Wait for render reset failed\n"); | |
252 | goto out; | |
253 | } | |
254 | ||
ade8a0f5 CW |
255 | I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); |
256 | ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR, | |
257 | ILK_GRDOM_RESET_ENABLE, 0, | |
258 | 5000, 0, | |
259 | NULL); | |
9f58892e CW |
260 | if (ret) { |
261 | DRM_DEBUG_DRIVER("Wait for media reset failed\n"); | |
262 | goto out; | |
263 | } | |
264 | ||
265 | out: | |
ade8a0f5 CW |
266 | I915_WRITE_FW(ILK_GDSR, 0); |
267 | POSTING_READ_FW(ILK_GDSR); | |
9f58892e CW |
268 | return ret; |
269 | } | |
270 | ||
271 | /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ | |
272 | static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv, | |
273 | u32 hw_domain_mask) | |
274 | { | |
275 | int err; | |
276 | ||
277 | /* | |
278 | * GEN6_GDRST is not in the gt power well, no need to check | |
279 | * for fifo space for the write or forcewake the chip for | |
280 | * the read | |
281 | */ | |
282 | I915_WRITE_FW(GEN6_GDRST, hw_domain_mask); | |
283 | ||
284 | /* Wait for the device to ack the reset requests */ | |
285 | err = __intel_wait_for_register_fw(dev_priv, | |
286 | GEN6_GDRST, hw_domain_mask, 0, | |
287 | 500, 0, | |
288 | NULL); | |
289 | if (err) | |
290 | DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", | |
291 | hw_domain_mask); | |
292 | ||
293 | return err; | |
294 | } | |
295 | ||
296 | static int gen6_reset_engines(struct drm_i915_private *i915, | |
297 | unsigned int engine_mask, | |
298 | unsigned int retry) | |
299 | { | |
300 | struct intel_engine_cs *engine; | |
301 | const u32 hw_engine_mask[I915_NUM_ENGINES] = { | |
302 | [RCS] = GEN6_GRDOM_RENDER, | |
303 | [BCS] = GEN6_GRDOM_BLT, | |
304 | [VCS] = GEN6_GRDOM_MEDIA, | |
305 | [VCS2] = GEN8_GRDOM_MEDIA2, | |
306 | [VECS] = GEN6_GRDOM_VECS, | |
307 | }; | |
308 | u32 hw_mask; | |
309 | ||
310 | if (engine_mask == ALL_ENGINES) { | |
311 | hw_mask = GEN6_GRDOM_FULL; | |
312 | } else { | |
313 | unsigned int tmp; | |
314 | ||
315 | hw_mask = 0; | |
316 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
317 | hw_mask |= hw_engine_mask[engine->id]; | |
318 | } | |
319 | ||
320 | return gen6_hw_domain_reset(i915, hw_mask); | |
321 | } | |
322 | ||
323 | static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv, | |
324 | struct intel_engine_cs *engine) | |
325 | { | |
326 | u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; | |
327 | i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; | |
328 | u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; | |
329 | i915_reg_t sfc_usage; | |
330 | u32 sfc_usage_bit; | |
331 | u32 sfc_reset_bit; | |
332 | ||
333 | switch (engine->class) { | |
334 | case VIDEO_DECODE_CLASS: | |
335 | if ((BIT(engine->instance) & vdbox_sfc_access) == 0) | |
336 | return 0; | |
337 | ||
338 | sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); | |
339 | sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; | |
340 | ||
341 | sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); | |
342 | sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; | |
343 | ||
344 | sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); | |
345 | sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; | |
346 | sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); | |
347 | break; | |
348 | ||
349 | case VIDEO_ENHANCEMENT_CLASS: | |
350 | sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); | |
351 | sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; | |
352 | ||
353 | sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); | |
354 | sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; | |
355 | ||
356 | sfc_usage = GEN11_VECS_SFC_USAGE(engine); | |
357 | sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; | |
358 | sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); | |
359 | break; | |
360 | ||
361 | default: | |
362 | return 0; | |
363 | } | |
364 | ||
365 | /* | |
366 | * Tell the engine that a software reset is going to happen. The engine | |
367 | * will then try to force lock the SFC (if currently locked, it will | |
368 | * remain so until we tell the engine it is safe to unlock; if currently | |
369 | * unlocked, it will ignore this and all new lock requests). If SFC | |
370 | * ends up being locked to the engine we want to reset, we have to reset | |
371 | * it as well (we will unlock it once the reset sequence is completed). | |
372 | */ | |
373 | I915_WRITE_FW(sfc_forced_lock, | |
374 | I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit); | |
375 | ||
376 | if (__intel_wait_for_register_fw(dev_priv, | |
377 | sfc_forced_lock_ack, | |
378 | sfc_forced_lock_ack_bit, | |
379 | sfc_forced_lock_ack_bit, | |
380 | 1000, 0, NULL)) { | |
381 | DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); | |
382 | return 0; | |
383 | } | |
384 | ||
385 | if (I915_READ_FW(sfc_usage) & sfc_usage_bit) | |
386 | return sfc_reset_bit; | |
387 | ||
388 | return 0; | |
389 | } | |
390 | ||
391 | static void gen11_unlock_sfc(struct drm_i915_private *dev_priv, | |
392 | struct intel_engine_cs *engine) | |
393 | { | |
394 | u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access; | |
395 | i915_reg_t sfc_forced_lock; | |
396 | u32 sfc_forced_lock_bit; | |
397 | ||
398 | switch (engine->class) { | |
399 | case VIDEO_DECODE_CLASS: | |
400 | if ((BIT(engine->instance) & vdbox_sfc_access) == 0) | |
401 | return; | |
402 | ||
403 | sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); | |
404 | sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; | |
405 | break; | |
406 | ||
407 | case VIDEO_ENHANCEMENT_CLASS: | |
408 | sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); | |
409 | sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; | |
410 | break; | |
411 | ||
412 | default: | |
413 | return; | |
414 | } | |
415 | ||
416 | I915_WRITE_FW(sfc_forced_lock, | |
417 | I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit); | |
418 | } | |
419 | ||
420 | static int gen11_reset_engines(struct drm_i915_private *i915, | |
421 | unsigned int engine_mask, | |
422 | unsigned int retry) | |
423 | { | |
424 | const u32 hw_engine_mask[I915_NUM_ENGINES] = { | |
425 | [RCS] = GEN11_GRDOM_RENDER, | |
426 | [BCS] = GEN11_GRDOM_BLT, | |
427 | [VCS] = GEN11_GRDOM_MEDIA, | |
428 | [VCS2] = GEN11_GRDOM_MEDIA2, | |
429 | [VCS3] = GEN11_GRDOM_MEDIA3, | |
430 | [VCS4] = GEN11_GRDOM_MEDIA4, | |
431 | [VECS] = GEN11_GRDOM_VECS, | |
432 | [VECS2] = GEN11_GRDOM_VECS2, | |
433 | }; | |
434 | struct intel_engine_cs *engine; | |
435 | unsigned int tmp; | |
436 | u32 hw_mask; | |
437 | int ret; | |
438 | ||
439 | BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES); | |
440 | ||
441 | if (engine_mask == ALL_ENGINES) { | |
442 | hw_mask = GEN11_GRDOM_FULL; | |
443 | } else { | |
444 | hw_mask = 0; | |
445 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
446 | hw_mask |= hw_engine_mask[engine->id]; | |
447 | hw_mask |= gen11_lock_sfc(i915, engine); | |
448 | } | |
449 | } | |
450 | ||
451 | ret = gen6_hw_domain_reset(i915, hw_mask); | |
452 | ||
453 | if (engine_mask != ALL_ENGINES) | |
454 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
455 | gen11_unlock_sfc(i915, engine); | |
456 | ||
457 | return ret; | |
458 | } | |
459 | ||
460 | static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) | |
461 | { | |
462 | struct drm_i915_private *dev_priv = engine->i915; | |
463 | int ret; | |
464 | ||
465 | I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base), | |
466 | _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET)); | |
467 | ||
468 | ret = __intel_wait_for_register_fw(dev_priv, | |
469 | RING_RESET_CTL(engine->mmio_base), | |
470 | RESET_CTL_READY_TO_RESET, | |
471 | RESET_CTL_READY_TO_RESET, | |
472 | 700, 0, | |
473 | NULL); | |
474 | if (ret) | |
475 | DRM_ERROR("%s: reset request timeout\n", engine->name); | |
476 | ||
477 | return ret; | |
478 | } | |
479 | ||
480 | static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) | |
481 | { | |
482 | struct drm_i915_private *dev_priv = engine->i915; | |
483 | ||
484 | I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base), | |
485 | _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); | |
486 | } | |
487 | ||
488 | static int gen8_reset_engines(struct drm_i915_private *i915, | |
489 | unsigned int engine_mask, | |
490 | unsigned int retry) | |
491 | { | |
492 | struct intel_engine_cs *engine; | |
493 | const bool reset_non_ready = retry >= 1; | |
494 | unsigned int tmp; | |
495 | int ret; | |
496 | ||
497 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
498 | ret = gen8_engine_reset_prepare(engine); | |
499 | if (ret && !reset_non_ready) | |
500 | goto skip_reset; | |
501 | ||
502 | /* | |
503 | * If this is not the first failed attempt to prepare, | |
504 | * we decide to proceed anyway. | |
505 | * | |
506 | * By doing so we risk context corruption and with | |
507 | * some gens (kbl), possible system hang if reset | |
508 | * happens during active bb execution. | |
509 | * | |
510 | * We rather take context corruption instead of | |
511 | * failed reset with a wedged driver/gpu. And | |
512 | * active bb execution case should be covered by | |
513 | * i915_stop_engines we have before the reset. | |
514 | */ | |
515 | } | |
516 | ||
517 | if (INTEL_GEN(i915) >= 11) | |
518 | ret = gen11_reset_engines(i915, engine_mask, retry); | |
519 | else | |
520 | ret = gen6_reset_engines(i915, engine_mask, retry); | |
521 | ||
522 | skip_reset: | |
523 | for_each_engine_masked(engine, i915, engine_mask, tmp) | |
524 | gen8_engine_reset_cancel(engine); | |
525 | ||
526 | return ret; | |
527 | } | |
528 | ||
529 | typedef int (*reset_func)(struct drm_i915_private *, | |
530 | unsigned int engine_mask, | |
531 | unsigned int retry); | |
532 | ||
533 | static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) | |
534 | { | |
535 | if (!i915_modparams.reset) | |
536 | return NULL; | |
537 | ||
538 | if (INTEL_GEN(i915) >= 8) | |
539 | return gen8_reset_engines; | |
540 | else if (INTEL_GEN(i915) >= 6) | |
541 | return gen6_reset_engines; | |
542 | else if (INTEL_GEN(i915) >= 5) | |
543 | return ironlake_do_reset; | |
544 | else if (IS_G4X(i915)) | |
545 | return g4x_do_reset; | |
546 | else if (IS_G33(i915) || IS_PINEVIEW(i915)) | |
547 | return g33_do_reset; | |
548 | else if (INTEL_GEN(i915) >= 3) | |
549 | return i915_do_reset; | |
550 | else | |
551 | return NULL; | |
552 | } | |
553 | ||
554 | int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask) | |
555 | { | |
ade8a0f5 CW |
556 | const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; |
557 | reset_func reset; | |
558 | int ret = -ETIMEDOUT; | |
9f58892e | 559 | int retry; |
9f58892e | 560 | |
ade8a0f5 CW |
561 | reset = intel_get_gpu_reset(i915); |
562 | if (!reset) | |
563 | return -ENODEV; | |
9f58892e CW |
564 | |
565 | /* | |
566 | * If the power well sleeps during the reset, the reset | |
567 | * request may be dropped and never completes (causing -EIO). | |
568 | */ | |
569 | intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); | |
ade8a0f5 | 570 | for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { |
9f58892e CW |
571 | /* |
572 | * We stop engines, otherwise we might get failed reset and a | |
573 | * dead gpu (on elk). Also as modern gpu as kbl can suffer | |
574 | * from system hang if batchbuffer is progressing when | |
575 | * the reset is issued, regardless of READY_TO_RESET ack. | |
576 | * Thus assume it is best to stop engines on all gens | |
577 | * where we have a gpu reset. | |
578 | * | |
579 | * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) | |
580 | * | |
581 | * WaMediaResetMainRingCleanup:ctg,elk (presumably) | |
582 | * | |
583 | * FIXME: Wa for more modern gens needs to be validated | |
584 | */ | |
585 | i915_stop_engines(i915, engine_mask); | |
586 | ||
ade8a0f5 CW |
587 | GEM_TRACE("engine_mask=%x\n", engine_mask); |
588 | preempt_disable(); | |
589 | ret = reset(i915, engine_mask, retry); | |
590 | preempt_enable(); | |
9f58892e CW |
591 | } |
592 | intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); | |
593 | ||
594 | return ret; | |
595 | } | |
596 | ||
597 | bool intel_has_gpu_reset(struct drm_i915_private *i915) | |
598 | { | |
fe62365f CW |
599 | if (USES_GUC(i915)) |
600 | return false; | |
601 | ||
9f58892e CW |
602 | return intel_get_gpu_reset(i915); |
603 | } | |
604 | ||
605 | bool intel_has_reset_engine(struct drm_i915_private *i915) | |
606 | { | |
607 | return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; | |
608 | } | |
609 | ||
610 | int intel_reset_guc(struct drm_i915_private *i915) | |
611 | { | |
612 | u32 guc_domain = | |
613 | INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; | |
614 | int ret; | |
615 | ||
616 | GEM_BUG_ON(!HAS_GUC(i915)); | |
617 | ||
618 | intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); | |
619 | ret = gen6_hw_domain_reset(i915, guc_domain); | |
620 | intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); | |
621 | ||
622 | return ret; | |
623 | } | |
624 | ||
625 | /* | |
626 | * Ensure irq handler finishes, and not run again. | |
627 | * Also return the active request so that we only search for it once. | |
628 | */ | |
eb8d0f5a | 629 | static void reset_prepare_engine(struct intel_engine_cs *engine) |
9f58892e | 630 | { |
9f58892e CW |
631 | /* |
632 | * During the reset sequence, we must prevent the engine from | |
633 | * entering RC6. As the context state is undefined until we restart | |
634 | * the engine, if it does enter RC6 during the reset, the state | |
635 | * written to the powercontext is undefined and so we may lose | |
636 | * GPU state upon resume, i.e. fail to restart after a reset. | |
637 | */ | |
638 | intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL); | |
eb8d0f5a | 639 | engine->reset.prepare(engine); |
9f58892e CW |
640 | } |
641 | ||
eb8d0f5a | 642 | static void reset_prepare(struct drm_i915_private *i915) |
9f58892e CW |
643 | { |
644 | struct intel_engine_cs *engine; | |
9f58892e | 645 | enum intel_engine_id id; |
9f58892e | 646 | |
eb8d0f5a CW |
647 | for_each_engine(engine, i915, id) |
648 | reset_prepare_engine(engine); | |
9f58892e | 649 | |
9f58892e | 650 | intel_uc_sanitize(i915); |
9f58892e CW |
651 | } |
652 | ||
eb8d0f5a | 653 | static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask) |
9f58892e | 654 | { |
eb8d0f5a CW |
655 | struct intel_engine_cs *engine; |
656 | enum intel_engine_id id; | |
657 | int err; | |
658 | ||
9f58892e | 659 | /* |
eb8d0f5a CW |
660 | * Everything depends on having the GTT running, so we need to start |
661 | * there. | |
9f58892e | 662 | */ |
eb8d0f5a CW |
663 | err = i915_ggtt_enable_hw(i915); |
664 | if (err) | |
665 | return err; | |
9f58892e | 666 | |
eb8d0f5a CW |
667 | for_each_engine(engine, i915, id) |
668 | intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id)); | |
9f58892e | 669 | |
eb8d0f5a | 670 | i915_gem_restore_fences(i915); |
9f58892e | 671 | |
eb8d0f5a | 672 | return err; |
9f58892e CW |
673 | } |
674 | ||
eb8d0f5a | 675 | static void reset_finish_engine(struct intel_engine_cs *engine) |
9f58892e | 676 | { |
eb8d0f5a CW |
677 | engine->reset.finish(engine); |
678 | intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL); | |
9f58892e CW |
679 | } |
680 | ||
eb8d0f5a CW |
681 | struct i915_gpu_restart { |
682 | struct work_struct work; | |
683 | struct drm_i915_private *i915; | |
684 | }; | |
685 | ||
686 | static void restart_work(struct work_struct *work) | |
9f58892e | 687 | { |
eb8d0f5a CW |
688 | struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work); |
689 | struct drm_i915_private *i915 = arg->i915; | |
9f58892e CW |
690 | struct intel_engine_cs *engine; |
691 | enum intel_engine_id id; | |
eb8d0f5a | 692 | intel_wakeref_t wakeref; |
9f58892e | 693 | |
eb8d0f5a CW |
694 | wakeref = intel_runtime_pm_get(i915); |
695 | mutex_lock(&i915->drm.struct_mutex); | |
696 | WRITE_ONCE(i915->gpu_error.restart, NULL); | |
9f58892e CW |
697 | |
698 | for_each_engine(engine, i915, id) { | |
eb8d0f5a | 699 | struct i915_request *rq; |
9f58892e CW |
700 | |
701 | /* | |
702 | * Ostensibily, we always want a context loaded for powersaving, | |
703 | * so if the engine is idle after the reset, send a request | |
704 | * to load our scratch kernel_context. | |
9f58892e | 705 | */ |
eb8d0f5a CW |
706 | if (!intel_engine_is_idle(engine)) |
707 | continue; | |
9f58892e | 708 | |
eb8d0f5a CW |
709 | rq = i915_request_alloc(engine, i915->kernel_context); |
710 | if (!IS_ERR(rq)) | |
711 | i915_request_add(rq); | |
9f58892e CW |
712 | } |
713 | ||
eb8d0f5a CW |
714 | mutex_unlock(&i915->drm.struct_mutex); |
715 | intel_runtime_pm_put(i915, wakeref); | |
9f58892e | 716 | |
eb8d0f5a | 717 | kfree(arg); |
9f58892e CW |
718 | } |
719 | ||
720 | static void reset_finish(struct drm_i915_private *i915) | |
721 | { | |
722 | struct intel_engine_cs *engine; | |
723 | enum intel_engine_id id; | |
724 | ||
eb8d0f5a | 725 | for_each_engine(engine, i915, id) |
9f58892e | 726 | reset_finish_engine(engine); |
eb8d0f5a CW |
727 | } |
728 | ||
729 | static void reset_restart(struct drm_i915_private *i915) | |
730 | { | |
731 | struct i915_gpu_restart *arg; | |
732 | ||
733 | /* | |
734 | * Following the reset, ensure that we always reload context for | |
735 | * powersaving, and to correct engine->last_retired_context. Since | |
736 | * this requires us to submit a request, queue a worker to do that | |
737 | * task for us to evade any locking here. | |
738 | */ | |
739 | if (READ_ONCE(i915->gpu_error.restart)) | |
740 | return; | |
741 | ||
742 | arg = kmalloc(sizeof(*arg), GFP_KERNEL); | |
743 | if (arg) { | |
744 | arg->i915 = i915; | |
745 | INIT_WORK(&arg->work, restart_work); | |
746 | ||
747 | WRITE_ONCE(i915->gpu_error.restart, arg); | |
748 | queue_work(i915->wq, &arg->work); | |
9f58892e CW |
749 | } |
750 | } | |
751 | ||
752 | static void nop_submit_request(struct i915_request *request) | |
753 | { | |
52c0fdb2 | 754 | struct intel_engine_cs *engine = request->engine; |
9f58892e CW |
755 | unsigned long flags; |
756 | ||
757 | GEM_TRACE("%s fence %llx:%lld -> -EIO\n", | |
52c0fdb2 | 758 | engine->name, request->fence.context, request->fence.seqno); |
9f58892e CW |
759 | dma_fence_set_error(&request->fence, -EIO); |
760 | ||
52c0fdb2 | 761 | spin_lock_irqsave(&engine->timeline.lock, flags); |
9f58892e | 762 | __i915_request_submit(request); |
5013eb8c | 763 | i915_request_mark_complete(request); |
52c0fdb2 CW |
764 | intel_engine_write_global_seqno(engine, request->global_seqno); |
765 | spin_unlock_irqrestore(&engine->timeline.lock, flags); | |
766 | ||
767 | intel_engine_queue_breadcrumbs(engine); | |
9f58892e CW |
768 | } |
769 | ||
770 | void i915_gem_set_wedged(struct drm_i915_private *i915) | |
771 | { | |
772 | struct i915_gpu_error *error = &i915->gpu_error; | |
773 | struct intel_engine_cs *engine; | |
774 | enum intel_engine_id id; | |
775 | ||
776 | mutex_lock(&error->wedge_mutex); | |
777 | if (test_bit(I915_WEDGED, &error->flags)) { | |
778 | mutex_unlock(&error->wedge_mutex); | |
779 | return; | |
780 | } | |
781 | ||
782 | if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { | |
783 | struct drm_printer p = drm_debug_printer(__func__); | |
784 | ||
785 | for_each_engine(engine, i915, id) | |
786 | intel_engine_dump(engine, &p, "%s\n", engine->name); | |
787 | } | |
788 | ||
789 | GEM_TRACE("start\n"); | |
790 | ||
791 | /* | |
792 | * First, stop submission to hw, but do not yet complete requests by | |
793 | * rolling the global seqno forward (since this would complete requests | |
794 | * for which we haven't set the fence error to EIO yet). | |
795 | */ | |
796 | for_each_engine(engine, i915, id) | |
797 | reset_prepare_engine(engine); | |
798 | ||
799 | /* Even if the GPU reset fails, it should still stop the engines */ | |
800 | if (INTEL_GEN(i915) >= 5) | |
801 | intel_gpu_reset(i915, ALL_ENGINES); | |
802 | ||
803 | for_each_engine(engine, i915, id) { | |
804 | engine->submit_request = nop_submit_request; | |
805 | engine->schedule = NULL; | |
806 | } | |
807 | i915->caps.scheduler = 0; | |
808 | ||
809 | /* | |
810 | * Make sure no request can slip through without getting completed by | |
811 | * either this call here to intel_engine_write_global_seqno, or the one | |
812 | * in nop_submit_request. | |
813 | */ | |
814 | synchronize_rcu(); | |
815 | ||
816 | /* Mark all executing requests as skipped */ | |
817 | for_each_engine(engine, i915, id) | |
818 | engine->cancel_requests(engine); | |
819 | ||
820 | for_each_engine(engine, i915, id) { | |
821 | reset_finish_engine(engine); | |
52c0fdb2 | 822 | intel_engine_signal_breadcrumbs(engine); |
9f58892e CW |
823 | } |
824 | ||
825 | smp_mb__before_atomic(); | |
826 | set_bit(I915_WEDGED, &error->flags); | |
827 | ||
828 | GEM_TRACE("end\n"); | |
829 | mutex_unlock(&error->wedge_mutex); | |
830 | ||
831 | wake_up_all(&error->reset_queue); | |
832 | } | |
833 | ||
834 | bool i915_gem_unset_wedged(struct drm_i915_private *i915) | |
835 | { | |
836 | struct i915_gpu_error *error = &i915->gpu_error; | |
837 | struct i915_timeline *tl; | |
838 | bool ret = false; | |
839 | ||
9f58892e CW |
840 | if (!test_bit(I915_WEDGED, &error->flags)) |
841 | return true; | |
842 | ||
843 | if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ | |
844 | return false; | |
845 | ||
846 | mutex_lock(&error->wedge_mutex); | |
847 | ||
848 | GEM_TRACE("start\n"); | |
849 | ||
850 | /* | |
851 | * Before unwedging, make sure that all pending operations | |
852 | * are flushed and errored out - we may have requests waiting upon | |
853 | * third party fences. We marked all inflight requests as EIO, and | |
854 | * every execbuf since returned EIO, for consistency we want all | |
855 | * the currently pending requests to also be marked as EIO, which | |
856 | * is done inside our nop_submit_request - and so we must wait. | |
857 | * | |
858 | * No more can be submitted until we reset the wedged bit. | |
859 | */ | |
1e345568 | 860 | mutex_lock(&i915->gt.timelines.mutex); |
9407d3bd | 861 | list_for_each_entry(tl, &i915->gt.timelines.active_list, link) { |
9f58892e | 862 | struct i915_request *rq; |
eb8d0f5a | 863 | long timeout; |
9f58892e | 864 | |
eb8d0f5a | 865 | rq = i915_gem_active_get_unlocked(&tl->last_request); |
9f58892e CW |
866 | if (!rq) |
867 | continue; | |
868 | ||
869 | /* | |
870 | * We can't use our normal waiter as we want to | |
871 | * avoid recursively trying to handle the current | |
872 | * reset. The basic dma_fence_default_wait() installs | |
873 | * a callback for dma_fence_signal(), which is | |
874 | * triggered by our nop handler (indirectly, the | |
875 | * callback enables the signaler thread which is | |
876 | * woken by the nop_submit_request() advancing the seqno | |
877 | * and when the seqno passes the fence, the signaler | |
878 | * then signals the fence waking us up). | |
879 | */ | |
eb8d0f5a CW |
880 | timeout = dma_fence_default_wait(&rq->fence, true, |
881 | MAX_SCHEDULE_TIMEOUT); | |
882 | i915_request_put(rq); | |
1e345568 CW |
883 | if (timeout < 0) { |
884 | mutex_unlock(&i915->gt.timelines.mutex); | |
9f58892e | 885 | goto unlock; |
1e345568 | 886 | } |
9f58892e | 887 | } |
1e345568 | 888 | mutex_unlock(&i915->gt.timelines.mutex); |
9f58892e CW |
889 | |
890 | intel_engines_sanitize(i915, false); | |
891 | ||
892 | /* | |
893 | * Undo nop_submit_request. We prevent all new i915 requests from | |
894 | * being queued (by disallowing execbuf whilst wedged) so having | |
895 | * waited for all active requests above, we know the system is idle | |
896 | * and do not have to worry about a thread being inside | |
897 | * engine->submit_request() as we swap over. So unlike installing | |
898 | * the nop_submit_request on reset, we can do this from normal | |
899 | * context and do not require stop_machine(). | |
900 | */ | |
901 | intel_engines_reset_default_submission(i915); | |
9f58892e CW |
902 | |
903 | GEM_TRACE("end\n"); | |
904 | ||
905 | smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ | |
906 | clear_bit(I915_WEDGED, &i915->gpu_error.flags); | |
907 | ret = true; | |
908 | unlock: | |
909 | mutex_unlock(&i915->gpu_error.wedge_mutex); | |
910 | ||
911 | return ret; | |
912 | } | |
913 | ||
eb8d0f5a CW |
914 | struct __i915_reset { |
915 | struct drm_i915_private *i915; | |
916 | unsigned int stalled_mask; | |
917 | }; | |
918 | ||
919 | static int __i915_reset__BKL(void *data) | |
920 | { | |
921 | struct __i915_reset *arg = data; | |
922 | int err; | |
923 | ||
924 | err = intel_gpu_reset(arg->i915, ALL_ENGINES); | |
925 | if (err) | |
926 | return err; | |
927 | ||
928 | return gt_reset(arg->i915, arg->stalled_mask); | |
929 | } | |
930 | ||
931 | #if RESET_UNDER_STOP_MACHINE | |
932 | /* | |
933 | * XXX An alternative to using stop_machine would be to park only the | |
934 | * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP) | |
935 | * we should be able to prevent their memmory accesses via the lost fence | |
936 | * registers over the course of the reset without the potential recursive | |
937 | * of mutexes between the pagefault handler and reset. | |
938 | * | |
939 | * See igt/gem_mmap_gtt/hang | |
940 | */ | |
941 | #define __do_reset(fn, arg) stop_machine(fn, arg, NULL) | |
942 | #else | |
943 | #define __do_reset(fn, arg) fn(arg) | |
944 | #endif | |
945 | ||
946 | static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask) | |
947 | { | |
948 | struct __i915_reset arg = { i915, stalled_mask }; | |
949 | int err, i; | |
950 | ||
951 | err = __do_reset(__i915_reset__BKL, &arg); | |
952 | for (i = 0; err && i < RESET_MAX_RETRIES; i++) { | |
953 | msleep(100); | |
954 | err = __do_reset(__i915_reset__BKL, &arg); | |
955 | } | |
956 | ||
957 | return err; | |
958 | } | |
959 | ||
9f58892e CW |
960 | /** |
961 | * i915_reset - reset chip after a hang | |
962 | * @i915: #drm_i915_private to reset | |
963 | * @stalled_mask: mask of the stalled engines with the guilty requests | |
964 | * @reason: user error message for why we are resetting | |
965 | * | |
966 | * Reset the chip. Useful if a hang is detected. Marks the device as wedged | |
967 | * on failure. | |
968 | * | |
969 | * Caller must hold the struct_mutex. | |
970 | * | |
971 | * Procedure is fairly simple: | |
972 | * - reset the chip using the reset reg | |
973 | * - re-init context state | |
974 | * - re-init hardware status page | |
975 | * - re-init ring buffer | |
976 | * - re-init interrupt state | |
977 | * - re-init display | |
978 | */ | |
979 | void i915_reset(struct drm_i915_private *i915, | |
980 | unsigned int stalled_mask, | |
981 | const char *reason) | |
982 | { | |
983 | struct i915_gpu_error *error = &i915->gpu_error; | |
984 | int ret; | |
9f58892e CW |
985 | |
986 | GEM_TRACE("flags=%lx\n", error->flags); | |
987 | ||
988 | might_sleep(); | |
9f58892e CW |
989 | assert_rpm_wakelock_held(i915); |
990 | GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); | |
991 | ||
9f58892e CW |
992 | /* Clear any previous failed attempts at recovery. Time to try again. */ |
993 | if (!i915_gem_unset_wedged(i915)) | |
eb8d0f5a | 994 | return; |
9f58892e CW |
995 | |
996 | if (reason) | |
997 | dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); | |
998 | error->reset_count++; | |
999 | ||
eb8d0f5a | 1000 | reset_prepare(i915); |
9f58892e CW |
1001 | |
1002 | if (!intel_has_gpu_reset(i915)) { | |
1003 | if (i915_modparams.reset) | |
1004 | dev_err(i915->drm.dev, "GPU reset not supported\n"); | |
1005 | else | |
1006 | DRM_DEBUG_DRIVER("GPU reset disabled\n"); | |
1007 | goto error; | |
1008 | } | |
1009 | ||
eb8d0f5a | 1010 | if (do_reset(i915, stalled_mask)) { |
9f58892e CW |
1011 | dev_err(i915->drm.dev, "Failed to reset chip\n"); |
1012 | goto taint; | |
1013 | } | |
1014 | ||
9f58892e CW |
1015 | intel_overlay_reset(i915); |
1016 | ||
1017 | /* | |
1018 | * Next we need to restore the context, but we don't use those | |
1019 | * yet either... | |
1020 | * | |
1021 | * Ring buffer needs to be re-initialized in the KMS case, or if X | |
1022 | * was running at the time of the reset (i.e. we weren't VT | |
1023 | * switched away). | |
1024 | */ | |
1025 | ret = i915_gem_init_hw(i915); | |
1026 | if (ret) { | |
1027 | DRM_ERROR("Failed to initialise HW following reset (%d)\n", | |
1028 | ret); | |
1029 | goto error; | |
1030 | } | |
1031 | ||
1032 | i915_queue_hangcheck(i915); | |
1033 | ||
1034 | finish: | |
1035 | reset_finish(i915); | |
eb8d0f5a CW |
1036 | if (!i915_terminally_wedged(error)) |
1037 | reset_restart(i915); | |
9f58892e CW |
1038 | return; |
1039 | ||
1040 | taint: | |
1041 | /* | |
1042 | * History tells us that if we cannot reset the GPU now, we | |
1043 | * never will. This then impacts everything that is run | |
1044 | * subsequently. On failing the reset, we mark the driver | |
1045 | * as wedged, preventing further execution on the GPU. | |
1046 | * We also want to go one step further and add a taint to the | |
1047 | * kernel so that any subsequent faults can be traced back to | |
1048 | * this failure. This is important for CI, where if the | |
1049 | * GPU/driver fails we would like to reboot and restart testing | |
1050 | * rather than continue on into oblivion. For everyone else, | |
1051 | * the system should still plod along, but they have been warned! | |
1052 | */ | |
1053 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | |
1054 | error: | |
1055 | i915_gem_set_wedged(i915); | |
9f58892e CW |
1056 | goto finish; |
1057 | } | |
1058 | ||
1059 | static inline int intel_gt_reset_engine(struct drm_i915_private *i915, | |
1060 | struct intel_engine_cs *engine) | |
1061 | { | |
1062 | return intel_gpu_reset(i915, intel_engine_flag(engine)); | |
1063 | } | |
1064 | ||
1065 | /** | |
1066 | * i915_reset_engine - reset GPU engine to recover from a hang | |
1067 | * @engine: engine to reset | |
1068 | * @msg: reason for GPU reset; or NULL for no dev_notice() | |
1069 | * | |
1070 | * Reset a specific GPU engine. Useful if a hang is detected. | |
1071 | * Returns zero on successful reset or otherwise an error code. | |
1072 | * | |
1073 | * Procedure is: | |
1074 | * - identifies the request that caused the hang and it is dropped | |
1075 | * - reset engine (which will force the engine to idle) | |
1076 | * - re-init/configure engine | |
1077 | */ | |
1078 | int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) | |
1079 | { | |
1080 | struct i915_gpu_error *error = &engine->i915->gpu_error; | |
9f58892e CW |
1081 | int ret; |
1082 | ||
1083 | GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); | |
1084 | GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); | |
1085 | ||
eb8d0f5a | 1086 | reset_prepare_engine(engine); |
9f58892e CW |
1087 | |
1088 | if (msg) | |
1089 | dev_notice(engine->i915->drm.dev, | |
1090 | "Resetting %s for %s\n", engine->name, msg); | |
1091 | error->reset_engine_count[engine->id]++; | |
1092 | ||
1093 | if (!engine->i915->guc.execbuf_client) | |
1094 | ret = intel_gt_reset_engine(engine->i915, engine); | |
1095 | else | |
1096 | ret = intel_guc_reset_engine(&engine->i915->guc, engine); | |
1097 | if (ret) { | |
1098 | /* If we fail here, we expect to fallback to a global reset */ | |
1099 | DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", | |
1100 | engine->i915->guc.execbuf_client ? "GuC " : "", | |
1101 | engine->name, ret); | |
1102 | goto out; | |
1103 | } | |
1104 | ||
1105 | /* | |
1106 | * The request that caused the hang is stuck on elsp, we know the | |
1107 | * active request and can drop it, adjust head to skip the offending | |
1108 | * request to resume executing remaining requests in the queue. | |
1109 | */ | |
eb8d0f5a | 1110 | intel_engine_reset(engine, true); |
9f58892e CW |
1111 | |
1112 | /* | |
1113 | * The engine and its registers (and workarounds in case of render) | |
1114 | * have been reset to their default values. Follow the init_ring | |
1115 | * process to program RING_MODE, HWSP and re-enable submission. | |
1116 | */ | |
1117 | ret = engine->init_hw(engine); | |
1118 | if (ret) | |
1119 | goto out; | |
1120 | ||
1121 | out: | |
1122 | intel_engine_cancel_stop_cs(engine); | |
1123 | reset_finish_engine(engine); | |
1124 | return ret; | |
1125 | } | |
1126 | ||
1127 | static void i915_reset_device(struct drm_i915_private *i915, | |
1128 | u32 engine_mask, | |
1129 | const char *reason) | |
1130 | { | |
1131 | struct i915_gpu_error *error = &i915->gpu_error; | |
1132 | struct kobject *kobj = &i915->drm.primary->kdev->kobj; | |
1133 | char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; | |
1134 | char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; | |
1135 | char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; | |
1136 | struct i915_wedge_me w; | |
1137 | ||
1138 | kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); | |
1139 | ||
1140 | DRM_DEBUG_DRIVER("resetting chip\n"); | |
1141 | kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); | |
1142 | ||
1143 | /* Use a watchdog to ensure that our reset completes */ | |
1144 | i915_wedge_on_timeout(&w, i915, 5 * HZ) { | |
1145 | intel_prepare_reset(i915); | |
1146 | ||
eb8d0f5a | 1147 | i915_reset(i915, engine_mask, reason); |
9f58892e CW |
1148 | |
1149 | intel_finish_reset(i915); | |
1150 | } | |
1151 | ||
1152 | if (!test_bit(I915_WEDGED, &error->flags)) | |
1153 | kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); | |
1154 | } | |
1155 | ||
1156 | void i915_clear_error_registers(struct drm_i915_private *dev_priv) | |
1157 | { | |
1158 | u32 eir; | |
1159 | ||
1160 | if (!IS_GEN(dev_priv, 2)) | |
1161 | I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER)); | |
1162 | ||
1163 | if (INTEL_GEN(dev_priv) < 4) | |
1164 | I915_WRITE(IPEIR, I915_READ(IPEIR)); | |
1165 | else | |
1166 | I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965)); | |
1167 | ||
1168 | I915_WRITE(EIR, I915_READ(EIR)); | |
1169 | eir = I915_READ(EIR); | |
1170 | if (eir) { | |
1171 | /* | |
1172 | * some errors might have become stuck, | |
1173 | * mask them. | |
1174 | */ | |
1175 | DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); | |
1176 | I915_WRITE(EMR, I915_READ(EMR) | eir); | |
1177 | I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT); | |
1178 | } | |
1179 | ||
1180 | if (INTEL_GEN(dev_priv) >= 8) { | |
1181 | I915_WRITE(GEN8_RING_FAULT_REG, | |
1182 | I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID); | |
1183 | POSTING_READ(GEN8_RING_FAULT_REG); | |
1184 | } else if (INTEL_GEN(dev_priv) >= 6) { | |
1185 | struct intel_engine_cs *engine; | |
1186 | enum intel_engine_id id; | |
1187 | ||
1188 | for_each_engine(engine, dev_priv, id) { | |
1189 | I915_WRITE(RING_FAULT_REG(engine), | |
1190 | I915_READ(RING_FAULT_REG(engine)) & | |
1191 | ~RING_FAULT_VALID); | |
1192 | } | |
1193 | POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS])); | |
1194 | } | |
1195 | } | |
1196 | ||
1197 | /** | |
1198 | * i915_handle_error - handle a gpu error | |
1199 | * @i915: i915 device private | |
1200 | * @engine_mask: mask representing engines that are hung | |
1201 | * @flags: control flags | |
1202 | * @fmt: Error message format string | |
1203 | * | |
1204 | * Do some basic checking of register state at error time and | |
1205 | * dump it to the syslog. Also call i915_capture_error_state() to make | |
1206 | * sure we get a record and make it available in debugfs. Fire a uevent | |
1207 | * so userspace knows something bad happened (should trigger collection | |
1208 | * of a ring dump etc.). | |
1209 | */ | |
1210 | void i915_handle_error(struct drm_i915_private *i915, | |
1211 | u32 engine_mask, | |
1212 | unsigned long flags, | |
1213 | const char *fmt, ...) | |
1214 | { | |
1215 | struct intel_engine_cs *engine; | |
1216 | intel_wakeref_t wakeref; | |
1217 | unsigned int tmp; | |
1218 | char error_msg[80]; | |
1219 | char *msg = NULL; | |
1220 | ||
1221 | if (fmt) { | |
1222 | va_list args; | |
1223 | ||
1224 | va_start(args, fmt); | |
1225 | vscnprintf(error_msg, sizeof(error_msg), fmt, args); | |
1226 | va_end(args); | |
1227 | ||
1228 | msg = error_msg; | |
1229 | } | |
1230 | ||
1231 | /* | |
1232 | * In most cases it's guaranteed that we get here with an RPM | |
1233 | * reference held, for example because there is a pending GPU | |
1234 | * request that won't finish until the reset is done. This | |
1235 | * isn't the case at least when we get here by doing a | |
1236 | * simulated reset via debugfs, so get an RPM reference. | |
1237 | */ | |
1238 | wakeref = intel_runtime_pm_get(i915); | |
1239 | ||
1240 | engine_mask &= INTEL_INFO(i915)->ring_mask; | |
1241 | ||
1242 | if (flags & I915_ERROR_CAPTURE) { | |
1243 | i915_capture_error_state(i915, engine_mask, msg); | |
1244 | i915_clear_error_registers(i915); | |
1245 | } | |
1246 | ||
1247 | /* | |
1248 | * Try engine reset when available. We fall back to full reset if | |
1249 | * single reset fails. | |
1250 | */ | |
1251 | if (intel_has_reset_engine(i915) && | |
1252 | !i915_terminally_wedged(&i915->gpu_error)) { | |
1253 | for_each_engine_masked(engine, i915, engine_mask, tmp) { | |
1254 | BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); | |
1255 | if (test_and_set_bit(I915_RESET_ENGINE + engine->id, | |
1256 | &i915->gpu_error.flags)) | |
1257 | continue; | |
1258 | ||
1259 | if (i915_reset_engine(engine, msg) == 0) | |
1260 | engine_mask &= ~intel_engine_flag(engine); | |
1261 | ||
1262 | clear_bit(I915_RESET_ENGINE + engine->id, | |
1263 | &i915->gpu_error.flags); | |
1264 | wake_up_bit(&i915->gpu_error.flags, | |
1265 | I915_RESET_ENGINE + engine->id); | |
1266 | } | |
1267 | } | |
1268 | ||
1269 | if (!engine_mask) | |
1270 | goto out; | |
1271 | ||
1272 | /* Full reset needs the mutex, stop any other user trying to do so. */ | |
1273 | if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) { | |
1274 | wait_event(i915->gpu_error.reset_queue, | |
1275 | !test_bit(I915_RESET_BACKOFF, | |
1276 | &i915->gpu_error.flags)); | |
1277 | goto out; | |
1278 | } | |
1279 | ||
1280 | /* Prevent any other reset-engine attempt. */ | |
1281 | for_each_engine(engine, i915, tmp) { | |
1282 | while (test_and_set_bit(I915_RESET_ENGINE + engine->id, | |
1283 | &i915->gpu_error.flags)) | |
1284 | wait_on_bit(&i915->gpu_error.flags, | |
1285 | I915_RESET_ENGINE + engine->id, | |
1286 | TASK_UNINTERRUPTIBLE); | |
1287 | } | |
1288 | ||
1289 | i915_reset_device(i915, engine_mask, msg); | |
1290 | ||
1291 | for_each_engine(engine, i915, tmp) { | |
1292 | clear_bit(I915_RESET_ENGINE + engine->id, | |
1293 | &i915->gpu_error.flags); | |
1294 | } | |
1295 | ||
1296 | clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); | |
1297 | wake_up_all(&i915->gpu_error.reset_queue); | |
1298 | ||
1299 | out: | |
1300 | intel_runtime_pm_put(i915, wakeref); | |
1301 | } | |
1302 | ||
eb8d0f5a CW |
1303 | bool i915_reset_flush(struct drm_i915_private *i915) |
1304 | { | |
1305 | int err; | |
1306 | ||
1307 | cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work); | |
1308 | ||
1309 | flush_workqueue(i915->wq); | |
1310 | GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart)); | |
1311 | ||
1312 | mutex_lock(&i915->drm.struct_mutex); | |
1313 | err = i915_gem_wait_for_idle(i915, | |
1314 | I915_WAIT_LOCKED | | |
1315 | I915_WAIT_FOR_IDLE_BOOST, | |
1316 | MAX_SCHEDULE_TIMEOUT); | |
1317 | mutex_unlock(&i915->drm.struct_mutex); | |
1318 | ||
1319 | return !err; | |
1320 | } | |
1321 | ||
9f58892e CW |
1322 | static void i915_wedge_me(struct work_struct *work) |
1323 | { | |
1324 | struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); | |
1325 | ||
1326 | dev_err(w->i915->drm.dev, | |
1327 | "%s timed out, cancelling all in-flight rendering.\n", | |
1328 | w->name); | |
1329 | i915_gem_set_wedged(w->i915); | |
1330 | } | |
1331 | ||
1332 | void __i915_init_wedge(struct i915_wedge_me *w, | |
1333 | struct drm_i915_private *i915, | |
1334 | long timeout, | |
1335 | const char *name) | |
1336 | { | |
1337 | w->i915 = i915; | |
1338 | w->name = name; | |
1339 | ||
1340 | INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); | |
1341 | schedule_delayed_work(&w->work, timeout); | |
1342 | } | |
1343 | ||
1344 | void __i915_fini_wedge(struct i915_wedge_me *w) | |
1345 | { | |
1346 | cancel_delayed_work_sync(&w->work); | |
1347 | destroy_delayed_work_on_stack(&w->work); | |
1348 | w->i915 = NULL; | |
1349 | } |