Commit | Line | Data |
---|---|---|
3ac168a7 MK |
1 | /* |
2 | * Copyright © 2016 Intel Corporation | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice (including the next | |
12 | * paragraph) shall be included in all copies or substantial portions of the | |
13 | * Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
21 | * IN THE SOFTWARE. | |
22 | * | |
23 | */ | |
24 | ||
25 | #include "i915_drv.h" | |
26 | ||
27 | static bool | |
28 | ipehr_is_semaphore_wait(struct intel_engine_cs *engine, u32 ipehr) | |
29 | { | |
79e6770c CW |
30 | ipehr &= ~MI_SEMAPHORE_SYNC_MASK; |
31 | return ipehr == (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | | |
32 | MI_SEMAPHORE_REGISTER); | |
3ac168a7 MK |
33 | } |
34 | ||
35 | static struct intel_engine_cs * | |
36 | semaphore_wait_to_signaller_ring(struct intel_engine_cs *engine, u32 ipehr, | |
37 | u64 offset) | |
38 | { | |
39 | struct drm_i915_private *dev_priv = engine->i915; | |
79e6770c | 40 | u32 sync_bits = ipehr & MI_SEMAPHORE_SYNC_MASK; |
3ac168a7 MK |
41 | struct intel_engine_cs *signaller; |
42 | enum intel_engine_id id; | |
43 | ||
79e6770c CW |
44 | for_each_engine(signaller, dev_priv, id) { |
45 | if (engine == signaller) | |
46 | continue; | |
3ac168a7 | 47 | |
79e6770c CW |
48 | if (sync_bits == signaller->semaphore.mbox.wait[engine->hw_id]) |
49 | return signaller; | |
3ac168a7 MK |
50 | } |
51 | ||
79e6770c CW |
52 | DRM_DEBUG_DRIVER("No signaller ring found for %s, ipehr 0x%08x\n", |
53 | engine->name, ipehr); | |
3ac168a7 MK |
54 | |
55 | return ERR_PTR(-ENODEV); | |
56 | } | |
57 | ||
58 | static struct intel_engine_cs * | |
59 | semaphore_waits_for(struct intel_engine_cs *engine, u32 *seqno) | |
60 | { | |
61 | struct drm_i915_private *dev_priv = engine->i915; | |
62 | void __iomem *vaddr; | |
63 | u32 cmd, ipehr, head; | |
64 | u64 offset = 0; | |
65 | int i, backwards; | |
66 | ||
67 | /* | |
68 | * This function does not support execlist mode - any attempt to | |
69 | * proceed further into this function will result in a kernel panic | |
70 | * when dereferencing ring->buffer, which is not set up in execlist | |
71 | * mode. | |
72 | * | |
73 | * The correct way of doing it would be to derive the currently | |
74 | * executing ring buffer from the current context, which is derived | |
75 | * from the currently running request. Unfortunately, to get the | |
76 | * current request we would have to grab the struct_mutex before doing | |
77 | * anything else, which would be ill-advised since some other thread | |
78 | * might have grabbed it already and managed to hang itself, causing | |
79 | * the hang checker to deadlock. | |
80 | * | |
81 | * Therefore, this function does not support execlist mode in its | |
82 | * current form. Just return NULL and move on. | |
83 | */ | |
84 | if (engine->buffer == NULL) | |
85 | return NULL; | |
86 | ||
87 | ipehr = I915_READ(RING_IPEHR(engine->mmio_base)); | |
88 | if (!ipehr_is_semaphore_wait(engine, ipehr)) | |
89 | return NULL; | |
90 | ||
91 | /* | |
92 | * HEAD is likely pointing to the dword after the actual command, | |
93 | * so scan backwards until we find the MBOX. But limit it to just 3 | |
94 | * or 4 dwords depending on the semaphore wait command size. | |
95 | * Note that we don't care about ACTHD here since that might | |
96 | * point at at batch, and semaphores are always emitted into the | |
97 | * ringbuffer itself. | |
98 | */ | |
99 | head = I915_READ_HEAD(engine) & HEAD_ADDR; | |
100 | backwards = (INTEL_GEN(dev_priv) >= 8) ? 5 : 4; | |
101 | vaddr = (void __iomem *)engine->buffer->vaddr; | |
102 | ||
103 | for (i = backwards; i; --i) { | |
104 | /* | |
105 | * Be paranoid and presume the hw has gone off into the wild - | |
106 | * our ring is smaller than what the hardware (and hence | |
107 | * HEAD_ADDR) allows. Also handles wrap-around. | |
108 | */ | |
109 | head &= engine->buffer->size - 1; | |
110 | ||
111 | /* This here seems to blow up */ | |
112 | cmd = ioread32(vaddr + head); | |
113 | if (cmd == ipehr) | |
114 | break; | |
115 | ||
116 | head -= 4; | |
117 | } | |
118 | ||
119 | if (!i) | |
120 | return NULL; | |
121 | ||
122 | *seqno = ioread32(vaddr + head + 4) + 1; | |
3ac168a7 MK |
123 | return semaphore_wait_to_signaller_ring(engine, ipehr, offset); |
124 | } | |
125 | ||
126 | static int semaphore_passed(struct intel_engine_cs *engine) | |
127 | { | |
128 | struct drm_i915_private *dev_priv = engine->i915; | |
129 | struct intel_engine_cs *signaller; | |
130 | u32 seqno; | |
131 | ||
132 | engine->hangcheck.deadlock++; | |
133 | ||
134 | signaller = semaphore_waits_for(engine, &seqno); | |
135 | if (signaller == NULL) | |
136 | return -1; | |
137 | ||
138 | if (IS_ERR(signaller)) | |
139 | return 0; | |
140 | ||
141 | /* Prevent pathological recursion due to driver bugs */ | |
142 | if (signaller->hangcheck.deadlock >= I915_NUM_ENGINES) | |
143 | return -1; | |
144 | ||
145 | if (i915_seqno_passed(intel_engine_get_seqno(signaller), seqno)) | |
146 | return 1; | |
147 | ||
148 | /* cursory check for an unkickable deadlock */ | |
149 | if (I915_READ_CTL(signaller) & RING_WAIT_SEMAPHORE && | |
150 | semaphore_passed(signaller) < 0) | |
151 | return -1; | |
152 | ||
153 | return 0; | |
154 | } | |
155 | ||
156 | static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv) | |
157 | { | |
158 | struct intel_engine_cs *engine; | |
159 | enum intel_engine_id id; | |
160 | ||
161 | for_each_engine(engine, dev_priv, id) | |
162 | engine->hangcheck.deadlock = 0; | |
163 | } | |
164 | ||
165 | static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone) | |
166 | { | |
167 | u32 tmp = current_instdone | *old_instdone; | |
168 | bool unchanged; | |
169 | ||
170 | unchanged = tmp == *old_instdone; | |
171 | *old_instdone |= tmp; | |
172 | ||
173 | return unchanged; | |
174 | } | |
175 | ||
176 | static bool subunits_stuck(struct intel_engine_cs *engine) | |
177 | { | |
178 | struct drm_i915_private *dev_priv = engine->i915; | |
179 | struct intel_instdone instdone; | |
180 | struct intel_instdone *accu_instdone = &engine->hangcheck.instdone; | |
181 | bool stuck; | |
182 | int slice; | |
183 | int subslice; | |
184 | ||
185 | if (engine->id != RCS) | |
186 | return true; | |
187 | ||
188 | intel_engine_get_instdone(engine, &instdone); | |
189 | ||
190 | /* There might be unstable subunit states even when | |
191 | * actual head is not moving. Filter out the unstable ones by | |
192 | * accumulating the undone -> done transitions and only | |
193 | * consider those as progress. | |
194 | */ | |
195 | stuck = instdone_unchanged(instdone.instdone, | |
196 | &accu_instdone->instdone); | |
197 | stuck &= instdone_unchanged(instdone.slice_common, | |
198 | &accu_instdone->slice_common); | |
199 | ||
200 | for_each_instdone_slice_subslice(dev_priv, slice, subslice) { | |
201 | stuck &= instdone_unchanged(instdone.sampler[slice][subslice], | |
202 | &accu_instdone->sampler[slice][subslice]); | |
203 | stuck &= instdone_unchanged(instdone.row[slice][subslice], | |
204 | &accu_instdone->row[slice][subslice]); | |
205 | } | |
206 | ||
207 | return stuck; | |
208 | } | |
209 | ||
210 | static enum intel_engine_hangcheck_action | |
211 | head_stuck(struct intel_engine_cs *engine, u64 acthd) | |
212 | { | |
213 | if (acthd != engine->hangcheck.acthd) { | |
214 | ||
215 | /* Clear subunit states on head movement */ | |
216 | memset(&engine->hangcheck.instdone, 0, | |
217 | sizeof(engine->hangcheck.instdone)); | |
218 | ||
3fe3b030 | 219 | return ENGINE_ACTIVE_HEAD; |
3ac168a7 MK |
220 | } |
221 | ||
222 | if (!subunits_stuck(engine)) | |
3fe3b030 | 223 | return ENGINE_ACTIVE_SUBUNITS; |
3ac168a7 | 224 | |
3fe3b030 | 225 | return ENGINE_DEAD; |
3ac168a7 MK |
226 | } |
227 | ||
228 | static enum intel_engine_hangcheck_action | |
229 | engine_stuck(struct intel_engine_cs *engine, u64 acthd) | |
230 | { | |
231 | struct drm_i915_private *dev_priv = engine->i915; | |
232 | enum intel_engine_hangcheck_action ha; | |
233 | u32 tmp; | |
234 | ||
235 | ha = head_stuck(engine, acthd); | |
3fe3b030 | 236 | if (ha != ENGINE_DEAD) |
3ac168a7 MK |
237 | return ha; |
238 | ||
239 | if (IS_GEN2(dev_priv)) | |
3fe3b030 | 240 | return ENGINE_DEAD; |
3ac168a7 MK |
241 | |
242 | /* Is the chip hanging on a WAIT_FOR_EVENT? | |
243 | * If so we can simply poke the RB_WAIT bit | |
244 | * and break the hang. This should work on | |
245 | * all but the second generation chipsets. | |
246 | */ | |
247 | tmp = I915_READ_CTL(engine); | |
248 | if (tmp & RING_WAIT) { | |
249 | i915_handle_error(dev_priv, 0, | |
250 | "Kicking stuck wait on %s", | |
251 | engine->name); | |
252 | I915_WRITE_CTL(engine, tmp); | |
3fe3b030 | 253 | return ENGINE_WAIT_KICK; |
3ac168a7 MK |
254 | } |
255 | ||
79e6770c | 256 | if (IS_GEN(dev_priv, 6, 7) && tmp & RING_WAIT_SEMAPHORE) { |
3ac168a7 MK |
257 | switch (semaphore_passed(engine)) { |
258 | default: | |
3fe3b030 | 259 | return ENGINE_DEAD; |
3ac168a7 MK |
260 | case 1: |
261 | i915_handle_error(dev_priv, 0, | |
262 | "Kicking stuck semaphore on %s", | |
263 | engine->name); | |
264 | I915_WRITE_CTL(engine, tmp); | |
3fe3b030 | 265 | return ENGINE_WAIT_KICK; |
3ac168a7 | 266 | case 0: |
3fe3b030 | 267 | return ENGINE_WAIT; |
3ac168a7 MK |
268 | } |
269 | } | |
270 | ||
3fe3b030 | 271 | return ENGINE_DEAD; |
3ac168a7 MK |
272 | } |
273 | ||
6e16d028 MK |
274 | static void hangcheck_load_sample(struct intel_engine_cs *engine, |
275 | struct intel_engine_hangcheck *hc) | |
276 | { | |
277 | /* We don't strictly need an irq-barrier here, as we are not | |
278 | * serving an interrupt request, be paranoid in case the | |
279 | * barrier has side-effects (such as preventing a broken | |
280 | * cacheline snoop) and so be sure that we can see the seqno | |
281 | * advance. If the seqno should stick, due to a stale | |
282 | * cacheline, we would erroneously declare the GPU hung. | |
283 | */ | |
284 | if (engine->irq_seqno_barrier) | |
285 | engine->irq_seqno_barrier(engine); | |
286 | ||
287 | hc->acthd = intel_engine_get_active_head(engine); | |
288 | hc->seqno = intel_engine_get_seqno(engine); | |
6e16d028 MK |
289 | } |
290 | ||
291 | static void hangcheck_store_sample(struct intel_engine_cs *engine, | |
292 | const struct intel_engine_hangcheck *hc) | |
293 | { | |
294 | engine->hangcheck.acthd = hc->acthd; | |
295 | engine->hangcheck.seqno = hc->seqno; | |
6e16d028 | 296 | engine->hangcheck.action = hc->action; |
3fe3b030 | 297 | engine->hangcheck.stalled = hc->stalled; |
6e16d028 MK |
298 | } |
299 | ||
300 | static enum intel_engine_hangcheck_action | |
301 | hangcheck_get_action(struct intel_engine_cs *engine, | |
302 | const struct intel_engine_hangcheck *hc) | |
303 | { | |
304 | if (engine->hangcheck.seqno != hc->seqno) | |
3fe3b030 | 305 | return ENGINE_ACTIVE_SEQNO; |
6e16d028 | 306 | |
5cce5e31 | 307 | if (intel_engine_is_idle(engine)) |
3fe3b030 | 308 | return ENGINE_IDLE; |
6e16d028 MK |
309 | |
310 | return engine_stuck(engine, hc->acthd); | |
311 | } | |
312 | ||
313 | static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, | |
314 | struct intel_engine_hangcheck *hc) | |
315 | { | |
3fe3b030 MK |
316 | unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; |
317 | ||
6e16d028 MK |
318 | hc->action = hangcheck_get_action(engine, hc); |
319 | ||
3fe3b030 MK |
320 | /* We always increment the progress |
321 | * if the engine is busy and still processing | |
322 | * the same request, so that no single request | |
323 | * can run indefinitely (such as a chain of | |
324 | * batches). The only time we do not increment | |
325 | * the hangcheck score on this ring, if this | |
326 | * engine is in a legitimate wait for another | |
327 | * engine. In that case the waiting engine is a | |
328 | * victim and we want to be sure we catch the | |
329 | * right culprit. Then every time we do kick | |
330 | * the ring, make it as a progress as the seqno | |
331 | * advancement might ensure and if not, it | |
332 | * will catch the hanging engine. | |
333 | */ | |
6e16d028 | 334 | |
3fe3b030 MK |
335 | switch (hc->action) { |
336 | case ENGINE_IDLE: | |
337 | case ENGINE_ACTIVE_SEQNO: | |
338 | /* Clear head and subunit states on seqno movement */ | |
339 | hc->acthd = 0; | |
6e16d028 | 340 | |
3fe3b030 MK |
341 | memset(&engine->hangcheck.instdone, 0, |
342 | sizeof(engine->hangcheck.instdone)); | |
6e16d028 | 343 | |
3fe3b030 MK |
344 | /* Intentional fall through */ |
345 | case ENGINE_WAIT_KICK: | |
346 | case ENGINE_WAIT: | |
347 | engine->hangcheck.action_timestamp = jiffies; | |
6e16d028 MK |
348 | break; |
349 | ||
3fe3b030 MK |
350 | case ENGINE_ACTIVE_HEAD: |
351 | case ENGINE_ACTIVE_SUBUNITS: | |
84ef3a72 CW |
352 | /* |
353 | * Seqno stuck with still active engine gets leeway, | |
3fe3b030 | 354 | * in hopes that it is just a long shader. |
6e16d028 | 355 | */ |
3fe3b030 MK |
356 | timeout = I915_SEQNO_DEAD_TIMEOUT; |
357 | break; | |
6e16d028 | 358 | |
3fe3b030 | 359 | case ENGINE_DEAD: |
84ef3a72 CW |
360 | if (drm_debug & DRM_UT_DRIVER) { |
361 | struct drm_printer p = drm_debug_printer("hangcheck"); | |
362 | intel_engine_dump(engine, &p, "%s", engine->name); | |
363 | } | |
6e16d028 MK |
364 | break; |
365 | ||
366 | default: | |
367 | MISSING_CASE(hc->action); | |
368 | } | |
3fe3b030 MK |
369 | |
370 | hc->stalled = time_after(jiffies, | |
371 | engine->hangcheck.action_timestamp + timeout); | |
6e16d028 MK |
372 | } |
373 | ||
374 | static void hangcheck_declare_hang(struct drm_i915_private *i915, | |
375 | unsigned int hung, | |
376 | unsigned int stuck) | |
377 | { | |
378 | struct intel_engine_cs *engine; | |
379 | char msg[80]; | |
380 | unsigned int tmp; | |
381 | int len; | |
382 | ||
383 | /* If some rings hung but others were still busy, only | |
384 | * blame the hanging rings in the synopsis. | |
385 | */ | |
386 | if (stuck != hung) | |
387 | hung &= ~stuck; | |
388 | len = scnprintf(msg, sizeof(msg), | |
389 | "%s on ", stuck == hung ? "No progress" : "Hang"); | |
390 | for_each_engine_masked(engine, i915, hung, tmp) | |
391 | len += scnprintf(msg + len, sizeof(msg) - len, | |
392 | "%s, ", engine->name); | |
393 | msg[len-2] = '\0'; | |
394 | ||
916a491d | 395 | return i915_handle_error(i915, hung, "%s", msg); |
6e16d028 MK |
396 | } |
397 | ||
3ac168a7 MK |
398 | /* |
399 | * This is called when the chip hasn't reported back with completed | |
400 | * batchbuffers in a long time. We keep track per ring seqno progress and | |
401 | * if there are no progress, hangcheck score for that ring is increased. | |
402 | * Further, acthd is inspected to see if the ring is stuck. On stuck case | |
403 | * we kick the ring. If we see no progress on three subsequent calls | |
404 | * we assume chip is wedged and try to fix it by resetting the chip. | |
405 | */ | |
406 | static void i915_hangcheck_elapsed(struct work_struct *work) | |
407 | { | |
408 | struct drm_i915_private *dev_priv = | |
409 | container_of(work, typeof(*dev_priv), | |
410 | gpu_error.hangcheck_work.work); | |
411 | struct intel_engine_cs *engine; | |
412 | enum intel_engine_id id; | |
413 | unsigned int hung = 0, stuck = 0; | |
414 | int busy_count = 0; | |
3ac168a7 | 415 | |
4f044a88 | 416 | if (!i915_modparams.enable_hangcheck) |
3ac168a7 MK |
417 | return; |
418 | ||
419 | if (!READ_ONCE(dev_priv->gt.awake)) | |
420 | return; | |
421 | ||
c2a126a4 CW |
422 | if (i915_terminally_wedged(&dev_priv->gpu_error)) |
423 | return; | |
424 | ||
3ac168a7 MK |
425 | /* As enabling the GPU requires fairly extensive mmio access, |
426 | * periodically arm the mmio checker to see if we are triggering | |
427 | * any invalid access. | |
428 | */ | |
429 | intel_uncore_arm_unclaimed_mmio_detection(dev_priv); | |
430 | ||
431 | for_each_engine(engine, dev_priv, id) { | |
6e16d028 MK |
432 | struct intel_engine_hangcheck cur_state, *hc = &cur_state; |
433 | const bool busy = intel_engine_has_waiter(engine); | |
3ac168a7 MK |
434 | |
435 | semaphore_clear_deadlocks(dev_priv); | |
436 | ||
6e16d028 MK |
437 | hangcheck_load_sample(engine, hc); |
438 | hangcheck_accumulate_sample(engine, hc); | |
439 | hangcheck_store_sample(engine, hc); | |
440 | ||
3fe3b030 | 441 | if (engine->hangcheck.stalled) { |
6e16d028 | 442 | hung |= intel_engine_flag(engine); |
3fe3b030 | 443 | if (hc->action != ENGINE_DEAD) |
6e16d028 | 444 | stuck |= intel_engine_flag(engine); |
3ac168a7 MK |
445 | } |
446 | ||
3ac168a7 MK |
447 | busy_count += busy; |
448 | } | |
449 | ||
6e16d028 MK |
450 | if (hung) |
451 | hangcheck_declare_hang(dev_priv, hung, stuck); | |
3ac168a7 MK |
452 | |
453 | /* Reset timer in case GPU hangs without another request being added */ | |
454 | if (busy_count) | |
455 | i915_queue_hangcheck(dev_priv); | |
456 | } | |
457 | ||
458 | void intel_engine_init_hangcheck(struct intel_engine_cs *engine) | |
459 | { | |
460 | memset(&engine->hangcheck, 0, sizeof(engine->hangcheck)); | |
461 | } | |
462 | ||
463 | void intel_hangcheck_init(struct drm_i915_private *i915) | |
464 | { | |
465 | INIT_DELAYED_WORK(&i915->gpu_error.hangcheck_work, | |
466 | i915_hangcheck_elapsed); | |
467 | } | |
496b575e CW |
468 | |
469 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) | |
470 | #include "selftests/intel_hangcheck.c" | |
471 | #endif |