Commit | Line | Data |
---|---|---|
496b575e CW |
1 | /* |
2 | * Copyright © 2016 Intel Corporation | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice (including the next | |
12 | * paragraph) shall be included in all copies or substantial portions of the | |
13 | * Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
21 | * IN THE SOFTWARE. | |
22 | * | |
23 | */ | |
24 | ||
79f0f472 CW |
25 | #include <linux/kthread.h> |
26 | ||
496b575e | 27 | #include "../i915_selftest.h" |
a90507d6 | 28 | #include "i915_random.h" |
496b575e | 29 | |
79f0f472 CW |
30 | #include "mock_context.h" |
31 | #include "mock_drm.h" | |
32 | ||
496b575e CW |
33 | struct hang { |
34 | struct drm_i915_private *i915; | |
35 | struct drm_i915_gem_object *hws; | |
36 | struct drm_i915_gem_object *obj; | |
8ec21a7c | 37 | struct i915_gem_context *ctx; |
496b575e CW |
38 | u32 *seqno; |
39 | u32 *batch; | |
40 | }; | |
41 | ||
42 | static int hang_init(struct hang *h, struct drm_i915_private *i915) | |
43 | { | |
44 | void *vaddr; | |
45 | int err; | |
46 | ||
47 | memset(h, 0, sizeof(*h)); | |
48 | h->i915 = i915; | |
49 | ||
8ec21a7c CW |
50 | h->ctx = kernel_context(i915); |
51 | if (IS_ERR(h->ctx)) | |
52 | return PTR_ERR(h->ctx); | |
53 | ||
496b575e | 54 | h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE); |
8ec21a7c CW |
55 | if (IS_ERR(h->hws)) { |
56 | err = PTR_ERR(h->hws); | |
57 | goto err_ctx; | |
58 | } | |
496b575e CW |
59 | |
60 | h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE); | |
61 | if (IS_ERR(h->obj)) { | |
62 | err = PTR_ERR(h->obj); | |
63 | goto err_hws; | |
64 | } | |
65 | ||
66 | i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC); | |
67 | vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); | |
68 | if (IS_ERR(vaddr)) { | |
69 | err = PTR_ERR(vaddr); | |
70 | goto err_obj; | |
71 | } | |
72 | h->seqno = memset(vaddr, 0xff, PAGE_SIZE); | |
73 | ||
74 | vaddr = i915_gem_object_pin_map(h->obj, | |
75 | HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC); | |
76 | if (IS_ERR(vaddr)) { | |
77 | err = PTR_ERR(vaddr); | |
78 | goto err_unpin_hws; | |
79 | } | |
80 | h->batch = vaddr; | |
81 | ||
82 | return 0; | |
83 | ||
84 | err_unpin_hws: | |
85 | i915_gem_object_unpin_map(h->hws); | |
86 | err_obj: | |
87 | i915_gem_object_put(h->obj); | |
88 | err_hws: | |
89 | i915_gem_object_put(h->hws); | |
8ec21a7c CW |
90 | err_ctx: |
91 | kernel_context_close(h->ctx); | |
496b575e CW |
92 | return err; |
93 | } | |
94 | ||
95 | static u64 hws_address(const struct i915_vma *hws, | |
e61e0f51 | 96 | const struct i915_request *rq) |
496b575e CW |
97 | { |
98 | return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); | |
99 | } | |
100 | ||
101 | static int emit_recurse_batch(struct hang *h, | |
e61e0f51 | 102 | struct i915_request *rq) |
496b575e CW |
103 | { |
104 | struct drm_i915_private *i915 = h->i915; | |
105 | struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base; | |
106 | struct i915_vma *hws, *vma; | |
107 | unsigned int flags; | |
108 | u32 *batch; | |
109 | int err; | |
110 | ||
111 | vma = i915_vma_instance(h->obj, vm, NULL); | |
112 | if (IS_ERR(vma)) | |
113 | return PTR_ERR(vma); | |
114 | ||
115 | hws = i915_vma_instance(h->hws, vm, NULL); | |
116 | if (IS_ERR(hws)) | |
117 | return PTR_ERR(hws); | |
118 | ||
119 | err = i915_vma_pin(vma, 0, 0, PIN_USER); | |
120 | if (err) | |
121 | return err; | |
122 | ||
123 | err = i915_vma_pin(hws, 0, 0, PIN_USER); | |
124 | if (err) | |
125 | goto unpin_vma; | |
126 | ||
496b575e CW |
127 | i915_vma_move_to_active(vma, rq, 0); |
128 | if (!i915_gem_object_has_active_reference(vma->obj)) { | |
129 | i915_gem_object_get(vma->obj); | |
130 | i915_gem_object_set_active_reference(vma->obj); | |
131 | } | |
132 | ||
133 | i915_vma_move_to_active(hws, rq, 0); | |
134 | if (!i915_gem_object_has_active_reference(hws->obj)) { | |
135 | i915_gem_object_get(hws->obj); | |
136 | i915_gem_object_set_active_reference(hws->obj); | |
137 | } | |
138 | ||
139 | batch = h->batch; | |
140 | if (INTEL_GEN(i915) >= 8) { | |
141 | *batch++ = MI_STORE_DWORD_IMM_GEN4; | |
142 | *batch++ = lower_32_bits(hws_address(hws, rq)); | |
143 | *batch++ = upper_32_bits(hws_address(hws, rq)); | |
144 | *batch++ = rq->fence.seqno; | |
3fb04cb0 CW |
145 | *batch++ = MI_ARB_CHECK; |
146 | ||
147 | memset(batch, 0, 1024); | |
148 | batch += 1024 / sizeof(*batch); | |
149 | ||
150 | *batch++ = MI_ARB_CHECK; | |
496b575e CW |
151 | *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; |
152 | *batch++ = lower_32_bits(vma->node.start); | |
153 | *batch++ = upper_32_bits(vma->node.start); | |
154 | } else if (INTEL_GEN(i915) >= 6) { | |
155 | *batch++ = MI_STORE_DWORD_IMM_GEN4; | |
156 | *batch++ = 0; | |
157 | *batch++ = lower_32_bits(hws_address(hws, rq)); | |
158 | *batch++ = rq->fence.seqno; | |
3fb04cb0 CW |
159 | *batch++ = MI_ARB_CHECK; |
160 | ||
161 | memset(batch, 0, 1024); | |
162 | batch += 1024 / sizeof(*batch); | |
163 | ||
164 | *batch++ = MI_ARB_CHECK; | |
496b575e CW |
165 | *batch++ = MI_BATCH_BUFFER_START | 1 << 8; |
166 | *batch++ = lower_32_bits(vma->node.start); | |
167 | } else if (INTEL_GEN(i915) >= 4) { | |
168 | *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22; | |
169 | *batch++ = 0; | |
170 | *batch++ = lower_32_bits(hws_address(hws, rq)); | |
171 | *batch++ = rq->fence.seqno; | |
3fb04cb0 CW |
172 | *batch++ = MI_ARB_CHECK; |
173 | ||
174 | memset(batch, 0, 1024); | |
175 | batch += 1024 / sizeof(*batch); | |
176 | ||
177 | *batch++ = MI_ARB_CHECK; | |
496b575e CW |
178 | *batch++ = MI_BATCH_BUFFER_START | 2 << 6; |
179 | *batch++ = lower_32_bits(vma->node.start); | |
180 | } else { | |
181 | *batch++ = MI_STORE_DWORD_IMM; | |
182 | *batch++ = lower_32_bits(hws_address(hws, rq)); | |
183 | *batch++ = rq->fence.seqno; | |
3fb04cb0 CW |
184 | *batch++ = MI_ARB_CHECK; |
185 | ||
186 | memset(batch, 0, 1024); | |
187 | batch += 1024 / sizeof(*batch); | |
188 | ||
189 | *batch++ = MI_ARB_CHECK; | |
496b575e CW |
190 | *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1; |
191 | *batch++ = lower_32_bits(vma->node.start); | |
192 | } | |
193 | *batch++ = MI_BATCH_BUFFER_END; /* not reached */ | |
60456d5c | 194 | i915_gem_chipset_flush(h->i915); |
496b575e CW |
195 | |
196 | flags = 0; | |
197 | if (INTEL_GEN(vm->i915) <= 5) | |
198 | flags |= I915_DISPATCH_SECURE; | |
199 | ||
200 | err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); | |
201 | ||
496b575e CW |
202 | i915_vma_unpin(hws); |
203 | unpin_vma: | |
204 | i915_vma_unpin(vma); | |
205 | return err; | |
206 | } | |
207 | ||
e61e0f51 | 208 | static struct i915_request * |
8ec21a7c | 209 | hang_create_request(struct hang *h, struct intel_engine_cs *engine) |
496b575e | 210 | { |
e61e0f51 | 211 | struct i915_request *rq; |
496b575e CW |
212 | int err; |
213 | ||
214 | if (i915_gem_object_is_active(h->obj)) { | |
215 | struct drm_i915_gem_object *obj; | |
216 | void *vaddr; | |
217 | ||
218 | obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); | |
219 | if (IS_ERR(obj)) | |
220 | return ERR_CAST(obj); | |
221 | ||
222 | vaddr = i915_gem_object_pin_map(obj, | |
223 | HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC); | |
224 | if (IS_ERR(vaddr)) { | |
225 | i915_gem_object_put(obj); | |
226 | return ERR_CAST(vaddr); | |
227 | } | |
228 | ||
229 | i915_gem_object_unpin_map(h->obj); | |
230 | i915_gem_object_put(h->obj); | |
231 | ||
232 | h->obj = obj; | |
233 | h->batch = vaddr; | |
234 | } | |
235 | ||
e61e0f51 | 236 | rq = i915_request_alloc(engine, h->ctx); |
496b575e CW |
237 | if (IS_ERR(rq)) |
238 | return rq; | |
239 | ||
240 | err = emit_recurse_batch(h, rq); | |
241 | if (err) { | |
e61e0f51 | 242 | __i915_request_add(rq, false); |
496b575e CW |
243 | return ERR_PTR(err); |
244 | } | |
245 | ||
246 | return rq; | |
247 | } | |
248 | ||
e61e0f51 | 249 | static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) |
496b575e CW |
250 | { |
251 | return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); | |
252 | } | |
253 | ||
a8b66f2c CW |
254 | struct wedge_me { |
255 | struct delayed_work work; | |
256 | struct drm_i915_private *i915; | |
257 | const void *symbol; | |
258 | }; | |
259 | ||
260 | static void wedge_me(struct work_struct *work) | |
261 | { | |
262 | struct wedge_me *w = container_of(work, typeof(*w), work.work); | |
263 | ||
0ade4390 CW |
264 | pr_err("%pS timed out, cancelling all further testing.\n", w->symbol); |
265 | ||
266 | GEM_TRACE("%pS timed out.\n", w->symbol); | |
267 | GEM_TRACE_DUMP(); | |
268 | ||
a8b66f2c CW |
269 | i915_gem_set_wedged(w->i915); |
270 | } | |
271 | ||
272 | static void __init_wedge(struct wedge_me *w, | |
273 | struct drm_i915_private *i915, | |
274 | long timeout, | |
275 | const void *symbol) | |
276 | { | |
277 | w->i915 = i915; | |
278 | w->symbol = symbol; | |
279 | ||
280 | INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me); | |
281 | schedule_delayed_work(&w->work, timeout); | |
282 | } | |
283 | ||
284 | static void __fini_wedge(struct wedge_me *w) | |
285 | { | |
286 | cancel_delayed_work_sync(&w->work); | |
287 | destroy_delayed_work_on_stack(&w->work); | |
288 | w->i915 = NULL; | |
289 | } | |
290 | ||
291 | #define wedge_on_timeout(W, DEV, TIMEOUT) \ | |
292 | for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \ | |
293 | (W)->i915; \ | |
294 | __fini_wedge((W))) | |
295 | ||
296 | static noinline int | |
297 | flush_test(struct drm_i915_private *i915, unsigned int flags) | |
298 | { | |
299 | struct wedge_me w; | |
300 | ||
301 | cond_resched(); | |
302 | ||
303 | wedge_on_timeout(&w, i915, HZ) | |
304 | i915_gem_wait_for_idle(i915, flags); | |
305 | ||
306 | return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0; | |
307 | } | |
308 | ||
496b575e CW |
309 | static void hang_fini(struct hang *h) |
310 | { | |
311 | *h->batch = MI_BATCH_BUFFER_END; | |
60456d5c | 312 | i915_gem_chipset_flush(h->i915); |
496b575e CW |
313 | |
314 | i915_gem_object_unpin_map(h->obj); | |
315 | i915_gem_object_put(h->obj); | |
316 | ||
317 | i915_gem_object_unpin_map(h->hws); | |
318 | i915_gem_object_put(h->hws); | |
319 | ||
8ec21a7c CW |
320 | kernel_context_close(h->ctx); |
321 | ||
a8b66f2c | 322 | flush_test(h->i915, I915_WAIT_LOCKED); |
496b575e CW |
323 | } |
324 | ||
29991d53 | 325 | static bool wait_until_running(struct hang *h, struct i915_request *rq) |
3fb04cb0 CW |
326 | { |
327 | return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), | |
328 | rq->fence.seqno), | |
329 | 10) && | |
330 | wait_for(i915_seqno_passed(hws_seqno(h, rq), | |
331 | rq->fence.seqno), | |
332 | 1000)); | |
333 | } | |
334 | ||
496b575e CW |
335 | static int igt_hang_sanitycheck(void *arg) |
336 | { | |
337 | struct drm_i915_private *i915 = arg; | |
e61e0f51 | 338 | struct i915_request *rq; |
496b575e CW |
339 | struct intel_engine_cs *engine; |
340 | enum intel_engine_id id; | |
341 | struct hang h; | |
342 | int err; | |
343 | ||
344 | /* Basic check that we can execute our hanging batch */ | |
345 | ||
496b575e CW |
346 | mutex_lock(&i915->drm.struct_mutex); |
347 | err = hang_init(&h, i915); | |
348 | if (err) | |
349 | goto unlock; | |
350 | ||
351 | for_each_engine(engine, i915, id) { | |
352 | long timeout; | |
353 | ||
f2f5c061 CW |
354 | if (!intel_engine_can_store_dword(engine)) |
355 | continue; | |
356 | ||
8ec21a7c | 357 | rq = hang_create_request(&h, engine); |
496b575e CW |
358 | if (IS_ERR(rq)) { |
359 | err = PTR_ERR(rq); | |
360 | pr_err("Failed to create request for %s, err=%d\n", | |
361 | engine->name, err); | |
362 | goto fini; | |
363 | } | |
364 | ||
e61e0f51 | 365 | i915_request_get(rq); |
496b575e CW |
366 | |
367 | *h.batch = MI_BATCH_BUFFER_END; | |
60456d5c CW |
368 | i915_gem_chipset_flush(i915); |
369 | ||
e61e0f51 | 370 | __i915_request_add(rq, true); |
496b575e | 371 | |
e61e0f51 | 372 | timeout = i915_request_wait(rq, |
496b575e CW |
373 | I915_WAIT_LOCKED, |
374 | MAX_SCHEDULE_TIMEOUT); | |
e61e0f51 | 375 | i915_request_put(rq); |
496b575e CW |
376 | |
377 | if (timeout < 0) { | |
378 | err = timeout; | |
379 | pr_err("Wait for request failed on %s, err=%d\n", | |
380 | engine->name, err); | |
381 | goto fini; | |
382 | } | |
383 | } | |
384 | ||
385 | fini: | |
386 | hang_fini(&h); | |
387 | unlock: | |
388 | mutex_unlock(&i915->drm.struct_mutex); | |
389 | return err; | |
390 | } | |
391 | ||
3744d49c CW |
392 | static void global_reset_lock(struct drm_i915_private *i915) |
393 | { | |
394 | struct intel_engine_cs *engine; | |
395 | enum intel_engine_id id; | |
396 | ||
3fb04cb0 CW |
397 | pr_debug("%s: current gpu_error=%08lx\n", |
398 | __func__, i915->gpu_error.flags); | |
399 | ||
3744d49c CW |
400 | while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) |
401 | wait_event(i915->gpu_error.reset_queue, | |
402 | !test_bit(I915_RESET_BACKOFF, | |
403 | &i915->gpu_error.flags)); | |
404 | ||
405 | for_each_engine(engine, i915, id) { | |
406 | while (test_and_set_bit(I915_RESET_ENGINE + id, | |
407 | &i915->gpu_error.flags)) | |
408 | wait_on_bit(&i915->gpu_error.flags, | |
409 | I915_RESET_ENGINE + id, | |
410 | TASK_UNINTERRUPTIBLE); | |
411 | } | |
412 | } | |
413 | ||
414 | static void global_reset_unlock(struct drm_i915_private *i915) | |
415 | { | |
416 | struct intel_engine_cs *engine; | |
417 | enum intel_engine_id id; | |
418 | ||
419 | for_each_engine(engine, i915, id) | |
420 | clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); | |
421 | ||
422 | clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); | |
423 | wake_up_all(&i915->gpu_error.reset_queue); | |
424 | } | |
425 | ||
496b575e CW |
426 | static int igt_global_reset(void *arg) |
427 | { | |
428 | struct drm_i915_private *i915 = arg; | |
429 | unsigned int reset_count; | |
430 | int err = 0; | |
431 | ||
432 | /* Check that we can issue a global GPU reset */ | |
433 | ||
3744d49c | 434 | global_reset_lock(i915); |
8c185eca | 435 | set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags); |
496b575e CW |
436 | |
437 | mutex_lock(&i915->drm.struct_mutex); | |
438 | reset_count = i915_reset_count(&i915->gpu_error); | |
439 | ||
d0667e9c | 440 | i915_reset(i915, ALL_ENGINES, NULL); |
496b575e CW |
441 | |
442 | if (i915_reset_count(&i915->gpu_error) == reset_count) { | |
443 | pr_err("No GPU reset recorded!\n"); | |
444 | err = -EINVAL; | |
445 | } | |
446 | mutex_unlock(&i915->drm.struct_mutex); | |
447 | ||
8c185eca | 448 | GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); |
3744d49c | 449 | global_reset_unlock(i915); |
d5367307 | 450 | |
496b575e CW |
451 | if (i915_terminally_wedged(&i915->gpu_error)) |
452 | err = -EIO; | |
453 | ||
454 | return err; | |
455 | } | |
456 | ||
3fb04cb0 | 457 | static int __igt_reset_engine(struct drm_i915_private *i915, bool active) |
abeb4def | 458 | { |
abeb4def MT |
459 | struct intel_engine_cs *engine; |
460 | enum intel_engine_id id; | |
3fb04cb0 | 461 | struct hang h; |
abeb4def MT |
462 | int err = 0; |
463 | ||
3fb04cb0 | 464 | /* Check that we can issue an engine reset on an idle engine (no-op) */ |
abeb4def MT |
465 | |
466 | if (!intel_has_reset_engine(i915)) | |
467 | return 0; | |
468 | ||
3fb04cb0 CW |
469 | if (active) { |
470 | mutex_lock(&i915->drm.struct_mutex); | |
471 | err = hang_init(&h, i915); | |
472 | mutex_unlock(&i915->drm.struct_mutex); | |
473 | if (err) | |
474 | return err; | |
475 | } | |
476 | ||
abeb4def | 477 | for_each_engine(engine, i915, id) { |
3fb04cb0 CW |
478 | unsigned int reset_count, reset_engine_count; |
479 | IGT_TIMEOUT(end_time); | |
480 | ||
481 | if (active && !intel_engine_can_store_dword(engine)) | |
482 | continue; | |
483 | ||
abeb4def MT |
484 | reset_count = i915_reset_count(&i915->gpu_error); |
485 | reset_engine_count = i915_reset_engine_count(&i915->gpu_error, | |
486 | engine); | |
487 | ||
3fb04cb0 CW |
488 | set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); |
489 | do { | |
a90507d6 CW |
490 | u32 seqno = intel_engine_get_seqno(engine); |
491 | ||
3fb04cb0 | 492 | if (active) { |
e61e0f51 | 493 | struct i915_request *rq; |
3fb04cb0 CW |
494 | |
495 | mutex_lock(&i915->drm.struct_mutex); | |
8ec21a7c | 496 | rq = hang_create_request(&h, engine); |
3fb04cb0 CW |
497 | if (IS_ERR(rq)) { |
498 | err = PTR_ERR(rq); | |
499 | mutex_unlock(&i915->drm.struct_mutex); | |
500 | break; | |
501 | } | |
502 | ||
e61e0f51 CW |
503 | i915_request_get(rq); |
504 | __i915_request_add(rq, true); | |
3fb04cb0 CW |
505 | mutex_unlock(&i915->drm.struct_mutex); |
506 | ||
29991d53 | 507 | if (!wait_until_running(&h, rq)) { |
3fb04cb0 CW |
508 | struct drm_printer p = drm_info_printer(i915->drm.dev); |
509 | ||
510 | pr_err("%s: Failed to start request %x, at %x\n", | |
511 | __func__, rq->fence.seqno, hws_seqno(&h, rq)); | |
512 | intel_engine_dump(engine, &p, | |
513 | "%s\n", engine->name); | |
514 | ||
e61e0f51 | 515 | i915_request_put(rq); |
3fb04cb0 CW |
516 | err = -EIO; |
517 | break; | |
518 | } | |
abeb4def | 519 | |
a90507d6 CW |
520 | GEM_BUG_ON(!rq->global_seqno); |
521 | seqno = rq->global_seqno - 1; | |
e61e0f51 | 522 | i915_request_put(rq); |
3fb04cb0 CW |
523 | } |
524 | ||
ce800754 | 525 | err = i915_reset_engine(engine, NULL); |
3fb04cb0 CW |
526 | if (err) { |
527 | pr_err("i915_reset_engine failed\n"); | |
528 | break; | |
529 | } | |
530 | ||
531 | if (i915_reset_count(&i915->gpu_error) != reset_count) { | |
532 | pr_err("Full GPU reset recorded! (engine reset expected)\n"); | |
533 | err = -EINVAL; | |
534 | break; | |
535 | } | |
536 | ||
537 | reset_engine_count += active; | |
538 | if (i915_reset_engine_count(&i915->gpu_error, engine) != | |
539 | reset_engine_count) { | |
540 | pr_err("%s engine reset %srecorded!\n", | |
541 | engine->name, active ? "not " : ""); | |
542 | err = -EINVAL; | |
543 | break; | |
544 | } | |
3fb04cb0 CW |
545 | } while (time_before(jiffies, end_time)); |
546 | clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); | |
abeb4def | 547 | |
3fb04cb0 | 548 | if (err) |
abeb4def | 549 | break; |
abeb4def | 550 | |
a8b66f2c CW |
551 | err = flush_test(i915, 0); |
552 | if (err) | |
553 | break; | |
abeb4def MT |
554 | } |
555 | ||
556 | if (i915_terminally_wedged(&i915->gpu_error)) | |
557 | err = -EIO; | |
558 | ||
3fb04cb0 CW |
559 | if (active) { |
560 | mutex_lock(&i915->drm.struct_mutex); | |
561 | hang_fini(&h); | |
562 | mutex_unlock(&i915->drm.struct_mutex); | |
563 | } | |
564 | ||
abeb4def MT |
565 | return err; |
566 | } | |
567 | ||
3fb04cb0 CW |
568 | static int igt_reset_idle_engine(void *arg) |
569 | { | |
570 | return __igt_reset_engine(arg, false); | |
571 | } | |
572 | ||
573 | static int igt_reset_active_engine(void *arg) | |
574 | { | |
575 | return __igt_reset_engine(arg, true); | |
576 | } | |
577 | ||
a90507d6 CW |
578 | struct active_engine { |
579 | struct task_struct *task; | |
580 | struct intel_engine_cs *engine; | |
581 | unsigned long resets; | |
582 | unsigned int flags; | |
583 | }; | |
584 | ||
585 | #define TEST_ACTIVE BIT(0) | |
586 | #define TEST_OTHERS BIT(1) | |
587 | #define TEST_SELF BIT(2) | |
588 | #define TEST_PRIORITY BIT(3) | |
589 | ||
79f0f472 CW |
590 | static int active_engine(void *data) |
591 | { | |
a90507d6 CW |
592 | I915_RND_STATE(prng); |
593 | struct active_engine *arg = data; | |
594 | struct intel_engine_cs *engine = arg->engine; | |
595 | struct i915_request *rq[8] = {}; | |
596 | struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; | |
79f0f472 CW |
597 | struct drm_file *file; |
598 | unsigned long count = 0; | |
599 | int err = 0; | |
600 | ||
601 | file = mock_file(engine->i915); | |
602 | if (IS_ERR(file)) | |
603 | return PTR_ERR(file); | |
604 | ||
a90507d6 CW |
605 | for (count = 0; count < ARRAY_SIZE(ctx); count++) { |
606 | mutex_lock(&engine->i915->drm.struct_mutex); | |
607 | ctx[count] = live_context(engine->i915, file); | |
608 | mutex_unlock(&engine->i915->drm.struct_mutex); | |
609 | if (IS_ERR(ctx[count])) { | |
610 | err = PTR_ERR(ctx[count]); | |
611 | while (--count) | |
612 | i915_gem_context_put(ctx[count]); | |
613 | goto err_file; | |
614 | } | |
79f0f472 CW |
615 | } |
616 | ||
617 | while (!kthread_should_stop()) { | |
a90507d6 | 618 | unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); |
e61e0f51 CW |
619 | struct i915_request *old = rq[idx]; |
620 | struct i915_request *new; | |
79f0f472 CW |
621 | |
622 | mutex_lock(&engine->i915->drm.struct_mutex); | |
e61e0f51 | 623 | new = i915_request_alloc(engine, ctx[idx]); |
79f0f472 CW |
624 | if (IS_ERR(new)) { |
625 | mutex_unlock(&engine->i915->drm.struct_mutex); | |
626 | err = PTR_ERR(new); | |
627 | break; | |
628 | } | |
629 | ||
a90507d6 CW |
630 | if (arg->flags & TEST_PRIORITY) |
631 | ctx[idx]->priority = | |
632 | i915_prandom_u32_max_state(512, &prng); | |
633 | ||
e61e0f51 CW |
634 | rq[idx] = i915_request_get(new); |
635 | i915_request_add(new); | |
79f0f472 CW |
636 | mutex_unlock(&engine->i915->drm.struct_mutex); |
637 | ||
638 | if (old) { | |
0ade4390 CW |
639 | if (i915_request_wait(old, 0, HZ) < 0) { |
640 | GEM_TRACE("%s timed out.\n", engine->name); | |
641 | GEM_TRACE_DUMP(); | |
642 | ||
643 | i915_gem_set_wedged(engine->i915); | |
644 | i915_request_put(old); | |
645 | err = -EIO; | |
646 | break; | |
647 | } | |
e61e0f51 | 648 | i915_request_put(old); |
79f0f472 | 649 | } |
0ade4390 CW |
650 | |
651 | cond_resched(); | |
79f0f472 CW |
652 | } |
653 | ||
654 | for (count = 0; count < ARRAY_SIZE(rq); count++) | |
e61e0f51 | 655 | i915_request_put(rq[count]); |
79f0f472 CW |
656 | |
657 | err_file: | |
658 | mock_file_free(engine->i915, file); | |
659 | return err; | |
660 | } | |
661 | ||
a90507d6 CW |
662 | static int __igt_reset_engines(struct drm_i915_private *i915, |
663 | const char *test_name, | |
664 | unsigned int flags) | |
79f0f472 | 665 | { |
3fb04cb0 | 666 | struct intel_engine_cs *engine, *other; |
79f0f472 | 667 | enum intel_engine_id id, tmp; |
3fb04cb0 | 668 | struct hang h; |
79f0f472 CW |
669 | int err = 0; |
670 | ||
671 | /* Check that issuing a reset on one engine does not interfere | |
672 | * with any other engine. | |
673 | */ | |
674 | ||
675 | if (!intel_has_reset_engine(i915)) | |
676 | return 0; | |
677 | ||
a90507d6 | 678 | if (flags & TEST_ACTIVE) { |
3fb04cb0 CW |
679 | mutex_lock(&i915->drm.struct_mutex); |
680 | err = hang_init(&h, i915); | |
681 | mutex_unlock(&i915->drm.struct_mutex); | |
682 | if (err) | |
683 | return err; | |
a90507d6 CW |
684 | |
685 | if (flags & TEST_PRIORITY) | |
686 | h.ctx->priority = 1024; | |
3fb04cb0 CW |
687 | } |
688 | ||
79f0f472 | 689 | for_each_engine(engine, i915, id) { |
a90507d6 | 690 | struct active_engine threads[I915_NUM_ENGINES] = {}; |
79f0f472 | 691 | unsigned long global = i915_reset_count(&i915->gpu_error); |
a90507d6 | 692 | unsigned long count = 0, reported; |
79f0f472 CW |
693 | IGT_TIMEOUT(end_time); |
694 | ||
a90507d6 CW |
695 | if (flags & TEST_ACTIVE && |
696 | !intel_engine_can_store_dword(engine)) | |
3fb04cb0 CW |
697 | continue; |
698 | ||
79f0f472 | 699 | memset(threads, 0, sizeof(threads)); |
3fb04cb0 | 700 | for_each_engine(other, i915, tmp) { |
79f0f472 CW |
701 | struct task_struct *tsk; |
702 | ||
a90507d6 CW |
703 | threads[tmp].resets = |
704 | i915_reset_engine_count(&i915->gpu_error, | |
705 | other); | |
79f0f472 | 706 | |
a90507d6 | 707 | if (!(flags & TEST_OTHERS)) |
3fb04cb0 CW |
708 | continue; |
709 | ||
a90507d6 CW |
710 | if (other == engine && !(flags & TEST_SELF)) |
711 | continue; | |
712 | ||
713 | threads[tmp].engine = other; | |
714 | threads[tmp].flags = flags; | |
715 | ||
716 | tsk = kthread_run(active_engine, &threads[tmp], | |
3fb04cb0 | 717 | "igt/%s", other->name); |
79f0f472 CW |
718 | if (IS_ERR(tsk)) { |
719 | err = PTR_ERR(tsk); | |
720 | goto unwind; | |
721 | } | |
722 | ||
a90507d6 | 723 | threads[tmp].task = tsk; |
79f0f472 CW |
724 | get_task_struct(tsk); |
725 | } | |
726 | ||
3fb04cb0 | 727 | set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); |
79f0f472 | 728 | do { |
a90507d6 CW |
729 | u32 seqno = intel_engine_get_seqno(engine); |
730 | struct i915_request *rq = NULL; | |
3fb04cb0 | 731 | |
a90507d6 | 732 | if (flags & TEST_ACTIVE) { |
3fb04cb0 | 733 | mutex_lock(&i915->drm.struct_mutex); |
8ec21a7c | 734 | rq = hang_create_request(&h, engine); |
3fb04cb0 CW |
735 | if (IS_ERR(rq)) { |
736 | err = PTR_ERR(rq); | |
737 | mutex_unlock(&i915->drm.struct_mutex); | |
738 | break; | |
739 | } | |
740 | ||
e61e0f51 CW |
741 | i915_request_get(rq); |
742 | __i915_request_add(rq, true); | |
3fb04cb0 CW |
743 | mutex_unlock(&i915->drm.struct_mutex); |
744 | ||
29991d53 | 745 | if (!wait_until_running(&h, rq)) { |
3fb04cb0 CW |
746 | struct drm_printer p = drm_info_printer(i915->drm.dev); |
747 | ||
748 | pr_err("%s: Failed to start request %x, at %x\n", | |
749 | __func__, rq->fence.seqno, hws_seqno(&h, rq)); | |
750 | intel_engine_dump(engine, &p, | |
751 | "%s\n", engine->name); | |
752 | ||
e61e0f51 | 753 | i915_request_put(rq); |
3fb04cb0 CW |
754 | err = -EIO; |
755 | break; | |
756 | } | |
757 | ||
a90507d6 CW |
758 | GEM_BUG_ON(!rq->global_seqno); |
759 | seqno = rq->global_seqno - 1; | |
3fb04cb0 CW |
760 | } |
761 | ||
ce800754 | 762 | err = i915_reset_engine(engine, NULL); |
79f0f472 | 763 | if (err) { |
a90507d6 CW |
764 | pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", |
765 | engine->name, test_name, err); | |
79f0f472 CW |
766 | break; |
767 | } | |
3fb04cb0 | 768 | |
3fb04cb0 | 769 | count++; |
a90507d6 CW |
770 | |
771 | if (rq) { | |
772 | i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); | |
773 | i915_request_put(rq); | |
774 | } | |
79f0f472 | 775 | } while (time_before(jiffies, end_time)); |
3fb04cb0 CW |
776 | clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); |
777 | pr_info("i915_reset_engine(%s:%s): %lu resets\n", | |
a90507d6 CW |
778 | engine->name, test_name, count); |
779 | ||
780 | reported = i915_reset_engine_count(&i915->gpu_error, engine); | |
781 | reported -= threads[engine->id].resets; | |
782 | if (reported != (flags & TEST_ACTIVE ? count : 0)) { | |
783 | pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n", | |
784 | engine->name, test_name, count, reported, | |
785 | (flags & TEST_ACTIVE ? count : 0)); | |
3fb04cb0 CW |
786 | if (!err) |
787 | err = -EINVAL; | |
788 | } | |
79f0f472 CW |
789 | |
790 | unwind: | |
3fb04cb0 | 791 | for_each_engine(other, i915, tmp) { |
79f0f472 CW |
792 | int ret; |
793 | ||
a90507d6 | 794 | if (!threads[tmp].task) |
79f0f472 CW |
795 | continue; |
796 | ||
a90507d6 | 797 | ret = kthread_stop(threads[tmp].task); |
79f0f472 | 798 | if (ret) { |
3fb04cb0 CW |
799 | pr_err("kthread for other engine %s failed, err=%d\n", |
800 | other->name, ret); | |
79f0f472 CW |
801 | if (!err) |
802 | err = ret; | |
803 | } | |
a90507d6 | 804 | put_task_struct(threads[tmp].task); |
79f0f472 | 805 | |
a90507d6 CW |
806 | if (other != engine && |
807 | threads[tmp].resets != | |
808 | i915_reset_engine_count(&i915->gpu_error, other)) { | |
79f0f472 | 809 | pr_err("Innocent engine %s was reset (count=%ld)\n", |
3fb04cb0 | 810 | other->name, |
79f0f472 | 811 | i915_reset_engine_count(&i915->gpu_error, |
a90507d6 CW |
812 | other) - |
813 | threads[tmp].resets); | |
3fb04cb0 CW |
814 | if (!err) |
815 | err = -EINVAL; | |
79f0f472 CW |
816 | } |
817 | } | |
818 | ||
819 | if (global != i915_reset_count(&i915->gpu_error)) { | |
820 | pr_err("Global reset (count=%ld)!\n", | |
821 | i915_reset_count(&i915->gpu_error) - global); | |
3fb04cb0 CW |
822 | if (!err) |
823 | err = -EINVAL; | |
79f0f472 CW |
824 | } |
825 | ||
826 | if (err) | |
827 | break; | |
828 | ||
a8b66f2c CW |
829 | err = flush_test(i915, 0); |
830 | if (err) | |
831 | break; | |
79f0f472 CW |
832 | } |
833 | ||
834 | if (i915_terminally_wedged(&i915->gpu_error)) | |
835 | err = -EIO; | |
836 | ||
a90507d6 | 837 | if (flags & TEST_ACTIVE) { |
3fb04cb0 CW |
838 | mutex_lock(&i915->drm.struct_mutex); |
839 | hang_fini(&h); | |
840 | mutex_unlock(&i915->drm.struct_mutex); | |
841 | } | |
842 | ||
79f0f472 CW |
843 | return err; |
844 | } | |
845 | ||
a90507d6 | 846 | static int igt_reset_engines(void *arg) |
3fb04cb0 | 847 | { |
a90507d6 CW |
848 | static const struct { |
849 | const char *name; | |
850 | unsigned int flags; | |
851 | } phases[] = { | |
852 | { "idle", 0 }, | |
853 | { "active", TEST_ACTIVE }, | |
854 | { "others-idle", TEST_OTHERS }, | |
855 | { "others-active", TEST_OTHERS | TEST_ACTIVE }, | |
856 | { | |
857 | "others-priority", | |
858 | TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | |
859 | }, | |
860 | { | |
861 | "self-priority", | |
862 | TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, | |
863 | }, | |
864 | { } | |
865 | }; | |
866 | struct drm_i915_private *i915 = arg; | |
867 | typeof(*phases) *p; | |
868 | int err; | |
3fb04cb0 | 869 | |
a90507d6 CW |
870 | for (p = phases; p->name; p++) { |
871 | if (p->flags & TEST_PRIORITY) { | |
872 | if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) | |
873 | continue; | |
874 | } | |
875 | ||
876 | err = __igt_reset_engines(arg, p->name, p->flags); | |
877 | if (err) | |
878 | return err; | |
879 | } | |
880 | ||
881 | return 0; | |
3fb04cb0 CW |
882 | } |
883 | ||
d0667e9c | 884 | static u32 fake_hangcheck(struct i915_request *rq, u32 mask) |
496b575e | 885 | { |
d0667e9c CW |
886 | struct i915_gpu_error *error = &rq->i915->gpu_error; |
887 | u32 reset_count = i915_reset_count(error); | |
496b575e | 888 | |
d0667e9c | 889 | error->stalled_mask = mask; |
496b575e | 890 | |
d0667e9c CW |
891 | /* set_bit() must be after we have setup the backchannel (mask) */ |
892 | smp_mb__before_atomic(); | |
893 | set_bit(I915_RESET_HANDOFF, &error->flags); | |
496b575e | 894 | |
d0667e9c | 895 | wake_up_all(&error->wait_queue); |
496b575e CW |
896 | |
897 | return reset_count; | |
898 | } | |
899 | ||
496b575e CW |
900 | static int igt_wait_reset(void *arg) |
901 | { | |
902 | struct drm_i915_private *i915 = arg; | |
e61e0f51 | 903 | struct i915_request *rq; |
496b575e CW |
904 | unsigned int reset_count; |
905 | struct hang h; | |
906 | long timeout; | |
907 | int err; | |
908 | ||
f2f5c061 CW |
909 | if (!intel_engine_can_store_dword(i915->engine[RCS])) |
910 | return 0; | |
911 | ||
496b575e CW |
912 | /* Check that we detect a stuck waiter and issue a reset */ |
913 | ||
3744d49c | 914 | global_reset_lock(i915); |
496b575e CW |
915 | |
916 | mutex_lock(&i915->drm.struct_mutex); | |
917 | err = hang_init(&h, i915); | |
918 | if (err) | |
919 | goto unlock; | |
920 | ||
8ec21a7c | 921 | rq = hang_create_request(&h, i915->engine[RCS]); |
496b575e CW |
922 | if (IS_ERR(rq)) { |
923 | err = PTR_ERR(rq); | |
924 | goto fini; | |
925 | } | |
926 | ||
e61e0f51 CW |
927 | i915_request_get(rq); |
928 | __i915_request_add(rq, true); | |
496b575e | 929 | |
29991d53 | 930 | if (!wait_until_running(&h, rq)) { |
95a19ab4 CW |
931 | struct drm_printer p = drm_info_printer(i915->drm.dev); |
932 | ||
3fb04cb0 CW |
933 | pr_err("%s: Failed to start request %x, at %x\n", |
934 | __func__, rq->fence.seqno, hws_seqno(&h, rq)); | |
0db18b17 | 935 | intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); |
87dc03ad | 936 | |
87dc03ad CW |
937 | i915_gem_set_wedged(i915); |
938 | ||
496b575e CW |
939 | err = -EIO; |
940 | goto out_rq; | |
941 | } | |
942 | ||
d0667e9c | 943 | reset_count = fake_hangcheck(rq, ALL_ENGINES); |
496b575e | 944 | |
e61e0f51 | 945 | timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); |
496b575e | 946 | if (timeout < 0) { |
e532be89 | 947 | pr_err("i915_request_wait failed on a stuck request: err=%ld\n", |
496b575e CW |
948 | timeout); |
949 | err = timeout; | |
950 | goto out_rq; | |
951 | } | |
496b575e | 952 | |
8c185eca | 953 | GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); |
496b575e CW |
954 | if (i915_reset_count(&i915->gpu_error) == reset_count) { |
955 | pr_err("No GPU reset recorded!\n"); | |
956 | err = -EINVAL; | |
957 | goto out_rq; | |
958 | } | |
959 | ||
960 | out_rq: | |
e61e0f51 | 961 | i915_request_put(rq); |
496b575e CW |
962 | fini: |
963 | hang_fini(&h); | |
964 | unlock: | |
965 | mutex_unlock(&i915->drm.struct_mutex); | |
3744d49c | 966 | global_reset_unlock(i915); |
496b575e CW |
967 | |
968 | if (i915_terminally_wedged(&i915->gpu_error)) | |
969 | return -EIO; | |
970 | ||
971 | return err; | |
972 | } | |
973 | ||
02866679 CW |
974 | static int wait_for_others(struct drm_i915_private *i915, |
975 | struct intel_engine_cs *exclude) | |
976 | { | |
977 | struct intel_engine_cs *engine; | |
978 | enum intel_engine_id id; | |
979 | ||
980 | for_each_engine(engine, i915, id) { | |
981 | if (engine == exclude) | |
982 | continue; | |
983 | ||
984 | if (wait_for(intel_engine_is_idle(engine), 10)) | |
985 | return -EIO; | |
986 | } | |
987 | ||
988 | return 0; | |
989 | } | |
990 | ||
496b575e CW |
991 | static int igt_reset_queue(void *arg) |
992 | { | |
993 | struct drm_i915_private *i915 = arg; | |
994 | struct intel_engine_cs *engine; | |
995 | enum intel_engine_id id; | |
996 | struct hang h; | |
997 | int err; | |
998 | ||
999 | /* Check that we replay pending requests following a hang */ | |
1000 | ||
3744d49c CW |
1001 | global_reset_lock(i915); |
1002 | ||
496b575e CW |
1003 | mutex_lock(&i915->drm.struct_mutex); |
1004 | err = hang_init(&h, i915); | |
1005 | if (err) | |
1006 | goto unlock; | |
1007 | ||
1008 | for_each_engine(engine, i915, id) { | |
e61e0f51 | 1009 | struct i915_request *prev; |
496b575e CW |
1010 | IGT_TIMEOUT(end_time); |
1011 | unsigned int count; | |
1012 | ||
f2f5c061 CW |
1013 | if (!intel_engine_can_store_dword(engine)) |
1014 | continue; | |
1015 | ||
8ec21a7c | 1016 | prev = hang_create_request(&h, engine); |
496b575e CW |
1017 | if (IS_ERR(prev)) { |
1018 | err = PTR_ERR(prev); | |
1019 | goto fini; | |
1020 | } | |
1021 | ||
e61e0f51 CW |
1022 | i915_request_get(prev); |
1023 | __i915_request_add(prev, true); | |
496b575e CW |
1024 | |
1025 | count = 0; | |
1026 | do { | |
e61e0f51 | 1027 | struct i915_request *rq; |
496b575e CW |
1028 | unsigned int reset_count; |
1029 | ||
8ec21a7c | 1030 | rq = hang_create_request(&h, engine); |
496b575e CW |
1031 | if (IS_ERR(rq)) { |
1032 | err = PTR_ERR(rq); | |
1033 | goto fini; | |
1034 | } | |
1035 | ||
e61e0f51 CW |
1036 | i915_request_get(rq); |
1037 | __i915_request_add(rq, true); | |
496b575e | 1038 | |
02866679 CW |
1039 | /* |
1040 | * XXX We don't handle resetting the kernel context | |
1041 | * very well. If we trigger a device reset twice in | |
1042 | * quick succession while the kernel context is | |
1043 | * executing, we may end up skipping the breadcrumb. | |
1044 | * This is really only a problem for the selftest as | |
1045 | * normally there is a large interlude between resets | |
1046 | * (hangcheck), or we focus on resetting just one | |
1047 | * engine and so avoid repeatedly resetting innocents. | |
1048 | */ | |
1049 | err = wait_for_others(i915, engine); | |
1050 | if (err) { | |
1051 | pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", | |
1052 | __func__, engine->name); | |
1053 | i915_request_put(rq); | |
1054 | i915_request_put(prev); | |
1055 | ||
1056 | GEM_TRACE_DUMP(); | |
1057 | i915_gem_set_wedged(i915); | |
1058 | goto fini; | |
1059 | } | |
1060 | ||
29991d53 | 1061 | if (!wait_until_running(&h, prev)) { |
95a19ab4 CW |
1062 | struct drm_printer p = drm_info_printer(i915->drm.dev); |
1063 | ||
02866679 CW |
1064 | pr_err("%s(%s): Failed to start request %x, at %x\n", |
1065 | __func__, engine->name, | |
1066 | prev->fence.seqno, hws_seqno(&h, prev)); | |
1067 | intel_engine_dump(engine, &p, | |
1068 | "%s\n", engine->name); | |
95a19ab4 | 1069 | |
e61e0f51 CW |
1070 | i915_request_put(rq); |
1071 | i915_request_put(prev); | |
87dc03ad | 1072 | |
87dc03ad CW |
1073 | i915_gem_set_wedged(i915); |
1074 | ||
496b575e CW |
1075 | err = -EIO; |
1076 | goto fini; | |
1077 | } | |
1078 | ||
d0667e9c | 1079 | reset_count = fake_hangcheck(prev, ENGINE_MASK(id)); |
496b575e | 1080 | |
d0667e9c | 1081 | i915_reset(i915, ENGINE_MASK(id), NULL); |
496b575e | 1082 | |
8c185eca | 1083 | GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, |
496b575e | 1084 | &i915->gpu_error.flags)); |
8c185eca | 1085 | |
496b575e CW |
1086 | if (prev->fence.error != -EIO) { |
1087 | pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", | |
1088 | prev->fence.error); | |
e61e0f51 CW |
1089 | i915_request_put(rq); |
1090 | i915_request_put(prev); | |
496b575e CW |
1091 | err = -EINVAL; |
1092 | goto fini; | |
1093 | } | |
1094 | ||
1095 | if (rq->fence.error) { | |
1096 | pr_err("Fence error status not zero [%d] after unrelated reset\n", | |
1097 | rq->fence.error); | |
e61e0f51 CW |
1098 | i915_request_put(rq); |
1099 | i915_request_put(prev); | |
496b575e CW |
1100 | err = -EINVAL; |
1101 | goto fini; | |
1102 | } | |
1103 | ||
1104 | if (i915_reset_count(&i915->gpu_error) == reset_count) { | |
1105 | pr_err("No GPU reset recorded!\n"); | |
e61e0f51 CW |
1106 | i915_request_put(rq); |
1107 | i915_request_put(prev); | |
496b575e CW |
1108 | err = -EINVAL; |
1109 | goto fini; | |
1110 | } | |
1111 | ||
e61e0f51 | 1112 | i915_request_put(prev); |
496b575e CW |
1113 | prev = rq; |
1114 | count++; | |
1115 | } while (time_before(jiffies, end_time)); | |
1116 | pr_info("%s: Completed %d resets\n", engine->name, count); | |
1117 | ||
1118 | *h.batch = MI_BATCH_BUFFER_END; | |
60456d5c | 1119 | i915_gem_chipset_flush(i915); |
496b575e | 1120 | |
e61e0f51 | 1121 | i915_request_put(prev); |
a8b66f2c CW |
1122 | |
1123 | err = flush_test(i915, I915_WAIT_LOCKED); | |
1124 | if (err) | |
1125 | break; | |
496b575e CW |
1126 | } |
1127 | ||
1128 | fini: | |
1129 | hang_fini(&h); | |
1130 | unlock: | |
1131 | mutex_unlock(&i915->drm.struct_mutex); | |
3744d49c | 1132 | global_reset_unlock(i915); |
496b575e CW |
1133 | |
1134 | if (i915_terminally_wedged(&i915->gpu_error)) | |
1135 | return -EIO; | |
1136 | ||
1137 | return err; | |
1138 | } | |
1139 | ||
41533940 | 1140 | static int igt_handle_error(void *arg) |
abeb4def MT |
1141 | { |
1142 | struct drm_i915_private *i915 = arg; | |
1143 | struct intel_engine_cs *engine = i915->engine[RCS]; | |
1144 | struct hang h; | |
e61e0f51 | 1145 | struct i915_request *rq; |
41533940 CW |
1146 | struct i915_gpu_state *error; |
1147 | int err; | |
abeb4def MT |
1148 | |
1149 | /* Check that we can issue a global GPU and engine reset */ | |
1150 | ||
1151 | if (!intel_has_reset_engine(i915)) | |
1152 | return 0; | |
1153 | ||
d0667e9c | 1154 | if (!engine || !intel_engine_can_store_dword(engine)) |
f2f5c061 CW |
1155 | return 0; |
1156 | ||
abeb4def MT |
1157 | mutex_lock(&i915->drm.struct_mutex); |
1158 | ||
1159 | err = hang_init(&h, i915); | |
1160 | if (err) | |
774eed4a | 1161 | goto err_unlock; |
abeb4def | 1162 | |
8ec21a7c | 1163 | rq = hang_create_request(&h, engine); |
abeb4def MT |
1164 | if (IS_ERR(rq)) { |
1165 | err = PTR_ERR(rq); | |
774eed4a | 1166 | goto err_fini; |
abeb4def MT |
1167 | } |
1168 | ||
e61e0f51 CW |
1169 | i915_request_get(rq); |
1170 | __i915_request_add(rq, true); | |
abeb4def | 1171 | |
29991d53 | 1172 | if (!wait_until_running(&h, rq)) { |
95a19ab4 CW |
1173 | struct drm_printer p = drm_info_printer(i915->drm.dev); |
1174 | ||
3fb04cb0 CW |
1175 | pr_err("%s: Failed to start request %x, at %x\n", |
1176 | __func__, rq->fence.seqno, hws_seqno(&h, rq)); | |
0db18b17 | 1177 | intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); |
87dc03ad | 1178 | |
87dc03ad CW |
1179 | i915_gem_set_wedged(i915); |
1180 | ||
abeb4def | 1181 | err = -EIO; |
774eed4a | 1182 | goto err_request; |
abeb4def MT |
1183 | } |
1184 | ||
abeb4def | 1185 | mutex_unlock(&i915->drm.struct_mutex); |
abeb4def | 1186 | |
41533940 CW |
1187 | /* Temporarily disable error capture */ |
1188 | error = xchg(&i915->gpu_error.first_error, (void *)-1); | |
abeb4def | 1189 | |
d0667e9c | 1190 | i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL); |
abeb4def | 1191 | |
41533940 | 1192 | xchg(&i915->gpu_error.first_error, error); |
abeb4def | 1193 | |
41533940 | 1194 | mutex_lock(&i915->drm.struct_mutex); |
abeb4def | 1195 | |
41533940 CW |
1196 | if (rq->fence.error != -EIO) { |
1197 | pr_err("Guilty request not identified!\n"); | |
1198 | err = -EINVAL; | |
1199 | goto err_request; | |
1200 | } | |
774eed4a CW |
1201 | |
1202 | err_request: | |
e61e0f51 | 1203 | i915_request_put(rq); |
774eed4a CW |
1204 | err_fini: |
1205 | hang_fini(&h); | |
1206 | err_unlock: | |
1207 | mutex_unlock(&i915->drm.struct_mutex); | |
41533940 | 1208 | return err; |
abeb4def MT |
1209 | } |
1210 | ||
496b575e CW |
1211 | int intel_hangcheck_live_selftests(struct drm_i915_private *i915) |
1212 | { | |
1213 | static const struct i915_subtest tests[] = { | |
87dc03ad | 1214 | SUBTEST(igt_global_reset), /* attempt to recover GPU first */ |
496b575e | 1215 | SUBTEST(igt_hang_sanitycheck), |
3fb04cb0 CW |
1216 | SUBTEST(igt_reset_idle_engine), |
1217 | SUBTEST(igt_reset_active_engine), | |
a90507d6 | 1218 | SUBTEST(igt_reset_engines), |
496b575e CW |
1219 | SUBTEST(igt_wait_reset), |
1220 | SUBTEST(igt_reset_queue), | |
41533940 | 1221 | SUBTEST(igt_handle_error), |
496b575e | 1222 | }; |
3fb04cb0 | 1223 | bool saved_hangcheck; |
ff97d3ae | 1224 | int err; |
496b575e CW |
1225 | |
1226 | if (!intel_has_gpu_reset(i915)) | |
1227 | return 0; | |
1228 | ||
ff97d3ae | 1229 | intel_runtime_pm_get(i915); |
3fb04cb0 | 1230 | saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); |
ff97d3ae CW |
1231 | |
1232 | err = i915_subtests(tests, i915); | |
1233 | ||
0ade4390 CW |
1234 | mutex_lock(&i915->drm.struct_mutex); |
1235 | flush_test(i915, I915_WAIT_LOCKED); | |
1236 | mutex_unlock(&i915->drm.struct_mutex); | |
1237 | ||
3fb04cb0 | 1238 | i915_modparams.enable_hangcheck = saved_hangcheck; |
ff97d3ae CW |
1239 | intel_runtime_pm_put(i915); |
1240 | ||
1241 | return err; | |
496b575e | 1242 | } |