drm/i915: Rename priotree to sched
[linux-block.git] / drivers / gpu / drm / i915 / selftests / intel_hangcheck.c
CommitLineData
496b575e
CW
1/*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
79f0f472
CW
25#include <linux/kthread.h>
26
496b575e 27#include "../i915_selftest.h"
a90507d6 28#include "i915_random.h"
496b575e 29
79f0f472
CW
30#include "mock_context.h"
31#include "mock_drm.h"
32
496b575e
CW
33struct hang {
34 struct drm_i915_private *i915;
35 struct drm_i915_gem_object *hws;
36 struct drm_i915_gem_object *obj;
8ec21a7c 37 struct i915_gem_context *ctx;
496b575e
CW
38 u32 *seqno;
39 u32 *batch;
40};
41
42static int hang_init(struct hang *h, struct drm_i915_private *i915)
43{
44 void *vaddr;
45 int err;
46
47 memset(h, 0, sizeof(*h));
48 h->i915 = i915;
49
8ec21a7c
CW
50 h->ctx = kernel_context(i915);
51 if (IS_ERR(h->ctx))
52 return PTR_ERR(h->ctx);
53
496b575e 54 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
8ec21a7c
CW
55 if (IS_ERR(h->hws)) {
56 err = PTR_ERR(h->hws);
57 goto err_ctx;
58 }
496b575e
CW
59
60 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
61 if (IS_ERR(h->obj)) {
62 err = PTR_ERR(h->obj);
63 goto err_hws;
64 }
65
66 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
67 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
68 if (IS_ERR(vaddr)) {
69 err = PTR_ERR(vaddr);
70 goto err_obj;
71 }
72 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
73
74 vaddr = i915_gem_object_pin_map(h->obj,
75 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
76 if (IS_ERR(vaddr)) {
77 err = PTR_ERR(vaddr);
78 goto err_unpin_hws;
79 }
80 h->batch = vaddr;
81
82 return 0;
83
84err_unpin_hws:
85 i915_gem_object_unpin_map(h->hws);
86err_obj:
87 i915_gem_object_put(h->obj);
88err_hws:
89 i915_gem_object_put(h->hws);
8ec21a7c
CW
90err_ctx:
91 kernel_context_close(h->ctx);
496b575e
CW
92 return err;
93}
94
95static u64 hws_address(const struct i915_vma *hws,
e61e0f51 96 const struct i915_request *rq)
496b575e
CW
97{
98 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
99}
100
101static int emit_recurse_batch(struct hang *h,
e61e0f51 102 struct i915_request *rq)
496b575e
CW
103{
104 struct drm_i915_private *i915 = h->i915;
105 struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
106 struct i915_vma *hws, *vma;
107 unsigned int flags;
108 u32 *batch;
109 int err;
110
111 vma = i915_vma_instance(h->obj, vm, NULL);
112 if (IS_ERR(vma))
113 return PTR_ERR(vma);
114
115 hws = i915_vma_instance(h->hws, vm, NULL);
116 if (IS_ERR(hws))
117 return PTR_ERR(hws);
118
119 err = i915_vma_pin(vma, 0, 0, PIN_USER);
120 if (err)
121 return err;
122
123 err = i915_vma_pin(hws, 0, 0, PIN_USER);
124 if (err)
125 goto unpin_vma;
126
496b575e
CW
127 i915_vma_move_to_active(vma, rq, 0);
128 if (!i915_gem_object_has_active_reference(vma->obj)) {
129 i915_gem_object_get(vma->obj);
130 i915_gem_object_set_active_reference(vma->obj);
131 }
132
133 i915_vma_move_to_active(hws, rq, 0);
134 if (!i915_gem_object_has_active_reference(hws->obj)) {
135 i915_gem_object_get(hws->obj);
136 i915_gem_object_set_active_reference(hws->obj);
137 }
138
139 batch = h->batch;
140 if (INTEL_GEN(i915) >= 8) {
141 *batch++ = MI_STORE_DWORD_IMM_GEN4;
142 *batch++ = lower_32_bits(hws_address(hws, rq));
143 *batch++ = upper_32_bits(hws_address(hws, rq));
144 *batch++ = rq->fence.seqno;
3fb04cb0
CW
145 *batch++ = MI_ARB_CHECK;
146
147 memset(batch, 0, 1024);
148 batch += 1024 / sizeof(*batch);
149
150 *batch++ = MI_ARB_CHECK;
496b575e
CW
151 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
152 *batch++ = lower_32_bits(vma->node.start);
153 *batch++ = upper_32_bits(vma->node.start);
154 } else if (INTEL_GEN(i915) >= 6) {
155 *batch++ = MI_STORE_DWORD_IMM_GEN4;
156 *batch++ = 0;
157 *batch++ = lower_32_bits(hws_address(hws, rq));
158 *batch++ = rq->fence.seqno;
3fb04cb0
CW
159 *batch++ = MI_ARB_CHECK;
160
161 memset(batch, 0, 1024);
162 batch += 1024 / sizeof(*batch);
163
164 *batch++ = MI_ARB_CHECK;
496b575e
CW
165 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
166 *batch++ = lower_32_bits(vma->node.start);
167 } else if (INTEL_GEN(i915) >= 4) {
168 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
169 *batch++ = 0;
170 *batch++ = lower_32_bits(hws_address(hws, rq));
171 *batch++ = rq->fence.seqno;
3fb04cb0
CW
172 *batch++ = MI_ARB_CHECK;
173
174 memset(batch, 0, 1024);
175 batch += 1024 / sizeof(*batch);
176
177 *batch++ = MI_ARB_CHECK;
496b575e
CW
178 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
179 *batch++ = lower_32_bits(vma->node.start);
180 } else {
181 *batch++ = MI_STORE_DWORD_IMM;
182 *batch++ = lower_32_bits(hws_address(hws, rq));
183 *batch++ = rq->fence.seqno;
3fb04cb0
CW
184 *batch++ = MI_ARB_CHECK;
185
186 memset(batch, 0, 1024);
187 batch += 1024 / sizeof(*batch);
188
189 *batch++ = MI_ARB_CHECK;
496b575e
CW
190 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
191 *batch++ = lower_32_bits(vma->node.start);
192 }
193 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
60456d5c 194 i915_gem_chipset_flush(h->i915);
496b575e
CW
195
196 flags = 0;
197 if (INTEL_GEN(vm->i915) <= 5)
198 flags |= I915_DISPATCH_SECURE;
199
200 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
201
496b575e
CW
202 i915_vma_unpin(hws);
203unpin_vma:
204 i915_vma_unpin(vma);
205 return err;
206}
207
e61e0f51 208static struct i915_request *
8ec21a7c 209hang_create_request(struct hang *h, struct intel_engine_cs *engine)
496b575e 210{
e61e0f51 211 struct i915_request *rq;
496b575e
CW
212 int err;
213
214 if (i915_gem_object_is_active(h->obj)) {
215 struct drm_i915_gem_object *obj;
216 void *vaddr;
217
218 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
219 if (IS_ERR(obj))
220 return ERR_CAST(obj);
221
222 vaddr = i915_gem_object_pin_map(obj,
223 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
224 if (IS_ERR(vaddr)) {
225 i915_gem_object_put(obj);
226 return ERR_CAST(vaddr);
227 }
228
229 i915_gem_object_unpin_map(h->obj);
230 i915_gem_object_put(h->obj);
231
232 h->obj = obj;
233 h->batch = vaddr;
234 }
235
e61e0f51 236 rq = i915_request_alloc(engine, h->ctx);
496b575e
CW
237 if (IS_ERR(rq))
238 return rq;
239
240 err = emit_recurse_batch(h, rq);
241 if (err) {
e61e0f51 242 __i915_request_add(rq, false);
496b575e
CW
243 return ERR_PTR(err);
244 }
245
246 return rq;
247}
248
e61e0f51 249static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
496b575e
CW
250{
251 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
252}
253
a8b66f2c
CW
254struct wedge_me {
255 struct delayed_work work;
256 struct drm_i915_private *i915;
257 const void *symbol;
258};
259
260static void wedge_me(struct work_struct *work)
261{
262 struct wedge_me *w = container_of(work, typeof(*w), work.work);
263
0ade4390
CW
264 pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
265
266 GEM_TRACE("%pS timed out.\n", w->symbol);
267 GEM_TRACE_DUMP();
268
a8b66f2c
CW
269 i915_gem_set_wedged(w->i915);
270}
271
272static void __init_wedge(struct wedge_me *w,
273 struct drm_i915_private *i915,
274 long timeout,
275 const void *symbol)
276{
277 w->i915 = i915;
278 w->symbol = symbol;
279
280 INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
281 schedule_delayed_work(&w->work, timeout);
282}
283
284static void __fini_wedge(struct wedge_me *w)
285{
286 cancel_delayed_work_sync(&w->work);
287 destroy_delayed_work_on_stack(&w->work);
288 w->i915 = NULL;
289}
290
291#define wedge_on_timeout(W, DEV, TIMEOUT) \
292 for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
293 (W)->i915; \
294 __fini_wedge((W)))
295
296static noinline int
297flush_test(struct drm_i915_private *i915, unsigned int flags)
298{
299 struct wedge_me w;
300
301 cond_resched();
302
303 wedge_on_timeout(&w, i915, HZ)
304 i915_gem_wait_for_idle(i915, flags);
305
306 return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
307}
308
496b575e
CW
309static void hang_fini(struct hang *h)
310{
311 *h->batch = MI_BATCH_BUFFER_END;
60456d5c 312 i915_gem_chipset_flush(h->i915);
496b575e
CW
313
314 i915_gem_object_unpin_map(h->obj);
315 i915_gem_object_put(h->obj);
316
317 i915_gem_object_unpin_map(h->hws);
318 i915_gem_object_put(h->hws);
319
8ec21a7c
CW
320 kernel_context_close(h->ctx);
321
a8b66f2c 322 flush_test(h->i915, I915_WAIT_LOCKED);
496b575e
CW
323}
324
29991d53 325static bool wait_until_running(struct hang *h, struct i915_request *rq)
3fb04cb0
CW
326{
327 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
328 rq->fence.seqno),
329 10) &&
330 wait_for(i915_seqno_passed(hws_seqno(h, rq),
331 rq->fence.seqno),
332 1000));
333}
334
496b575e
CW
335static int igt_hang_sanitycheck(void *arg)
336{
337 struct drm_i915_private *i915 = arg;
e61e0f51 338 struct i915_request *rq;
496b575e
CW
339 struct intel_engine_cs *engine;
340 enum intel_engine_id id;
341 struct hang h;
342 int err;
343
344 /* Basic check that we can execute our hanging batch */
345
496b575e
CW
346 mutex_lock(&i915->drm.struct_mutex);
347 err = hang_init(&h, i915);
348 if (err)
349 goto unlock;
350
351 for_each_engine(engine, i915, id) {
352 long timeout;
353
f2f5c061
CW
354 if (!intel_engine_can_store_dword(engine))
355 continue;
356
8ec21a7c 357 rq = hang_create_request(&h, engine);
496b575e
CW
358 if (IS_ERR(rq)) {
359 err = PTR_ERR(rq);
360 pr_err("Failed to create request for %s, err=%d\n",
361 engine->name, err);
362 goto fini;
363 }
364
e61e0f51 365 i915_request_get(rq);
496b575e
CW
366
367 *h.batch = MI_BATCH_BUFFER_END;
60456d5c
CW
368 i915_gem_chipset_flush(i915);
369
e61e0f51 370 __i915_request_add(rq, true);
496b575e 371
e61e0f51 372 timeout = i915_request_wait(rq,
496b575e
CW
373 I915_WAIT_LOCKED,
374 MAX_SCHEDULE_TIMEOUT);
e61e0f51 375 i915_request_put(rq);
496b575e
CW
376
377 if (timeout < 0) {
378 err = timeout;
379 pr_err("Wait for request failed on %s, err=%d\n",
380 engine->name, err);
381 goto fini;
382 }
383 }
384
385fini:
386 hang_fini(&h);
387unlock:
388 mutex_unlock(&i915->drm.struct_mutex);
389 return err;
390}
391
3744d49c
CW
392static void global_reset_lock(struct drm_i915_private *i915)
393{
394 struct intel_engine_cs *engine;
395 enum intel_engine_id id;
396
3fb04cb0
CW
397 pr_debug("%s: current gpu_error=%08lx\n",
398 __func__, i915->gpu_error.flags);
399
3744d49c
CW
400 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
401 wait_event(i915->gpu_error.reset_queue,
402 !test_bit(I915_RESET_BACKOFF,
403 &i915->gpu_error.flags));
404
405 for_each_engine(engine, i915, id) {
406 while (test_and_set_bit(I915_RESET_ENGINE + id,
407 &i915->gpu_error.flags))
408 wait_on_bit(&i915->gpu_error.flags,
409 I915_RESET_ENGINE + id,
410 TASK_UNINTERRUPTIBLE);
411 }
412}
413
414static void global_reset_unlock(struct drm_i915_private *i915)
415{
416 struct intel_engine_cs *engine;
417 enum intel_engine_id id;
418
419 for_each_engine(engine, i915, id)
420 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
421
422 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
423 wake_up_all(&i915->gpu_error.reset_queue);
424}
425
496b575e
CW
426static int igt_global_reset(void *arg)
427{
428 struct drm_i915_private *i915 = arg;
429 unsigned int reset_count;
430 int err = 0;
431
432 /* Check that we can issue a global GPU reset */
433
3744d49c 434 global_reset_lock(i915);
8c185eca 435 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
496b575e
CW
436
437 mutex_lock(&i915->drm.struct_mutex);
438 reset_count = i915_reset_count(&i915->gpu_error);
439
d0667e9c 440 i915_reset(i915, ALL_ENGINES, NULL);
496b575e
CW
441
442 if (i915_reset_count(&i915->gpu_error) == reset_count) {
443 pr_err("No GPU reset recorded!\n");
444 err = -EINVAL;
445 }
446 mutex_unlock(&i915->drm.struct_mutex);
447
8c185eca 448 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
3744d49c 449 global_reset_unlock(i915);
d5367307 450
496b575e
CW
451 if (i915_terminally_wedged(&i915->gpu_error))
452 err = -EIO;
453
454 return err;
455}
456
3fb04cb0 457static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
abeb4def 458{
abeb4def
MT
459 struct intel_engine_cs *engine;
460 enum intel_engine_id id;
3fb04cb0 461 struct hang h;
abeb4def
MT
462 int err = 0;
463
3fb04cb0 464 /* Check that we can issue an engine reset on an idle engine (no-op) */
abeb4def
MT
465
466 if (!intel_has_reset_engine(i915))
467 return 0;
468
3fb04cb0
CW
469 if (active) {
470 mutex_lock(&i915->drm.struct_mutex);
471 err = hang_init(&h, i915);
472 mutex_unlock(&i915->drm.struct_mutex);
473 if (err)
474 return err;
475 }
476
abeb4def 477 for_each_engine(engine, i915, id) {
3fb04cb0
CW
478 unsigned int reset_count, reset_engine_count;
479 IGT_TIMEOUT(end_time);
480
481 if (active && !intel_engine_can_store_dword(engine))
482 continue;
483
abeb4def
MT
484 reset_count = i915_reset_count(&i915->gpu_error);
485 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
486 engine);
487
3fb04cb0
CW
488 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
489 do {
a90507d6
CW
490 u32 seqno = intel_engine_get_seqno(engine);
491
3fb04cb0 492 if (active) {
e61e0f51 493 struct i915_request *rq;
3fb04cb0
CW
494
495 mutex_lock(&i915->drm.struct_mutex);
8ec21a7c 496 rq = hang_create_request(&h, engine);
3fb04cb0
CW
497 if (IS_ERR(rq)) {
498 err = PTR_ERR(rq);
499 mutex_unlock(&i915->drm.struct_mutex);
500 break;
501 }
502
e61e0f51
CW
503 i915_request_get(rq);
504 __i915_request_add(rq, true);
3fb04cb0
CW
505 mutex_unlock(&i915->drm.struct_mutex);
506
29991d53 507 if (!wait_until_running(&h, rq)) {
3fb04cb0
CW
508 struct drm_printer p = drm_info_printer(i915->drm.dev);
509
510 pr_err("%s: Failed to start request %x, at %x\n",
511 __func__, rq->fence.seqno, hws_seqno(&h, rq));
512 intel_engine_dump(engine, &p,
513 "%s\n", engine->name);
514
e61e0f51 515 i915_request_put(rq);
3fb04cb0
CW
516 err = -EIO;
517 break;
518 }
abeb4def 519
a90507d6
CW
520 GEM_BUG_ON(!rq->global_seqno);
521 seqno = rq->global_seqno - 1;
e61e0f51 522 i915_request_put(rq);
3fb04cb0
CW
523 }
524
ce800754 525 err = i915_reset_engine(engine, NULL);
3fb04cb0
CW
526 if (err) {
527 pr_err("i915_reset_engine failed\n");
528 break;
529 }
530
531 if (i915_reset_count(&i915->gpu_error) != reset_count) {
532 pr_err("Full GPU reset recorded! (engine reset expected)\n");
533 err = -EINVAL;
534 break;
535 }
536
537 reset_engine_count += active;
538 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
539 reset_engine_count) {
540 pr_err("%s engine reset %srecorded!\n",
541 engine->name, active ? "not " : "");
542 err = -EINVAL;
543 break;
544 }
3fb04cb0
CW
545 } while (time_before(jiffies, end_time));
546 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
abeb4def 547
3fb04cb0 548 if (err)
abeb4def 549 break;
abeb4def 550
a8b66f2c
CW
551 err = flush_test(i915, 0);
552 if (err)
553 break;
abeb4def
MT
554 }
555
556 if (i915_terminally_wedged(&i915->gpu_error))
557 err = -EIO;
558
3fb04cb0
CW
559 if (active) {
560 mutex_lock(&i915->drm.struct_mutex);
561 hang_fini(&h);
562 mutex_unlock(&i915->drm.struct_mutex);
563 }
564
abeb4def
MT
565 return err;
566}
567
3fb04cb0
CW
568static int igt_reset_idle_engine(void *arg)
569{
570 return __igt_reset_engine(arg, false);
571}
572
573static int igt_reset_active_engine(void *arg)
574{
575 return __igt_reset_engine(arg, true);
576}
577
a90507d6
CW
578struct active_engine {
579 struct task_struct *task;
580 struct intel_engine_cs *engine;
581 unsigned long resets;
582 unsigned int flags;
583};
584
585#define TEST_ACTIVE BIT(0)
586#define TEST_OTHERS BIT(1)
587#define TEST_SELF BIT(2)
588#define TEST_PRIORITY BIT(3)
589
79f0f472
CW
590static int active_engine(void *data)
591{
a90507d6
CW
592 I915_RND_STATE(prng);
593 struct active_engine *arg = data;
594 struct intel_engine_cs *engine = arg->engine;
595 struct i915_request *rq[8] = {};
596 struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
79f0f472
CW
597 struct drm_file *file;
598 unsigned long count = 0;
599 int err = 0;
600
601 file = mock_file(engine->i915);
602 if (IS_ERR(file))
603 return PTR_ERR(file);
604
a90507d6
CW
605 for (count = 0; count < ARRAY_SIZE(ctx); count++) {
606 mutex_lock(&engine->i915->drm.struct_mutex);
607 ctx[count] = live_context(engine->i915, file);
608 mutex_unlock(&engine->i915->drm.struct_mutex);
609 if (IS_ERR(ctx[count])) {
610 err = PTR_ERR(ctx[count]);
611 while (--count)
612 i915_gem_context_put(ctx[count]);
613 goto err_file;
614 }
79f0f472
CW
615 }
616
617 while (!kthread_should_stop()) {
a90507d6 618 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
e61e0f51
CW
619 struct i915_request *old = rq[idx];
620 struct i915_request *new;
79f0f472
CW
621
622 mutex_lock(&engine->i915->drm.struct_mutex);
e61e0f51 623 new = i915_request_alloc(engine, ctx[idx]);
79f0f472
CW
624 if (IS_ERR(new)) {
625 mutex_unlock(&engine->i915->drm.struct_mutex);
626 err = PTR_ERR(new);
627 break;
628 }
629
a90507d6
CW
630 if (arg->flags & TEST_PRIORITY)
631 ctx[idx]->priority =
632 i915_prandom_u32_max_state(512, &prng);
633
e61e0f51
CW
634 rq[idx] = i915_request_get(new);
635 i915_request_add(new);
79f0f472
CW
636 mutex_unlock(&engine->i915->drm.struct_mutex);
637
638 if (old) {
0ade4390
CW
639 if (i915_request_wait(old, 0, HZ) < 0) {
640 GEM_TRACE("%s timed out.\n", engine->name);
641 GEM_TRACE_DUMP();
642
643 i915_gem_set_wedged(engine->i915);
644 i915_request_put(old);
645 err = -EIO;
646 break;
647 }
e61e0f51 648 i915_request_put(old);
79f0f472 649 }
0ade4390
CW
650
651 cond_resched();
79f0f472
CW
652 }
653
654 for (count = 0; count < ARRAY_SIZE(rq); count++)
e61e0f51 655 i915_request_put(rq[count]);
79f0f472
CW
656
657err_file:
658 mock_file_free(engine->i915, file);
659 return err;
660}
661
a90507d6
CW
662static int __igt_reset_engines(struct drm_i915_private *i915,
663 const char *test_name,
664 unsigned int flags)
79f0f472 665{
3fb04cb0 666 struct intel_engine_cs *engine, *other;
79f0f472 667 enum intel_engine_id id, tmp;
3fb04cb0 668 struct hang h;
79f0f472
CW
669 int err = 0;
670
671 /* Check that issuing a reset on one engine does not interfere
672 * with any other engine.
673 */
674
675 if (!intel_has_reset_engine(i915))
676 return 0;
677
a90507d6 678 if (flags & TEST_ACTIVE) {
3fb04cb0
CW
679 mutex_lock(&i915->drm.struct_mutex);
680 err = hang_init(&h, i915);
681 mutex_unlock(&i915->drm.struct_mutex);
682 if (err)
683 return err;
a90507d6
CW
684
685 if (flags & TEST_PRIORITY)
686 h.ctx->priority = 1024;
3fb04cb0
CW
687 }
688
79f0f472 689 for_each_engine(engine, i915, id) {
a90507d6 690 struct active_engine threads[I915_NUM_ENGINES] = {};
79f0f472 691 unsigned long global = i915_reset_count(&i915->gpu_error);
a90507d6 692 unsigned long count = 0, reported;
79f0f472
CW
693 IGT_TIMEOUT(end_time);
694
a90507d6
CW
695 if (flags & TEST_ACTIVE &&
696 !intel_engine_can_store_dword(engine))
3fb04cb0
CW
697 continue;
698
79f0f472 699 memset(threads, 0, sizeof(threads));
3fb04cb0 700 for_each_engine(other, i915, tmp) {
79f0f472
CW
701 struct task_struct *tsk;
702
a90507d6
CW
703 threads[tmp].resets =
704 i915_reset_engine_count(&i915->gpu_error,
705 other);
79f0f472 706
a90507d6 707 if (!(flags & TEST_OTHERS))
3fb04cb0
CW
708 continue;
709
a90507d6
CW
710 if (other == engine && !(flags & TEST_SELF))
711 continue;
712
713 threads[tmp].engine = other;
714 threads[tmp].flags = flags;
715
716 tsk = kthread_run(active_engine, &threads[tmp],
3fb04cb0 717 "igt/%s", other->name);
79f0f472
CW
718 if (IS_ERR(tsk)) {
719 err = PTR_ERR(tsk);
720 goto unwind;
721 }
722
a90507d6 723 threads[tmp].task = tsk;
79f0f472
CW
724 get_task_struct(tsk);
725 }
726
3fb04cb0 727 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
79f0f472 728 do {
a90507d6
CW
729 u32 seqno = intel_engine_get_seqno(engine);
730 struct i915_request *rq = NULL;
3fb04cb0 731
a90507d6 732 if (flags & TEST_ACTIVE) {
3fb04cb0 733 mutex_lock(&i915->drm.struct_mutex);
8ec21a7c 734 rq = hang_create_request(&h, engine);
3fb04cb0
CW
735 if (IS_ERR(rq)) {
736 err = PTR_ERR(rq);
737 mutex_unlock(&i915->drm.struct_mutex);
738 break;
739 }
740
e61e0f51
CW
741 i915_request_get(rq);
742 __i915_request_add(rq, true);
3fb04cb0
CW
743 mutex_unlock(&i915->drm.struct_mutex);
744
29991d53 745 if (!wait_until_running(&h, rq)) {
3fb04cb0
CW
746 struct drm_printer p = drm_info_printer(i915->drm.dev);
747
748 pr_err("%s: Failed to start request %x, at %x\n",
749 __func__, rq->fence.seqno, hws_seqno(&h, rq));
750 intel_engine_dump(engine, &p,
751 "%s\n", engine->name);
752
e61e0f51 753 i915_request_put(rq);
3fb04cb0
CW
754 err = -EIO;
755 break;
756 }
757
a90507d6
CW
758 GEM_BUG_ON(!rq->global_seqno);
759 seqno = rq->global_seqno - 1;
3fb04cb0
CW
760 }
761
ce800754 762 err = i915_reset_engine(engine, NULL);
79f0f472 763 if (err) {
a90507d6
CW
764 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
765 engine->name, test_name, err);
79f0f472
CW
766 break;
767 }
3fb04cb0 768
3fb04cb0 769 count++;
a90507d6
CW
770
771 if (rq) {
772 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
773 i915_request_put(rq);
774 }
79f0f472 775 } while (time_before(jiffies, end_time));
3fb04cb0
CW
776 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
777 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
a90507d6
CW
778 engine->name, test_name, count);
779
780 reported = i915_reset_engine_count(&i915->gpu_error, engine);
781 reported -= threads[engine->id].resets;
782 if (reported != (flags & TEST_ACTIVE ? count : 0)) {
783 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
784 engine->name, test_name, count, reported,
785 (flags & TEST_ACTIVE ? count : 0));
3fb04cb0
CW
786 if (!err)
787 err = -EINVAL;
788 }
79f0f472
CW
789
790unwind:
3fb04cb0 791 for_each_engine(other, i915, tmp) {
79f0f472
CW
792 int ret;
793
a90507d6 794 if (!threads[tmp].task)
79f0f472
CW
795 continue;
796
a90507d6 797 ret = kthread_stop(threads[tmp].task);
79f0f472 798 if (ret) {
3fb04cb0
CW
799 pr_err("kthread for other engine %s failed, err=%d\n",
800 other->name, ret);
79f0f472
CW
801 if (!err)
802 err = ret;
803 }
a90507d6 804 put_task_struct(threads[tmp].task);
79f0f472 805
a90507d6
CW
806 if (other != engine &&
807 threads[tmp].resets !=
808 i915_reset_engine_count(&i915->gpu_error, other)) {
79f0f472 809 pr_err("Innocent engine %s was reset (count=%ld)\n",
3fb04cb0 810 other->name,
79f0f472 811 i915_reset_engine_count(&i915->gpu_error,
a90507d6
CW
812 other) -
813 threads[tmp].resets);
3fb04cb0
CW
814 if (!err)
815 err = -EINVAL;
79f0f472
CW
816 }
817 }
818
819 if (global != i915_reset_count(&i915->gpu_error)) {
820 pr_err("Global reset (count=%ld)!\n",
821 i915_reset_count(&i915->gpu_error) - global);
3fb04cb0
CW
822 if (!err)
823 err = -EINVAL;
79f0f472
CW
824 }
825
826 if (err)
827 break;
828
a8b66f2c
CW
829 err = flush_test(i915, 0);
830 if (err)
831 break;
79f0f472
CW
832 }
833
834 if (i915_terminally_wedged(&i915->gpu_error))
835 err = -EIO;
836
a90507d6 837 if (flags & TEST_ACTIVE) {
3fb04cb0
CW
838 mutex_lock(&i915->drm.struct_mutex);
839 hang_fini(&h);
840 mutex_unlock(&i915->drm.struct_mutex);
841 }
842
79f0f472
CW
843 return err;
844}
845
a90507d6 846static int igt_reset_engines(void *arg)
3fb04cb0 847{
a90507d6
CW
848 static const struct {
849 const char *name;
850 unsigned int flags;
851 } phases[] = {
852 { "idle", 0 },
853 { "active", TEST_ACTIVE },
854 { "others-idle", TEST_OTHERS },
855 { "others-active", TEST_OTHERS | TEST_ACTIVE },
856 {
857 "others-priority",
858 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
859 },
860 {
861 "self-priority",
862 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
863 },
864 { }
865 };
866 struct drm_i915_private *i915 = arg;
867 typeof(*phases) *p;
868 int err;
3fb04cb0 869
a90507d6
CW
870 for (p = phases; p->name; p++) {
871 if (p->flags & TEST_PRIORITY) {
872 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
873 continue;
874 }
875
876 err = __igt_reset_engines(arg, p->name, p->flags);
877 if (err)
878 return err;
879 }
880
881 return 0;
3fb04cb0
CW
882}
883
d0667e9c 884static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
496b575e 885{
d0667e9c
CW
886 struct i915_gpu_error *error = &rq->i915->gpu_error;
887 u32 reset_count = i915_reset_count(error);
496b575e 888
d0667e9c 889 error->stalled_mask = mask;
496b575e 890
d0667e9c
CW
891 /* set_bit() must be after we have setup the backchannel (mask) */
892 smp_mb__before_atomic();
893 set_bit(I915_RESET_HANDOFF, &error->flags);
496b575e 894
d0667e9c 895 wake_up_all(&error->wait_queue);
496b575e
CW
896
897 return reset_count;
898}
899
496b575e
CW
900static int igt_wait_reset(void *arg)
901{
902 struct drm_i915_private *i915 = arg;
e61e0f51 903 struct i915_request *rq;
496b575e
CW
904 unsigned int reset_count;
905 struct hang h;
906 long timeout;
907 int err;
908
f2f5c061
CW
909 if (!intel_engine_can_store_dword(i915->engine[RCS]))
910 return 0;
911
496b575e
CW
912 /* Check that we detect a stuck waiter and issue a reset */
913
3744d49c 914 global_reset_lock(i915);
496b575e
CW
915
916 mutex_lock(&i915->drm.struct_mutex);
917 err = hang_init(&h, i915);
918 if (err)
919 goto unlock;
920
8ec21a7c 921 rq = hang_create_request(&h, i915->engine[RCS]);
496b575e
CW
922 if (IS_ERR(rq)) {
923 err = PTR_ERR(rq);
924 goto fini;
925 }
926
e61e0f51
CW
927 i915_request_get(rq);
928 __i915_request_add(rq, true);
496b575e 929
29991d53 930 if (!wait_until_running(&h, rq)) {
95a19ab4
CW
931 struct drm_printer p = drm_info_printer(i915->drm.dev);
932
3fb04cb0
CW
933 pr_err("%s: Failed to start request %x, at %x\n",
934 __func__, rq->fence.seqno, hws_seqno(&h, rq));
0db18b17 935 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
87dc03ad 936
87dc03ad
CW
937 i915_gem_set_wedged(i915);
938
496b575e
CW
939 err = -EIO;
940 goto out_rq;
941 }
942
d0667e9c 943 reset_count = fake_hangcheck(rq, ALL_ENGINES);
496b575e 944
e61e0f51 945 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
496b575e 946 if (timeout < 0) {
e532be89 947 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
496b575e
CW
948 timeout);
949 err = timeout;
950 goto out_rq;
951 }
496b575e 952
8c185eca 953 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
496b575e
CW
954 if (i915_reset_count(&i915->gpu_error) == reset_count) {
955 pr_err("No GPU reset recorded!\n");
956 err = -EINVAL;
957 goto out_rq;
958 }
959
960out_rq:
e61e0f51 961 i915_request_put(rq);
496b575e
CW
962fini:
963 hang_fini(&h);
964unlock:
965 mutex_unlock(&i915->drm.struct_mutex);
3744d49c 966 global_reset_unlock(i915);
496b575e
CW
967
968 if (i915_terminally_wedged(&i915->gpu_error))
969 return -EIO;
970
971 return err;
972}
973
02866679
CW
974static int wait_for_others(struct drm_i915_private *i915,
975 struct intel_engine_cs *exclude)
976{
977 struct intel_engine_cs *engine;
978 enum intel_engine_id id;
979
980 for_each_engine(engine, i915, id) {
981 if (engine == exclude)
982 continue;
983
984 if (wait_for(intel_engine_is_idle(engine), 10))
985 return -EIO;
986 }
987
988 return 0;
989}
990
496b575e
CW
991static int igt_reset_queue(void *arg)
992{
993 struct drm_i915_private *i915 = arg;
994 struct intel_engine_cs *engine;
995 enum intel_engine_id id;
996 struct hang h;
997 int err;
998
999 /* Check that we replay pending requests following a hang */
1000
3744d49c
CW
1001 global_reset_lock(i915);
1002
496b575e
CW
1003 mutex_lock(&i915->drm.struct_mutex);
1004 err = hang_init(&h, i915);
1005 if (err)
1006 goto unlock;
1007
1008 for_each_engine(engine, i915, id) {
e61e0f51 1009 struct i915_request *prev;
496b575e
CW
1010 IGT_TIMEOUT(end_time);
1011 unsigned int count;
1012
f2f5c061
CW
1013 if (!intel_engine_can_store_dword(engine))
1014 continue;
1015
8ec21a7c 1016 prev = hang_create_request(&h, engine);
496b575e
CW
1017 if (IS_ERR(prev)) {
1018 err = PTR_ERR(prev);
1019 goto fini;
1020 }
1021
e61e0f51
CW
1022 i915_request_get(prev);
1023 __i915_request_add(prev, true);
496b575e
CW
1024
1025 count = 0;
1026 do {
e61e0f51 1027 struct i915_request *rq;
496b575e
CW
1028 unsigned int reset_count;
1029
8ec21a7c 1030 rq = hang_create_request(&h, engine);
496b575e
CW
1031 if (IS_ERR(rq)) {
1032 err = PTR_ERR(rq);
1033 goto fini;
1034 }
1035
e61e0f51
CW
1036 i915_request_get(rq);
1037 __i915_request_add(rq, true);
496b575e 1038
02866679
CW
1039 /*
1040 * XXX We don't handle resetting the kernel context
1041 * very well. If we trigger a device reset twice in
1042 * quick succession while the kernel context is
1043 * executing, we may end up skipping the breadcrumb.
1044 * This is really only a problem for the selftest as
1045 * normally there is a large interlude between resets
1046 * (hangcheck), or we focus on resetting just one
1047 * engine and so avoid repeatedly resetting innocents.
1048 */
1049 err = wait_for_others(i915, engine);
1050 if (err) {
1051 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1052 __func__, engine->name);
1053 i915_request_put(rq);
1054 i915_request_put(prev);
1055
1056 GEM_TRACE_DUMP();
1057 i915_gem_set_wedged(i915);
1058 goto fini;
1059 }
1060
29991d53 1061 if (!wait_until_running(&h, prev)) {
95a19ab4
CW
1062 struct drm_printer p = drm_info_printer(i915->drm.dev);
1063
02866679
CW
1064 pr_err("%s(%s): Failed to start request %x, at %x\n",
1065 __func__, engine->name,
1066 prev->fence.seqno, hws_seqno(&h, prev));
1067 intel_engine_dump(engine, &p,
1068 "%s\n", engine->name);
95a19ab4 1069
e61e0f51
CW
1070 i915_request_put(rq);
1071 i915_request_put(prev);
87dc03ad 1072
87dc03ad
CW
1073 i915_gem_set_wedged(i915);
1074
496b575e
CW
1075 err = -EIO;
1076 goto fini;
1077 }
1078
d0667e9c 1079 reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
496b575e 1080
d0667e9c 1081 i915_reset(i915, ENGINE_MASK(id), NULL);
496b575e 1082
8c185eca 1083 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
496b575e 1084 &i915->gpu_error.flags));
8c185eca 1085
496b575e
CW
1086 if (prev->fence.error != -EIO) {
1087 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1088 prev->fence.error);
e61e0f51
CW
1089 i915_request_put(rq);
1090 i915_request_put(prev);
496b575e
CW
1091 err = -EINVAL;
1092 goto fini;
1093 }
1094
1095 if (rq->fence.error) {
1096 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1097 rq->fence.error);
e61e0f51
CW
1098 i915_request_put(rq);
1099 i915_request_put(prev);
496b575e
CW
1100 err = -EINVAL;
1101 goto fini;
1102 }
1103
1104 if (i915_reset_count(&i915->gpu_error) == reset_count) {
1105 pr_err("No GPU reset recorded!\n");
e61e0f51
CW
1106 i915_request_put(rq);
1107 i915_request_put(prev);
496b575e
CW
1108 err = -EINVAL;
1109 goto fini;
1110 }
1111
e61e0f51 1112 i915_request_put(prev);
496b575e
CW
1113 prev = rq;
1114 count++;
1115 } while (time_before(jiffies, end_time));
1116 pr_info("%s: Completed %d resets\n", engine->name, count);
1117
1118 *h.batch = MI_BATCH_BUFFER_END;
60456d5c 1119 i915_gem_chipset_flush(i915);
496b575e 1120
e61e0f51 1121 i915_request_put(prev);
a8b66f2c
CW
1122
1123 err = flush_test(i915, I915_WAIT_LOCKED);
1124 if (err)
1125 break;
496b575e
CW
1126 }
1127
1128fini:
1129 hang_fini(&h);
1130unlock:
1131 mutex_unlock(&i915->drm.struct_mutex);
3744d49c 1132 global_reset_unlock(i915);
496b575e
CW
1133
1134 if (i915_terminally_wedged(&i915->gpu_error))
1135 return -EIO;
1136
1137 return err;
1138}
1139
41533940 1140static int igt_handle_error(void *arg)
abeb4def
MT
1141{
1142 struct drm_i915_private *i915 = arg;
1143 struct intel_engine_cs *engine = i915->engine[RCS];
1144 struct hang h;
e61e0f51 1145 struct i915_request *rq;
41533940
CW
1146 struct i915_gpu_state *error;
1147 int err;
abeb4def
MT
1148
1149 /* Check that we can issue a global GPU and engine reset */
1150
1151 if (!intel_has_reset_engine(i915))
1152 return 0;
1153
d0667e9c 1154 if (!engine || !intel_engine_can_store_dword(engine))
f2f5c061
CW
1155 return 0;
1156
abeb4def
MT
1157 mutex_lock(&i915->drm.struct_mutex);
1158
1159 err = hang_init(&h, i915);
1160 if (err)
774eed4a 1161 goto err_unlock;
abeb4def 1162
8ec21a7c 1163 rq = hang_create_request(&h, engine);
abeb4def
MT
1164 if (IS_ERR(rq)) {
1165 err = PTR_ERR(rq);
774eed4a 1166 goto err_fini;
abeb4def
MT
1167 }
1168
e61e0f51
CW
1169 i915_request_get(rq);
1170 __i915_request_add(rq, true);
abeb4def 1171
29991d53 1172 if (!wait_until_running(&h, rq)) {
95a19ab4
CW
1173 struct drm_printer p = drm_info_printer(i915->drm.dev);
1174
3fb04cb0
CW
1175 pr_err("%s: Failed to start request %x, at %x\n",
1176 __func__, rq->fence.seqno, hws_seqno(&h, rq));
0db18b17 1177 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
87dc03ad 1178
87dc03ad
CW
1179 i915_gem_set_wedged(i915);
1180
abeb4def 1181 err = -EIO;
774eed4a 1182 goto err_request;
abeb4def
MT
1183 }
1184
abeb4def 1185 mutex_unlock(&i915->drm.struct_mutex);
abeb4def 1186
41533940
CW
1187 /* Temporarily disable error capture */
1188 error = xchg(&i915->gpu_error.first_error, (void *)-1);
abeb4def 1189
d0667e9c 1190 i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
abeb4def 1191
41533940 1192 xchg(&i915->gpu_error.first_error, error);
abeb4def 1193
41533940 1194 mutex_lock(&i915->drm.struct_mutex);
abeb4def 1195
41533940
CW
1196 if (rq->fence.error != -EIO) {
1197 pr_err("Guilty request not identified!\n");
1198 err = -EINVAL;
1199 goto err_request;
1200 }
774eed4a
CW
1201
1202err_request:
e61e0f51 1203 i915_request_put(rq);
774eed4a
CW
1204err_fini:
1205 hang_fini(&h);
1206err_unlock:
1207 mutex_unlock(&i915->drm.struct_mutex);
41533940 1208 return err;
abeb4def
MT
1209}
1210
496b575e
CW
1211int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1212{
1213 static const struct i915_subtest tests[] = {
87dc03ad 1214 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
496b575e 1215 SUBTEST(igt_hang_sanitycheck),
3fb04cb0
CW
1216 SUBTEST(igt_reset_idle_engine),
1217 SUBTEST(igt_reset_active_engine),
a90507d6 1218 SUBTEST(igt_reset_engines),
496b575e
CW
1219 SUBTEST(igt_wait_reset),
1220 SUBTEST(igt_reset_queue),
41533940 1221 SUBTEST(igt_handle_error),
496b575e 1222 };
3fb04cb0 1223 bool saved_hangcheck;
ff97d3ae 1224 int err;
496b575e
CW
1225
1226 if (!intel_has_gpu_reset(i915))
1227 return 0;
1228
ff97d3ae 1229 intel_runtime_pm_get(i915);
3fb04cb0 1230 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
ff97d3ae
CW
1231
1232 err = i915_subtests(tests, i915);
1233
0ade4390
CW
1234 mutex_lock(&i915->drm.struct_mutex);
1235 flush_test(i915, I915_WAIT_LOCKED);
1236 mutex_unlock(&i915->drm.struct_mutex);
1237
3fb04cb0 1238 i915_modparams.enable_hangcheck = saved_hangcheck;
ff97d3ae
CW
1239 intel_runtime_pm_put(i915);
1240
1241 return err;
496b575e 1242}