2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/kthread.h>
27 #include "../i915_selftest.h"
29 #include "mock_context.h"
33 struct drm_i915_private *i915;
34 struct drm_i915_gem_object *hws;
35 struct drm_i915_gem_object *obj;
40 static int hang_init(struct hang *h, struct drm_i915_private *i915)
45 memset(h, 0, sizeof(*h));
48 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
50 return PTR_ERR(h->hws);
52 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
54 err = PTR_ERR(h->obj);
58 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
59 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
64 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
66 vaddr = i915_gem_object_pin_map(h->obj,
67 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
77 i915_gem_object_unpin_map(h->hws);
79 i915_gem_object_put(h->obj);
81 i915_gem_object_put(h->hws);
85 static u64 hws_address(const struct i915_vma *hws,
86 const struct drm_i915_gem_request *rq)
88 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
91 static int emit_recurse_batch(struct hang *h,
92 struct drm_i915_gem_request *rq)
94 struct drm_i915_private *i915 = h->i915;
95 struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
96 struct i915_vma *hws, *vma;
101 vma = i915_vma_instance(h->obj, vm, NULL);
105 hws = i915_vma_instance(h->hws, vm, NULL);
109 err = i915_vma_pin(vma, 0, 0, PIN_USER);
113 err = i915_vma_pin(hws, 0, 0, PIN_USER);
117 err = i915_switch_context(rq);
121 i915_vma_move_to_active(vma, rq, 0);
122 if (!i915_gem_object_has_active_reference(vma->obj)) {
123 i915_gem_object_get(vma->obj);
124 i915_gem_object_set_active_reference(vma->obj);
127 i915_vma_move_to_active(hws, rq, 0);
128 if (!i915_gem_object_has_active_reference(hws->obj)) {
129 i915_gem_object_get(hws->obj);
130 i915_gem_object_set_active_reference(hws->obj);
134 if (INTEL_GEN(i915) >= 8) {
135 *batch++ = MI_STORE_DWORD_IMM_GEN4;
136 *batch++ = lower_32_bits(hws_address(hws, rq));
137 *batch++ = upper_32_bits(hws_address(hws, rq));
138 *batch++ = rq->fence.seqno;
139 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
140 *batch++ = lower_32_bits(vma->node.start);
141 *batch++ = upper_32_bits(vma->node.start);
142 } else if (INTEL_GEN(i915) >= 6) {
143 *batch++ = MI_STORE_DWORD_IMM_GEN4;
145 *batch++ = lower_32_bits(hws_address(hws, rq));
146 *batch++ = rq->fence.seqno;
147 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
148 *batch++ = lower_32_bits(vma->node.start);
149 } else if (INTEL_GEN(i915) >= 4) {
150 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
152 *batch++ = lower_32_bits(hws_address(hws, rq));
153 *batch++ = rq->fence.seqno;
154 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
155 *batch++ = lower_32_bits(vma->node.start);
157 *batch++ = MI_STORE_DWORD_IMM;
158 *batch++ = lower_32_bits(hws_address(hws, rq));
159 *batch++ = rq->fence.seqno;
160 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
161 *batch++ = lower_32_bits(vma->node.start);
163 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
164 i915_gem_chipset_flush(h->i915);
167 if (INTEL_GEN(vm->i915) <= 5)
168 flags |= I915_DISPATCH_SECURE;
170 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
179 static struct drm_i915_gem_request *
180 hang_create_request(struct hang *h,
181 struct intel_engine_cs *engine,
182 struct i915_gem_context *ctx)
184 struct drm_i915_gem_request *rq;
187 if (i915_gem_object_is_active(h->obj)) {
188 struct drm_i915_gem_object *obj;
191 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
193 return ERR_CAST(obj);
195 vaddr = i915_gem_object_pin_map(obj,
196 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
198 i915_gem_object_put(obj);
199 return ERR_CAST(vaddr);
202 i915_gem_object_unpin_map(h->obj);
203 i915_gem_object_put(h->obj);
209 rq = i915_gem_request_alloc(engine, ctx);
213 err = emit_recurse_batch(h, rq);
215 __i915_add_request(rq, false);
222 static u32 hws_seqno(const struct hang *h,
223 const struct drm_i915_gem_request *rq)
225 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
228 static void hang_fini(struct hang *h)
230 *h->batch = MI_BATCH_BUFFER_END;
231 i915_gem_chipset_flush(h->i915);
233 i915_gem_object_unpin_map(h->obj);
234 i915_gem_object_put(h->obj);
236 i915_gem_object_unpin_map(h->hws);
237 i915_gem_object_put(h->hws);
239 i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
242 static int igt_hang_sanitycheck(void *arg)
244 struct drm_i915_private *i915 = arg;
245 struct drm_i915_gem_request *rq;
246 struct intel_engine_cs *engine;
247 enum intel_engine_id id;
251 /* Basic check that we can execute our hanging batch */
253 mutex_lock(&i915->drm.struct_mutex);
254 err = hang_init(&h, i915);
258 for_each_engine(engine, i915, id) {
261 if (!intel_engine_can_store_dword(engine))
264 rq = hang_create_request(&h, engine, i915->kernel_context);
267 pr_err("Failed to create request for %s, err=%d\n",
272 i915_gem_request_get(rq);
274 *h.batch = MI_BATCH_BUFFER_END;
275 i915_gem_chipset_flush(i915);
277 __i915_add_request(rq, true);
279 timeout = i915_wait_request(rq,
281 MAX_SCHEDULE_TIMEOUT);
282 i915_gem_request_put(rq);
286 pr_err("Wait for request failed on %s, err=%d\n",
295 mutex_unlock(&i915->drm.struct_mutex);
299 static void global_reset_lock(struct drm_i915_private *i915)
301 struct intel_engine_cs *engine;
302 enum intel_engine_id id;
304 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
305 wait_event(i915->gpu_error.reset_queue,
306 !test_bit(I915_RESET_BACKOFF,
307 &i915->gpu_error.flags));
309 for_each_engine(engine, i915, id) {
310 while (test_and_set_bit(I915_RESET_ENGINE + id,
311 &i915->gpu_error.flags))
312 wait_on_bit(&i915->gpu_error.flags,
313 I915_RESET_ENGINE + id,
314 TASK_UNINTERRUPTIBLE);
318 static void global_reset_unlock(struct drm_i915_private *i915)
320 struct intel_engine_cs *engine;
321 enum intel_engine_id id;
323 for_each_engine(engine, i915, id)
324 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
326 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
327 wake_up_all(&i915->gpu_error.reset_queue);
330 static int igt_global_reset(void *arg)
332 struct drm_i915_private *i915 = arg;
333 unsigned int reset_count;
336 /* Check that we can issue a global GPU reset */
338 global_reset_lock(i915);
339 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
341 mutex_lock(&i915->drm.struct_mutex);
342 reset_count = i915_reset_count(&i915->gpu_error);
344 i915_reset(i915, I915_RESET_QUIET);
346 if (i915_reset_count(&i915->gpu_error) == reset_count) {
347 pr_err("No GPU reset recorded!\n");
350 mutex_unlock(&i915->drm.struct_mutex);
352 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
353 global_reset_unlock(i915);
355 if (i915_terminally_wedged(&i915->gpu_error))
361 static int igt_reset_engine(void *arg)
363 struct drm_i915_private *i915 = arg;
364 struct intel_engine_cs *engine;
365 enum intel_engine_id id;
366 unsigned int reset_count, reset_engine_count;
369 /* Check that we can issue a global GPU and engine reset */
371 if (!intel_has_reset_engine(i915))
374 for_each_engine(engine, i915, id) {
375 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
376 reset_count = i915_reset_count(&i915->gpu_error);
377 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
380 err = i915_reset_engine(engine, I915_RESET_QUIET);
382 pr_err("i915_reset_engine failed\n");
386 if (i915_reset_count(&i915->gpu_error) != reset_count) {
387 pr_err("Full GPU reset recorded! (engine reset expected)\n");
392 if (i915_reset_engine_count(&i915->gpu_error, engine) ==
393 reset_engine_count) {
394 pr_err("No %s engine reset recorded!\n", engine->name);
399 clear_bit(I915_RESET_ENGINE + engine->id,
400 &i915->gpu_error.flags);
403 if (i915_terminally_wedged(&i915->gpu_error))
409 static int active_engine(void *data)
411 struct intel_engine_cs *engine = data;
412 struct drm_i915_gem_request *rq[2] = {};
413 struct i915_gem_context *ctx[2];
414 struct drm_file *file;
415 unsigned long count = 0;
418 file = mock_file(engine->i915);
420 return PTR_ERR(file);
422 mutex_lock(&engine->i915->drm.struct_mutex);
423 ctx[0] = live_context(engine->i915, file);
424 mutex_unlock(&engine->i915->drm.struct_mutex);
425 if (IS_ERR(ctx[0])) {
426 err = PTR_ERR(ctx[0]);
430 mutex_lock(&engine->i915->drm.struct_mutex);
431 ctx[1] = live_context(engine->i915, file);
432 mutex_unlock(&engine->i915->drm.struct_mutex);
433 if (IS_ERR(ctx[1])) {
434 err = PTR_ERR(ctx[1]);
435 i915_gem_context_put(ctx[0]);
439 while (!kthread_should_stop()) {
440 unsigned int idx = count++ & 1;
441 struct drm_i915_gem_request *old = rq[idx];
442 struct drm_i915_gem_request *new;
444 mutex_lock(&engine->i915->drm.struct_mutex);
445 new = i915_gem_request_alloc(engine, ctx[idx]);
447 mutex_unlock(&engine->i915->drm.struct_mutex);
452 rq[idx] = i915_gem_request_get(new);
453 i915_add_request(new);
454 mutex_unlock(&engine->i915->drm.struct_mutex);
457 i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
458 i915_gem_request_put(old);
462 for (count = 0; count < ARRAY_SIZE(rq); count++)
463 i915_gem_request_put(rq[count]);
466 mock_file_free(engine->i915, file);
470 static int igt_reset_active_engines(void *arg)
472 struct drm_i915_private *i915 = arg;
473 struct intel_engine_cs *engine, *active;
474 enum intel_engine_id id, tmp;
477 /* Check that issuing a reset on one engine does not interfere
478 * with any other engine.
481 if (!intel_has_reset_engine(i915))
484 for_each_engine(engine, i915, id) {
485 struct task_struct *threads[I915_NUM_ENGINES];
486 unsigned long resets[I915_NUM_ENGINES];
487 unsigned long global = i915_reset_count(&i915->gpu_error);
488 IGT_TIMEOUT(end_time);
490 memset(threads, 0, sizeof(threads));
491 for_each_engine(active, i915, tmp) {
492 struct task_struct *tsk;
494 if (active == engine)
497 resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
500 tsk = kthread_run(active_engine, active,
501 "igt/%s", active->name);
508 get_task_struct(tsk);
511 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
513 err = i915_reset_engine(engine, I915_RESET_QUIET);
515 pr_err("i915_reset_engine(%s) failed, err=%d\n",
519 } while (time_before(jiffies, end_time));
520 clear_bit(I915_RESET_ENGINE + engine->id,
521 &i915->gpu_error.flags);
524 for_each_engine(active, i915, tmp) {
530 ret = kthread_stop(threads[tmp]);
532 pr_err("kthread for active engine %s failed, err=%d\n",
537 put_task_struct(threads[tmp]);
539 if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
541 pr_err("Innocent engine %s was reset (count=%ld)\n",
543 i915_reset_engine_count(&i915->gpu_error,
544 active) - resets[tmp]);
549 if (global != i915_reset_count(&i915->gpu_error)) {
550 pr_err("Global reset (count=%ld)!\n",
551 i915_reset_count(&i915->gpu_error) - global);
561 if (i915_terminally_wedged(&i915->gpu_error))
567 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
571 rq->engine->hangcheck.stalled = true;
572 rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
574 reset_count = i915_reset_count(&rq->i915->gpu_error);
576 set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
577 wake_up_all(&rq->i915->gpu_error.wait_queue);
582 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
584 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
587 wait_for(i915_seqno_passed(hws_seqno(h, rq),
592 static int igt_wait_reset(void *arg)
594 struct drm_i915_private *i915 = arg;
595 struct drm_i915_gem_request *rq;
596 unsigned int reset_count;
601 if (!intel_engine_can_store_dword(i915->engine[RCS]))
604 /* Check that we detect a stuck waiter and issue a reset */
606 global_reset_lock(i915);
608 mutex_lock(&i915->drm.struct_mutex);
609 err = hang_init(&h, i915);
613 rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
619 i915_gem_request_get(rq);
620 __i915_add_request(rq, true);
622 if (!wait_for_hang(&h, rq)) {
623 struct drm_printer p = drm_info_printer(i915->drm.dev);
625 pr_err("Failed to start request %x, at %x\n",
626 rq->fence.seqno, hws_seqno(&h, rq));
627 intel_engine_dump(rq->engine, &p);
630 i915_gem_set_wedged(i915);
636 reset_count = fake_hangcheck(rq);
638 timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
640 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
646 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
647 if (i915_reset_count(&i915->gpu_error) == reset_count) {
648 pr_err("No GPU reset recorded!\n");
654 i915_gem_request_put(rq);
658 mutex_unlock(&i915->drm.struct_mutex);
659 global_reset_unlock(i915);
661 if (i915_terminally_wedged(&i915->gpu_error))
667 static int igt_reset_queue(void *arg)
669 struct drm_i915_private *i915 = arg;
670 struct intel_engine_cs *engine;
671 enum intel_engine_id id;
675 /* Check that we replay pending requests following a hang */
677 global_reset_lock(i915);
679 mutex_lock(&i915->drm.struct_mutex);
680 err = hang_init(&h, i915);
684 for_each_engine(engine, i915, id) {
685 struct drm_i915_gem_request *prev;
686 IGT_TIMEOUT(end_time);
689 if (!intel_engine_can_store_dword(engine))
692 prev = hang_create_request(&h, engine, i915->kernel_context);
698 i915_gem_request_get(prev);
699 __i915_add_request(prev, true);
703 struct drm_i915_gem_request *rq;
704 unsigned int reset_count;
706 rq = hang_create_request(&h,
708 i915->kernel_context);
714 i915_gem_request_get(rq);
715 __i915_add_request(rq, true);
717 if (!wait_for_hang(&h, prev)) {
718 struct drm_printer p = drm_info_printer(i915->drm.dev);
720 pr_err("Failed to start request %x, at %x\n",
721 prev->fence.seqno, hws_seqno(&h, prev));
722 intel_engine_dump(rq->engine, &p);
724 i915_gem_request_put(rq);
725 i915_gem_request_put(prev);
728 i915_gem_set_wedged(i915);
734 reset_count = fake_hangcheck(prev);
736 i915_reset(i915, I915_RESET_QUIET);
738 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
739 &i915->gpu_error.flags));
741 if (prev->fence.error != -EIO) {
742 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
744 i915_gem_request_put(rq);
745 i915_gem_request_put(prev);
750 if (rq->fence.error) {
751 pr_err("Fence error status not zero [%d] after unrelated reset\n",
753 i915_gem_request_put(rq);
754 i915_gem_request_put(prev);
759 if (i915_reset_count(&i915->gpu_error) == reset_count) {
760 pr_err("No GPU reset recorded!\n");
761 i915_gem_request_put(rq);
762 i915_gem_request_put(prev);
767 i915_gem_request_put(prev);
770 } while (time_before(jiffies, end_time));
771 pr_info("%s: Completed %d resets\n", engine->name, count);
773 *h.batch = MI_BATCH_BUFFER_END;
774 i915_gem_chipset_flush(i915);
776 i915_gem_request_put(prev);
782 mutex_unlock(&i915->drm.struct_mutex);
783 global_reset_unlock(i915);
785 if (i915_terminally_wedged(&i915->gpu_error))
791 static int igt_handle_error(void *arg)
793 struct drm_i915_private *i915 = arg;
794 struct intel_engine_cs *engine = i915->engine[RCS];
796 struct drm_i915_gem_request *rq;
797 struct i915_gpu_state *error;
800 /* Check that we can issue a global GPU and engine reset */
802 if (!intel_has_reset_engine(i915))
805 if (!intel_engine_can_store_dword(i915->engine[RCS]))
808 mutex_lock(&i915->drm.struct_mutex);
810 err = hang_init(&h, i915);
814 rq = hang_create_request(&h, engine, i915->kernel_context);
820 i915_gem_request_get(rq);
821 __i915_add_request(rq, true);
823 if (!wait_for_hang(&h, rq)) {
824 struct drm_printer p = drm_info_printer(i915->drm.dev);
826 pr_err("Failed to start request %x, at %x\n",
827 rq->fence.seqno, hws_seqno(&h, rq));
828 intel_engine_dump(rq->engine, &p);
831 i915_gem_set_wedged(i915);
837 mutex_unlock(&i915->drm.struct_mutex);
839 /* Temporarily disable error capture */
840 error = xchg(&i915->gpu_error.first_error, (void *)-1);
842 engine->hangcheck.stalled = true;
843 engine->hangcheck.seqno = intel_engine_get_seqno(engine);
845 i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
847 xchg(&i915->gpu_error.first_error, error);
849 mutex_lock(&i915->drm.struct_mutex);
851 if (rq->fence.error != -EIO) {
852 pr_err("Guilty request not identified!\n");
858 i915_gem_request_put(rq);
862 mutex_unlock(&i915->drm.struct_mutex);
866 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
868 static const struct i915_subtest tests[] = {
869 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
870 SUBTEST(igt_hang_sanitycheck),
871 SUBTEST(igt_reset_engine),
872 SUBTEST(igt_reset_active_engines),
873 SUBTEST(igt_wait_reset),
874 SUBTEST(igt_reset_queue),
875 SUBTEST(igt_handle_error),
879 if (!intel_has_gpu_reset(i915))
882 intel_runtime_pm_get(i915);
884 err = i915_subtests(tests, i915);
886 intel_runtime_pm_put(i915);