Merge tag 'pwm/for-5.5-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/thierry...
[linux-2.6-block.git] / drivers / gpu / drm / i915 / gt / selftest_hangcheck.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/kthread.h>
26
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36
37 #include "selftests/mock_drm.h"
38
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43
44 struct hang {
45         struct intel_gt *gt;
46         struct drm_i915_gem_object *hws;
47         struct drm_i915_gem_object *obj;
48         struct i915_gem_context *ctx;
49         u32 *seqno;
50         u32 *batch;
51 };
52
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55         void *vaddr;
56         int err;
57
58         memset(h, 0, sizeof(*h));
59         h->gt = gt;
60
61         h->ctx = kernel_context(gt->i915);
62         if (IS_ERR(h->ctx))
63                 return PTR_ERR(h->ctx);
64
65         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66
67         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
68         if (IS_ERR(h->hws)) {
69                 err = PTR_ERR(h->hws);
70                 goto err_ctx;
71         }
72
73         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
74         if (IS_ERR(h->obj)) {
75                 err = PTR_ERR(h->obj);
76                 goto err_hws;
77         }
78
79         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81         if (IS_ERR(vaddr)) {
82                 err = PTR_ERR(vaddr);
83                 goto err_obj;
84         }
85         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86
87         vaddr = i915_gem_object_pin_map(h->obj,
88                                         i915_coherent_map_type(gt->i915));
89         if (IS_ERR(vaddr)) {
90                 err = PTR_ERR(vaddr);
91                 goto err_unpin_hws;
92         }
93         h->batch = vaddr;
94
95         return 0;
96
97 err_unpin_hws:
98         i915_gem_object_unpin_map(h->hws);
99 err_obj:
100         i915_gem_object_put(h->obj);
101 err_hws:
102         i915_gem_object_put(h->hws);
103 err_ctx:
104         kernel_context_close(h->ctx);
105         return err;
106 }
107
108 static u64 hws_address(const struct i915_vma *hws,
109                        const struct i915_request *rq)
110 {
111         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113
114 static int move_to_active(struct i915_vma *vma,
115                           struct i915_request *rq,
116                           unsigned int flags)
117 {
118         int err;
119
120         i915_vma_lock(vma);
121         err = i915_request_await_object(rq, vma->obj,
122                                         flags & EXEC_OBJECT_WRITE);
123         if (err == 0)
124                 err = i915_vma_move_to_active(vma, rq, flags);
125         i915_vma_unlock(vma);
126
127         return err;
128 }
129
130 static struct i915_request *
131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
132 {
133         struct intel_gt *gt = h->gt;
134         struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
135         struct drm_i915_gem_object *obj;
136         struct i915_request *rq = NULL;
137         struct i915_vma *hws, *vma;
138         unsigned int flags;
139         void *vaddr;
140         u32 *batch;
141         int err;
142
143         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
144         if (IS_ERR(obj)) {
145                 i915_vm_put(vm);
146                 return ERR_CAST(obj);
147         }
148
149         vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
150         if (IS_ERR(vaddr)) {
151                 i915_gem_object_put(obj);
152                 i915_vm_put(vm);
153                 return ERR_CAST(vaddr);
154         }
155
156         i915_gem_object_unpin_map(h->obj);
157         i915_gem_object_put(h->obj);
158
159         h->obj = obj;
160         h->batch = vaddr;
161
162         vma = i915_vma_instance(h->obj, vm, NULL);
163         if (IS_ERR(vma)) {
164                 i915_vm_put(vm);
165                 return ERR_CAST(vma);
166         }
167
168         hws = i915_vma_instance(h->hws, vm, NULL);
169         if (IS_ERR(hws)) {
170                 i915_vm_put(vm);
171                 return ERR_CAST(hws);
172         }
173
174         err = i915_vma_pin(vma, 0, 0, PIN_USER);
175         if (err) {
176                 i915_vm_put(vm);
177                 return ERR_PTR(err);
178         }
179
180         err = i915_vma_pin(hws, 0, 0, PIN_USER);
181         if (err)
182                 goto unpin_vma;
183
184         rq = igt_request_alloc(h->ctx, engine);
185         if (IS_ERR(rq)) {
186                 err = PTR_ERR(rq);
187                 goto unpin_hws;
188         }
189
190         err = move_to_active(vma, rq, 0);
191         if (err)
192                 goto cancel_rq;
193
194         err = move_to_active(hws, rq, 0);
195         if (err)
196                 goto cancel_rq;
197
198         batch = h->batch;
199         if (INTEL_GEN(gt->i915) >= 8) {
200                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
201                 *batch++ = lower_32_bits(hws_address(hws, rq));
202                 *batch++ = upper_32_bits(hws_address(hws, rq));
203                 *batch++ = rq->fence.seqno;
204                 *batch++ = MI_ARB_CHECK;
205
206                 memset(batch, 0, 1024);
207                 batch += 1024 / sizeof(*batch);
208
209                 *batch++ = MI_ARB_CHECK;
210                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
211                 *batch++ = lower_32_bits(vma->node.start);
212                 *batch++ = upper_32_bits(vma->node.start);
213         } else if (INTEL_GEN(gt->i915) >= 6) {
214                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
215                 *batch++ = 0;
216                 *batch++ = lower_32_bits(hws_address(hws, rq));
217                 *batch++ = rq->fence.seqno;
218                 *batch++ = MI_ARB_CHECK;
219
220                 memset(batch, 0, 1024);
221                 batch += 1024 / sizeof(*batch);
222
223                 *batch++ = MI_ARB_CHECK;
224                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
225                 *batch++ = lower_32_bits(vma->node.start);
226         } else if (INTEL_GEN(gt->i915) >= 4) {
227                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
228                 *batch++ = 0;
229                 *batch++ = lower_32_bits(hws_address(hws, rq));
230                 *batch++ = rq->fence.seqno;
231                 *batch++ = MI_ARB_CHECK;
232
233                 memset(batch, 0, 1024);
234                 batch += 1024 / sizeof(*batch);
235
236                 *batch++ = MI_ARB_CHECK;
237                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
238                 *batch++ = lower_32_bits(vma->node.start);
239         } else {
240                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
241                 *batch++ = lower_32_bits(hws_address(hws, rq));
242                 *batch++ = rq->fence.seqno;
243                 *batch++ = MI_ARB_CHECK;
244
245                 memset(batch, 0, 1024);
246                 batch += 1024 / sizeof(*batch);
247
248                 *batch++ = MI_ARB_CHECK;
249                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
250                 *batch++ = lower_32_bits(vma->node.start);
251         }
252         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
253         intel_gt_chipset_flush(engine->gt);
254
255         if (rq->engine->emit_init_breadcrumb) {
256                 err = rq->engine->emit_init_breadcrumb(rq);
257                 if (err)
258                         goto cancel_rq;
259         }
260
261         flags = 0;
262         if (INTEL_GEN(gt->i915) <= 5)
263                 flags |= I915_DISPATCH_SECURE;
264
265         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
266
267 cancel_rq:
268         if (err) {
269                 i915_request_skip(rq, err);
270                 i915_request_add(rq);
271         }
272 unpin_hws:
273         i915_vma_unpin(hws);
274 unpin_vma:
275         i915_vma_unpin(vma);
276         i915_vm_put(vm);
277         return err ? ERR_PTR(err) : rq;
278 }
279
280 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
281 {
282         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
283 }
284
285 static void hang_fini(struct hang *h)
286 {
287         *h->batch = MI_BATCH_BUFFER_END;
288         intel_gt_chipset_flush(h->gt);
289
290         i915_gem_object_unpin_map(h->obj);
291         i915_gem_object_put(h->obj);
292
293         i915_gem_object_unpin_map(h->hws);
294         i915_gem_object_put(h->hws);
295
296         kernel_context_close(h->ctx);
297
298         igt_flush_test(h->gt->i915);
299 }
300
301 static bool wait_until_running(struct hang *h, struct i915_request *rq)
302 {
303         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
304                                                rq->fence.seqno),
305                              10) &&
306                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
307                                             rq->fence.seqno),
308                           1000));
309 }
310
311 static int igt_hang_sanitycheck(void *arg)
312 {
313         struct intel_gt *gt = arg;
314         struct i915_request *rq;
315         struct intel_engine_cs *engine;
316         enum intel_engine_id id;
317         struct hang h;
318         int err;
319
320         /* Basic check that we can execute our hanging batch */
321
322         err = hang_init(&h, gt);
323         if (err)
324                 return err;
325
326         for_each_engine(engine, gt, id) {
327                 struct intel_wedge_me w;
328                 long timeout;
329
330                 if (!intel_engine_can_store_dword(engine))
331                         continue;
332
333                 rq = hang_create_request(&h, engine);
334                 if (IS_ERR(rq)) {
335                         err = PTR_ERR(rq);
336                         pr_err("Failed to create request for %s, err=%d\n",
337                                engine->name, err);
338                         goto fini;
339                 }
340
341                 i915_request_get(rq);
342
343                 *h.batch = MI_BATCH_BUFFER_END;
344                 intel_gt_chipset_flush(engine->gt);
345
346                 i915_request_add(rq);
347
348                 timeout = 0;
349                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
350                         timeout = i915_request_wait(rq, 0,
351                                                     MAX_SCHEDULE_TIMEOUT);
352                 if (intel_gt_is_wedged(gt))
353                         timeout = -EIO;
354
355                 i915_request_put(rq);
356
357                 if (timeout < 0) {
358                         err = timeout;
359                         pr_err("Wait for request failed on %s, err=%d\n",
360                                engine->name, err);
361                         goto fini;
362                 }
363         }
364
365 fini:
366         hang_fini(&h);
367         return err;
368 }
369
370 static bool wait_for_idle(struct intel_engine_cs *engine)
371 {
372         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
373 }
374
375 static int igt_reset_nop(void *arg)
376 {
377         struct intel_gt *gt = arg;
378         struct i915_gpu_error *global = &gt->i915->gpu_error;
379         struct intel_engine_cs *engine;
380         struct i915_gem_context *ctx;
381         unsigned int reset_count, count;
382         enum intel_engine_id id;
383         struct drm_file *file;
384         IGT_TIMEOUT(end_time);
385         int err = 0;
386
387         /* Check that we can reset during non-user portions of requests */
388
389         file = mock_file(gt->i915);
390         if (IS_ERR(file))
391                 return PTR_ERR(file);
392
393         ctx = live_context(gt->i915, file);
394         if (IS_ERR(ctx)) {
395                 err = PTR_ERR(ctx);
396                 goto out;
397         }
398
399         i915_gem_context_clear_bannable(ctx);
400         reset_count = i915_reset_count(global);
401         count = 0;
402         do {
403                 for_each_engine(engine, gt, id) {
404                         int i;
405
406                         for (i = 0; i < 16; i++) {
407                                 struct i915_request *rq;
408
409                                 rq = igt_request_alloc(ctx, engine);
410                                 if (IS_ERR(rq)) {
411                                         err = PTR_ERR(rq);
412                                         break;
413                                 }
414
415                                 i915_request_add(rq);
416                         }
417                 }
418
419                 igt_global_reset_lock(gt);
420                 intel_gt_reset(gt, ALL_ENGINES, NULL);
421                 igt_global_reset_unlock(gt);
422
423                 if (intel_gt_is_wedged(gt)) {
424                         err = -EIO;
425                         break;
426                 }
427
428                 if (i915_reset_count(global) != reset_count + ++count) {
429                         pr_err("Full GPU reset not recorded!\n");
430                         err = -EINVAL;
431                         break;
432                 }
433
434                 err = igt_flush_test(gt->i915);
435                 if (err)
436                         break;
437         } while (time_before(jiffies, end_time));
438         pr_info("%s: %d resets\n", __func__, count);
439
440         err = igt_flush_test(gt->i915);
441 out:
442         mock_file_free(gt->i915, file);
443         if (intel_gt_is_wedged(gt))
444                 err = -EIO;
445         return err;
446 }
447
448 static int igt_reset_nop_engine(void *arg)
449 {
450         struct intel_gt *gt = arg;
451         struct i915_gpu_error *global = &gt->i915->gpu_error;
452         struct intel_engine_cs *engine;
453         struct i915_gem_context *ctx;
454         enum intel_engine_id id;
455         struct drm_file *file;
456         int err = 0;
457
458         /* Check that we can engine-reset during non-user portions */
459
460         if (!intel_has_reset_engine(gt))
461                 return 0;
462
463         file = mock_file(gt->i915);
464         if (IS_ERR(file))
465                 return PTR_ERR(file);
466
467         ctx = live_context(gt->i915, file);
468         if (IS_ERR(ctx)) {
469                 err = PTR_ERR(ctx);
470                 goto out;
471         }
472
473         i915_gem_context_clear_bannable(ctx);
474         for_each_engine(engine, gt, id) {
475                 unsigned int reset_count, reset_engine_count;
476                 unsigned int count;
477                 IGT_TIMEOUT(end_time);
478
479                 reset_count = i915_reset_count(global);
480                 reset_engine_count = i915_reset_engine_count(global, engine);
481                 count = 0;
482
483                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
484                 do {
485                         int i;
486
487                         if (!wait_for_idle(engine)) {
488                                 pr_err("%s failed to idle before reset\n",
489                                        engine->name);
490                                 err = -EIO;
491                                 break;
492                         }
493
494                         for (i = 0; i < 16; i++) {
495                                 struct i915_request *rq;
496
497                                 rq = igt_request_alloc(ctx, engine);
498                                 if (IS_ERR(rq)) {
499                                         err = PTR_ERR(rq);
500                                         break;
501                                 }
502
503                                 i915_request_add(rq);
504                         }
505                         err = intel_engine_reset(engine, NULL);
506                         if (err) {
507                                 pr_err("i915_reset_engine failed\n");
508                                 break;
509                         }
510
511                         if (i915_reset_count(global) != reset_count) {
512                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
513                                 err = -EINVAL;
514                                 break;
515                         }
516
517                         if (i915_reset_engine_count(global, engine) !=
518                             reset_engine_count + ++count) {
519                                 pr_err("%s engine reset not recorded!\n",
520                                        engine->name);
521                                 err = -EINVAL;
522                                 break;
523                         }
524                 } while (time_before(jiffies, end_time));
525                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
526                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
527
528                 if (err)
529                         break;
530
531                 err = igt_flush_test(gt->i915);
532                 if (err)
533                         break;
534         }
535
536         err = igt_flush_test(gt->i915);
537 out:
538         mock_file_free(gt->i915, file);
539         if (intel_gt_is_wedged(gt))
540                 err = -EIO;
541         return err;
542 }
543
544 static int __igt_reset_engine(struct intel_gt *gt, bool active)
545 {
546         struct i915_gpu_error *global = &gt->i915->gpu_error;
547         struct intel_engine_cs *engine;
548         enum intel_engine_id id;
549         struct hang h;
550         int err = 0;
551
552         /* Check that we can issue an engine reset on an idle engine (no-op) */
553
554         if (!intel_has_reset_engine(gt))
555                 return 0;
556
557         if (active) {
558                 err = hang_init(&h, gt);
559                 if (err)
560                         return err;
561         }
562
563         for_each_engine(engine, gt, id) {
564                 unsigned int reset_count, reset_engine_count;
565                 IGT_TIMEOUT(end_time);
566
567                 if (active && !intel_engine_can_store_dword(engine))
568                         continue;
569
570                 if (!wait_for_idle(engine)) {
571                         pr_err("%s failed to idle before reset\n",
572                                engine->name);
573                         err = -EIO;
574                         break;
575                 }
576
577                 reset_count = i915_reset_count(global);
578                 reset_engine_count = i915_reset_engine_count(global, engine);
579
580                 intel_engine_pm_get(engine);
581                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
582                 do {
583                         if (active) {
584                                 struct i915_request *rq;
585
586                                 rq = hang_create_request(&h, engine);
587                                 if (IS_ERR(rq)) {
588                                         err = PTR_ERR(rq);
589                                         break;
590                                 }
591
592                                 i915_request_get(rq);
593                                 i915_request_add(rq);
594
595                                 if (!wait_until_running(&h, rq)) {
596                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
597
598                                         pr_err("%s: Failed to start request %llx, at %x\n",
599                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
600                                         intel_engine_dump(engine, &p,
601                                                           "%s\n", engine->name);
602
603                                         i915_request_put(rq);
604                                         err = -EIO;
605                                         break;
606                                 }
607
608                                 i915_request_put(rq);
609                         }
610
611                         err = intel_engine_reset(engine, NULL);
612                         if (err) {
613                                 pr_err("i915_reset_engine failed\n");
614                                 break;
615                         }
616
617                         if (i915_reset_count(global) != reset_count) {
618                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
619                                 err = -EINVAL;
620                                 break;
621                         }
622
623                         if (i915_reset_engine_count(global, engine) !=
624                             ++reset_engine_count) {
625                                 pr_err("%s engine reset not recorded!\n",
626                                        engine->name);
627                                 err = -EINVAL;
628                                 break;
629                         }
630                 } while (time_before(jiffies, end_time));
631                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
632                 intel_engine_pm_put(engine);
633
634                 if (err)
635                         break;
636
637                 err = igt_flush_test(gt->i915);
638                 if (err)
639                         break;
640         }
641
642         if (intel_gt_is_wedged(gt))
643                 err = -EIO;
644
645         if (active)
646                 hang_fini(&h);
647
648         return err;
649 }
650
651 static int igt_reset_idle_engine(void *arg)
652 {
653         return __igt_reset_engine(arg, false);
654 }
655
656 static int igt_reset_active_engine(void *arg)
657 {
658         return __igt_reset_engine(arg, true);
659 }
660
661 struct active_engine {
662         struct task_struct *task;
663         struct intel_engine_cs *engine;
664         unsigned long resets;
665         unsigned int flags;
666 };
667
668 #define TEST_ACTIVE     BIT(0)
669 #define TEST_OTHERS     BIT(1)
670 #define TEST_SELF       BIT(2)
671 #define TEST_PRIORITY   BIT(3)
672
673 static int active_request_put(struct i915_request *rq)
674 {
675         int err = 0;
676
677         if (!rq)
678                 return 0;
679
680         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
681                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
682                           rq->engine->name,
683                           rq->fence.context,
684                           rq->fence.seqno);
685                 GEM_TRACE_DUMP();
686
687                 intel_gt_set_wedged(rq->engine->gt);
688                 err = -EIO;
689         }
690
691         i915_request_put(rq);
692
693         return err;
694 }
695
696 static int active_engine(void *data)
697 {
698         I915_RND_STATE(prng);
699         struct active_engine *arg = data;
700         struct intel_engine_cs *engine = arg->engine;
701         struct i915_request *rq[8] = {};
702         struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
703         struct drm_file *file;
704         unsigned long count = 0;
705         int err = 0;
706
707         file = mock_file(engine->i915);
708         if (IS_ERR(file))
709                 return PTR_ERR(file);
710
711         for (count = 0; count < ARRAY_SIZE(ctx); count++) {
712                 ctx[count] = live_context(engine->i915, file);
713                 if (IS_ERR(ctx[count])) {
714                         err = PTR_ERR(ctx[count]);
715                         while (--count)
716                                 i915_gem_context_put(ctx[count]);
717                         goto err_file;
718                 }
719         }
720
721         while (!kthread_should_stop()) {
722                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
723                 struct i915_request *old = rq[idx];
724                 struct i915_request *new;
725
726                 new = igt_request_alloc(ctx[idx], engine);
727                 if (IS_ERR(new)) {
728                         err = PTR_ERR(new);
729                         break;
730                 }
731
732                 if (arg->flags & TEST_PRIORITY)
733                         ctx[idx]->sched.priority =
734                                 i915_prandom_u32_max_state(512, &prng);
735
736                 rq[idx] = i915_request_get(new);
737                 i915_request_add(new);
738
739                 err = active_request_put(old);
740                 if (err)
741                         break;
742
743                 cond_resched();
744         }
745
746         for (count = 0; count < ARRAY_SIZE(rq); count++) {
747                 int err__ = active_request_put(rq[count]);
748
749                 /* Keep the first error */
750                 if (!err)
751                         err = err__;
752         }
753
754 err_file:
755         mock_file_free(engine->i915, file);
756         return err;
757 }
758
759 static int __igt_reset_engines(struct intel_gt *gt,
760                                const char *test_name,
761                                unsigned int flags)
762 {
763         struct i915_gpu_error *global = &gt->i915->gpu_error;
764         struct intel_engine_cs *engine, *other;
765         enum intel_engine_id id, tmp;
766         struct hang h;
767         int err = 0;
768
769         /* Check that issuing a reset on one engine does not interfere
770          * with any other engine.
771          */
772
773         if (!intel_has_reset_engine(gt))
774                 return 0;
775
776         if (flags & TEST_ACTIVE) {
777                 err = hang_init(&h, gt);
778                 if (err)
779                         return err;
780
781                 if (flags & TEST_PRIORITY)
782                         h.ctx->sched.priority = 1024;
783         }
784
785         for_each_engine(engine, gt, id) {
786                 struct active_engine threads[I915_NUM_ENGINES] = {};
787                 unsigned long device = i915_reset_count(global);
788                 unsigned long count = 0, reported;
789                 IGT_TIMEOUT(end_time);
790
791                 if (flags & TEST_ACTIVE &&
792                     !intel_engine_can_store_dword(engine))
793                         continue;
794
795                 if (!wait_for_idle(engine)) {
796                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
797                                engine->name, test_name);
798                         err = -EIO;
799                         break;
800                 }
801
802                 memset(threads, 0, sizeof(threads));
803                 for_each_engine(other, gt, tmp) {
804                         struct task_struct *tsk;
805
806                         threads[tmp].resets =
807                                 i915_reset_engine_count(global, other);
808
809                         if (!(flags & TEST_OTHERS))
810                                 continue;
811
812                         if (other == engine && !(flags & TEST_SELF))
813                                 continue;
814
815                         threads[tmp].engine = other;
816                         threads[tmp].flags = flags;
817
818                         tsk = kthread_run(active_engine, &threads[tmp],
819                                           "igt/%s", other->name);
820                         if (IS_ERR(tsk)) {
821                                 err = PTR_ERR(tsk);
822                                 goto unwind;
823                         }
824
825                         threads[tmp].task = tsk;
826                         get_task_struct(tsk);
827                 }
828
829                 yield(); /* start all threads before we begin */
830
831                 intel_engine_pm_get(engine);
832                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
833                 do {
834                         struct i915_request *rq = NULL;
835
836                         if (flags & TEST_ACTIVE) {
837                                 rq = hang_create_request(&h, engine);
838                                 if (IS_ERR(rq)) {
839                                         err = PTR_ERR(rq);
840                                         break;
841                                 }
842
843                                 i915_request_get(rq);
844                                 i915_request_add(rq);
845
846                                 if (!wait_until_running(&h, rq)) {
847                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
848
849                                         pr_err("%s: Failed to start request %llx, at %x\n",
850                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
851                                         intel_engine_dump(engine, &p,
852                                                           "%s\n", engine->name);
853
854                                         i915_request_put(rq);
855                                         err = -EIO;
856                                         break;
857                                 }
858                         }
859
860                         err = intel_engine_reset(engine, NULL);
861                         if (err) {
862                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
863                                        engine->name, test_name, err);
864                                 break;
865                         }
866
867                         count++;
868
869                         if (rq) {
870                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
871                                         struct drm_printer p =
872                                                 drm_info_printer(gt->i915->drm.dev);
873
874                                         pr_err("i915_reset_engine(%s:%s):"
875                                                " failed to complete request after reset\n",
876                                                engine->name, test_name);
877                                         intel_engine_dump(engine, &p,
878                                                           "%s\n", engine->name);
879                                         i915_request_put(rq);
880
881                                         GEM_TRACE_DUMP();
882                                         intel_gt_set_wedged(gt);
883                                         err = -EIO;
884                                         break;
885                                 }
886
887                                 i915_request_put(rq);
888                         }
889
890                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
891                                 struct drm_printer p =
892                                         drm_info_printer(gt->i915->drm.dev);
893
894                                 pr_err("i915_reset_engine(%s:%s):"
895                                        " failed to idle after reset\n",
896                                        engine->name, test_name);
897                                 intel_engine_dump(engine, &p,
898                                                   "%s\n", engine->name);
899
900                                 err = -EIO;
901                                 break;
902                         }
903                 } while (time_before(jiffies, end_time));
904                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
905                 intel_engine_pm_put(engine);
906                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
907                         engine->name, test_name, count);
908
909                 reported = i915_reset_engine_count(global, engine);
910                 reported -= threads[engine->id].resets;
911                 if (reported != count) {
912                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
913                                engine->name, test_name, count, reported);
914                         if (!err)
915                                 err = -EINVAL;
916                 }
917
918 unwind:
919                 for_each_engine(other, gt, tmp) {
920                         int ret;
921
922                         if (!threads[tmp].task)
923                                 continue;
924
925                         ret = kthread_stop(threads[tmp].task);
926                         if (ret) {
927                                 pr_err("kthread for other engine %s failed, err=%d\n",
928                                        other->name, ret);
929                                 if (!err)
930                                         err = ret;
931                         }
932                         put_task_struct(threads[tmp].task);
933
934                         if (other->uabi_class != engine->uabi_class &&
935                             threads[tmp].resets !=
936                             i915_reset_engine_count(global, other)) {
937                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
938                                        other->name,
939                                        i915_reset_engine_count(global, other) -
940                                        threads[tmp].resets);
941                                 if (!err)
942                                         err = -EINVAL;
943                         }
944                 }
945
946                 if (device != i915_reset_count(global)) {
947                         pr_err("Global reset (count=%ld)!\n",
948                                i915_reset_count(global) - device);
949                         if (!err)
950                                 err = -EINVAL;
951                 }
952
953                 if (err)
954                         break;
955
956                 err = igt_flush_test(gt->i915);
957                 if (err)
958                         break;
959         }
960
961         if (intel_gt_is_wedged(gt))
962                 err = -EIO;
963
964         if (flags & TEST_ACTIVE)
965                 hang_fini(&h);
966
967         return err;
968 }
969
970 static int igt_reset_engines(void *arg)
971 {
972         static const struct {
973                 const char *name;
974                 unsigned int flags;
975         } phases[] = {
976                 { "idle", 0 },
977                 { "active", TEST_ACTIVE },
978                 { "others-idle", TEST_OTHERS },
979                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
980                 {
981                         "others-priority",
982                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
983                 },
984                 {
985                         "self-priority",
986                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
987                 },
988                 { }
989         };
990         struct intel_gt *gt = arg;
991         typeof(*phases) *p;
992         int err;
993
994         for (p = phases; p->name; p++) {
995                 if (p->flags & TEST_PRIORITY) {
996                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
997                                 continue;
998                 }
999
1000                 err = __igt_reset_engines(arg, p->name, p->flags);
1001                 if (err)
1002                         return err;
1003         }
1004
1005         return 0;
1006 }
1007
1008 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1009 {
1010         u32 count = i915_reset_count(&gt->i915->gpu_error);
1011
1012         intel_gt_reset(gt, mask, NULL);
1013
1014         return count;
1015 }
1016
1017 static int igt_reset_wait(void *arg)
1018 {
1019         struct intel_gt *gt = arg;
1020         struct i915_gpu_error *global = &gt->i915->gpu_error;
1021         struct intel_engine_cs *engine = gt->engine[RCS0];
1022         struct i915_request *rq;
1023         unsigned int reset_count;
1024         struct hang h;
1025         long timeout;
1026         int err;
1027
1028         if (!engine || !intel_engine_can_store_dword(engine))
1029                 return 0;
1030
1031         /* Check that we detect a stuck waiter and issue a reset */
1032
1033         igt_global_reset_lock(gt);
1034
1035         err = hang_init(&h, gt);
1036         if (err)
1037                 goto unlock;
1038
1039         rq = hang_create_request(&h, engine);
1040         if (IS_ERR(rq)) {
1041                 err = PTR_ERR(rq);
1042                 goto fini;
1043         }
1044
1045         i915_request_get(rq);
1046         i915_request_add(rq);
1047
1048         if (!wait_until_running(&h, rq)) {
1049                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1050
1051                 pr_err("%s: Failed to start request %llx, at %x\n",
1052                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1053                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1054
1055                 intel_gt_set_wedged(gt);
1056
1057                 err = -EIO;
1058                 goto out_rq;
1059         }
1060
1061         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1062
1063         timeout = i915_request_wait(rq, 0, 10);
1064         if (timeout < 0) {
1065                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1066                        timeout);
1067                 err = timeout;
1068                 goto out_rq;
1069         }
1070
1071         if (i915_reset_count(global) == reset_count) {
1072                 pr_err("No GPU reset recorded!\n");
1073                 err = -EINVAL;
1074                 goto out_rq;
1075         }
1076
1077 out_rq:
1078         i915_request_put(rq);
1079 fini:
1080         hang_fini(&h);
1081 unlock:
1082         igt_global_reset_unlock(gt);
1083
1084         if (intel_gt_is_wedged(gt))
1085                 return -EIO;
1086
1087         return err;
1088 }
1089
1090 struct evict_vma {
1091         struct completion completion;
1092         struct i915_vma *vma;
1093 };
1094
1095 static int evict_vma(void *data)
1096 {
1097         struct evict_vma *arg = data;
1098         struct i915_address_space *vm = arg->vma->vm;
1099         struct drm_mm_node evict = arg->vma->node;
1100         int err;
1101
1102         complete(&arg->completion);
1103
1104         mutex_lock(&vm->mutex);
1105         err = i915_gem_evict_for_node(vm, &evict, 0);
1106         mutex_unlock(&vm->mutex);
1107
1108         return err;
1109 }
1110
1111 static int evict_fence(void *data)
1112 {
1113         struct evict_vma *arg = data;
1114         int err;
1115
1116         complete(&arg->completion);
1117
1118         /* Mark the fence register as dirty to force the mmio update. */
1119         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1120         if (err) {
1121                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1122                 return err;
1123         }
1124
1125         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1126         if (err) {
1127                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1128                 return err;
1129         }
1130
1131         err = i915_vma_pin_fence(arg->vma);
1132         i915_vma_unpin(arg->vma);
1133         if (err) {
1134                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1135                 return err;
1136         }
1137
1138         i915_vma_unpin_fence(arg->vma);
1139
1140         return 0;
1141 }
1142
1143 static int __igt_reset_evict_vma(struct intel_gt *gt,
1144                                  struct i915_address_space *vm,
1145                                  int (*fn)(void *),
1146                                  unsigned int flags)
1147 {
1148         struct intel_engine_cs *engine = gt->engine[RCS0];
1149         struct drm_i915_gem_object *obj;
1150         struct task_struct *tsk = NULL;
1151         struct i915_request *rq;
1152         struct evict_vma arg;
1153         struct hang h;
1154         unsigned int pin_flags;
1155         int err;
1156
1157         if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1158                 return 0;
1159
1160         if (!engine || !intel_engine_can_store_dword(engine))
1161                 return 0;
1162
1163         /* Check that we can recover an unbind stuck on a hanging request */
1164
1165         err = hang_init(&h, gt);
1166         if (err)
1167                 return err;
1168
1169         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1170         if (IS_ERR(obj)) {
1171                 err = PTR_ERR(obj);
1172                 goto fini;
1173         }
1174
1175         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1176                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1177                 if (err) {
1178                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1179                         goto out_obj;
1180                 }
1181         }
1182
1183         arg.vma = i915_vma_instance(obj, vm, NULL);
1184         if (IS_ERR(arg.vma)) {
1185                 err = PTR_ERR(arg.vma);
1186                 goto out_obj;
1187         }
1188
1189         rq = hang_create_request(&h, engine);
1190         if (IS_ERR(rq)) {
1191                 err = PTR_ERR(rq);
1192                 goto out_obj;
1193         }
1194
1195         pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1196
1197         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1198                 pin_flags |= PIN_MAPPABLE;
1199
1200         err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1201         if (err) {
1202                 i915_request_add(rq);
1203                 goto out_obj;
1204         }
1205
1206         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1207                 err = i915_vma_pin_fence(arg.vma);
1208                 if (err) {
1209                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1210                         i915_vma_unpin(arg.vma);
1211                         i915_request_add(rq);
1212                         goto out_obj;
1213                 }
1214         }
1215
1216         i915_vma_lock(arg.vma);
1217         err = i915_request_await_object(rq, arg.vma->obj,
1218                                         flags & EXEC_OBJECT_WRITE);
1219         if (err == 0)
1220                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1221         i915_vma_unlock(arg.vma);
1222
1223         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1224                 i915_vma_unpin_fence(arg.vma);
1225         i915_vma_unpin(arg.vma);
1226
1227         i915_request_get(rq);
1228         i915_request_add(rq);
1229         if (err)
1230                 goto out_rq;
1231
1232         if (!wait_until_running(&h, rq)) {
1233                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1234
1235                 pr_err("%s: Failed to start request %llx, at %x\n",
1236                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1237                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1238
1239                 intel_gt_set_wedged(gt);
1240                 goto out_reset;
1241         }
1242
1243         init_completion(&arg.completion);
1244
1245         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1246         if (IS_ERR(tsk)) {
1247                 err = PTR_ERR(tsk);
1248                 tsk = NULL;
1249                 goto out_reset;
1250         }
1251         get_task_struct(tsk);
1252
1253         wait_for_completion(&arg.completion);
1254
1255         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1256                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1257
1258                 pr_err("igt/evict_vma kthread did not wait\n");
1259                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1260
1261                 intel_gt_set_wedged(gt);
1262                 goto out_reset;
1263         }
1264
1265 out_reset:
1266         igt_global_reset_lock(gt);
1267         fake_hangcheck(gt, rq->engine->mask);
1268         igt_global_reset_unlock(gt);
1269
1270         if (tsk) {
1271                 struct intel_wedge_me w;
1272
1273                 /* The reset, even indirectly, should take less than 10ms. */
1274                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1275                         err = kthread_stop(tsk);
1276
1277                 put_task_struct(tsk);
1278         }
1279
1280 out_rq:
1281         i915_request_put(rq);
1282 out_obj:
1283         i915_gem_object_put(obj);
1284 fini:
1285         hang_fini(&h);
1286         if (intel_gt_is_wedged(gt))
1287                 return -EIO;
1288
1289         return err;
1290 }
1291
1292 static int igt_reset_evict_ggtt(void *arg)
1293 {
1294         struct intel_gt *gt = arg;
1295
1296         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1297                                      evict_vma, EXEC_OBJECT_WRITE);
1298 }
1299
1300 static int igt_reset_evict_ppgtt(void *arg)
1301 {
1302         struct intel_gt *gt = arg;
1303         struct i915_gem_context *ctx;
1304         struct i915_address_space *vm;
1305         struct drm_file *file;
1306         int err;
1307
1308         file = mock_file(gt->i915);
1309         if (IS_ERR(file))
1310                 return PTR_ERR(file);
1311
1312         ctx = live_context(gt->i915, file);
1313         if (IS_ERR(ctx)) {
1314                 err = PTR_ERR(ctx);
1315                 goto out;
1316         }
1317
1318         err = 0;
1319         vm = i915_gem_context_get_vm_rcu(ctx);
1320         if (!i915_is_ggtt(vm)) {
1321                 /* aliasing == global gtt locking, covered above */
1322                 err = __igt_reset_evict_vma(gt, vm,
1323                                             evict_vma, EXEC_OBJECT_WRITE);
1324         }
1325         i915_vm_put(vm);
1326
1327 out:
1328         mock_file_free(gt->i915, file);
1329         return err;
1330 }
1331
1332 static int igt_reset_evict_fence(void *arg)
1333 {
1334         struct intel_gt *gt = arg;
1335
1336         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1337                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1338 }
1339
1340 static int wait_for_others(struct intel_gt *gt,
1341                            struct intel_engine_cs *exclude)
1342 {
1343         struct intel_engine_cs *engine;
1344         enum intel_engine_id id;
1345
1346         for_each_engine(engine, gt, id) {
1347                 if (engine == exclude)
1348                         continue;
1349
1350                 if (!wait_for_idle(engine))
1351                         return -EIO;
1352         }
1353
1354         return 0;
1355 }
1356
1357 static int igt_reset_queue(void *arg)
1358 {
1359         struct intel_gt *gt = arg;
1360         struct i915_gpu_error *global = &gt->i915->gpu_error;
1361         struct intel_engine_cs *engine;
1362         enum intel_engine_id id;
1363         struct hang h;
1364         int err;
1365
1366         /* Check that we replay pending requests following a hang */
1367
1368         igt_global_reset_lock(gt);
1369
1370         err = hang_init(&h, gt);
1371         if (err)
1372                 goto unlock;
1373
1374         for_each_engine(engine, gt, id) {
1375                 struct i915_request *prev;
1376                 IGT_TIMEOUT(end_time);
1377                 unsigned int count;
1378
1379                 if (!intel_engine_can_store_dword(engine))
1380                         continue;
1381
1382                 prev = hang_create_request(&h, engine);
1383                 if (IS_ERR(prev)) {
1384                         err = PTR_ERR(prev);
1385                         goto fini;
1386                 }
1387
1388                 i915_request_get(prev);
1389                 i915_request_add(prev);
1390
1391                 count = 0;
1392                 do {
1393                         struct i915_request *rq;
1394                         unsigned int reset_count;
1395
1396                         rq = hang_create_request(&h, engine);
1397                         if (IS_ERR(rq)) {
1398                                 err = PTR_ERR(rq);
1399                                 goto fini;
1400                         }
1401
1402                         i915_request_get(rq);
1403                         i915_request_add(rq);
1404
1405                         /*
1406                          * XXX We don't handle resetting the kernel context
1407                          * very well. If we trigger a device reset twice in
1408                          * quick succession while the kernel context is
1409                          * executing, we may end up skipping the breadcrumb.
1410                          * This is really only a problem for the selftest as
1411                          * normally there is a large interlude between resets
1412                          * (hangcheck), or we focus on resetting just one
1413                          * engine and so avoid repeatedly resetting innocents.
1414                          */
1415                         err = wait_for_others(gt, engine);
1416                         if (err) {
1417                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1418                                        __func__, engine->name);
1419                                 i915_request_put(rq);
1420                                 i915_request_put(prev);
1421
1422                                 GEM_TRACE_DUMP();
1423                                 intel_gt_set_wedged(gt);
1424                                 goto fini;
1425                         }
1426
1427                         if (!wait_until_running(&h, prev)) {
1428                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1429
1430                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1431                                        __func__, engine->name,
1432                                        prev->fence.seqno, hws_seqno(&h, prev));
1433                                 intel_engine_dump(engine, &p,
1434                                                   "%s\n", engine->name);
1435
1436                                 i915_request_put(rq);
1437                                 i915_request_put(prev);
1438
1439                                 intel_gt_set_wedged(gt);
1440
1441                                 err = -EIO;
1442                                 goto fini;
1443                         }
1444
1445                         reset_count = fake_hangcheck(gt, BIT(id));
1446
1447                         if (prev->fence.error != -EIO) {
1448                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1449                                        prev->fence.error);
1450                                 i915_request_put(rq);
1451                                 i915_request_put(prev);
1452                                 err = -EINVAL;
1453                                 goto fini;
1454                         }
1455
1456                         if (rq->fence.error) {
1457                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1458                                        rq->fence.error);
1459                                 i915_request_put(rq);
1460                                 i915_request_put(prev);
1461                                 err = -EINVAL;
1462                                 goto fini;
1463                         }
1464
1465                         if (i915_reset_count(global) == reset_count) {
1466                                 pr_err("No GPU reset recorded!\n");
1467                                 i915_request_put(rq);
1468                                 i915_request_put(prev);
1469                                 err = -EINVAL;
1470                                 goto fini;
1471                         }
1472
1473                         i915_request_put(prev);
1474                         prev = rq;
1475                         count++;
1476                 } while (time_before(jiffies, end_time));
1477                 pr_info("%s: Completed %d resets\n", engine->name, count);
1478
1479                 *h.batch = MI_BATCH_BUFFER_END;
1480                 intel_gt_chipset_flush(engine->gt);
1481
1482                 i915_request_put(prev);
1483
1484                 err = igt_flush_test(gt->i915);
1485                 if (err)
1486                         break;
1487         }
1488
1489 fini:
1490         hang_fini(&h);
1491 unlock:
1492         igt_global_reset_unlock(gt);
1493
1494         if (intel_gt_is_wedged(gt))
1495                 return -EIO;
1496
1497         return err;
1498 }
1499
1500 static int igt_handle_error(void *arg)
1501 {
1502         struct intel_gt *gt = arg;
1503         struct i915_gpu_error *global = &gt->i915->gpu_error;
1504         struct intel_engine_cs *engine = gt->engine[RCS0];
1505         struct hang h;
1506         struct i915_request *rq;
1507         struct i915_gpu_state *error;
1508         int err;
1509
1510         /* Check that we can issue a global GPU and engine reset */
1511
1512         if (!intel_has_reset_engine(gt))
1513                 return 0;
1514
1515         if (!engine || !intel_engine_can_store_dword(engine))
1516                 return 0;
1517
1518         err = hang_init(&h, gt);
1519         if (err)
1520                 return err;
1521
1522         rq = hang_create_request(&h, engine);
1523         if (IS_ERR(rq)) {
1524                 err = PTR_ERR(rq);
1525                 goto err_fini;
1526         }
1527
1528         i915_request_get(rq);
1529         i915_request_add(rq);
1530
1531         if (!wait_until_running(&h, rq)) {
1532                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1533
1534                 pr_err("%s: Failed to start request %llx, at %x\n",
1535                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1536                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1537
1538                 intel_gt_set_wedged(gt);
1539
1540                 err = -EIO;
1541                 goto err_request;
1542         }
1543
1544         /* Temporarily disable error capture */
1545         error = xchg(&global->first_error, (void *)-1);
1546
1547         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1548
1549         xchg(&global->first_error, error);
1550
1551         if (rq->fence.error != -EIO) {
1552                 pr_err("Guilty request not identified!\n");
1553                 err = -EINVAL;
1554                 goto err_request;
1555         }
1556
1557 err_request:
1558         i915_request_put(rq);
1559 err_fini:
1560         hang_fini(&h);
1561         return err;
1562 }
1563
1564 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1565                                      const struct igt_atomic_section *p,
1566                                      const char *mode)
1567 {
1568         struct tasklet_struct * const t = &engine->execlists.tasklet;
1569         int err;
1570
1571         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1572                   engine->name, mode, p->name);
1573
1574         tasklet_disable(t);
1575         p->critical_section_begin();
1576
1577         err = intel_engine_reset(engine, NULL);
1578
1579         p->critical_section_end();
1580         tasklet_enable(t);
1581
1582         if (err)
1583                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1584                        engine->name, mode, p->name);
1585
1586         return err;
1587 }
1588
1589 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1590                                    const struct igt_atomic_section *p)
1591 {
1592         struct i915_request *rq;
1593         struct hang h;
1594         int err;
1595
1596         err = __igt_atomic_reset_engine(engine, p, "idle");
1597         if (err)
1598                 return err;
1599
1600         err = hang_init(&h, engine->gt);
1601         if (err)
1602                 return err;
1603
1604         rq = hang_create_request(&h, engine);
1605         if (IS_ERR(rq)) {
1606                 err = PTR_ERR(rq);
1607                 goto out;
1608         }
1609
1610         i915_request_get(rq);
1611         i915_request_add(rq);
1612
1613         if (wait_until_running(&h, rq)) {
1614                 err = __igt_atomic_reset_engine(engine, p, "active");
1615         } else {
1616                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1617                        __func__, engine->name,
1618                        rq->fence.seqno, hws_seqno(&h, rq));
1619                 intel_gt_set_wedged(engine->gt);
1620                 err = -EIO;
1621         }
1622
1623         if (err == 0) {
1624                 struct intel_wedge_me w;
1625
1626                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1627                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1628                 if (intel_gt_is_wedged(engine->gt))
1629                         err = -EIO;
1630         }
1631
1632         i915_request_put(rq);
1633 out:
1634         hang_fini(&h);
1635         return err;
1636 }
1637
1638 static int igt_reset_engines_atomic(void *arg)
1639 {
1640         struct intel_gt *gt = arg;
1641         const typeof(*igt_atomic_phases) *p;
1642         int err = 0;
1643
1644         /* Check that the engines resets are usable from atomic context */
1645
1646         if (!intel_has_reset_engine(gt))
1647                 return 0;
1648
1649         if (USES_GUC_SUBMISSION(gt->i915))
1650                 return 0;
1651
1652         igt_global_reset_lock(gt);
1653
1654         /* Flush any requests before we get started and check basics */
1655         if (!igt_force_reset(gt))
1656                 goto unlock;
1657
1658         for (p = igt_atomic_phases; p->name; p++) {
1659                 struct intel_engine_cs *engine;
1660                 enum intel_engine_id id;
1661
1662                 for_each_engine(engine, gt, id) {
1663                         err = igt_atomic_reset_engine(engine, p);
1664                         if (err)
1665                                 goto out;
1666                 }
1667         }
1668
1669 out:
1670         /* As we poke around the guts, do a full reset before continuing. */
1671         igt_force_reset(gt);
1672 unlock:
1673         igt_global_reset_unlock(gt);
1674
1675         return err;
1676 }
1677
1678 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1679 {
1680         static const struct i915_subtest tests[] = {
1681                 SUBTEST(igt_hang_sanitycheck),
1682                 SUBTEST(igt_reset_nop),
1683                 SUBTEST(igt_reset_nop_engine),
1684                 SUBTEST(igt_reset_idle_engine),
1685                 SUBTEST(igt_reset_active_engine),
1686                 SUBTEST(igt_reset_engines),
1687                 SUBTEST(igt_reset_engines_atomic),
1688                 SUBTEST(igt_reset_queue),
1689                 SUBTEST(igt_reset_wait),
1690                 SUBTEST(igt_reset_evict_ggtt),
1691                 SUBTEST(igt_reset_evict_ppgtt),
1692                 SUBTEST(igt_reset_evict_fence),
1693                 SUBTEST(igt_handle_error),
1694         };
1695         struct intel_gt *gt = &i915->gt;
1696         intel_wakeref_t wakeref;
1697         int err;
1698
1699         if (!intel_has_gpu_reset(gt))
1700                 return 0;
1701
1702         if (intel_gt_is_wedged(gt))
1703                 return -EIO; /* we're long past hope of a successful reset */
1704
1705         wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1706
1707         err = intel_gt_live_subtests(tests, gt);
1708
1709         intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1710
1711         return err;
1712 }