drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <linux/kthread.h>
  26
  27 #include "../i915_selftest.h"
  28
  29 #include "mock_context.h"
  30 #include "mock_drm.h"
  31
  32 struct hang {
  33         struct drm_i915_private *i915;
  34         struct drm_i915_gem_object *hws;
  35         struct drm_i915_gem_object *obj;
  36         u32 *seqno;
  37         u32 *batch;
  38 };
  39
  40 static int hang_init(struct hang *h, struct drm_i915_private *i915)
  41 {
  42         void *vaddr;
  43         int err;
  44
  45         memset(h, 0, sizeof(*h));
  46         h->i915 = i915;
  47
  48         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  49         if (IS_ERR(h->hws))
  50                 return PTR_ERR(h->hws);
  51
  52         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  53         if (IS_ERR(h->obj)) {
  54                 err = PTR_ERR(h->obj);
  55                 goto err_hws;
  56         }
  57
  58         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  59         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  60         if (IS_ERR(vaddr)) {
  61                 err = PTR_ERR(vaddr);
  62                 goto err_obj;
  63         }
  64         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  65
  66         vaddr = i915_gem_object_pin_map(h->obj,
  67                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  68         if (IS_ERR(vaddr)) {
  69                 err = PTR_ERR(vaddr);
  70                 goto err_unpin_hws;
  71         }
  72         h->batch = vaddr;
  73
  74         return 0;
  75
  76 err_unpin_hws:
  77         i915_gem_object_unpin_map(h->hws);
  78 err_obj:
  79         i915_gem_object_put(h->obj);
  80 err_hws:
  81         i915_gem_object_put(h->hws);
  82         return err;
  83 }
  84
  85 static u64 hws_address(const struct i915_vma *hws,
  86                        const struct drm_i915_gem_request *rq)
  87 {
  88         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  89 }
  90
  91 static int emit_recurse_batch(struct hang *h,
  92                               struct drm_i915_gem_request *rq)
  93 {
  94         struct drm_i915_private *i915 = h->i915;
  95         struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  96         struct i915_vma *hws, *vma;
  97         unsigned int flags;
  98         u32 *batch;
  99         int err;
 100
 101         vma = i915_vma_instance(h->obj, vm, NULL);
 102         if (IS_ERR(vma))
 103                 return PTR_ERR(vma);
 104
 105         hws = i915_vma_instance(h->hws, vm, NULL);
 106         if (IS_ERR(hws))
 107                 return PTR_ERR(hws);
 108
 109         err = i915_vma_pin(vma, 0, 0, PIN_USER);
 110         if (err)
 111                 return err;
 112
 113         err = i915_vma_pin(hws, 0, 0, PIN_USER);
 114         if (err)
 115                 goto unpin_vma;
 116
 117         err = i915_switch_context(rq);
 118         if (err)
 119                 goto unpin_hws;
 120
 121         i915_vma_move_to_active(vma, rq, 0);
 122         if (!i915_gem_object_has_active_reference(vma->obj)) {
 123                 i915_gem_object_get(vma->obj);
 124                 i915_gem_object_set_active_reference(vma->obj);
 125         }
 126
 127         i915_vma_move_to_active(hws, rq, 0);
 128         if (!i915_gem_object_has_active_reference(hws->obj)) {
 129                 i915_gem_object_get(hws->obj);
 130                 i915_gem_object_set_active_reference(hws->obj);
 131         }
 132
 133         batch = h->batch;
 134         if (INTEL_GEN(i915) >= 8) {
 135                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 136                 *batch++ = lower_32_bits(hws_address(hws, rq));
 137                 *batch++ = upper_32_bits(hws_address(hws, rq));
 138                 *batch++ = rq->fence.seqno;
 139                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 140                 *batch++ = lower_32_bits(vma->node.start);
 141                 *batch++ = upper_32_bits(vma->node.start);
 142         } else if (INTEL_GEN(i915) >= 6) {
 143                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 144                 *batch++ = 0;
 145                 *batch++ = lower_32_bits(hws_address(hws, rq));
 146                 *batch++ = rq->fence.seqno;
 147                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 148                 *batch++ = lower_32_bits(vma->node.start);
 149         } else if (INTEL_GEN(i915) >= 4) {
 150                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
 151                 *batch++ = 0;
 152                 *batch++ = lower_32_bits(hws_address(hws, rq));
 153                 *batch++ = rq->fence.seqno;
 154                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 155                 *batch++ = lower_32_bits(vma->node.start);
 156         } else {
 157                 *batch++ = MI_STORE_DWORD_IMM;
 158                 *batch++ = lower_32_bits(hws_address(hws, rq));
 159                 *batch++ = rq->fence.seqno;
 160                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 161                 *batch++ = lower_32_bits(vma->node.start);
 162         }
 163         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 164         i915_gem_chipset_flush(h->i915);
 165
 166         flags = 0;
 167         if (INTEL_GEN(vm->i915) <= 5)
 168                 flags |= I915_DISPATCH_SECURE;
 169
 170         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 171
 172 unpin_hws:
 173         i915_vma_unpin(hws);
 174 unpin_vma:
 175         i915_vma_unpin(vma);
 176         return err;
 177 }
 178
 179 static struct drm_i915_gem_request *
 180 hang_create_request(struct hang *h,
 181                     struct intel_engine_cs *engine,
 182                     struct i915_gem_context *ctx)
 183 {
 184         struct drm_i915_gem_request *rq;
 185         int err;
 186
 187         if (i915_gem_object_is_active(h->obj)) {
 188                 struct drm_i915_gem_object *obj;
 189                 void *vaddr;
 190
 191                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 192                 if (IS_ERR(obj))
 193                         return ERR_CAST(obj);
 194
 195                 vaddr = i915_gem_object_pin_map(obj,
 196                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 197                 if (IS_ERR(vaddr)) {
 198                         i915_gem_object_put(obj);
 199                         return ERR_CAST(vaddr);
 200                 }
 201
 202                 i915_gem_object_unpin_map(h->obj);
 203                 i915_gem_object_put(h->obj);
 204
 205                 h->obj = obj;
 206                 h->batch = vaddr;
 207         }
 208
 209         rq = i915_gem_request_alloc(engine, ctx);
 210         if (IS_ERR(rq))
 211                 return rq;
 212
 213         err = emit_recurse_batch(h, rq);
 214         if (err) {
 215                 __i915_add_request(rq, false);
 216                 return ERR_PTR(err);
 217         }
 218
 219         return rq;
 220 }
 221
 222 static u32 hws_seqno(const struct hang *h,
 223                      const struct drm_i915_gem_request *rq)
 224 {
 225         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 226 }
 227
 228 static void hang_fini(struct hang *h)
 229 {
 230         *h->batch = MI_BATCH_BUFFER_END;
 231         i915_gem_chipset_flush(h->i915);
 232
 233         i915_gem_object_unpin_map(h->obj);
 234         i915_gem_object_put(h->obj);
 235
 236         i915_gem_object_unpin_map(h->hws);
 237         i915_gem_object_put(h->hws);
 238
 239         i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
 240 }
 241
 242 static int igt_hang_sanitycheck(void *arg)
 243 {
 244         struct drm_i915_private *i915 = arg;
 245         struct drm_i915_gem_request *rq;
 246         struct intel_engine_cs *engine;
 247         enum intel_engine_id id;
 248         struct hang h;
 249         int err;
 250
 251         /* Basic check that we can execute our hanging batch */
 252
 253         mutex_lock(&i915->drm.struct_mutex);
 254         err = hang_init(&h, i915);
 255         if (err)
 256                 goto unlock;
 257
 258         for_each_engine(engine, i915, id) {
 259                 long timeout;
 260
 261                 if (!intel_engine_can_store_dword(engine))
 262                         continue;
 263
 264                 rq = hang_create_request(&h, engine, i915->kernel_context);
 265                 if (IS_ERR(rq)) {
 266                         err = PTR_ERR(rq);
 267                         pr_err("Failed to create request for %s, err=%d\n",
 268                                engine->name, err);
 269                         goto fini;
 270                 }
 271
 272                 i915_gem_request_get(rq);
 273
 274                 *h.batch = MI_BATCH_BUFFER_END;
 275                 i915_gem_chipset_flush(i915);
 276
 277                 __i915_add_request(rq, true);
 278
 279                 timeout = i915_wait_request(rq,
 280                                             I915_WAIT_LOCKED,
 281                                             MAX_SCHEDULE_TIMEOUT);
 282                 i915_gem_request_put(rq);
 283
 284                 if (timeout < 0) {
 285                         err = timeout;
 286                         pr_err("Wait for request failed on %s, err=%d\n",
 287                                engine->name, err);
 288                         goto fini;
 289                 }
 290         }
 291
 292 fini:
 293         hang_fini(&h);
 294 unlock:
 295         mutex_unlock(&i915->drm.struct_mutex);
 296         return err;
 297 }
 298
 299 static void global_reset_lock(struct drm_i915_private *i915)
 300 {
 301         struct intel_engine_cs *engine;
 302         enum intel_engine_id id;
 303
 304         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 305                 wait_event(i915->gpu_error.reset_queue,
 306                            !test_bit(I915_RESET_BACKOFF,
 307                                      &i915->gpu_error.flags));
 308
 309         for_each_engine(engine, i915, id) {
 310                 while (test_and_set_bit(I915_RESET_ENGINE + id,
 311                                         &i915->gpu_error.flags))
 312                         wait_on_bit(&i915->gpu_error.flags,
 313                                     I915_RESET_ENGINE + id,
 314                                     TASK_UNINTERRUPTIBLE);
 315         }
 316 }
 317
 318 static void global_reset_unlock(struct drm_i915_private *i915)
 319 {
 320         struct intel_engine_cs *engine;
 321         enum intel_engine_id id;
 322
 323         for_each_engine(engine, i915, id)
 324                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 325
 326         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 327         wake_up_all(&i915->gpu_error.reset_queue);
 328 }
 329
 330 static int igt_global_reset(void *arg)
 331 {
 332         struct drm_i915_private *i915 = arg;
 333         unsigned int reset_count;
 334         int err = 0;
 335
 336         /* Check that we can issue a global GPU reset */
 337
 338         global_reset_lock(i915);
 339         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 340
 341         mutex_lock(&i915->drm.struct_mutex);
 342         reset_count = i915_reset_count(&i915->gpu_error);
 343
 344         i915_reset(i915, I915_RESET_QUIET);
 345
 346         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 347                 pr_err("No GPU reset recorded!\n");
 348                 err = -EINVAL;
 349         }
 350         mutex_unlock(&i915->drm.struct_mutex);
 351
 352         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 353         global_reset_unlock(i915);
 354
 355         if (i915_terminally_wedged(&i915->gpu_error))
 356                 err = -EIO;
 357
 358         return err;
 359 }
 360
 361 static int igt_reset_engine(void *arg)
 362 {
 363         struct drm_i915_private *i915 = arg;
 364         struct intel_engine_cs *engine;
 365         enum intel_engine_id id;
 366         unsigned int reset_count, reset_engine_count;
 367         int err = 0;
 368
 369         /* Check that we can issue a global GPU and engine reset */
 370
 371         if (!intel_has_reset_engine(i915))
 372                 return 0;
 373
 374         for_each_engine(engine, i915, id) {
 375                 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
 376                 reset_count = i915_reset_count(&i915->gpu_error);
 377                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 378                                                              engine);
 379
 380                 err = i915_reset_engine(engine, I915_RESET_QUIET);
 381                 if (err) {
 382                         pr_err("i915_reset_engine failed\n");
 383                         break;
 384                 }
 385
 386                 if (i915_reset_count(&i915->gpu_error) != reset_count) {
 387                         pr_err("Full GPU reset recorded! (engine reset expected)\n");
 388                         err = -EINVAL;
 389                         break;
 390                 }
 391
 392                 if (i915_reset_engine_count(&i915->gpu_error, engine) ==
 393                     reset_engine_count) {
 394                         pr_err("No %s engine reset recorded!\n", engine->name);
 395                         err = -EINVAL;
 396                         break;
 397                 }
 398
 399                 clear_bit(I915_RESET_ENGINE + engine->id,
 400                           &i915->gpu_error.flags);
 401         }
 402
 403         if (i915_terminally_wedged(&i915->gpu_error))
 404                 err = -EIO;
 405
 406         return err;
 407 }
 408
 409 static int active_engine(void *data)
 410 {
 411         struct intel_engine_cs *engine = data;
 412         struct drm_i915_gem_request *rq[2] = {};
 413         struct i915_gem_context *ctx[2];
 414         struct drm_file *file;
 415         unsigned long count = 0;
 416         int err = 0;
 417
 418         file = mock_file(engine->i915);
 419         if (IS_ERR(file))
 420                 return PTR_ERR(file);
 421
 422         mutex_lock(&engine->i915->drm.struct_mutex);
 423         ctx[0] = live_context(engine->i915, file);
 424         mutex_unlock(&engine->i915->drm.struct_mutex);
 425         if (IS_ERR(ctx[0])) {
 426                 err = PTR_ERR(ctx[0]);
 427                 goto err_file;
 428         }
 429
 430         mutex_lock(&engine->i915->drm.struct_mutex);
 431         ctx[1] = live_context(engine->i915, file);
 432         mutex_unlock(&engine->i915->drm.struct_mutex);
 433         if (IS_ERR(ctx[1])) {
 434                 err = PTR_ERR(ctx[1]);
 435                 i915_gem_context_put(ctx[0]);
 436                 goto err_file;
 437         }
 438
 439         while (!kthread_should_stop()) {
 440                 unsigned int idx = count++ & 1;
 441                 struct drm_i915_gem_request *old = rq[idx];
 442                 struct drm_i915_gem_request *new;
 443
 444                 mutex_lock(&engine->i915->drm.struct_mutex);
 445                 new = i915_gem_request_alloc(engine, ctx[idx]);
 446                 if (IS_ERR(new)) {
 447                         mutex_unlock(&engine->i915->drm.struct_mutex);
 448                         err = PTR_ERR(new);
 449                         break;
 450                 }
 451
 452                 rq[idx] = i915_gem_request_get(new);
 453                 i915_add_request(new);
 454                 mutex_unlock(&engine->i915->drm.struct_mutex);
 455
 456                 if (old) {
 457                         i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
 458                         i915_gem_request_put(old);
 459                 }
 460         }
 461
 462         for (count = 0; count < ARRAY_SIZE(rq); count++)
 463                 i915_gem_request_put(rq[count]);
 464
 465 err_file:
 466         mock_file_free(engine->i915, file);
 467         return err;
 468 }
 469
 470 static int igt_reset_active_engines(void *arg)
 471 {
 472         struct drm_i915_private *i915 = arg;
 473         struct intel_engine_cs *engine, *active;
 474         enum intel_engine_id id, tmp;
 475         int err = 0;
 476
 477         /* Check that issuing a reset on one engine does not interfere
 478          * with any other engine.
 479          */
 480
 481         if (!intel_has_reset_engine(i915))
 482                 return 0;
 483
 484         for_each_engine(engine, i915, id) {
 485                 struct task_struct *threads[I915_NUM_ENGINES];
 486                 unsigned long resets[I915_NUM_ENGINES];
 487                 unsigned long global = i915_reset_count(&i915->gpu_error);
 488                 IGT_TIMEOUT(end_time);
 489
 490                 memset(threads, 0, sizeof(threads));
 491                 for_each_engine(active, i915, tmp) {
 492                         struct task_struct *tsk;
 493
 494                         if (active == engine)
 495                                 continue;
 496
 497                         resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
 498                                                               active);
 499
 500                         tsk = kthread_run(active_engine, active,
 501                                           "igt/%s", active->name);
 502                         if (IS_ERR(tsk)) {
 503                                 err = PTR_ERR(tsk);
 504                                 goto unwind;
 505                         }
 506
 507                         threads[tmp] = tsk;
 508                         get_task_struct(tsk);
 509                 }
 510
 511                 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
 512                 do {
 513                         err = i915_reset_engine(engine, I915_RESET_QUIET);
 514                         if (err) {
 515                                 pr_err("i915_reset_engine(%s) failed, err=%d\n",
 516                                        engine->name, err);
 517                                 break;
 518                         }
 519                 } while (time_before(jiffies, end_time));
 520                 clear_bit(I915_RESET_ENGINE + engine->id,
 521                           &i915->gpu_error.flags);
 522
 523 unwind:
 524                 for_each_engine(active, i915, tmp) {
 525                         int ret;
 526
 527                         if (!threads[tmp])
 528                                 continue;
 529
 530                         ret = kthread_stop(threads[tmp]);
 531                         if (ret) {
 532                                 pr_err("kthread for active engine %s failed, err=%d\n",
 533                                        active->name, ret);
 534                                 if (!err)
 535                                         err = ret;
 536                         }
 537                         put_task_struct(threads[tmp]);
 538
 539                         if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
 540                                                                    active)) {
 541                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
 542                                        active->name,
 543                                        i915_reset_engine_count(&i915->gpu_error,
 544                                                                active) - resets[tmp]);
 545                                 err = -EIO;
 546                         }
 547                 }
 548
 549                 if (global != i915_reset_count(&i915->gpu_error)) {
 550                         pr_err("Global reset (count=%ld)!\n",
 551                                i915_reset_count(&i915->gpu_error) - global);
 552                         err = -EIO;
 553                 }
 554
 555                 if (err)
 556                         break;
 557
 558                 cond_resched();
 559         }
 560
 561         if (i915_terminally_wedged(&i915->gpu_error))
 562                 err = -EIO;
 563
 564         return err;
 565 }
 566
 567 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 568 {
 569         u32 reset_count;
 570
 571         rq->engine->hangcheck.stalled = true;
 572         rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
 573
 574         reset_count = i915_reset_count(&rq->i915->gpu_error);
 575
 576         set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 577         wake_up_all(&rq->i915->gpu_error.wait_queue);
 578
 579         return reset_count;
 580 }
 581
 582 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
 583 {
 584         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 585                                                rq->fence.seqno),
 586                              10) &&
 587                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
 588                                             rq->fence.seqno),
 589                           1000));
 590 }
 591
 592 static int igt_wait_reset(void *arg)
 593 {
 594         struct drm_i915_private *i915 = arg;
 595         struct drm_i915_gem_request *rq;
 596         unsigned int reset_count;
 597         struct hang h;
 598         long timeout;
 599         int err;
 600
 601         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 602                 return 0;
 603
 604         /* Check that we detect a stuck waiter and issue a reset */
 605
 606         global_reset_lock(i915);
 607
 608         mutex_lock(&i915->drm.struct_mutex);
 609         err = hang_init(&h, i915);
 610         if (err)
 611                 goto unlock;
 612
 613         rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
 614         if (IS_ERR(rq)) {
 615                 err = PTR_ERR(rq);
 616                 goto fini;
 617         }
 618
 619         i915_gem_request_get(rq);
 620         __i915_add_request(rq, true);
 621
 622         if (!wait_for_hang(&h, rq)) {
 623                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 624
 625                 pr_err("Failed to start request %x, at %x\n",
 626                        rq->fence.seqno, hws_seqno(&h, rq));
 627                 intel_engine_dump(rq->engine, &p);
 628
 629                 i915_reset(i915, 0);
 630                 i915_gem_set_wedged(i915);
 631
 632                 err = -EIO;
 633                 goto out_rq;
 634         }
 635
 636         reset_count = fake_hangcheck(rq);
 637
 638         timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
 639         if (timeout < 0) {
 640                 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
 641                        timeout);
 642                 err = timeout;
 643                 goto out_rq;
 644         }
 645
 646         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 647         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 648                 pr_err("No GPU reset recorded!\n");
 649                 err = -EINVAL;
 650                 goto out_rq;
 651         }
 652
 653 out_rq:
 654         i915_gem_request_put(rq);
 655 fini:
 656         hang_fini(&h);
 657 unlock:
 658         mutex_unlock(&i915->drm.struct_mutex);
 659         global_reset_unlock(i915);
 660
 661         if (i915_terminally_wedged(&i915->gpu_error))
 662                 return -EIO;
 663
 664         return err;
 665 }
 666
 667 static int igt_reset_queue(void *arg)
 668 {
 669         struct drm_i915_private *i915 = arg;
 670         struct intel_engine_cs *engine;
 671         enum intel_engine_id id;
 672         struct hang h;
 673         int err;
 674
 675         /* Check that we replay pending requests following a hang */
 676
 677         global_reset_lock(i915);
 678
 679         mutex_lock(&i915->drm.struct_mutex);
 680         err = hang_init(&h, i915);
 681         if (err)
 682                 goto unlock;
 683
 684         for_each_engine(engine, i915, id) {
 685                 struct drm_i915_gem_request *prev;
 686                 IGT_TIMEOUT(end_time);
 687                 unsigned int count;
 688
 689                 if (!intel_engine_can_store_dword(engine))
 690                         continue;
 691
 692                 prev = hang_create_request(&h, engine, i915->kernel_context);
 693                 if (IS_ERR(prev)) {
 694                         err = PTR_ERR(prev);
 695                         goto fini;
 696                 }
 697
 698                 i915_gem_request_get(prev);
 699                 __i915_add_request(prev, true);
 700
 701                 count = 0;
 702                 do {
 703                         struct drm_i915_gem_request *rq;
 704                         unsigned int reset_count;
 705
 706                         rq = hang_create_request(&h,
 707                                                  engine,
 708                                                  i915->kernel_context);
 709                         if (IS_ERR(rq)) {
 710                                 err = PTR_ERR(rq);
 711                                 goto fini;
 712                         }
 713
 714                         i915_gem_request_get(rq);
 715                         __i915_add_request(rq, true);
 716
 717                         if (!wait_for_hang(&h, prev)) {
 718                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 719
 720                                 pr_err("Failed to start request %x, at %x\n",
 721                                        prev->fence.seqno, hws_seqno(&h, prev));
 722                                 intel_engine_dump(rq->engine, &p);
 723
 724                                 i915_gem_request_put(rq);
 725                                 i915_gem_request_put(prev);
 726
 727                                 i915_reset(i915, 0);
 728                                 i915_gem_set_wedged(i915);
 729
 730                                 err = -EIO;
 731                                 goto fini;
 732                         }
 733
 734                         reset_count = fake_hangcheck(prev);
 735
 736                         i915_reset(i915, I915_RESET_QUIET);
 737
 738                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 739                                             &i915->gpu_error.flags));
 740
 741                         if (prev->fence.error != -EIO) {
 742                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 743                                        prev->fence.error);
 744                                 i915_gem_request_put(rq);
 745                                 i915_gem_request_put(prev);
 746                                 err = -EINVAL;
 747                                 goto fini;
 748                         }
 749
 750                         if (rq->fence.error) {
 751                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
 752                                        rq->fence.error);
 753                                 i915_gem_request_put(rq);
 754                                 i915_gem_request_put(prev);
 755                                 err = -EINVAL;
 756                                 goto fini;
 757                         }
 758
 759                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 760                                 pr_err("No GPU reset recorded!\n");
 761                                 i915_gem_request_put(rq);
 762                                 i915_gem_request_put(prev);
 763                                 err = -EINVAL;
 764                                 goto fini;
 765                         }
 766
 767                         i915_gem_request_put(prev);
 768                         prev = rq;
 769                         count++;
 770                 } while (time_before(jiffies, end_time));
 771                 pr_info("%s: Completed %d resets\n", engine->name, count);
 772
 773                 *h.batch = MI_BATCH_BUFFER_END;
 774                 i915_gem_chipset_flush(i915);
 775
 776                 i915_gem_request_put(prev);
 777         }
 778
 779 fini:
 780         hang_fini(&h);
 781 unlock:
 782         mutex_unlock(&i915->drm.struct_mutex);
 783         global_reset_unlock(i915);
 784
 785         if (i915_terminally_wedged(&i915->gpu_error))
 786                 return -EIO;
 787
 788         return err;
 789 }
 790
 791 static int igt_handle_error(void *arg)
 792 {
 793         struct drm_i915_private *i915 = arg;
 794         struct intel_engine_cs *engine = i915->engine[RCS];
 795         struct hang h;
 796         struct drm_i915_gem_request *rq;
 797         struct i915_gpu_state *error;
 798         int err;
 799
 800         /* Check that we can issue a global GPU and engine reset */
 801
 802         if (!intel_has_reset_engine(i915))
 803                 return 0;
 804
 805         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 806                 return 0;
 807
 808         mutex_lock(&i915->drm.struct_mutex);
 809
 810         err = hang_init(&h, i915);
 811         if (err)
 812                 goto err_unlock;
 813
 814         rq = hang_create_request(&h, engine, i915->kernel_context);
 815         if (IS_ERR(rq)) {
 816                 err = PTR_ERR(rq);
 817                 goto err_fini;
 818         }
 819
 820         i915_gem_request_get(rq);
 821         __i915_add_request(rq, true);
 822
 823         if (!wait_for_hang(&h, rq)) {
 824                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 825
 826                 pr_err("Failed to start request %x, at %x\n",
 827                        rq->fence.seqno, hws_seqno(&h, rq));
 828                 intel_engine_dump(rq->engine, &p);
 829
 830                 i915_reset(i915, 0);
 831                 i915_gem_set_wedged(i915);
 832
 833                 err = -EIO;
 834                 goto err_request;
 835         }
 836
 837         mutex_unlock(&i915->drm.struct_mutex);
 838
 839         /* Temporarily disable error capture */
 840         error = xchg(&i915->gpu_error.first_error, (void *)-1);
 841
 842         engine->hangcheck.stalled = true;
 843         engine->hangcheck.seqno = intel_engine_get_seqno(engine);
 844
 845         i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
 846
 847         xchg(&i915->gpu_error.first_error, error);
 848
 849         mutex_lock(&i915->drm.struct_mutex);
 850
 851         if (rq->fence.error != -EIO) {
 852                 pr_err("Guilty request not identified!\n");
 853                 err = -EINVAL;
 854                 goto err_request;
 855         }
 856
 857 err_request:
 858         i915_gem_request_put(rq);
 859 err_fini:
 860         hang_fini(&h);
 861 err_unlock:
 862         mutex_unlock(&i915->drm.struct_mutex);
 863         return err;
 864 }
 865
 866 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 867 {
 868         static const struct i915_subtest tests[] = {
 869                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
 870                 SUBTEST(igt_hang_sanitycheck),
 871                 SUBTEST(igt_reset_engine),
 872                 SUBTEST(igt_reset_active_engines),
 873                 SUBTEST(igt_wait_reset),
 874                 SUBTEST(igt_reset_queue),
 875                 SUBTEST(igt_handle_error),
 876         };
 877         int err;
 878
 879         if (!intel_has_gpu_reset(i915))
 880                 return 0;
 881
 882         intel_runtime_pm_get(i915);
 883
 884         err = i915_subtests(tests, i915);
 885
 886         intel_runtime_pm_put(i915);
 887
 888         return err;
 889 }