drm/i915: Move global activity tracking from GEM to GT
[linux-block.git] / drivers / gpu / drm / i915 / i915_gpu_error.c
CommitLineData
84734a04
MK
1/*
2 * Copyright (c) 2008 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 * Keith Packard <keithp@keithp.com>
26 * Mika Kuoppala <mika.kuoppala@intel.com>
27 *
28 */
29
0e39037b
CW
30#include <linux/ascii85.h>
31#include <linux/nmi.h>
3bdd4f84 32#include <linux/pagevec.h>
0e39037b 33#include <linux/scatterlist.h>
0e39037b 34#include <linux/utsname.h>
0a97015d 35#include <linux/zlib.h>
0e39037b 36
7d41ef34
MW
37#include <drm/drm_print.h>
38
df0566a6
JN
39#include "display/intel_atomic.h"
40#include "display/intel_overlay.h"
41
10be98a7
CW
42#include "gem/i915_gem_context.h"
43
84734a04 44#include "i915_drv.h"
05ca9306 45#include "i915_gpu_error.h"
9c9082b9 46#include "i915_memcpy.h"
37d63f8f 47#include "i915_scatterlist.h"
6176490e 48#include "intel_csr.h"
84734a04 49
3bdd4f84
CW
50#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
51#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
52
0e39037b
CW
53static void __sg_set_buf(struct scatterlist *sg,
54 void *addr, unsigned int len, loff_t it)
84734a04 55{
0e39037b
CW
56 sg->page_link = (unsigned long)virt_to_page(addr);
57 sg->offset = offset_in_page(addr);
58 sg->length = len;
59 sg->dma_address = it;
84734a04
MK
60}
61
0e39037b 62static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
84734a04 63{
0e39037b 64 if (!len)
84734a04 65 return false;
84734a04 66
0e39037b
CW
67 if (e->bytes + len + 1 <= e->size)
68 return true;
69
70 if (e->bytes) {
71 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
72 e->iter += e->bytes;
73 e->buf = NULL;
74 e->bytes = 0;
84734a04
MK
75 }
76
0e39037b
CW
77 if (e->cur == e->end) {
78 struct scatterlist *sgl;
84734a04 79
3bdd4f84 80 sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
0e39037b
CW
81 if (!sgl) {
82 e->err = -ENOMEM;
83 return false;
84 }
84734a04 85
0e39037b
CW
86 if (e->cur) {
87 e->cur->offset = 0;
88 e->cur->length = 0;
89 e->cur->page_link =
90 (unsigned long)sgl | SG_CHAIN;
91 } else {
92 e->sgl = sgl;
84734a04
MK
93 }
94
0e39037b
CW
95 e->cur = sgl;
96 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
84734a04
MK
97 }
98
0e39037b 99 e->size = ALIGN(len + 1, SZ_64K);
3bdd4f84 100 e->buf = kmalloc(e->size, ALLOW_FAIL);
0e39037b
CW
101 if (!e->buf) {
102 e->size = PAGE_ALIGN(len + 1);
103 e->buf = kmalloc(e->size, GFP_KERNEL);
104 }
105 if (!e->buf) {
106 e->err = -ENOMEM;
107 return false;
108 }
109
110 return true;
84734a04
MK
111}
112
dda35931 113__printf(2, 0)
84734a04 114static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
0e39037b 115 const char *fmt, va_list args)
84734a04 116{
0e39037b
CW
117 va_list ap;
118 int len;
84734a04 119
0e39037b 120 if (e->err)
84734a04
MK
121 return;
122
0e39037b
CW
123 va_copy(ap, args);
124 len = vsnprintf(NULL, 0, fmt, ap);
125 va_end(ap);
126 if (len <= 0) {
127 e->err = len;
128 return;
84734a04
MK
129 }
130
0e39037b
CW
131 if (!__i915_error_grow(e, len))
132 return;
84734a04 133
0e39037b
CW
134 GEM_BUG_ON(e->bytes >= e->size);
135 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
136 if (len < 0) {
137 e->err = len;
138 return;
139 }
140 e->bytes += len;
84734a04
MK
141}
142
0e39037b 143static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
84734a04
MK
144{
145 unsigned len;
146
0e39037b 147 if (e->err || !str)
84734a04
MK
148 return;
149
150 len = strlen(str);
0e39037b
CW
151 if (!__i915_error_grow(e, len))
152 return;
84734a04 153
0e39037b 154 GEM_BUG_ON(e->bytes + len > e->size);
84734a04 155 memcpy(e->buf + e->bytes, str, len);
0e39037b 156 e->bytes += len;
84734a04
MK
157}
158
159#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
160#define err_puts(e, s) i915_error_puts(e, s)
161
7d41ef34
MW
162static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
163{
164 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
165}
166
167static inline struct drm_printer
168i915_error_printer(struct drm_i915_error_state_buf *e)
169{
170 struct drm_printer p = {
171 .printfn = __i915_printfn_error,
172 .arg = e,
173 };
174 return p;
175}
176
3bdd4f84
CW
177/* single threaded page allocator with a reserved stash for emergencies */
178static void pool_fini(struct pagevec *pv)
179{
180 pagevec_release(pv);
181}
182
183static int pool_refill(struct pagevec *pv, gfp_t gfp)
184{
185 while (pagevec_space(pv)) {
186 struct page *p;
187
188 p = alloc_page(gfp);
189 if (!p)
190 return -ENOMEM;
191
192 pagevec_add(pv, p);
193 }
194
195 return 0;
196}
197
198static int pool_init(struct pagevec *pv, gfp_t gfp)
199{
200 int err;
201
202 pagevec_init(pv);
203
204 err = pool_refill(pv, gfp);
205 if (err)
206 pool_fini(pv);
207
208 return err;
209}
210
211static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
212{
213 struct page *p;
214
215 p = alloc_page(gfp);
216 if (!p && pagevec_count(pv))
217 p = pv->pages[--pv->nr];
218
219 return p ? page_address(p) : NULL;
220}
221
222static void pool_free(struct pagevec *pv, void *addr)
223{
224 struct page *p = virt_to_page(addr);
225
226 if (pagevec_space(pv))
227 pagevec_add(pv, p);
228 else
229 __free_page(p);
230}
231
0a97015d
CW
232#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
233
d637c178 234struct compress {
3bdd4f84 235 struct pagevec pool;
d637c178
CW
236 struct z_stream_s zstream;
237 void *tmp;
238};
239
240static bool compress_init(struct compress *c)
0a97015d 241{
3bdd4f84 242 struct z_stream_s *zstream = &c->zstream;
0a97015d 243
3bdd4f84 244 if (pool_init(&c->pool, ALLOW_FAIL))
0a97015d
CW
245 return false;
246
3bdd4f84
CW
247 zstream->workspace =
248 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
249 ALLOW_FAIL);
250 if (!zstream->workspace) {
251 pool_fini(&c->pool);
0a97015d
CW
252 return false;
253 }
254
d637c178 255 c->tmp = NULL;
c4d3ae68 256 if (i915_has_memcpy_from_wc())
3bdd4f84 257 c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
d637c178 258
0a97015d
CW
259 return true;
260}
261
3bdd4f84 262static bool compress_start(struct compress *c)
83bc0f5b 263{
3bdd4f84
CW
264 struct z_stream_s *zstream = &c->zstream;
265 void *workspace = zstream->workspace;
266
267 memset(zstream, 0, sizeof(*zstream));
268 zstream->workspace = workspace;
269
270 return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
271}
272
273static void *compress_next_page(struct compress *c,
274 struct drm_i915_error_object *dst)
275{
276 void *page;
83bc0f5b
CW
277
278 if (dst->page_count >= dst->num_pages)
279 return ERR_PTR(-ENOSPC);
280
79c7a28e 281 page = pool_alloc(&c->pool, ALLOW_FAIL);
83bc0f5b
CW
282 if (!page)
283 return ERR_PTR(-ENOMEM);
284
3bdd4f84 285 return dst->pages[dst->page_count++] = page;
83bc0f5b
CW
286}
287
d637c178 288static int compress_page(struct compress *c,
0a97015d
CW
289 void *src,
290 struct drm_i915_error_object *dst)
291{
d637c178
CW
292 struct z_stream_s *zstream = &c->zstream;
293
0a97015d 294 zstream->next_in = src;
d637c178
CW
295 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
296 zstream->next_in = c->tmp;
0a97015d
CW
297 zstream->avail_in = PAGE_SIZE;
298
299 do {
300 if (zstream->avail_out == 0) {
3bdd4f84 301 zstream->next_out = compress_next_page(c, dst);
83bc0f5b
CW
302 if (IS_ERR(zstream->next_out))
303 return PTR_ERR(zstream->next_out);
0a97015d 304
0a97015d
CW
305 zstream->avail_out = PAGE_SIZE;
306 }
307
83bc0f5b 308 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
0a97015d
CW
309 return -EIO;
310 } while (zstream->avail_in);
311
312 /* Fallback to uncompressed if we increase size? */
313 if (0 && zstream->total_out > zstream->total_in)
314 return -E2BIG;
315
316 return 0;
317}
318
83bc0f5b 319static int compress_flush(struct compress *c,
0a97015d
CW
320 struct drm_i915_error_object *dst)
321{
d637c178
CW
322 struct z_stream_s *zstream = &c->zstream;
323
83bc0f5b
CW
324 do {
325 switch (zlib_deflate(zstream, Z_FINISH)) {
326 case Z_OK: /* more space requested */
3bdd4f84 327 zstream->next_out = compress_next_page(c, dst);
83bc0f5b
CW
328 if (IS_ERR(zstream->next_out))
329 return PTR_ERR(zstream->next_out);
330
331 zstream->avail_out = PAGE_SIZE;
332 break;
333
334 case Z_STREAM_END:
335 goto end;
336
337 default: /* any error */
338 return -EIO;
339 }
340 } while (1);
341
342end:
343 memset(zstream->next_out, 0, zstream->avail_out);
344 dst->unused = zstream->avail_out;
345 return 0;
346}
347
3bdd4f84 348static void compress_finish(struct compress *c)
83bc0f5b 349{
3bdd4f84
CW
350 zlib_deflateEnd(&c->zstream);
351}
0a97015d 352
3bdd4f84
CW
353static void compress_fini(struct compress *c)
354{
355 kfree(c->zstream.workspace);
d637c178 356 if (c->tmp)
3bdd4f84
CW
357 pool_free(&c->pool, c->tmp);
358 pool_fini(&c->pool);
0a97015d
CW
359}
360
361static void err_compression_marker(struct drm_i915_error_state_buf *m)
362{
363 err_puts(m, ":");
364}
365
366#else
367
d637c178 368struct compress {
3bdd4f84 369 struct pagevec pool;
d637c178
CW
370};
371
372static bool compress_init(struct compress *c)
3bdd4f84
CW
373{
374 return pool_init(&c->pool, ALLOW_FAIL) == 0;
375}
376
377static bool compress_start(struct compress *c)
0a97015d
CW
378{
379 return true;
380}
381
d637c178 382static int compress_page(struct compress *c,
0a97015d
CW
383 void *src,
384 struct drm_i915_error_object *dst)
385{
d637c178 386 void *ptr;
0a97015d 387
79c7a28e 388 ptr = pool_alloc(&c->pool, ALLOW_FAIL);
3bdd4f84 389 if (!ptr)
0a97015d
CW
390 return -ENOMEM;
391
d637c178
CW
392 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
393 memcpy(ptr, src, PAGE_SIZE);
394 dst->pages[dst->page_count++] = ptr;
0a97015d
CW
395
396 return 0;
397}
398
83bc0f5b
CW
399static int compress_flush(struct compress *c,
400 struct drm_i915_error_object *dst)
401{
402 return 0;
403}
404
3bdd4f84 405static void compress_finish(struct compress *c)
0a97015d
CW
406{
407}
408
3bdd4f84
CW
409static void compress_fini(struct compress *c)
410{
411 pool_fini(&c->pool);
412}
413
0a97015d
CW
414static void err_compression_marker(struct drm_i915_error_state_buf *m)
415{
416 err_puts(m, "~");
417}
418
419#endif
420
d636951e 421static void error_print_instdone(struct drm_i915_error_state_buf *m,
5a4c6f1b 422 const struct drm_i915_error_engine *ee)
d636951e 423{
eaef5b3c 424 const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu;
f9e61372
BW
425 int slice;
426 int subslice;
427
d636951e
BW
428 err_printf(m, " INSTDONE: 0x%08x\n",
429 ee->instdone.instdone);
430
c990b4c3 431 if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
d636951e
BW
432 return;
433
434 err_printf(m, " SC_INSTDONE: 0x%08x\n",
435 ee->instdone.slice_common);
436
437 if (INTEL_GEN(m->i915) <= 6)
438 return;
439
eaef5b3c 440 for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
f9e61372
BW
441 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
442 slice, subslice,
443 ee->instdone.sampler[slice][subslice]);
444
eaef5b3c 445 for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
f9e61372
BW
446 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
447 slice, subslice,
448 ee->instdone.row[slice][subslice]);
d636951e
BW
449}
450
35ca039e
CW
451static void error_print_request(struct drm_i915_error_state_buf *m,
452 const char *prefix,
043477b0
MK
453 const struct drm_i915_error_request *erq,
454 const unsigned long epoch)
35ca039e
CW
455{
456 if (!erq->seqno)
457 return;
458
7f4127c4
CW
459 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
460 prefix, erq->pid, erq->context, erq->seqno,
52c0fdb2
CW
461 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
462 &erq->flags) ? "!" : "",
463 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
464 &erq->flags) ? "+" : "",
465 erq->sched_attr.priority,
043477b0 466 jiffies_to_msecs(erq->jiffies - epoch),
3a068721 467 erq->start, erq->head, erq->tail);
35ca039e
CW
468}
469
4fa6053e
CW
470static void error_print_context(struct drm_i915_error_state_buf *m,
471 const char *header,
5a4c6f1b 472 const struct drm_i915_error_context *ctx)
4fa6053e 473{
3e055312
CW
474 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
475 header, ctx->comm, ctx->pid, ctx->hw_id,
7f4127c4 476 ctx->sched_attr.priority, ctx->guilty, ctx->active);
4fa6053e
CW
477}
478
6361f4ba 479static void error_print_engine(struct drm_i915_error_state_buf *m,
043477b0
MK
480 const struct drm_i915_error_engine *ee,
481 const unsigned long epoch)
84734a04 482{
76e70087
MK
483 int n;
484
c990b4c3 485 err_printf(m, "%s command stream:\n", ee->engine->name);
398c8a30 486 err_printf(m, " IDLE?: %s\n", yesno(ee->idle));
6361f4ba 487 err_printf(m, " START: 0x%08x\n", ee->start);
06392e3b 488 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
cdb324bd
CW
489 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
490 ee->tail, ee->rq_post, ee->rq_tail);
6361f4ba 491 err_printf(m, " CTL: 0x%08x\n", ee->ctl);
21a2c58a 492 err_printf(m, " MODE: 0x%08x\n", ee->mode);
6361f4ba
CW
493 err_printf(m, " HWS: 0x%08x\n", ee->hws);
494 err_printf(m, " ACTHD: 0x%08x %08x\n",
495 (u32)(ee->acthd>>32), (u32)ee->acthd);
496 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
497 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
d636951e
BW
498
499 error_print_instdone(m, ee);
500
03382dfb
CW
501 if (ee->batchbuffer) {
502 u64 start = ee->batchbuffer->gtt_offset;
503 u64 end = start + ee->batchbuffer->gtt_size;
504
505 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
506 upper_32_bits(start), lower_32_bits(start),
507 upper_32_bits(end), lower_32_bits(end));
508 }
6361f4ba 509 if (INTEL_GEN(m->i915) >= 4) {
03382dfb 510 err_printf(m, " BBADDR: 0x%08x_%08x\n",
6361f4ba
CW
511 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
512 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
513 err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
3dda20a9 514 }
6361f4ba
CW
515 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
516 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
517 lower_32_bits(ee->faddr));
518 if (INTEL_GEN(m->i915) >= 6) {
519 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
520 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
84734a04 521 }
4bdafb9d 522 if (HAS_PPGTT(m->i915)) {
6361f4ba 523 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
6c7a01ec 524
6361f4ba 525 if (INTEL_GEN(m->i915) >= 8) {
6c7a01ec
BW
526 int i;
527 for (i = 0; i < 4; i++)
528 err_printf(m, " PDP%d: 0x%016llx\n",
6361f4ba 529 i, ee->vm_info.pdp[i]);
6c7a01ec
BW
530 } else {
531 err_printf(m, " PP_DIR_BASE: 0x%08x\n",
6361f4ba 532 ee->vm_info.pp_dir_base);
6c7a01ec
BW
533 }
534 }
6361f4ba
CW
535 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
536 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
eb8d0f5a 537 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
043477b0 538 jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
3fe3b030 539 ee->hangcheck_timestamp,
043477b0 540 ee->hangcheck_timestamp == epoch ? "; epoch" : "");
702c8f8e 541 err_printf(m, " engine reset count: %u\n", ee->reset_count);
3fe3b030 542
76e70087
MK
543 for (n = 0; n < ee->num_ports; n++) {
544 err_printf(m, " ELSP[%d]:", n);
043477b0 545 error_print_request(m, " ", &ee->execlist[n], epoch);
76e70087
MK
546 }
547
4fa6053e 548 error_print_context(m, " Active context: ", &ee->context);
84734a04
MK
549}
550
551void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
552{
553 va_list args;
554
555 va_start(args, f);
556 i915_error_vprintf(e, f, args);
557 va_end(args);
558}
559
ab0e7ff9 560static void print_error_obj(struct drm_i915_error_state_buf *m,
c990b4c3 561 const struct intel_engine_cs *engine,
fc4c79c3 562 const char *name,
c990b4c3 563 const struct drm_i915_error_object *obj)
ab0e7ff9 564{
489cae63 565 char out[ASCII85_BUFSZ];
0a97015d 566 int page;
ab0e7ff9 567
fc4c79c3
CW
568 if (!obj)
569 return;
570
571 if (name) {
572 err_printf(m, "%s --- %s = 0x%08x %08x\n",
573 engine ? engine->name : "global", name,
574 upper_32_bits(obj->gtt_offset),
575 lower_32_bits(obj->gtt_offset));
576 }
577
fd521d3b
MA
578 if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
579 err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes);
580
0a97015d
CW
581 err_compression_marker(m);
582 for (page = 0; page < obj->page_count; page++) {
583 int i, len;
584
585 len = PAGE_SIZE;
586 if (page == obj->page_count - 1)
587 len -= obj->unused;
588 len = ascii85_encode_len(len);
589
489cae63
JC
590 for (i = 0; i < len; i++)
591 err_puts(m, ascii85_encode(obj->pages[page][i], out));
ab0e7ff9 592 }
0a97015d 593 err_puts(m, "\n");
ab0e7ff9
CW
594}
595
2bd160a1 596static void err_print_capabilities(struct drm_i915_error_state_buf *m,
3fed1808 597 const struct intel_device_info *info,
0258404f 598 const struct intel_runtime_info *runtime,
3fed1808 599 const struct intel_driver_caps *caps)
2bd160a1 600{
a8c9b849
MW
601 struct drm_printer p = i915_error_printer(m);
602
603 intel_device_info_dump_flags(info, &p);
3fed1808 604 intel_driver_caps_print(caps, &p);
0258404f 605 intel_device_info_dump_topology(&runtime->sseu, &p);
2bd160a1
CW
606}
607
642c8a72 608static void err_print_params(struct drm_i915_error_state_buf *m,
acfb9973 609 const struct i915_params *params)
642c8a72 610{
acfb9973
MW
611 struct drm_printer p = i915_error_printer(m);
612
613 i915_params_dump(params, &p);
642c8a72
CW
614}
615
5a4c6f1b
CW
616static void err_print_pciid(struct drm_i915_error_state_buf *m,
617 struct drm_i915_private *i915)
618{
619 struct pci_dev *pdev = i915->drm.pdev;
620
621 err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
622 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
623 err_printf(m, "PCI Subsystem: %04x:%04x\n",
624 pdev->subsystem_vendor,
625 pdev->subsystem_device);
626}
627
7d41ef34
MW
628static void err_print_uc(struct drm_i915_error_state_buf *m,
629 const struct i915_error_uc *error_uc)
630{
631 struct drm_printer p = i915_error_printer(m);
632 const struct i915_gpu_state *error =
633 container_of(error_uc, typeof(*error), uc);
634
702668e6 635 if (!error->device_info.has_gt_uc)
7d41ef34
MW
636 return;
637
638 intel_uc_fw_dump(&error_uc->guc_fw, &p);
639 intel_uc_fw_dump(&error_uc->huc_fw, &p);
0397ac13 640 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
7d41ef34
MW
641}
642
0e39037b 643static void err_free_sgl(struct scatterlist *sgl)
84734a04 644{
0e39037b
CW
645 while (sgl) {
646 struct scatterlist *sg;
84734a04 647
0e39037b
CW
648 for (sg = sgl; !sg_is_chain(sg); sg++) {
649 kfree(sg_virt(sg));
650 if (sg_is_last(sg))
651 break;
652 }
653
654 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
655 free_page((unsigned long)sgl);
656 sgl = sg;
84734a04 657 }
0e39037b 658}
84734a04 659
0e39037b
CW
660static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
661 struct i915_gpu_state *error)
662{
c990b4c3 663 const struct drm_i915_error_engine *ee;
0e39037b
CW
664 struct timespec64 ts;
665 int i, j;
fb6f0b64 666
5a4c6f1b
CW
667 if (*error->error_msg)
668 err_printf(m, "%s\n", error->error_msg);
57428bcc
CW
669 err_printf(m, "Kernel: %s %s\n",
670 init_utsname()->release,
671 init_utsname()->machine);
d71c4b03 672 err_printf(m, "Driver: %s\n", DRIVER_DATE);
c6270dbc
AB
673 ts = ktime_to_timespec64(error->time);
674 err_printf(m, "Time: %lld s %ld us\n",
675 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
676 ts = ktime_to_timespec64(error->boottime);
677 err_printf(m, "Boottime: %lld s %ld us\n",
678 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
679 ts = ktime_to_timespec64(error->uptime);
680 err_printf(m, "Uptime: %lld s %ld us\n",
681 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
043477b0
MK
682 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
683 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
684 error->capture,
685 jiffies_to_msecs(jiffies - error->capture),
686 jiffies_to_msecs(error->capture - error->epoch));
3fe3b030 687
c990b4c3 688 for (ee = error->engine; ee; ee = ee->next)
7f4127c4 689 err_printf(m, "Active process (on ring %s): %s [%d]\n",
c990b4c3
CW
690 ee->engine->name,
691 ee->context.comm,
692 ee->context.pid);
693
48b031e3 694 err_printf(m, "Reset count: %u\n", error->reset_count);
62d5d69b 695 err_printf(m, "Suspend count: %u\n", error->suspend_count);
2e0d26f8 696 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
805446c8
TU
697 err_printf(m, "Subplatform: 0x%x\n",
698 intel_subplatform(&error->runtime_info,
699 error->device_info.platform));
0e39037b 700 err_print_pciid(m, m->i915);
642c8a72 701
eb5be9d0 702 err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
0ac7655c 703
0e39037b
CW
704 if (HAS_CSR(m->i915)) {
705 struct intel_csr *csr = &m->i915->csr;
0ac7655c
MK
706
707 err_printf(m, "DMC loaded: %s\n",
708 yesno(csr->dmc_payload != NULL));
709 err_printf(m, "DMC fw version: %d.%d\n",
710 CSR_VERSION_MAJOR(csr->version),
711 CSR_VERSION_MINOR(csr->version));
712 }
713
f73b5674 714 err_printf(m, "GT awake: %s\n", yesno(error->awake));
e5aac87e
CW
715 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
716 err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
84734a04
MK
717 err_printf(m, "EIR: 0x%08x\n", error->eir);
718 err_printf(m, "IER: 0x%08x\n", error->ier);
5a4c6f1b
CW
719 for (i = 0; i < error->ngtier; i++)
720 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
84734a04
MK
721 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
722 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
723 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
724 err_printf(m, "CCID: 0x%08x\n", error->ccid);
725
5a4c6f1b 726 for (i = 0; i < error->nfence; i++)
84734a04
MK
727 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
728
23dea051 729 if (IS_GEN_RANGE(m->i915, 6, 11)) {
84734a04
MK
730 err_printf(m, "ERROR: 0x%08x\n", error->error);
731 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
732 }
733
23dea051
LDM
734 if (INTEL_GEN(m->i915) >= 8)
735 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
736 error->fault_data1, error->fault_data0);
737
cf819eff 738 if (IS_GEN(m->i915, 7))
84734a04
MK
739 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
740
fd521d3b
MA
741 if (IS_GEN_RANGE(m->i915, 8, 11))
742 err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache);
743
c990b4c3
CW
744 for (ee = error->engine; ee; ee = ee->next)
745 error_print_engine(m, ee, error->epoch);
84734a04 746
c990b4c3
CW
747 for (ee = error->engine; ee; ee = ee->next) {
748 const struct drm_i915_error_object *obj;
6361f4ba
CW
749
750 obj = ee->batchbuffer;
ab0e7ff9 751 if (obj) {
c990b4c3 752 err_puts(m, ee->engine->name);
4fa6053e 753 if (ee->context.pid)
3e055312 754 err_printf(m, " (submitted by %s [%d])",
4fa6053e 755 ee->context.comm,
3e055312 756 ee->context.pid);
e1f12325
MT
757 err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
758 upper_32_bits(obj->gtt_offset),
759 lower_32_bits(obj->gtt_offset));
c990b4c3 760 print_error_obj(m, ee->engine, NULL, obj);
84734a04
MK
761 }
762
b0fd47ad 763 for (j = 0; j < ee->user_bo_count; j++)
c990b4c3 764 print_error_obj(m, ee->engine, "user", ee->user_bo[j]);
b0fd47ad 765
6361f4ba 766 if (ee->num_requests) {
84734a04 767 err_printf(m, "%s --- %d requests\n",
c990b4c3 768 ee->engine->name,
6361f4ba 769 ee->num_requests);
35ca039e 770 for (j = 0; j < ee->num_requests; j++)
043477b0
MK
771 error_print_request(m, " ",
772 &ee->requests[j],
773 error->epoch);
84734a04
MK
774 }
775
c990b4c3
CW
776 print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer);
777 print_error_obj(m, ee->engine, "HW Status", ee->hws_page);
778 print_error_obj(m, ee->engine, "HW context", ee->ctx);
779 print_error_obj(m, ee->engine, "WA context", ee->wa_ctx);
780 print_error_obj(m, ee->engine,
fc4c79c3 781 "WA batchbuffer", ee->wa_batchbuffer);
c990b4c3 782 print_error_obj(m, ee->engine,
4e90a6e2 783 "NULL context", ee->default_state);
84734a04
MK
784 }
785
786 if (error->overlay)
787 intel_overlay_print_error_state(m, error->overlay);
788
789 if (error->display)
5a4c6f1b 790 intel_display_print_error_state(m, error->display);
84734a04 791
0258404f
JN
792 err_print_capabilities(m, &error->device_info, &error->runtime_info,
793 &error->driver_caps);
642c8a72 794 err_print_params(m, &error->params);
7d41ef34 795 err_print_uc(m, &error->uc);
0e39037b
CW
796}
797
798static int err_print_to_sgl(struct i915_gpu_state *error)
799{
800 struct drm_i915_error_state_buf m;
801
802 if (IS_ERR(error))
803 return PTR_ERR(error);
804
805 if (READ_ONCE(error->sgl))
806 return 0;
807
808 memset(&m, 0, sizeof(m));
809 m.i915 = error->i915;
810
811 __err_print_to_sgl(&m, error);
812
813 if (m.buf) {
814 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
815 m.bytes = 0;
816 m.buf = NULL;
817 }
818 if (m.cur) {
819 GEM_BUG_ON(m.end < m.cur);
820 sg_mark_end(m.cur - 1);
821 }
822 GEM_BUG_ON(m.sgl && !m.cur);
823
824 if (m.err) {
825 err_free_sgl(m.sgl);
826 return m.err;
827 }
642c8a72 828
0e39037b
CW
829 if (cmpxchg(&error->sgl, NULL, m.sgl))
830 err_free_sgl(m.sgl);
84734a04
MK
831
832 return 0;
833}
834
0e39037b
CW
835ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
836 char *buf, loff_t off, size_t rem)
84734a04 837{
0e39037b
CW
838 struct scatterlist *sg;
839 size_t count;
840 loff_t pos;
841 int err;
84734a04 842
0e39037b
CW
843 if (!error || !rem)
844 return 0;
84734a04 845
0e39037b
CW
846 err = err_print_to_sgl(error);
847 if (err)
848 return err;
84734a04 849
0e39037b
CW
850 sg = READ_ONCE(error->fit);
851 if (!sg || off < sg->dma_address)
852 sg = error->sgl;
853 if (!sg)
854 return 0;
84734a04 855
0e39037b
CW
856 pos = sg->dma_address;
857 count = 0;
858 do {
859 size_t len, start;
860
861 if (sg_is_chain(sg)) {
862 sg = sg_chain_ptr(sg);
863 GEM_BUG_ON(sg_is_chain(sg));
864 }
84734a04 865
0e39037b
CW
866 len = sg->length;
867 if (pos + len <= off) {
868 pos += len;
869 continue;
870 }
84734a04 871
0e39037b
CW
872 start = sg->offset;
873 if (pos < off) {
874 GEM_BUG_ON(off - pos > len);
875 len -= off - pos;
876 start += off - pos;
877 pos = off;
878 }
879
880 len = min(len, rem);
881 GEM_BUG_ON(!len || len > sg->length);
882
883 memcpy(buf, page_address(sg_page(sg)) + start, len);
884
885 count += len;
886 pos += len;
887
888 buf += len;
889 rem -= len;
890 if (!rem) {
891 WRITE_ONCE(error->fit, sg);
892 break;
893 }
894 } while (!sg_is_last(sg++));
895
896 return count;
84734a04
MK
897}
898
899static void i915_error_object_free(struct drm_i915_error_object *obj)
900{
901 int page;
902
903 if (obj == NULL)
904 return;
905
906 for (page = 0; page < obj->page_count; page++)
95374d75 907 free_page((unsigned long)obj->pages[page]);
84734a04
MK
908
909 kfree(obj);
910}
911
1d6aa7a3 912
84a20a8a
MW
913static void cleanup_params(struct i915_gpu_state *error)
914{
16cabb12 915 i915_params_free(&error->params);
84a20a8a
MW
916}
917
7d41ef34
MW
918static void cleanup_uc_state(struct i915_gpu_state *error)
919{
920 struct i915_error_uc *error_uc = &error->uc;
921
922 kfree(error_uc->guc_fw.path);
923 kfree(error_uc->huc_fw.path);
0397ac13 924 i915_error_object_free(error_uc->guc_log);
7d41ef34
MW
925}
926
5a4c6f1b 927void __i915_gpu_state_free(struct kref *error_ref)
84734a04 928{
5a4c6f1b
CW
929 struct i915_gpu_state *error =
930 container_of(error_ref, typeof(*error), ref);
c990b4c3 931 long i;
84734a04 932
c990b4c3
CW
933 while (error->engine) {
934 struct drm_i915_error_engine *ee = error->engine;
6361f4ba 935
c990b4c3
CW
936 error->engine = ee->next;
937
938 for (i = 0; i < ee->user_bo_count; i++)
939 i915_error_object_free(ee->user_bo[i]);
b0fd47ad
CW
940 kfree(ee->user_bo);
941
6361f4ba
CW
942 i915_error_object_free(ee->batchbuffer);
943 i915_error_object_free(ee->wa_batchbuffer);
944 i915_error_object_free(ee->ringbuffer);
945 i915_error_object_free(ee->hws_page);
946 i915_error_object_free(ee->ctx);
947 i915_error_object_free(ee->wa_ctx);
948
949 kfree(ee->requests);
c990b4c3 950 kfree(ee);
84734a04
MK
951 }
952
84734a04
MK
953 kfree(error->overlay);
954 kfree(error->display);
1d6aa7a3 955
84a20a8a 956 cleanup_params(error);
7d41ef34
MW
957 cleanup_uc_state(error);
958
0e39037b 959 err_free_sgl(error->sgl);
84734a04
MK
960 kfree(error);
961}
962
963static struct drm_i915_error_object *
95374d75 964i915_error_object_create(struct drm_i915_private *i915,
3bdd4f84
CW
965 struct i915_vma *vma,
966 struct compress *compress)
84734a04 967{
95374d75
CW
968 struct i915_ggtt *ggtt = &i915->ggtt;
969 const u64 slot = ggtt->error_capture.start;
84734a04 970 struct drm_i915_error_object *dst;
95374d75
CW
971 unsigned long num_pages;
972 struct sgt_iter iter;
973 dma_addr_t dma;
83bc0f5b 974 int ret;
84734a04 975
79c7a28e
CW
976 might_sleep();
977
7f9e20ef 978 if (!vma || !vma->pages)
058d88c4
CW
979 return NULL;
980
95374d75 981 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
0a97015d 982 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
79c7a28e 983 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL);
058d88c4 984 if (!dst)
84734a04
MK
985 return NULL;
986
3bdd4f84
CW
987 if (!compress_start(compress)) {
988 kfree(dst);
989 return NULL;
990 }
991
03382dfb
CW
992 dst->gtt_offset = vma->node.start;
993 dst->gtt_size = vma->node.size;
fd521d3b 994 dst->gtt_page_sizes = vma->page_sizes.gtt;
83bc0f5b 995 dst->num_pages = num_pages;
95374d75 996 dst->page_count = 0;
0a97015d
CW
997 dst->unused = 0;
998
83bc0f5b 999 ret = -EINVAL;
31444afb 1000 for_each_sgt_daddr(dma, iter, vma->pages) {
95374d75 1001 void __iomem *s;
b3c3f5e6 1002
82ad6443 1003 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
b3c3f5e6 1004
79c7a28e 1005 s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
3bdd4f84 1006 ret = compress_page(compress, (void __force *)s, dst);
79c7a28e 1007 io_mapping_unmap(s);
95374d75 1008 if (ret)
83bc0f5b 1009 break;
84734a04 1010 }
84734a04 1011
3bdd4f84 1012 if (ret || compress_flush(compress, dst)) {
83bc0f5b 1013 while (dst->page_count--)
3bdd4f84 1014 pool_free(&compress->pool, dst->pages[dst->page_count]);
83bc0f5b
CW
1015 kfree(dst);
1016 dst = NULL;
1017 }
3bdd4f84 1018 compress_finish(compress);
95374d75 1019
95374d75 1020 return dst;
84734a04 1021}
84734a04 1022
eb8d0f5a
CW
1023/*
1024 * Generate a semi-unique error code. The code is not meant to have meaning, The
011cf577
BW
1025 * code's only purpose is to try to prevent false duplicated bug reports by
1026 * grossly estimating a GPU error state.
1027 *
1028 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1029 * the hang if we could strip the GTT offset information from it.
1030 *
1031 * It's only a small step better than a random number in its current form.
1032 */
c990b4c3 1033static u32 i915_error_generate_code(struct i915_gpu_state *error)
011cf577 1034{
c990b4c3
CW
1035 const struct drm_i915_error_engine *ee = error->engine;
1036
eb8d0f5a
CW
1037 /*
1038 * IPEHR would be an ideal way to detect errors, as it's the gross
011cf577
BW
1039 * measure of "the command that hung." However, has some very common
1040 * synchronization commands which almost always appear in the case
1041 * strictly a client bug. Use instdone to differentiate those some.
1042 */
c990b4c3 1043 return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
011cf577
BW
1044}
1045
53b725c7 1046static void gem_record_fences(struct i915_gpu_state *error)
84734a04 1047{
53b725c7 1048 struct drm_i915_private *dev_priv = error->i915;
7f1502d9 1049 struct intel_uncore *uncore = &dev_priv->uncore;
84734a04
MK
1050 int i;
1051
5a4c6f1b 1052 if (INTEL_GEN(dev_priv) >= 6) {
0cf289bd 1053 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
7f1502d9
TU
1054 error->fence[i] =
1055 intel_uncore_read64(uncore,
1056 FENCE_REG_GEN6_LO(i));
5a4c6f1b 1057 } else if (INTEL_GEN(dev_priv) >= 4) {
0cf289bd 1058 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
7f1502d9
TU
1059 error->fence[i] =
1060 intel_uncore_read64(uncore,
1061 FENCE_REG_965_LO(i));
5a4c6f1b 1062 } else {
0cf289bd 1063 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
7f1502d9
TU
1064 error->fence[i] =
1065 intel_uncore_read(uncore, FENCE_REG(i));
eecf613a 1066 }
5a4c6f1b 1067 error->nfence = i;
84734a04
MK
1068}
1069
5a4c6f1b 1070static void error_record_engine_registers(struct i915_gpu_state *error,
6361f4ba
CW
1071 struct intel_engine_cs *engine,
1072 struct drm_i915_error_engine *ee)
84734a04 1073{
6361f4ba
CW
1074 struct drm_i915_private *dev_priv = engine->i915;
1075
c033666a 1076 if (INTEL_GEN(dev_priv) >= 6) {
baba6e57 1077 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
91b59cd9
LDM
1078
1079 if (INTEL_GEN(dev_priv) >= 12)
1080 ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG);
1081 else if (INTEL_GEN(dev_priv) >= 8)
b03ec3d6 1082 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
62acc7e8 1083 else
77a302e0 1084 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
4e5aabfd
BW
1085 }
1086
c033666a 1087 if (INTEL_GEN(dev_priv) >= 4) {
baba6e57
DCS
1088 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1089 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1090 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1091 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1092 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
c033666a 1093 if (INTEL_GEN(dev_priv) >= 8) {
baba6e57
DCS
1094 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1095 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
13ffadd1 1096 }
baba6e57 1097 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
84734a04 1098 } else {
baba6e57
DCS
1099 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1100 ee->ipeir = ENGINE_READ(engine, IPEIR);
1101 ee->ipehr = ENGINE_READ(engine, IPEHR);
84734a04
MK
1102 }
1103
0e704476 1104 intel_engine_get_instdone(engine, &ee->instdone);
d636951e 1105
baba6e57 1106 ee->instpm = ENGINE_READ(engine, RING_INSTPM);
7e37f889 1107 ee->acthd = intel_engine_get_active_head(engine);
baba6e57
DCS
1108 ee->start = ENGINE_READ(engine, RING_START);
1109 ee->head = ENGINE_READ(engine, RING_HEAD);
1110 ee->tail = ENGINE_READ(engine, RING_TAIL);
1111 ee->ctl = ENGINE_READ(engine, RING_CTL);
21a2c58a 1112 if (INTEL_GEN(dev_priv) > 2)
baba6e57 1113 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
84734a04 1114
3177659a 1115 if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
f0f59a00 1116 i915_reg_t mmio;
f3ce3821 1117
cf819eff 1118 if (IS_GEN(dev_priv, 7)) {
0bc40be8 1119 switch (engine->id) {
f3ce3821 1120 default:
8a68d464 1121 MISSING_CASE(engine->id);
2defb94e 1122 /* fall through */
8a68d464 1123 case RCS0:
f3ce3821
CW
1124 mmio = RENDER_HWS_PGA_GEN7;
1125 break;
8a68d464 1126 case BCS0:
f3ce3821
CW
1127 mmio = BLT_HWS_PGA_GEN7;
1128 break;
8a68d464 1129 case VCS0:
f3ce3821
CW
1130 mmio = BSD_HWS_PGA_GEN7;
1131 break;
8a68d464 1132 case VECS0:
f3ce3821
CW
1133 mmio = VEBOX_HWS_PGA_GEN7;
1134 break;
1135 }
cf819eff 1136 } else if (IS_GEN(engine->i915, 6)) {
0bc40be8 1137 mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
f3ce3821
CW
1138 } else {
1139 /* XXX: gen8 returns to sanity */
0bc40be8 1140 mmio = RING_HWS_PGA(engine->mmio_base);
f3ce3821
CW
1141 }
1142
6361f4ba 1143 ee->hws = I915_READ(mmio);
f3ce3821
CW
1144 }
1145
398c8a30 1146 ee->idle = intel_engine_is_idle(engine);
eb8d0f5a
CW
1147 if (!ee->idle)
1148 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
702c8f8e
MT
1149 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1150 engine);
6c7a01ec 1151
4bdafb9d 1152 if (HAS_PPGTT(dev_priv)) {
6c7a01ec
BW
1153 int i;
1154
dbc65183 1155 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
6c7a01ec 1156
6d425728 1157 if (IS_GEN(dev_priv, 6)) {
6361f4ba 1158 ee->vm_info.pp_dir_base =
baba6e57 1159 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
6d425728 1160 } else if (IS_GEN(dev_priv, 7)) {
6361f4ba 1161 ee->vm_info.pp_dir_base =
6d425728
CW
1162 ENGINE_READ(engine, RING_PP_DIR_BASE);
1163 } else if (INTEL_GEN(dev_priv) >= 8) {
1164 u32 base = engine->mmio_base;
1165
6c7a01ec 1166 for (i = 0; i < 4; i++) {
6361f4ba 1167 ee->vm_info.pdp[i] =
6d425728 1168 I915_READ(GEN8_RING_PDP_UDW(base, i));
6361f4ba
CW
1169 ee->vm_info.pdp[i] <<= 32;
1170 ee->vm_info.pdp[i] |=
6d425728 1171 I915_READ(GEN8_RING_PDP_LDW(base, i));
6c7a01ec 1172 }
6d425728 1173 }
6c7a01ec 1174 }
84734a04
MK
1175}
1176
22b7a426 1177static void record_request(const struct i915_request *request,
35ca039e
CW
1178 struct drm_i915_error_request *erq)
1179{
22b7a426 1180 const struct i915_gem_context *ctx = request->gem_context;
4e0d64db 1181
52c0fdb2 1182 erq->flags = request->fence.flags;
b300fde8
CW
1183 erq->context = request->fence.context;
1184 erq->seqno = request->fence.seqno;
b7268c5e 1185 erq->sched_attr = request->sched.attr;
35ca039e 1186 erq->jiffies = request->emitted_jiffies;
3a068721 1187 erq->start = i915_ggtt_offset(request->ring->vma);
35ca039e
CW
1188 erq->head = request->head;
1189 erq->tail = request->tail;
1190
1191 rcu_read_lock();
4e0d64db 1192 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
35ca039e
CW
1193 rcu_read_unlock();
1194}
1195
57bc699d 1196static void engine_record_requests(struct intel_engine_cs *engine,
e61e0f51 1197 struct i915_request *first,
57bc699d
CW
1198 struct drm_i915_error_engine *ee)
1199{
e61e0f51 1200 struct i915_request *request;
57bc699d
CW
1201 int count;
1202
1203 count = 0;
1204 request = first;
422d7df4 1205 list_for_each_entry_from(request, &engine->active.requests, sched.link)
57bc699d
CW
1206 count++;
1207 if (!count)
1208 return;
1209
3bdd4f84 1210 ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
57bc699d
CW
1211 if (!ee->requests)
1212 return;
1213
1214 ee->num_requests = count;
1215
1216 count = 0;
1217 request = first;
422d7df4
CW
1218 list_for_each_entry_from(request,
1219 &engine->active.requests, sched.link) {
57bc699d
CW
1220 if (count >= ee->num_requests) {
1221 /*
1222 * If the ring request list was changed in
1223 * between the point where the error request
1224 * list was created and dimensioned and this
1225 * point then just exit early to avoid crashes.
1226 *
1227 * We don't need to communicate that the
1228 * request list changed state during error
1229 * state capture and that the error state is
1230 * slightly incorrect as a consequence since we
1231 * are typically only interested in the request
1232 * list state at the point of error state
1233 * capture, not in any changes happening during
1234 * the capture.
1235 */
1236 break;
1237 }
1238
35ca039e 1239 record_request(request, &ee->requests[count++]);
57bc699d
CW
1240 }
1241 ee->num_requests = count;
1242}
1243
22b7a426 1244static void error_record_engine_execlists(const struct intel_engine_cs *engine,
35ca039e
CW
1245 struct drm_i915_error_engine *ee)
1246{
76e70087 1247 const struct intel_engine_execlists * const execlists = &engine->execlists;
22b7a426
CW
1248 struct i915_request * const *port = execlists->active;
1249 unsigned int n = 0;
35ca039e 1250
22b7a426
CW
1251 while (*port)
1252 record_request(*port++, &ee->execlist[n++]);
76e70087
MK
1253
1254 ee->num_ports = n;
35ca039e
CW
1255}
1256
c990b4c3
CW
1257static bool record_context(struct drm_i915_error_context *e,
1258 const struct i915_request *rq)
4fa6053e 1259{
c990b4c3
CW
1260 const struct i915_gem_context *ctx = rq->gem_context;
1261
4fa6053e
CW
1262 if (ctx->pid) {
1263 struct task_struct *task;
1264
1265 rcu_read_lock();
1266 task = pid_task(ctx->pid, PIDTYPE_PID);
1267 if (task) {
1268 strcpy(e->comm, task->comm);
1269 e->pid = task->pid;
1270 }
1271 rcu_read_unlock();
1272 }
1273
4fa6053e 1274 e->hw_id = ctx->hw_id;
b7268c5e 1275 e->sched_attr = ctx->sched;
77b25a97
CW
1276 e->guilty = atomic_read(&ctx->guilty_count);
1277 e->active = atomic_read(&ctx->active_count);
c990b4c3
CW
1278
1279 return i915_gem_context_no_error_capture(ctx);
4fa6053e
CW
1280}
1281
79c7a28e
CW
1282struct capture_vma {
1283 struct capture_vma *next;
1284 void **slot;
1285};
1286
1287static struct capture_vma *
1288capture_vma(struct capture_vma *next,
1289 struct i915_vma *vma,
1290 struct drm_i915_error_object **out)
1291{
1292 struct capture_vma *c;
1293
1294 *out = NULL;
1295 if (!vma)
1296 return next;
1297
1298 c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL);
1299 if (!c)
1300 return next;
1301
b1e3177b 1302 if (!i915_active_acquire_if_busy(&vma->active)) {
79c7a28e
CW
1303 kfree(c);
1304 return next;
1305 }
1306
1307 c->slot = (void **)out;
1308 *c->slot = i915_vma_get(vma);
1309
1310 c->next = next;
1311 return c;
1312}
1313
1314static struct capture_vma *
3bdd4f84
CW
1315request_record_user_bo(struct i915_request *request,
1316 struct drm_i915_error_engine *ee,
79c7a28e 1317 struct capture_vma *capture)
b0fd47ad 1318{
e61e0f51 1319 struct i915_capture_list *c;
b0fd47ad 1320 struct drm_i915_error_object **bo;
8e3ffa8d 1321 long count, max;
b0fd47ad 1322
8e3ffa8d 1323 max = 0;
b0fd47ad 1324 for (c = request->capture_list; c; c = c->next)
8e3ffa8d
CW
1325 max++;
1326 if (!max)
79c7a28e 1327 return capture;
b0fd47ad 1328
3bdd4f84 1329 bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
8e3ffa8d
CW
1330 if (!bo) {
1331 /* If we can't capture everything, try to capture something. */
1332 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
3bdd4f84 1333 bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
8e3ffa8d 1334 }
b0fd47ad 1335 if (!bo)
79c7a28e 1336 return capture;
b0fd47ad
CW
1337
1338 count = 0;
1339 for (c = request->capture_list; c; c = c->next) {
79c7a28e 1340 capture = capture_vma(capture, c->vma, &bo[count]);
8e3ffa8d
CW
1341 if (++count == max)
1342 break;
b0fd47ad
CW
1343 }
1344
1345 ee->user_bo = bo;
1346 ee->user_bo_count = count;
79c7a28e
CW
1347
1348 return capture;
b0fd47ad
CW
1349}
1350
4e90a6e2
CW
1351static struct drm_i915_error_object *
1352capture_object(struct drm_i915_private *dev_priv,
3bdd4f84
CW
1353 struct drm_i915_gem_object *obj,
1354 struct compress *compress)
4e90a6e2
CW
1355{
1356 if (obj && i915_gem_object_has_pages(obj)) {
1357 struct i915_vma fake = {
1358 .node = { .start = U64_MAX, .size = obj->base.size },
b5e0a941 1359 .size = obj->base.size,
4e90a6e2
CW
1360 .pages = obj->mm.pages,
1361 .obj = obj,
1362 };
1363
3bdd4f84 1364 return i915_error_object_create(dev_priv, &fake, compress);
4e90a6e2
CW
1365 } else {
1366 return NULL;
1367 }
1368}
1369
3bdd4f84
CW
1370static void
1371gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
84734a04 1372{
53b725c7 1373 struct drm_i915_private *i915 = error->i915;
c990b4c3
CW
1374 struct intel_engine_cs *engine;
1375 struct drm_i915_error_engine *ee;
1376
1377 ee = kzalloc(sizeof(*ee), GFP_KERNEL);
1378 if (!ee)
1379 return;
84734a04 1380
c990b4c3 1381 for_each_uabi_engine(engine, i915) {
79c7a28e 1382 struct capture_vma *capture = NULL;
e61e0f51 1383 struct i915_request *request;
cfe7288c 1384 unsigned long flags;
372fbb8e 1385
3bdd4f84
CW
1386 /* Refill our page pool before entering atomic section */
1387 pool_refill(&compress->pool, ALLOW_FAIL);
1388
cfe7288c 1389 spin_lock_irqsave(&engine->active.lock, flags);
cf4331dd 1390 request = intel_engine_find_active_request(engine);
c990b4c3
CW
1391 if (!request) {
1392 spin_unlock_irqrestore(&engine->active.lock, flags);
1393 continue;
1394 }
ab0e7ff9 1395
c990b4c3 1396 error->simulated |= record_context(&ee->context, request);
79c7a28e 1397
c990b4c3
CW
1398 /*
1399 * We need to copy these to an anonymous buffer
1400 * as the simplest method to avoid being overwritten
1401 * by userspace.
1402 */
1403 capture = capture_vma(capture,
1404 request->batch,
1405 &ee->batchbuffer);
ab0e7ff9 1406
c990b4c3 1407 if (HAS_BROKEN_CS_TLB(i915))
79c7a28e 1408 capture = capture_vma(capture,
c990b4c3
CW
1409 engine->gt->scratch,
1410 &ee->wa_batchbuffer);
79c7a28e 1411
c990b4c3 1412 capture = request_record_user_bo(request, ee, capture);
546b1b6a 1413
c990b4c3
CW
1414 capture = capture_vma(capture,
1415 request->hw_context->state,
1416 &ee->ctx);
bc3d6744 1417
c990b4c3
CW
1418 capture = capture_vma(capture,
1419 request->ring->vma,
1420 &ee->ringbuffer);
cdb324bd 1421
c990b4c3
CW
1422 ee->cpu_ring_head = request->ring->head;
1423 ee->cpu_ring_tail = request->ring->tail;
57bc699d 1424
c990b4c3
CW
1425 ee->rq_head = request->head;
1426 ee->rq_post = request->postfix;
1427 ee->rq_tail = request->tail;
1428
1429 engine_record_requests(engine, request, ee);
cfe7288c 1430 spin_unlock_irqrestore(&engine->active.lock, flags);
84734a04 1431
c990b4c3
CW
1432 error_record_engine_registers(error, engine, ee);
1433 error_record_engine_execlists(engine, ee);
1434
79c7a28e
CW
1435 while (capture) {
1436 struct capture_vma *this = capture;
1437 struct i915_vma *vma = *this->slot;
1438
1439 *this->slot =
1440 i915_error_object_create(i915, vma, compress);
1441
b1e3177b 1442 i915_active_release(&vma->active);
79c7a28e
CW
1443 i915_vma_put(vma);
1444
1445 capture = this->next;
1446 kfree(this);
1447 }
1448
6361f4ba 1449 ee->hws_page =
53b725c7 1450 i915_error_object_create(i915,
3bdd4f84
CW
1451 engine->status_page.vma,
1452 compress);
c0ce4663 1453
3bdd4f84
CW
1454 ee->wa_ctx =
1455 i915_error_object_create(i915,
1456 engine->wa_ctx.vma,
1457 compress);
3a448734 1458
3bdd4f84
CW
1459 ee->default_state =
1460 capture_object(i915, engine->default_state, compress);
c990b4c3
CW
1461
1462 ee->engine = engine;
1463
1464 ee->next = error->engine;
1465 error->engine = ee;
1466
1467 ee = kzalloc(sizeof(*ee), GFP_KERNEL);
1468 if (!ee)
1469 return;
3a448734 1470 }
c990b4c3
CW
1471
1472 kfree(ee);
84734a04
MK
1473}
1474
3bdd4f84
CW
1475static void
1476capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
7d41ef34
MW
1477{
1478 struct drm_i915_private *i915 = error->i915;
1479 struct i915_error_uc *error_uc = &error->uc;
8b5689d7 1480 struct intel_uc *uc = &i915->gt.uc;
7d41ef34
MW
1481
1482 /* Capturing uC state won't be useful if there is no GuC */
702668e6 1483 if (!error->device_info.has_gt_uc)
7d41ef34
MW
1484 return;
1485
abb042f3
MW
1486 memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
1487 memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
7d41ef34
MW
1488
1489 /* Non-default firmware paths will be specified by the modparam.
1490 * As modparams are generally accesible from the userspace make
1491 * explicit copies of the firmware paths.
1492 */
3bdd4f84
CW
1493 error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
1494 error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
1495 error_uc->guc_log = i915_error_object_create(i915,
1496 uc->guc.log.vma,
1497 compress);
27b85bea
AG
1498}
1499
1d762aad 1500/* Capture all registers which don't fit into another category. */
53b725c7 1501static void capture_reg_state(struct i915_gpu_state *error)
84734a04 1502{
4f5fd91f
TU
1503 struct drm_i915_private *i915 = error->i915;
1504 struct intel_uncore *uncore = &i915->uncore;
885ea5a8 1505 int i;
84734a04 1506
654c90c6
BW
1507 /* General organization
1508 * 1. Registers specific to a single generation
1509 * 2. Registers which belong to multiple generations
1510 * 3. Feature specific registers.
1511 * 4. Everything else
1512 * Please try to follow the order.
1513 */
84734a04 1514
654c90c6 1515 /* 1: Registers specific to a single generation */
4f5fd91f
TU
1516 if (IS_VALLEYVIEW(i915)) {
1517 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1518 error->ier = intel_uncore_read(uncore, VLV_IER);
1519 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
654c90c6 1520 }
84734a04 1521
4f5fd91f
TU
1522 if (IS_GEN(i915, 7))
1523 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
84734a04 1524
91b59cd9
LDM
1525 if (INTEL_GEN(i915) >= 12) {
1526 error->fault_data0 = intel_uncore_read(uncore,
1527 GEN12_FAULT_TLB_DATA0);
1528 error->fault_data1 = intel_uncore_read(uncore,
1529 GEN12_FAULT_TLB_DATA1);
1530 } else if (INTEL_GEN(i915) >= 8) {
4f5fd91f
TU
1531 error->fault_data0 = intel_uncore_read(uncore,
1532 GEN8_FAULT_TLB_DATA0);
1533 error->fault_data1 = intel_uncore_read(uncore,
1534 GEN8_FAULT_TLB_DATA1);
6c826f34
MK
1535 }
1536
4f5fd91f
TU
1537 if (IS_GEN(i915, 6)) {
1538 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1539 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1540 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
91ec5d11 1541 }
84734a04 1542
654c90c6 1543 /* 2: Registers which belong to multiple generations */
4f5fd91f
TU
1544 if (INTEL_GEN(i915) >= 7)
1545 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
84734a04 1546
4f5fd91f
TU
1547 if (INTEL_GEN(i915) >= 6) {
1548 error->derrmr = intel_uncore_read(uncore, DERRMR);
23dea051
LDM
1549 if (INTEL_GEN(i915) < 12) {
1550 error->error = intel_uncore_read(uncore, ERROR_GEN6);
1551 error->done_reg = intel_uncore_read(uncore, DONE_REG);
1552 }
84734a04
MK
1553 }
1554
4f5fd91f
TU
1555 if (INTEL_GEN(i915) >= 5)
1556 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
f2e4d76e 1557
654c90c6 1558 /* 3: Feature specific registers */
4f5fd91f
TU
1559 if (IS_GEN_RANGE(i915, 6, 7)) {
1560 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1561 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
91ec5d11
BW
1562 }
1563
fd521d3b
MA
1564 if (IS_GEN_RANGE(i915, 8, 11))
1565 error->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
1566
91ec5d11 1567 /* 4: Everything else */
4f5fd91f
TU
1568 if (INTEL_GEN(i915) >= 11) {
1569 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1570 error->gtier[0] =
1571 intel_uncore_read(uncore,
1572 GEN11_RENDER_COPY_INTR_ENABLE);
1573 error->gtier[1] =
1574 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1575 error->gtier[2] =
1576 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1577 error->gtier[3] =
1578 intel_uncore_read(uncore,
1579 GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1580 error->gtier[4] =
1581 intel_uncore_read(uncore,
1582 GEN11_CRYPTO_RSVD_INTR_ENABLE);
1583 error->gtier[5] =
1584 intel_uncore_read(uncore,
1585 GEN11_GUNIT_CSME_INTR_ENABLE);
6b7a6a7b 1586 error->ngtier = 6;
4f5fd91f
TU
1587 } else if (INTEL_GEN(i915) >= 8) {
1588 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
885ea5a8 1589 for (i = 0; i < 4; i++)
4f5fd91f
TU
1590 error->gtier[i] = intel_uncore_read(uncore,
1591 GEN8_GT_IER(i));
5a4c6f1b 1592 error->ngtier = 4;
4f5fd91f
TU
1593 } else if (HAS_PCH_SPLIT(i915)) {
1594 error->ier = intel_uncore_read(uncore, DEIER);
1595 error->gtier[0] = intel_uncore_read(uncore, GTIER);
5a4c6f1b 1596 error->ngtier = 1;
4f5fd91f
TU
1597 } else if (IS_GEN(i915, 2)) {
1598 error->ier = intel_uncore_read16(uncore, GEN2_IER);
1599 } else if (!IS_VALLEYVIEW(i915)) {
1600 error->ier = intel_uncore_read(uncore, GEN2_IER);
654c90c6 1601 }
4f5fd91f
TU
1602 error->eir = intel_uncore_read(uncore, EIR);
1603 error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1d762aad
BW
1604}
1605
eb8d0f5a 1606static const char *
3a891a62
CW
1607error_msg(struct i915_gpu_state *error,
1608 intel_engine_mask_t engines, const char *msg)
cb383002 1609{
eb8d0f5a 1610 int len;
cb383002 1611
58174462 1612 len = scnprintf(error->error_msg, sizeof(error->error_msg),
3a891a62 1613 "GPU HANG: ecode %d:%x:0x%08x",
eb8d0f5a 1614 INTEL_GEN(error->i915), engines,
c990b4c3
CW
1615 i915_error_generate_code(error));
1616 if (error->engine) {
eb8d0f5a 1617 /* Just show the first executing process, more is confusing */
58174462
MK
1618 len += scnprintf(error->error_msg + len,
1619 sizeof(error->error_msg) - len,
1620 ", in %s [%d]",
c990b4c3
CW
1621 error->engine->context.comm,
1622 error->engine->context.pid);
eb8d0f5a
CW
1623 }
1624 if (msg)
1625 len += scnprintf(error->error_msg + len,
1626 sizeof(error->error_msg) - len,
1627 ", %s", msg);
58174462 1628
eb8d0f5a 1629 return error->error_msg;
cb383002
MK
1630}
1631
53b725c7 1632static void capture_gen_state(struct i915_gpu_state *error)
48b031e3 1633{
53b725c7
DCS
1634 struct drm_i915_private *i915 = error->i915;
1635
1636 error->awake = i915->gt.awake;
1637 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1638 error->suspended = i915->runtime_pm.suspended;
f73b5674 1639
eb5be9d0
CW
1640 error->iommu = -1;
1641#ifdef CONFIG_INTEL_IOMMU
1642 error->iommu = intel_iommu_gfx_mapped;
1643#endif
53b725c7
DCS
1644 error->reset_count = i915_reset_count(&i915->gpu_error);
1645 error->suspend_count = i915->suspend_count;
2bd160a1
CW
1646
1647 memcpy(&error->device_info,
53b725c7 1648 INTEL_INFO(i915),
2bd160a1 1649 sizeof(error->device_info));
0258404f
JN
1650 memcpy(&error->runtime_info,
1651 RUNTIME_INFO(i915),
1652 sizeof(error->runtime_info));
53b725c7 1653 error->driver_caps = i915->caps;
48b031e3
MK
1654}
1655
84a20a8a
MW
1656static void capture_params(struct i915_gpu_state *error)
1657{
4081cef9 1658 i915_params_copy(&error->params, &i915_modparams);
84a20a8a
MW
1659}
1660
043477b0
MK
1661static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1662{
c990b4c3 1663 const struct drm_i915_error_engine *ee;
043477b0 1664 unsigned long epoch = error->capture;
043477b0 1665
c990b4c3 1666 for (ee = error->engine; ee; ee = ee->next) {
eb8d0f5a 1667 if (ee->hangcheck_timestamp &&
043477b0
MK
1668 time_before(ee->hangcheck_timestamp, epoch))
1669 epoch = ee->hangcheck_timestamp;
1670 }
1671
1672 return epoch;
1673}
1674
8f5c6fe4
CW
1675static void capture_finish(struct i915_gpu_state *error)
1676{
1677 struct i915_ggtt *ggtt = &error->i915->ggtt;
1678 const u64 slot = ggtt->error_capture.start;
1679
1680 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1681}
1682
eafc4894
CW
1683#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1684
5a4c6f1b
CW
1685struct i915_gpu_state *
1686i915_capture_gpu_state(struct drm_i915_private *i915)
1687{
1688 struct i915_gpu_state *error;
3bdd4f84 1689 struct compress compress;
5a4c6f1b 1690
e6154e4c
CW
1691 /* Check if GPU capture has been disabled */
1692 error = READ_ONCE(i915->gpu_error.first_error);
1693 if (IS_ERR(error))
1694 return error;
1695
3bdd4f84 1696 error = kzalloc(sizeof(*error), ALLOW_FAIL);
e6154e4c
CW
1697 if (!error) {
1698 i915_disable_error_state(i915, -ENOMEM);
1699 return ERR_PTR(-ENOMEM);
1700 }
5a4c6f1b 1701
3bdd4f84
CW
1702 if (!compress_init(&compress)) {
1703 kfree(error);
1704 i915_disable_error_state(i915, -ENOMEM);
1705 return ERR_PTR(-ENOMEM);
1706 }
1707
5a4c6f1b
CW
1708 kref_init(&error->ref);
1709 error->i915 = i915;
1710
3bdd4f84
CW
1711 error->time = ktime_get_real();
1712 error->boottime = ktime_get_boottime();
1713 error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
1714 error->capture = jiffies;
1715
1716 capture_params(error);
1717 capture_gen_state(error);
1718 capture_uc_state(error, &compress);
1719 capture_reg_state(error);
1720 gem_record_fences(error);
1721 gem_record_rings(error, &compress);
1722
1723 error->overlay = intel_overlay_capture_error_state(i915);
1724 error->display = intel_display_capture_error_state(i915);
1725
1726 error->epoch = capture_find_epoch(error);
1727
1728 capture_finish(error);
1729 compress_fini(&compress);
5a4c6f1b
CW
1730
1731 return error;
1732}
1733
1d762aad
BW
1734/**
1735 * i915_capture_error_state - capture an error record for later analysis
d03133a8
CW
1736 * @i915: i915 device
1737 * @engine_mask: the mask of engines triggering the hang
eb8d0f5a 1738 * @msg: a message to insert into the error capture header
1d762aad
BW
1739 *
1740 * Should be called when an error is detected (either a hang or an error
1741 * interrupt) to capture error state from the time of the error. Fills
1742 * out a structure which becomes available in debugfs for user level tools
1743 * to pick up.
1744 */
d03133a8 1745void i915_capture_error_state(struct drm_i915_private *i915,
3a891a62 1746 intel_engine_mask_t engine_mask,
eb8d0f5a 1747 const char *msg)
1d762aad 1748{
53a4c6b2 1749 static bool warned;
5a4c6f1b 1750 struct i915_gpu_state *error;
1d762aad 1751 unsigned long flags;
1d762aad 1752
4f044a88 1753 if (!i915_modparams.error_capture)
98a2f411
CW
1754 return;
1755
d03133a8 1756 if (READ_ONCE(i915->gpu_error.first_error))
9777cca0
CW
1757 return;
1758
d03133a8 1759 error = i915_capture_gpu_state(i915);
e6154e4c 1760 if (IS_ERR(error))
1d762aad 1761 return;
1d762aad 1762
eb8d0f5a 1763 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
cb383002 1764
bc3d6744 1765 if (!error->simulated) {
d03133a8
CW
1766 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1767 if (!i915->gpu_error.first_error) {
1768 i915->gpu_error.first_error = error;
bc3d6744
CW
1769 error = NULL;
1770 }
d03133a8 1771 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
84734a04 1772 }
84734a04 1773
cb383002 1774 if (error) {
5a4c6f1b 1775 __i915_gpu_state_free(&error->ref);
cb383002
MK
1776 return;
1777 }
1778
a1e37b02 1779 if (!xchg(&warned, true) &&
eafc4894 1780 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
88f8065c
CW
1781 pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1782 pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1783 pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1784 pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n");
1785 pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1786 i915->drm.primary->index);
cb383002 1787 }
84734a04
MK
1788}
1789
5a4c6f1b
CW
1790struct i915_gpu_state *
1791i915_first_error_state(struct drm_i915_private *i915)
84734a04 1792{
5a4c6f1b 1793 struct i915_gpu_state *error;
84734a04 1794
5a4c6f1b
CW
1795 spin_lock_irq(&i915->gpu_error.lock);
1796 error = i915->gpu_error.first_error;
e6154e4c 1797 if (!IS_ERR_OR_NULL(error))
5a4c6f1b
CW
1798 i915_gpu_state_get(error);
1799 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 1800
5a4c6f1b 1801 return error;
84734a04
MK
1802}
1803
5a4c6f1b 1804void i915_reset_error_state(struct drm_i915_private *i915)
84734a04 1805{
5a4c6f1b 1806 struct i915_gpu_state *error;
84734a04 1807
5a4c6f1b
CW
1808 spin_lock_irq(&i915->gpu_error.lock);
1809 error = i915->gpu_error.first_error;
e6154e4c
CW
1810 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1811 i915->gpu_error.first_error = NULL;
5a4c6f1b 1812 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 1813
e6154e4c 1814 if (!IS_ERR_OR_NULL(error))
fb6f0b64
CW
1815 i915_gpu_state_put(error);
1816}
1817
1818void i915_disable_error_state(struct drm_i915_private *i915, int err)
1819{
1820 spin_lock_irq(&i915->gpu_error.lock);
1821 if (!i915->gpu_error.first_error)
1822 i915->gpu_error.first_error = ERR_PTR(err);
1823 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 1824}