drm/i915: add a helper to make a copy of i915_params
[linux-block.git] / drivers / gpu / drm / i915 / i915_gpu_error.c
CommitLineData
84734a04
MK
1/*
2 * Copyright (c) 2008 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 * Keith Packard <keithp@keithp.com>
26 * Mika Kuoppala <mika.kuoppala@intel.com>
27 *
28 */
29
0e39037b
CW
30#include <linux/ascii85.h>
31#include <linux/nmi.h>
32#include <linux/scatterlist.h>
9f267eb8 33#include <linux/stop_machine.h>
0e39037b 34#include <linux/utsname.h>
0a97015d 35#include <linux/zlib.h>
0e39037b 36
7d41ef34
MW
37#include <drm/drm_print.h>
38
d897a111 39#include "i915_gpu_error.h"
84734a04
MK
40#include "i915_drv.h"
41
1edf6958
MT
42static inline const struct intel_engine_cs *
43engine_lookup(const struct drm_i915_private *i915, unsigned int id)
44{
45 if (id >= I915_NUM_ENGINES)
46 return NULL;
47
48 return i915->engine[id];
49}
50
51static inline const char *
52__engine_name(const struct intel_engine_cs *engine)
53{
54 return engine ? engine->name : "";
55}
56
57static const char *
58engine_name(const struct drm_i915_private *i915, unsigned int id)
59{
60 return __engine_name(engine_lookup(i915, id));
84734a04
MK
61}
62
84734a04
MK
63static const char *tiling_flag(int tiling)
64{
65 switch (tiling) {
66 default:
67 case I915_TILING_NONE: return "";
68 case I915_TILING_X: return " X";
69 case I915_TILING_Y: return " Y";
70 }
71}
72
73static const char *dirty_flag(int dirty)
74{
75 return dirty ? " dirty" : "";
76}
77
78static const char *purgeable_flag(int purgeable)
79{
80 return purgeable ? " purgeable" : "";
81}
82
0e39037b
CW
83static void __sg_set_buf(struct scatterlist *sg,
84 void *addr, unsigned int len, loff_t it)
84734a04 85{
0e39037b
CW
86 sg->page_link = (unsigned long)virt_to_page(addr);
87 sg->offset = offset_in_page(addr);
88 sg->length = len;
89 sg->dma_address = it;
84734a04
MK
90}
91
0e39037b 92static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
84734a04 93{
0e39037b 94 if (!len)
84734a04 95 return false;
84734a04 96
0e39037b
CW
97 if (e->bytes + len + 1 <= e->size)
98 return true;
99
100 if (e->bytes) {
101 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
102 e->iter += e->bytes;
103 e->buf = NULL;
104 e->bytes = 0;
84734a04
MK
105 }
106
0e39037b
CW
107 if (e->cur == e->end) {
108 struct scatterlist *sgl;
84734a04 109
0e39037b
CW
110 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
111 if (!sgl) {
112 e->err = -ENOMEM;
113 return false;
114 }
84734a04 115
0e39037b
CW
116 if (e->cur) {
117 e->cur->offset = 0;
118 e->cur->length = 0;
119 e->cur->page_link =
120 (unsigned long)sgl | SG_CHAIN;
121 } else {
122 e->sgl = sgl;
84734a04
MK
123 }
124
0e39037b
CW
125 e->cur = sgl;
126 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
84734a04
MK
127 }
128
0e39037b
CW
129 e->size = ALIGN(len + 1, SZ_64K);
130 e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
131 if (!e->buf) {
132 e->size = PAGE_ALIGN(len + 1);
133 e->buf = kmalloc(e->size, GFP_KERNEL);
134 }
135 if (!e->buf) {
136 e->err = -ENOMEM;
137 return false;
138 }
139
140 return true;
84734a04
MK
141}
142
dda35931 143__printf(2, 0)
84734a04 144static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
0e39037b 145 const char *fmt, va_list args)
84734a04 146{
0e39037b
CW
147 va_list ap;
148 int len;
84734a04 149
0e39037b 150 if (e->err)
84734a04
MK
151 return;
152
0e39037b
CW
153 va_copy(ap, args);
154 len = vsnprintf(NULL, 0, fmt, ap);
155 va_end(ap);
156 if (len <= 0) {
157 e->err = len;
158 return;
84734a04
MK
159 }
160
0e39037b
CW
161 if (!__i915_error_grow(e, len))
162 return;
84734a04 163
0e39037b
CW
164 GEM_BUG_ON(e->bytes >= e->size);
165 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
166 if (len < 0) {
167 e->err = len;
168 return;
169 }
170 e->bytes += len;
84734a04
MK
171}
172
0e39037b 173static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
84734a04
MK
174{
175 unsigned len;
176
0e39037b 177 if (e->err || !str)
84734a04
MK
178 return;
179
180 len = strlen(str);
0e39037b
CW
181 if (!__i915_error_grow(e, len))
182 return;
84734a04 183
0e39037b 184 GEM_BUG_ON(e->bytes + len > e->size);
84734a04 185 memcpy(e->buf + e->bytes, str, len);
0e39037b 186 e->bytes += len;
84734a04
MK
187}
188
189#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
190#define err_puts(e, s) i915_error_puts(e, s)
191
7d41ef34
MW
192static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
193{
194 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
195}
196
197static inline struct drm_printer
198i915_error_printer(struct drm_i915_error_state_buf *e)
199{
200 struct drm_printer p = {
201 .printfn = __i915_printfn_error,
202 .arg = e,
203 };
204 return p;
205}
206
0a97015d
CW
207#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
208
d637c178
CW
209struct compress {
210 struct z_stream_s zstream;
211 void *tmp;
212};
213
214static bool compress_init(struct compress *c)
0a97015d 215{
d637c178 216 struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
0a97015d
CW
217
218 zstream->workspace =
219 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
220 GFP_ATOMIC | __GFP_NOWARN);
221 if (!zstream->workspace)
222 return false;
223
224 if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
225 kfree(zstream->workspace);
226 return false;
227 }
228
d637c178 229 c->tmp = NULL;
c4d3ae68 230 if (i915_has_memcpy_from_wc())
d637c178
CW
231 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
232
0a97015d
CW
233 return true;
234}
235
83bc0f5b
CW
236static void *compress_next_page(struct drm_i915_error_object *dst)
237{
238 unsigned long page;
239
240 if (dst->page_count >= dst->num_pages)
241 return ERR_PTR(-ENOSPC);
242
243 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
244 if (!page)
245 return ERR_PTR(-ENOMEM);
246
247 return dst->pages[dst->page_count++] = (void *)page;
248}
249
d637c178 250static int compress_page(struct compress *c,
0a97015d
CW
251 void *src,
252 struct drm_i915_error_object *dst)
253{
d637c178
CW
254 struct z_stream_s *zstream = &c->zstream;
255
0a97015d 256 zstream->next_in = src;
d637c178
CW
257 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
258 zstream->next_in = c->tmp;
0a97015d
CW
259 zstream->avail_in = PAGE_SIZE;
260
261 do {
262 if (zstream->avail_out == 0) {
83bc0f5b
CW
263 zstream->next_out = compress_next_page(dst);
264 if (IS_ERR(zstream->next_out))
265 return PTR_ERR(zstream->next_out);
0a97015d 266
0a97015d
CW
267 zstream->avail_out = PAGE_SIZE;
268 }
269
83bc0f5b 270 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
0a97015d 271 return -EIO;
0e39037b
CW
272
273 touch_nmi_watchdog();
0a97015d
CW
274 } while (zstream->avail_in);
275
276 /* Fallback to uncompressed if we increase size? */
277 if (0 && zstream->total_out > zstream->total_in)
278 return -E2BIG;
279
280 return 0;
281}
282
83bc0f5b 283static int compress_flush(struct compress *c,
0a97015d
CW
284 struct drm_i915_error_object *dst)
285{
d637c178
CW
286 struct z_stream_s *zstream = &c->zstream;
287
83bc0f5b
CW
288 do {
289 switch (zlib_deflate(zstream, Z_FINISH)) {
290 case Z_OK: /* more space requested */
291 zstream->next_out = compress_next_page(dst);
292 if (IS_ERR(zstream->next_out))
293 return PTR_ERR(zstream->next_out);
294
295 zstream->avail_out = PAGE_SIZE;
296 break;
297
298 case Z_STREAM_END:
299 goto end;
300
301 default: /* any error */
302 return -EIO;
303 }
304 } while (1);
305
306end:
307 memset(zstream->next_out, 0, zstream->avail_out);
308 dst->unused = zstream->avail_out;
309 return 0;
310}
311
312static void compress_fini(struct compress *c,
313 struct drm_i915_error_object *dst)
314{
315 struct z_stream_s *zstream = &c->zstream;
0a97015d
CW
316
317 zlib_deflateEnd(zstream);
318 kfree(zstream->workspace);
d637c178
CW
319 if (c->tmp)
320 free_page((unsigned long)c->tmp);
0a97015d
CW
321}
322
323static void err_compression_marker(struct drm_i915_error_state_buf *m)
324{
325 err_puts(m, ":");
326}
327
328#else
329
d637c178
CW
330struct compress {
331};
332
333static bool compress_init(struct compress *c)
0a97015d
CW
334{
335 return true;
336}
337
d637c178 338static int compress_page(struct compress *c,
0a97015d
CW
339 void *src,
340 struct drm_i915_error_object *dst)
341{
342 unsigned long page;
d637c178 343 void *ptr;
0a97015d
CW
344
345 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
346 if (!page)
347 return -ENOMEM;
348
d637c178
CW
349 ptr = (void *)page;
350 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
351 memcpy(ptr, src, PAGE_SIZE);
352 dst->pages[dst->page_count++] = ptr;
0a97015d
CW
353
354 return 0;
355}
356
83bc0f5b
CW
357static int compress_flush(struct compress *c,
358 struct drm_i915_error_object *dst)
359{
360 return 0;
361}
362
d637c178 363static void compress_fini(struct compress *c,
0a97015d
CW
364 struct drm_i915_error_object *dst)
365{
366}
367
368static void err_compression_marker(struct drm_i915_error_state_buf *m)
369{
370 err_puts(m, "~");
371}
372
373#endif
374
84734a04
MK
375static void print_error_buffers(struct drm_i915_error_state_buf *m,
376 const char *name,
377 struct drm_i915_error_buffer *err,
378 int count)
379{
c0ce4663 380 err_printf(m, "%s [%d]:\n", name, count);
84734a04
MK
381
382 while (count--) {
5c3f8c22 383 err_printf(m, " %08x_%08x %8u %02x %02x %02x",
e1f12325
MT
384 upper_32_bits(err->gtt_offset),
385 lower_32_bits(err->gtt_offset),
84734a04
MK
386 err->size,
387 err->read_domains,
5c3f8c22
CW
388 err->write_domain,
389 err->wseqno);
84734a04
MK
390 err_puts(m, tiling_flag(err->tiling));
391 err_puts(m, dirty_flag(err->dirty));
392 err_puts(m, purgeable_flag(err->purgeable));
5cc9ed4b 393 err_puts(m, err->userptr ? " userptr" : "");
6361f4ba 394 err_puts(m, err->engine != -1 ? " " : "");
1edf6958 395 err_puts(m, engine_name(m->i915, err->engine));
0a4cd7c8 396 err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
84734a04
MK
397
398 if (err->name)
399 err_printf(m, " (name: %d)", err->name);
400 if (err->fence_reg != I915_FENCE_REG_NONE)
401 err_printf(m, " (fence: %d)", err->fence_reg);
402
403 err_puts(m, "\n");
404 err++;
405 }
406}
407
d636951e 408static void error_print_instdone(struct drm_i915_error_state_buf *m,
5a4c6f1b 409 const struct drm_i915_error_engine *ee)
d636951e 410{
f9e61372
BW
411 int slice;
412 int subslice;
413
d636951e
BW
414 err_printf(m, " INSTDONE: 0x%08x\n",
415 ee->instdone.instdone);
416
417 if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3)
418 return;
419
420 err_printf(m, " SC_INSTDONE: 0x%08x\n",
421 ee->instdone.slice_common);
422
423 if (INTEL_GEN(m->i915) <= 6)
424 return;
425
f9e61372
BW
426 for_each_instdone_slice_subslice(m->i915, slice, subslice)
427 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
428 slice, subslice,
429 ee->instdone.sampler[slice][subslice]);
430
431 for_each_instdone_slice_subslice(m->i915, slice, subslice)
432 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
433 slice, subslice,
434 ee->instdone.row[slice][subslice]);
d636951e
BW
435}
436
302e55d7
CW
437static const char *bannable(const struct drm_i915_error_context *ctx)
438{
439 return ctx->bannable ? "" : " (unbannable)";
440}
441
35ca039e
CW
442static void error_print_request(struct drm_i915_error_state_buf *m,
443 const char *prefix,
043477b0
MK
444 const struct drm_i915_error_request *erq,
445 const unsigned long epoch)
35ca039e
CW
446{
447 if (!erq->seqno)
448 return;
449
3a068721 450 err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
84102171 451 prefix, erq->pid, erq->ban_score,
b7268c5e 452 erq->context, erq->seqno, erq->sched_attr.priority,
043477b0 453 jiffies_to_msecs(erq->jiffies - epoch),
3a068721 454 erq->start, erq->head, erq->tail);
35ca039e
CW
455}
456
4fa6053e
CW
457static void error_print_context(struct drm_i915_error_state_buf *m,
458 const char *header,
5a4c6f1b 459 const struct drm_i915_error_context *ctx)
4fa6053e 460{
302e55d7 461 err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
4fa6053e 462 header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
b7268c5e 463 ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
302e55d7 464 ctx->guilty, ctx->active);
4fa6053e
CW
465}
466
6361f4ba 467static void error_print_engine(struct drm_i915_error_state_buf *m,
043477b0
MK
468 const struct drm_i915_error_engine *ee,
469 const unsigned long epoch)
84734a04 470{
76e70087
MK
471 int n;
472
1edf6958
MT
473 err_printf(m, "%s command stream:\n",
474 engine_name(m->i915, ee->engine_id));
398c8a30 475 err_printf(m, " IDLE?: %s\n", yesno(ee->idle));
6361f4ba 476 err_printf(m, " START: 0x%08x\n", ee->start);
06392e3b 477 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
cdb324bd
CW
478 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
479 ee->tail, ee->rq_post, ee->rq_tail);
6361f4ba 480 err_printf(m, " CTL: 0x%08x\n", ee->ctl);
21a2c58a 481 err_printf(m, " MODE: 0x%08x\n", ee->mode);
6361f4ba
CW
482 err_printf(m, " HWS: 0x%08x\n", ee->hws);
483 err_printf(m, " ACTHD: 0x%08x %08x\n",
484 (u32)(ee->acthd>>32), (u32)ee->acthd);
485 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
486 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
d636951e
BW
487
488 error_print_instdone(m, ee);
489
03382dfb
CW
490 if (ee->batchbuffer) {
491 u64 start = ee->batchbuffer->gtt_offset;
492 u64 end = start + ee->batchbuffer->gtt_size;
493
494 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
495 upper_32_bits(start), lower_32_bits(start),
496 upper_32_bits(end), lower_32_bits(end));
497 }
6361f4ba 498 if (INTEL_GEN(m->i915) >= 4) {
03382dfb 499 err_printf(m, " BBADDR: 0x%08x_%08x\n",
6361f4ba
CW
500 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
501 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
502 err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
3dda20a9 503 }
6361f4ba
CW
504 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
505 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
506 lower_32_bits(ee->faddr));
507 if (INTEL_GEN(m->i915) >= 6) {
508 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
509 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
85e17f59
CW
510 err_printf(m, " SYNC_0: 0x%08x\n",
511 ee->semaphore_mboxes[0]);
512 err_printf(m, " SYNC_1: 0x%08x\n",
513 ee->semaphore_mboxes[1]);
514 if (HAS_VEBOX(m->i915))
515 err_printf(m, " SYNC_2: 0x%08x\n",
516 ee->semaphore_mboxes[2]);
84734a04 517 }
4bdafb9d 518 if (HAS_PPGTT(m->i915)) {
6361f4ba 519 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
6c7a01ec 520
6361f4ba 521 if (INTEL_GEN(m->i915) >= 8) {
6c7a01ec
BW
522 int i;
523 for (i = 0; i < 4; i++)
524 err_printf(m, " PDP%d: 0x%016llx\n",
6361f4ba 525 i, ee->vm_info.pdp[i]);
6c7a01ec
BW
526 } else {
527 err_printf(m, " PP_DIR_BASE: 0x%08x\n",
6361f4ba 528 ee->vm_info.pp_dir_base);
6c7a01ec
BW
529 }
530 }
6361f4ba
CW
531 err_printf(m, " seqno: 0x%08x\n", ee->seqno);
532 err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno);
533 err_printf(m, " waiting: %s\n", yesno(ee->waiting));
534 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
535 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
3fe3b030
MK
536 err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
537 err_printf(m, " hangcheck action: %s\n",
538 hangcheck_action_to_str(ee->hangcheck_action));
043477b0
MK
539 err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n",
540 jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
3fe3b030 541 ee->hangcheck_timestamp,
043477b0 542 ee->hangcheck_timestamp == epoch ? "; epoch" : "");
702c8f8e 543 err_printf(m, " engine reset count: %u\n", ee->reset_count);
3fe3b030 544
76e70087
MK
545 for (n = 0; n < ee->num_ports; n++) {
546 err_printf(m, " ELSP[%d]:", n);
043477b0 547 error_print_request(m, " ", &ee->execlist[n], epoch);
76e70087
MK
548 }
549
4fa6053e 550 error_print_context(m, " Active context: ", &ee->context);
84734a04
MK
551}
552
553void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
554{
555 va_list args;
556
557 va_start(args, f);
558 i915_error_vprintf(e, f, args);
559 va_end(args);
560}
561
ab0e7ff9 562static void print_error_obj(struct drm_i915_error_state_buf *m,
fc4c79c3
CW
563 struct intel_engine_cs *engine,
564 const char *name,
ab0e7ff9
CW
565 struct drm_i915_error_object *obj)
566{
489cae63 567 char out[ASCII85_BUFSZ];
0a97015d 568 int page;
ab0e7ff9 569
fc4c79c3
CW
570 if (!obj)
571 return;
572
573 if (name) {
574 err_printf(m, "%s --- %s = 0x%08x %08x\n",
575 engine ? engine->name : "global", name,
576 upper_32_bits(obj->gtt_offset),
577 lower_32_bits(obj->gtt_offset));
578 }
579
0a97015d
CW
580 err_compression_marker(m);
581 for (page = 0; page < obj->page_count; page++) {
582 int i, len;
583
584 len = PAGE_SIZE;
585 if (page == obj->page_count - 1)
586 len -= obj->unused;
587 len = ascii85_encode_len(len);
588
489cae63
JC
589 for (i = 0; i < len; i++)
590 err_puts(m, ascii85_encode(obj->pages[page][i], out));
ab0e7ff9 591 }
0a97015d 592 err_puts(m, "\n");
ab0e7ff9
CW
593}
594
2bd160a1 595static void err_print_capabilities(struct drm_i915_error_state_buf *m,
3fed1808
CW
596 const struct intel_device_info *info,
597 const struct intel_driver_caps *caps)
2bd160a1 598{
a8c9b849
MW
599 struct drm_printer p = i915_error_printer(m);
600
601 intel_device_info_dump_flags(info, &p);
3fed1808 602 intel_driver_caps_print(caps, &p);
cac6cfaa 603 intel_device_info_dump_topology(&info->sseu, &p);
2bd160a1
CW
604}
605
642c8a72 606static void err_print_params(struct drm_i915_error_state_buf *m,
acfb9973 607 const struct i915_params *params)
642c8a72 608{
acfb9973
MW
609 struct drm_printer p = i915_error_printer(m);
610
611 i915_params_dump(params, &p);
642c8a72
CW
612}
613
5a4c6f1b
CW
614static void err_print_pciid(struct drm_i915_error_state_buf *m,
615 struct drm_i915_private *i915)
616{
617 struct pci_dev *pdev = i915->drm.pdev;
618
619 err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
620 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
621 err_printf(m, "PCI Subsystem: %04x:%04x\n",
622 pdev->subsystem_vendor,
623 pdev->subsystem_device);
624}
625
7d41ef34
MW
626static void err_print_uc(struct drm_i915_error_state_buf *m,
627 const struct i915_error_uc *error_uc)
628{
629 struct drm_printer p = i915_error_printer(m);
630 const struct i915_gpu_state *error =
631 container_of(error_uc, typeof(*error), uc);
632
633 if (!error->device_info.has_guc)
634 return;
635
636 intel_uc_fw_dump(&error_uc->guc_fw, &p);
637 intel_uc_fw_dump(&error_uc->huc_fw, &p);
0397ac13 638 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
7d41ef34
MW
639}
640
0e39037b 641static void err_free_sgl(struct scatterlist *sgl)
84734a04 642{
0e39037b
CW
643 while (sgl) {
644 struct scatterlist *sg;
84734a04 645
0e39037b
CW
646 for (sg = sgl; !sg_is_chain(sg); sg++) {
647 kfree(sg_virt(sg));
648 if (sg_is_last(sg))
649 break;
650 }
651
652 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
653 free_page((unsigned long)sgl);
654 sgl = sg;
84734a04 655 }
0e39037b 656}
84734a04 657
0e39037b
CW
658static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
659 struct i915_gpu_state *error)
660{
661 struct drm_i915_error_object *obj;
662 struct timespec64 ts;
663 int i, j;
fb6f0b64 664
5a4c6f1b
CW
665 if (*error->error_msg)
666 err_printf(m, "%s\n", error->error_msg);
95fd94a6 667 err_printf(m, "Kernel: %s\n", init_utsname()->release);
c6270dbc
AB
668 ts = ktime_to_timespec64(error->time);
669 err_printf(m, "Time: %lld s %ld us\n",
670 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
671 ts = ktime_to_timespec64(error->boottime);
672 err_printf(m, "Boottime: %lld s %ld us\n",
673 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
674 ts = ktime_to_timespec64(error->uptime);
675 err_printf(m, "Uptime: %lld s %ld us\n",
676 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
043477b0
MK
677 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
678 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
679 error->capture,
680 jiffies_to_msecs(jiffies - error->capture),
681 jiffies_to_msecs(error->capture - error->epoch));
3fe3b030 682
6361f4ba 683 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
3fe3b030 684 if (error->engine[i].hangcheck_stalled &&
4fa6053e 685 error->engine[i].context.pid) {
302e55d7 686 err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
1edf6958 687 engine_name(m->i915, i),
4fa6053e
CW
688 error->engine[i].context.comm,
689 error->engine[i].context.pid,
302e55d7
CW
690 error->engine[i].context.ban_score,
691 bannable(&error->engine[i].context));
ab0e7ff9
CW
692 }
693 }
48b031e3 694 err_printf(m, "Reset count: %u\n", error->reset_count);
62d5d69b 695 err_printf(m, "Suspend count: %u\n", error->suspend_count);
2e0d26f8 696 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
0e39037b 697 err_print_pciid(m, m->i915);
642c8a72 698
eb5be9d0 699 err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
0ac7655c 700
0e39037b
CW
701 if (HAS_CSR(m->i915)) {
702 struct intel_csr *csr = &m->i915->csr;
0ac7655c
MK
703
704 err_printf(m, "DMC loaded: %s\n",
705 yesno(csr->dmc_payload != NULL));
706 err_printf(m, "DMC fw version: %d.%d\n",
707 CSR_VERSION_MAJOR(csr->version),
708 CSR_VERSION_MINOR(csr->version));
709 }
710
f73b5674 711 err_printf(m, "GT awake: %s\n", yesno(error->awake));
e5aac87e
CW
712 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
713 err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
84734a04
MK
714 err_printf(m, "EIR: 0x%08x\n", error->eir);
715 err_printf(m, "IER: 0x%08x\n", error->ier);
5a4c6f1b
CW
716 for (i = 0; i < error->ngtier; i++)
717 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
84734a04
MK
718 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
719 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
720 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
721 err_printf(m, "CCID: 0x%08x\n", error->ccid);
0e39037b
CW
722 err_printf(m, "Missed interrupts: 0x%08lx\n",
723 m->i915->gpu_error.missed_irq_rings);
84734a04 724
5a4c6f1b 725 for (i = 0; i < error->nfence; i++)
84734a04
MK
726 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
727
0e39037b 728 if (INTEL_GEN(m->i915) >= 6) {
84734a04 729 err_printf(m, "ERROR: 0x%08x\n", error->error);
6c826f34 730
0e39037b 731 if (INTEL_GEN(m->i915) >= 8)
6c826f34
MK
732 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
733 error->fault_data1, error->fault_data0);
734
84734a04
MK
735 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
736 }
737
cf819eff 738 if (IS_GEN(m->i915, 7))
84734a04
MK
739 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
740
6361f4ba
CW
741 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
742 if (error->engine[i].engine_id != -1)
043477b0 743 error_print_engine(m, &error->engine[i], error->epoch);
6361f4ba 744 }
84734a04 745
c0ce4663
CW
746 for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
747 char buf[128];
748 int len, first = 1;
3a448734 749
c0ce4663
CW
750 if (!error->active_vm[i])
751 break;
752
753 len = scnprintf(buf, sizeof(buf), "Active (");
754 for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
755 if (error->engine[j].vm != error->active_vm[i])
756 continue;
757
758 len += scnprintf(buf + len, sizeof(buf), "%s%s",
759 first ? "" : ", ",
0e39037b 760 m->i915->engine[j]->name);
c0ce4663
CW
761 first = 0;
762 }
763 scnprintf(buf + len, sizeof(buf), ")");
764 print_error_buffers(m, buf,
3a448734
CW
765 error->active_bo[i],
766 error->active_bo_count[i]);
3a448734 767 }
84734a04 768
c0ce4663
CW
769 print_error_buffers(m, "Pinned (global)",
770 error->pinned_bo,
771 error->pinned_bo_count);
772
6361f4ba 773 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
5a4c6f1b 774 const struct drm_i915_error_engine *ee = &error->engine[i];
6361f4ba
CW
775
776 obj = ee->batchbuffer;
ab0e7ff9 777 if (obj) {
0e39037b 778 err_puts(m, m->i915->engine[i]->name);
4fa6053e 779 if (ee->context.pid)
302e55d7 780 err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
4fa6053e
CW
781 ee->context.comm,
782 ee->context.pid,
783 ee->context.handle,
784 ee->context.hw_id,
302e55d7
CW
785 ee->context.ban_score,
786 bannable(&ee->context));
e1f12325
MT
787 err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
788 upper_32_bits(obj->gtt_offset),
789 lower_32_bits(obj->gtt_offset));
0e39037b 790 print_error_obj(m, m->i915->engine[i], NULL, obj);
84734a04
MK
791 }
792
b0fd47ad 793 for (j = 0; j < ee->user_bo_count; j++)
0e39037b 794 print_error_obj(m, m->i915->engine[i],
b0fd47ad
CW
795 "user", ee->user_bo[j]);
796
6361f4ba 797 if (ee->num_requests) {
84734a04 798 err_printf(m, "%s --- %d requests\n",
0e39037b 799 m->i915->engine[i]->name,
6361f4ba 800 ee->num_requests);
35ca039e 801 for (j = 0; j < ee->num_requests; j++)
043477b0
MK
802 error_print_request(m, " ",
803 &ee->requests[j],
804 error->epoch);
84734a04
MK
805 }
806
19eb9189
CW
807 if (IS_ERR(ee->waiters)) {
808 err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
0e39037b 809 m->i915->engine[i]->name);
19eb9189 810 } else if (ee->num_waiters) {
688e6c72 811 err_printf(m, "%s --- %d waiters\n",
0e39037b 812 m->i915->engine[i]->name,
6361f4ba
CW
813 ee->num_waiters);
814 for (j = 0; j < ee->num_waiters; j++) {
688e6c72 815 err_printf(m, " seqno 0x%08x for %s [%d]\n",
6361f4ba
CW
816 ee->waiters[j].seqno,
817 ee->waiters[j].comm,
818 ee->waiters[j].pid);
688e6c72
CW
819 }
820 }
821
0e39037b 822 print_error_obj(m, m->i915->engine[i],
fc4c79c3 823 "ringbuffer", ee->ringbuffer);
84734a04 824
0e39037b 825 print_error_obj(m, m->i915->engine[i],
fc4c79c3 826 "HW Status", ee->hws_page);
3a5a0393 827
0e39037b 828 print_error_obj(m, m->i915->engine[i],
fc4c79c3 829 "HW context", ee->ctx);
f3ce3821 830
0e39037b 831 print_error_obj(m, m->i915->engine[i],
fc4c79c3 832 "WA context", ee->wa_ctx);
f85db059 833
0e39037b 834 print_error_obj(m, m->i915->engine[i],
fc4c79c3 835 "WA batchbuffer", ee->wa_batchbuffer);
4e90a6e2 836
0e39037b 837 print_error_obj(m, m->i915->engine[i],
4e90a6e2 838 "NULL context", ee->default_state);
84734a04
MK
839 }
840
841 if (error->overlay)
842 intel_overlay_print_error_state(m, error->overlay);
843
844 if (error->display)
5a4c6f1b 845 intel_display_print_error_state(m, error->display);
84734a04 846
3fed1808 847 err_print_capabilities(m, &error->device_info, &error->driver_caps);
642c8a72 848 err_print_params(m, &error->params);
7d41ef34 849 err_print_uc(m, &error->uc);
0e39037b
CW
850}
851
852static int err_print_to_sgl(struct i915_gpu_state *error)
853{
854 struct drm_i915_error_state_buf m;
855
856 if (IS_ERR(error))
857 return PTR_ERR(error);
858
859 if (READ_ONCE(error->sgl))
860 return 0;
861
862 memset(&m, 0, sizeof(m));
863 m.i915 = error->i915;
864
865 __err_print_to_sgl(&m, error);
866
867 if (m.buf) {
868 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
869 m.bytes = 0;
870 m.buf = NULL;
871 }
872 if (m.cur) {
873 GEM_BUG_ON(m.end < m.cur);
874 sg_mark_end(m.cur - 1);
875 }
876 GEM_BUG_ON(m.sgl && !m.cur);
877
878 if (m.err) {
879 err_free_sgl(m.sgl);
880 return m.err;
881 }
642c8a72 882
0e39037b
CW
883 if (cmpxchg(&error->sgl, NULL, m.sgl))
884 err_free_sgl(m.sgl);
84734a04
MK
885
886 return 0;
887}
888
0e39037b
CW
889ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
890 char *buf, loff_t off, size_t rem)
84734a04 891{
0e39037b
CW
892 struct scatterlist *sg;
893 size_t count;
894 loff_t pos;
895 int err;
84734a04 896
0e39037b
CW
897 if (!error || !rem)
898 return 0;
84734a04 899
0e39037b
CW
900 err = err_print_to_sgl(error);
901 if (err)
902 return err;
84734a04 903
0e39037b
CW
904 sg = READ_ONCE(error->fit);
905 if (!sg || off < sg->dma_address)
906 sg = error->sgl;
907 if (!sg)
908 return 0;
84734a04 909
0e39037b
CW
910 pos = sg->dma_address;
911 count = 0;
912 do {
913 size_t len, start;
914
915 if (sg_is_chain(sg)) {
916 sg = sg_chain_ptr(sg);
917 GEM_BUG_ON(sg_is_chain(sg));
918 }
84734a04 919
0e39037b
CW
920 len = sg->length;
921 if (pos + len <= off) {
922 pos += len;
923 continue;
924 }
84734a04 925
0e39037b
CW
926 start = sg->offset;
927 if (pos < off) {
928 GEM_BUG_ON(off - pos > len);
929 len -= off - pos;
930 start += off - pos;
931 pos = off;
932 }
933
934 len = min(len, rem);
935 GEM_BUG_ON(!len || len > sg->length);
936
937 memcpy(buf, page_address(sg_page(sg)) + start, len);
938
939 count += len;
940 pos += len;
941
942 buf += len;
943 rem -= len;
944 if (!rem) {
945 WRITE_ONCE(error->fit, sg);
946 break;
947 }
948 } while (!sg_is_last(sg++));
949
950 return count;
84734a04
MK
951}
952
953static void i915_error_object_free(struct drm_i915_error_object *obj)
954{
955 int page;
956
957 if (obj == NULL)
958 return;
959
960 for (page = 0; page < obj->page_count; page++)
95374d75 961 free_page((unsigned long)obj->pages[page]);
84734a04
MK
962
963 kfree(obj);
964}
965
1d6aa7a3
CW
966static __always_inline void free_param(const char *type, void *x)
967{
968 if (!__builtin_strcmp(type, "char *"))
969 kfree(*(void **)x);
970}
971
84a20a8a
MW
972static void cleanup_params(struct i915_gpu_state *error)
973{
974#define FREE(T, x, ...) free_param(#T, &error->params.x);
975 I915_PARAMS_FOR_EACH(FREE);
976#undef FREE
977}
978
7d41ef34
MW
979static void cleanup_uc_state(struct i915_gpu_state *error)
980{
981 struct i915_error_uc *error_uc = &error->uc;
982
983 kfree(error_uc->guc_fw.path);
984 kfree(error_uc->huc_fw.path);
0397ac13 985 i915_error_object_free(error_uc->guc_log);
7d41ef34
MW
986}
987
5a4c6f1b 988void __i915_gpu_state_free(struct kref *error_ref)
84734a04 989{
5a4c6f1b
CW
990 struct i915_gpu_state *error =
991 container_of(error_ref, typeof(*error), ref);
b0fd47ad 992 long i, j;
84734a04 993
6361f4ba
CW
994 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
995 struct drm_i915_error_engine *ee = &error->engine[i];
996
b0fd47ad
CW
997 for (j = 0; j < ee->user_bo_count; j++)
998 i915_error_object_free(ee->user_bo[j]);
999 kfree(ee->user_bo);
1000
6361f4ba
CW
1001 i915_error_object_free(ee->batchbuffer);
1002 i915_error_object_free(ee->wa_batchbuffer);
1003 i915_error_object_free(ee->ringbuffer);
1004 i915_error_object_free(ee->hws_page);
1005 i915_error_object_free(ee->ctx);
1006 i915_error_object_free(ee->wa_ctx);
1007
1008 kfree(ee->requests);
19eb9189
CW
1009 if (!IS_ERR_OR_NULL(ee->waiters))
1010 kfree(ee->waiters);
84734a04
MK
1011 }
1012
c0ce4663 1013 for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
0b37a9a9 1014 kfree(error->active_bo[i]);
0b37a9a9 1015 kfree(error->pinned_bo);
c0ce4663 1016
84734a04
MK
1017 kfree(error->overlay);
1018 kfree(error->display);
1d6aa7a3 1019
84a20a8a 1020 cleanup_params(error);
7d41ef34
MW
1021 cleanup_uc_state(error);
1022
0e39037b 1023 err_free_sgl(error->sgl);
84734a04
MK
1024 kfree(error);
1025}
1026
1027static struct drm_i915_error_object *
95374d75 1028i915_error_object_create(struct drm_i915_private *i915,
058d88c4 1029 struct i915_vma *vma)
84734a04 1030{
95374d75
CW
1031 struct i915_ggtt *ggtt = &i915->ggtt;
1032 const u64 slot = ggtt->error_capture.start;
84734a04 1033 struct drm_i915_error_object *dst;
d637c178 1034 struct compress compress;
95374d75
CW
1035 unsigned long num_pages;
1036 struct sgt_iter iter;
1037 dma_addr_t dma;
83bc0f5b 1038 int ret;
84734a04 1039
058d88c4
CW
1040 if (!vma)
1041 return NULL;
1042
95374d75 1043 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
0a97015d 1044 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
95374d75
CW
1045 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1046 GFP_ATOMIC | __GFP_NOWARN);
058d88c4 1047 if (!dst)
84734a04
MK
1048 return NULL;
1049
03382dfb
CW
1050 dst->gtt_offset = vma->node.start;
1051 dst->gtt_size = vma->node.size;
83bc0f5b 1052 dst->num_pages = num_pages;
95374d75 1053 dst->page_count = 0;
0a97015d
CW
1054 dst->unused = 0;
1055
d637c178 1056 if (!compress_init(&compress)) {
0a97015d
CW
1057 kfree(dst);
1058 return NULL;
1059 }
03382dfb 1060
83bc0f5b 1061 ret = -EINVAL;
95374d75
CW
1062 for_each_sgt_dma(dma, iter, vma->pages) {
1063 void __iomem *s;
b3c3f5e6 1064
82ad6443 1065 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
b3c3f5e6 1066
73ebd503 1067 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
d637c178 1068 ret = compress_page(&compress, (void __force *)s, dst);
95374d75 1069 io_mapping_unmap_atomic(s);
95374d75 1070 if (ret)
83bc0f5b 1071 break;
84734a04 1072 }
84734a04 1073
83bc0f5b
CW
1074 if (ret || compress_flush(&compress, dst)) {
1075 while (dst->page_count--)
1076 free_page((unsigned long)dst->pages[dst->page_count]);
1077 kfree(dst);
1078 dst = NULL;
1079 }
95374d75 1080
d637c178 1081 compress_fini(&compress, dst);
95374d75 1082 return dst;
84734a04 1083}
84734a04 1084
d72d908b
CW
1085/* The error capture is special as tries to run underneath the normal
1086 * locking rules - so we use the raw version of the i915_gem_active lookup.
1087 */
1088static inline uint32_t
1089__active_get_seqno(struct i915_gem_active *active)
1090{
e61e0f51 1091 struct i915_request *request;
24327f83
JL
1092
1093 request = __i915_gem_active_peek(active);
1094 return request ? request->global_seqno : 0;
d72d908b
CW
1095}
1096
1097static inline int
1098__active_get_engine_id(struct i915_gem_active *active)
1099{
e61e0f51 1100 struct i915_request *request;
d72d908b 1101
24327f83
JL
1102 request = __i915_gem_active_peek(active);
1103 return request ? request->engine->id : -1;
d72d908b
CW
1104}
1105
84734a04 1106static void capture_bo(struct drm_i915_error_buffer *err,
3a448734 1107 struct i915_vma *vma)
84734a04 1108{
3a448734
CW
1109 struct drm_i915_gem_object *obj = vma->obj;
1110
84734a04
MK
1111 err->size = obj->base.size;
1112 err->name = obj->base.name;
d72d908b 1113
5b8c8aec
CW
1114 err->wseqno = __active_get_seqno(&obj->frontbuffer_write);
1115 err->engine = __active_get_engine_id(&obj->frontbuffer_write);
d72d908b 1116
3a448734 1117 err->gtt_offset = vma->node.start;
c0a51fd0
CK
1118 err->read_domains = obj->read_domains;
1119 err->write_domain = obj->write_domain;
49ef5294 1120 err->fence_reg = vma->fence ? vma->fence->id : -1;
3e510a8e 1121 err->tiling = i915_gem_object_get_tiling(obj);
a4f5ea64
CW
1122 err->dirty = obj->mm.dirty;
1123 err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
5cc9ed4b 1124 err->userptr = obj->userptr.mm != NULL;
84734a04
MK
1125 err->cache_level = obj->cache_level;
1126}
1127
c0ce4663
CW
1128static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1129 int count, struct list_head *head,
1130 bool pinned_only)
84734a04 1131{
ca191b13 1132 struct i915_vma *vma;
84734a04
MK
1133 int i = 0;
1134
1c7f4bca 1135 list_for_each_entry(vma, head, vm_link) {
520ea7c5
CW
1136 if (!vma->obj)
1137 continue;
1138
c0ce4663
CW
1139 if (pinned_only && !i915_vma_is_pinned(vma))
1140 continue;
1141
3a448734 1142 capture_bo(err++, vma);
84734a04
MK
1143 if (++i == count)
1144 break;
1145 }
1146
1147 return i;
1148}
1149
011cf577
BW
1150/* Generate a semi-unique error code. The code is not meant to have meaning, The
1151 * code's only purpose is to try to prevent false duplicated bug reports by
1152 * grossly estimating a GPU error state.
1153 *
1154 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1155 * the hang if we could strip the GTT offset information from it.
1156 *
1157 * It's only a small step better than a random number in its current form.
1158 */
1159static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
5a4c6f1b 1160 struct i915_gpu_state *error,
6361f4ba 1161 int *engine_id)
011cf577
BW
1162{
1163 uint32_t error_code = 0;
1164 int i;
1165
1166 /* IPEHR would be an ideal way to detect errors, as it's the gross
1167 * measure of "the command that hung." However, has some very common
1168 * synchronization commands which almost always appear in the case
1169 * strictly a client bug. Use instdone to differentiate those some.
1170 */
666796da 1171 for (i = 0; i < I915_NUM_ENGINES; i++) {
3fe3b030 1172 if (error->engine[i].hangcheck_stalled) {
6361f4ba
CW
1173 if (engine_id)
1174 *engine_id = i;
cb383002 1175
d636951e
BW
1176 return error->engine[i].ipehr ^
1177 error->engine[i].instdone.instdone;
cb383002
MK
1178 }
1179 }
011cf577
BW
1180
1181 return error_code;
1182}
1183
53b725c7 1184static void gem_record_fences(struct i915_gpu_state *error)
84734a04 1185{
53b725c7 1186 struct drm_i915_private *dev_priv = error->i915;
84734a04
MK
1187 int i;
1188
5a4c6f1b 1189 if (INTEL_GEN(dev_priv) >= 6) {
ce38ab05 1190 for (i = 0; i < dev_priv->num_fence_regs; i++)
5a4c6f1b
CW
1191 error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
1192 } else if (INTEL_GEN(dev_priv) >= 4) {
eecf613a
VS
1193 for (i = 0; i < dev_priv->num_fence_regs; i++)
1194 error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
5a4c6f1b 1195 } else {
eecf613a 1196 for (i = 0; i < dev_priv->num_fence_regs; i++)
5a4c6f1b 1197 error->fence[i] = I915_READ(FENCE_REG(i));
eecf613a 1198 }
5a4c6f1b 1199 error->nfence = i;
84734a04
MK
1200}
1201
6361f4ba
CW
1202static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
1203 struct drm_i915_error_engine *ee)
87f85ebc 1204{
6361f4ba
CW
1205 struct drm_i915_private *dev_priv = engine->i915;
1206
1207 ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base));
1208 ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base));
85e17f59 1209 if (HAS_VEBOX(dev_priv))
6361f4ba 1210 ee->semaphore_mboxes[2] =
0bc40be8 1211 I915_READ(RING_SYNC_2(engine->mmio_base));
87f85ebc
BW
1212}
1213
6361f4ba
CW
1214static void error_record_engine_waiters(struct intel_engine_cs *engine,
1215 struct drm_i915_error_engine *ee)
688e6c72
CW
1216{
1217 struct intel_breadcrumbs *b = &engine->breadcrumbs;
1218 struct drm_i915_error_waiter *waiter;
1219 struct rb_node *rb;
1220 int count;
1221
6361f4ba
CW
1222 ee->num_waiters = 0;
1223 ee->waiters = NULL;
688e6c72 1224
19eb9189
CW
1225 if (RB_EMPTY_ROOT(&b->waiters))
1226 return;
1227
61d3dc70 1228 if (!spin_trylock_irq(&b->rb_lock)) {
19eb9189
CW
1229 ee->waiters = ERR_PTR(-EDEADLK);
1230 return;
1231 }
1232
688e6c72
CW
1233 count = 0;
1234 for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
1235 count++;
61d3dc70 1236 spin_unlock_irq(&b->rb_lock);
688e6c72
CW
1237
1238 waiter = NULL;
1239 if (count)
1240 waiter = kmalloc_array(count,
1241 sizeof(struct drm_i915_error_waiter),
1242 GFP_ATOMIC);
1243 if (!waiter)
1244 return;
1245
61d3dc70 1246 if (!spin_trylock_irq(&b->rb_lock)) {
19eb9189
CW
1247 kfree(waiter);
1248 ee->waiters = ERR_PTR(-EDEADLK);
1249 return;
1250 }
688e6c72 1251
19eb9189 1252 ee->waiters = waiter;
688e6c72 1253 for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
e27414a0 1254 struct intel_wait *w = rb_entry(rb, typeof(*w), node);
688e6c72
CW
1255
1256 strcpy(waiter->comm, w->tsk->comm);
1257 waiter->pid = w->tsk->pid;
1258 waiter->seqno = w->seqno;
1259 waiter++;
1260
6361f4ba 1261 if (++ee->num_waiters == count)
688e6c72
CW
1262 break;
1263 }
61d3dc70 1264 spin_unlock_irq(&b->rb_lock);
688e6c72
CW
1265}
1266
5a4c6f1b 1267static void error_record_engine_registers(struct i915_gpu_state *error,
6361f4ba
CW
1268 struct intel_engine_cs *engine,
1269 struct drm_i915_error_engine *ee)
84734a04 1270{
6361f4ba
CW
1271 struct drm_i915_private *dev_priv = engine->i915;
1272
c033666a 1273 if (INTEL_GEN(dev_priv) >= 6) {
6361f4ba 1274 ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base));
b03ec3d6 1275 if (INTEL_GEN(dev_priv) >= 8) {
b03ec3d6
MT
1276 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1277 } else {
6361f4ba 1278 gen6_record_semaphore_state(engine, ee);
b03ec3d6
MT
1279 ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
1280 }
4e5aabfd
BW
1281 }
1282
c033666a 1283 if (INTEL_GEN(dev_priv) >= 4) {
6361f4ba
CW
1284 ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base));
1285 ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base));
1286 ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
6361f4ba
CW
1287 ee->instps = I915_READ(RING_INSTPS(engine->mmio_base));
1288 ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base));
c033666a 1289 if (INTEL_GEN(dev_priv) >= 8) {
6361f4ba
CW
1290 ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32;
1291 ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32;
13ffadd1 1292 }
6361f4ba 1293 ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base));
84734a04 1294 } else {
6361f4ba
CW
1295 ee->faddr = I915_READ(DMA_FADD_I8XX);
1296 ee->ipeir = I915_READ(IPEIR);
1297 ee->ipehr = I915_READ(IPEHR);
84734a04
MK
1298 }
1299
0e704476 1300 intel_engine_get_instdone(engine, &ee->instdone);
d636951e 1301
6361f4ba
CW
1302 ee->waiting = intel_engine_has_waiter(engine);
1303 ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
7e37f889 1304 ee->acthd = intel_engine_get_active_head(engine);
6361f4ba 1305 ee->seqno = intel_engine_get_seqno(engine);
cb399eab 1306 ee->last_seqno = intel_engine_last_submit(engine);
6361f4ba
CW
1307 ee->start = I915_READ_START(engine);
1308 ee->head = I915_READ_HEAD(engine);
1309 ee->tail = I915_READ_TAIL(engine);
1310 ee->ctl = I915_READ_CTL(engine);
21a2c58a
CW
1311 if (INTEL_GEN(dev_priv) > 2)
1312 ee->mode = I915_READ_MODE(engine);
84734a04 1313
3177659a 1314 if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
f0f59a00 1315 i915_reg_t mmio;
f3ce3821 1316
cf819eff 1317 if (IS_GEN(dev_priv, 7)) {
0bc40be8 1318 switch (engine->id) {
f3ce3821
CW
1319 default:
1320 case RCS:
1321 mmio = RENDER_HWS_PGA_GEN7;
1322 break;
1323 case BCS:
1324 mmio = BLT_HWS_PGA_GEN7;
1325 break;
1326 case VCS:
1327 mmio = BSD_HWS_PGA_GEN7;
1328 break;
1329 case VECS:
1330 mmio = VEBOX_HWS_PGA_GEN7;
1331 break;
1332 }
cf819eff 1333 } else if (IS_GEN(engine->i915, 6)) {
0bc40be8 1334 mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
f3ce3821
CW
1335 } else {
1336 /* XXX: gen8 returns to sanity */
0bc40be8 1337 mmio = RING_HWS_PGA(engine->mmio_base);
f3ce3821
CW
1338 }
1339
6361f4ba 1340 ee->hws = I915_READ(mmio);
f3ce3821
CW
1341 }
1342
398c8a30 1343 ee->idle = intel_engine_is_idle(engine);
3fe3b030 1344 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
6361f4ba 1345 ee->hangcheck_action = engine->hangcheck.action;
3fe3b030 1346 ee->hangcheck_stalled = engine->hangcheck.stalled;
702c8f8e
MT
1347 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1348 engine);
6c7a01ec 1349
4bdafb9d 1350 if (HAS_PPGTT(dev_priv)) {
6c7a01ec
BW
1351 int i;
1352
6361f4ba 1353 ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
6c7a01ec 1354
cf819eff 1355 if (IS_GEN(dev_priv, 6))
6361f4ba 1356 ee->vm_info.pp_dir_base =
0bc40be8 1357 I915_READ(RING_PP_DIR_BASE_READ(engine));
cf819eff 1358 else if (IS_GEN(dev_priv, 7))
6361f4ba 1359 ee->vm_info.pp_dir_base =
0bc40be8 1360 I915_READ(RING_PP_DIR_BASE(engine));
c033666a 1361 else if (INTEL_GEN(dev_priv) >= 8)
6c7a01ec 1362 for (i = 0; i < 4; i++) {
6361f4ba 1363 ee->vm_info.pdp[i] =
0bc40be8 1364 I915_READ(GEN8_RING_PDP_UDW(engine, i));
6361f4ba
CW
1365 ee->vm_info.pdp[i] <<= 32;
1366 ee->vm_info.pdp[i] |=
0bc40be8 1367 I915_READ(GEN8_RING_PDP_LDW(engine, i));
6c7a01ec 1368 }
6c7a01ec 1369 }
84734a04
MK
1370}
1371
e61e0f51 1372static void record_request(struct i915_request *request,
35ca039e
CW
1373 struct drm_i915_error_request *erq)
1374{
4e0d64db
CW
1375 struct i915_gem_context *ctx = request->gem_context;
1376
1377 erq->context = ctx->hw_id;
b7268c5e 1378 erq->sched_attr = request->sched.attr;
4e0d64db 1379 erq->ban_score = atomic_read(&ctx->ban_score);
65e4760e 1380 erq->seqno = request->global_seqno;
35ca039e 1381 erq->jiffies = request->emitted_jiffies;
3a068721 1382 erq->start = i915_ggtt_offset(request->ring->vma);
35ca039e
CW
1383 erq->head = request->head;
1384 erq->tail = request->tail;
1385
1386 rcu_read_lock();
4e0d64db 1387 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
35ca039e
CW
1388 rcu_read_unlock();
1389}
1390
57bc699d 1391static void engine_record_requests(struct intel_engine_cs *engine,
e61e0f51 1392 struct i915_request *first,
57bc699d
CW
1393 struct drm_i915_error_engine *ee)
1394{
e61e0f51 1395 struct i915_request *request;
57bc699d
CW
1396 int count;
1397
1398 count = 0;
1399 request = first;
a89d1f92 1400 list_for_each_entry_from(request, &engine->timeline.requests, link)
57bc699d
CW
1401 count++;
1402 if (!count)
1403 return;
1404
1405 ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1406 if (!ee->requests)
1407 return;
1408
1409 ee->num_requests = count;
1410
1411 count = 0;
1412 request = first;
a89d1f92 1413 list_for_each_entry_from(request, &engine->timeline.requests, link) {
57bc699d
CW
1414 if (count >= ee->num_requests) {
1415 /*
1416 * If the ring request list was changed in
1417 * between the point where the error request
1418 * list was created and dimensioned and this
1419 * point then just exit early to avoid crashes.
1420 *
1421 * We don't need to communicate that the
1422 * request list changed state during error
1423 * state capture and that the error state is
1424 * slightly incorrect as a consequence since we
1425 * are typically only interested in the request
1426 * list state at the point of error state
1427 * capture, not in any changes happening during
1428 * the capture.
1429 */
1430 break;
1431 }
1432
35ca039e 1433 record_request(request, &ee->requests[count++]);
57bc699d
CW
1434 }
1435 ee->num_requests = count;
1436}
1437
35ca039e
CW
1438static void error_record_engine_execlists(struct intel_engine_cs *engine,
1439 struct drm_i915_error_engine *ee)
1440{
76e70087 1441 const struct intel_engine_execlists * const execlists = &engine->execlists;
35ca039e
CW
1442 unsigned int n;
1443
76e70087 1444 for (n = 0; n < execlists_num_ports(execlists); n++) {
e61e0f51 1445 struct i915_request *rq = port_request(&execlists->port[n]);
77f0d0e9
CW
1446
1447 if (!rq)
1448 break;
1449
1450 record_request(rq, &ee->execlist[n]);
1451 }
76e70087
MK
1452
1453 ee->num_ports = n;
35ca039e
CW
1454}
1455
4fa6053e
CW
1456static void record_context(struct drm_i915_error_context *e,
1457 struct i915_gem_context *ctx)
1458{
1459 if (ctx->pid) {
1460 struct task_struct *task;
1461
1462 rcu_read_lock();
1463 task = pid_task(ctx->pid, PIDTYPE_PID);
1464 if (task) {
1465 strcpy(e->comm, task->comm);
1466 e->pid = task->pid;
1467 }
1468 rcu_read_unlock();
1469 }
1470
1471 e->handle = ctx->user_handle;
1472 e->hw_id = ctx->hw_id;
b7268c5e 1473 e->sched_attr = ctx->sched;
77b25a97 1474 e->ban_score = atomic_read(&ctx->ban_score);
302e55d7 1475 e->bannable = i915_gem_context_is_bannable(ctx);
77b25a97
CW
1476 e->guilty = atomic_read(&ctx->guilty_count);
1477 e->active = atomic_read(&ctx->active_count);
4fa6053e
CW
1478}
1479
e61e0f51 1480static void request_record_user_bo(struct i915_request *request,
b0fd47ad
CW
1481 struct drm_i915_error_engine *ee)
1482{
e61e0f51 1483 struct i915_capture_list *c;
b0fd47ad 1484 struct drm_i915_error_object **bo;
8e3ffa8d 1485 long count, max;
b0fd47ad 1486
8e3ffa8d 1487 max = 0;
b0fd47ad 1488 for (c = request->capture_list; c; c = c->next)
8e3ffa8d
CW
1489 max++;
1490 if (!max)
1491 return;
b0fd47ad 1492
8e3ffa8d
CW
1493 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1494 if (!bo) {
1495 /* If we can't capture everything, try to capture something. */
1496 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1497 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1498 }
b0fd47ad
CW
1499 if (!bo)
1500 return;
1501
1502 count = 0;
1503 for (c = request->capture_list; c; c = c->next) {
1504 bo[count] = i915_error_object_create(request->i915, c->vma);
1505 if (!bo[count])
1506 break;
8e3ffa8d
CW
1507 if (++count == max)
1508 break;
b0fd47ad
CW
1509 }
1510
1511 ee->user_bo = bo;
1512 ee->user_bo_count = count;
1513}
1514
4e90a6e2
CW
1515static struct drm_i915_error_object *
1516capture_object(struct drm_i915_private *dev_priv,
1517 struct drm_i915_gem_object *obj)
1518{
1519 if (obj && i915_gem_object_has_pages(obj)) {
1520 struct i915_vma fake = {
1521 .node = { .start = U64_MAX, .size = obj->base.size },
b5e0a941 1522 .size = obj->base.size,
4e90a6e2
CW
1523 .pages = obj->mm.pages,
1524 .obj = obj,
1525 };
1526
1527 return i915_error_object_create(dev_priv, &fake);
1528 } else {
1529 return NULL;
1530 }
1531}
1532
53b725c7 1533static void gem_record_rings(struct i915_gpu_state *error)
84734a04 1534{
53b725c7
DCS
1535 struct drm_i915_private *i915 = error->i915;
1536 struct i915_ggtt *ggtt = &i915->ggtt;
57bc699d 1537 int i;
84734a04 1538
666796da 1539 for (i = 0; i < I915_NUM_ENGINES; i++) {
53b725c7 1540 struct intel_engine_cs *engine = i915->engine[i];
6361f4ba 1541 struct drm_i915_error_engine *ee = &error->engine[i];
e61e0f51 1542 struct i915_request *request;
372fbb8e 1543
6361f4ba 1544 ee->engine_id = -1;
eee73b46 1545
3b3f1650 1546 if (!engine)
372fbb8e
CW
1547 continue;
1548
6361f4ba 1549 ee->engine_id = i;
372fbb8e 1550
6361f4ba
CW
1551 error_record_engine_registers(error, engine, ee);
1552 error_record_engine_waiters(engine, ee);
35ca039e 1553 error_record_engine_execlists(engine, ee);
84734a04 1554
e2f80391 1555 request = i915_gem_find_active_request(engine);
ab0e7ff9 1556 if (request) {
4e0d64db 1557 struct i915_gem_context *ctx = request->gem_context;
7e37f889 1558 struct intel_ring *ring;
ae6c4806 1559
82ad6443 1560 ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm;
ae6c4806 1561
4e0d64db 1562 record_context(&ee->context, ctx);
4fa6053e 1563
ab0e7ff9
CW
1564 /* We need to copy these to an anonymous buffer
1565 * as the simplest method to avoid being overwritten
1566 * by userspace.
1567 */
6361f4ba 1568 ee->batchbuffer =
53b725c7 1569 i915_error_object_create(i915, request->batch);
ab0e7ff9 1570
53b725c7 1571 if (HAS_BROKEN_CS_TLB(i915))
6361f4ba 1572 ee->wa_batchbuffer =
53b725c7 1573 i915_error_object_create(i915,
51797499 1574 i915->gt.scratch);
b0fd47ad 1575 request_record_user_bo(request, ee);
ab0e7ff9 1576
058d88c4 1577 ee->ctx =
53b725c7 1578 i915_error_object_create(i915,
1fc44d9b 1579 request->hw_context->state);
546b1b6a 1580
bc3d6744 1581 error->simulated |=
4e0d64db 1582 i915_gem_context_no_error_capture(ctx);
bc3d6744 1583
cdb324bd
CW
1584 ee->rq_head = request->head;
1585 ee->rq_post = request->postfix;
1586 ee->rq_tail = request->tail;
1587
1dae2dfb
CW
1588 ring = request->ring;
1589 ee->cpu_ring_head = ring->head;
1590 ee->cpu_ring_tail = ring->tail;
6361f4ba 1591 ee->ringbuffer =
53b725c7 1592 i915_error_object_create(i915, ring->vma);
57bc699d
CW
1593
1594 engine_record_requests(engine, request, ee);
ba6e0418 1595 }
84734a04 1596
6361f4ba 1597 ee->hws_page =
53b725c7 1598 i915_error_object_create(i915,
058d88c4 1599 engine->status_page.vma);
84734a04 1600
53b725c7 1601 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
4e90a6e2 1602
53b725c7 1603 ee->default_state = capture_object(i915, engine->default_state);
84734a04
MK
1604 }
1605}
1606
53b725c7
DCS
1607static void gem_capture_vm(struct i915_gpu_state *error,
1608 struct i915_address_space *vm,
1609 int idx)
84734a04 1610{
c0ce4663 1611 struct drm_i915_error_buffer *active_bo;
95f5301d 1612 struct i915_vma *vma;
c0ce4663 1613 int count;
84734a04 1614
c0ce4663 1615 count = 0;
1c7f4bca 1616 list_for_each_entry(vma, &vm->active_list, vm_link)
c0ce4663 1617 count++;
84734a04 1618
c0ce4663
CW
1619 active_bo = NULL;
1620 if (count)
1621 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
95f5301d 1622 if (active_bo)
c0ce4663
CW
1623 count = capture_error_bo(active_bo, count, &vm->active_list, false);
1624 else
1625 count = 0;
1626
1627 error->active_vm[idx] = vm;
1628 error->active_bo[idx] = active_bo;
1629 error->active_bo_count[idx] = count;
95f5301d
BW
1630}
1631
53b725c7 1632static void capture_active_buffers(struct i915_gpu_state *error)
95f5301d 1633{
c0ce4663
CW
1634 int cnt = 0, i, j;
1635
1636 BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1637 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1638 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1639
1640 /* Scan each engine looking for unique active contexts/vm */
1641 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1642 struct drm_i915_error_engine *ee = &error->engine[i];
1643 bool found;
1644
1645 if (!ee->vm)
1646 continue;
3a448734 1647
c0ce4663
CW
1648 found = false;
1649 for (j = 0; j < i && !found; j++)
1650 found = error->engine[j].vm == ee->vm;
1651 if (!found)
53b725c7 1652 gem_capture_vm(error, ee->vm, cnt++);
3a448734 1653 }
84734a04
MK
1654}
1655
53b725c7 1656static void capture_pinned_buffers(struct i915_gpu_state *error)
c0ce4663 1657{
82ad6443 1658 struct i915_address_space *vm = &error->i915->ggtt.vm;
c0ce4663
CW
1659 struct drm_i915_error_buffer *bo;
1660 struct i915_vma *vma;
1661 int count_inactive, count_active;
1662
1663 count_inactive = 0;
cd68e04c 1664 list_for_each_entry(vma, &vm->inactive_list, vm_link)
c0ce4663
CW
1665 count_inactive++;
1666
1667 count_active = 0;
cd68e04c 1668 list_for_each_entry(vma, &vm->active_list, vm_link)
c0ce4663
CW
1669 count_active++;
1670
1671 bo = NULL;
1672 if (count_inactive + count_active)
1673 bo = kcalloc(count_inactive + count_active,
1674 sizeof(*bo), GFP_ATOMIC);
1675 if (!bo)
1676 return;
1677
1678 count_inactive = capture_error_bo(bo, count_inactive,
1679 &vm->active_list, true);
1680 count_active = capture_error_bo(bo + count_inactive, count_active,
1681 &vm->inactive_list, true);
1682 error->pinned_bo_count = count_inactive + count_active;
1683 error->pinned_bo = bo;
1684}
1685
7d41ef34
MW
1686static void capture_uc_state(struct i915_gpu_state *error)
1687{
1688 struct drm_i915_private *i915 = error->i915;
1689 struct i915_error_uc *error_uc = &error->uc;
1690
1691 /* Capturing uC state won't be useful if there is no GuC */
1692 if (!error->device_info.has_guc)
1693 return;
1694
1695 error_uc->guc_fw = i915->guc.fw;
1696 error_uc->huc_fw = i915->huc.fw;
1697
1698 /* Non-default firmware paths will be specified by the modparam.
1699 * As modparams are generally accesible from the userspace make
1700 * explicit copies of the firmware paths.
1701 */
1702 error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1703 error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
0397ac13 1704 error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
27b85bea
AG
1705}
1706
1d762aad 1707/* Capture all registers which don't fit into another category. */
53b725c7 1708static void capture_reg_state(struct i915_gpu_state *error)
84734a04 1709{
53b725c7 1710 struct drm_i915_private *dev_priv = error->i915;
885ea5a8 1711 int i;
84734a04 1712
654c90c6
BW
1713 /* General organization
1714 * 1. Registers specific to a single generation
1715 * 2. Registers which belong to multiple generations
1716 * 3. Feature specific registers.
1717 * 4. Everything else
1718 * Please try to follow the order.
1719 */
84734a04 1720
654c90c6 1721 /* 1: Registers specific to a single generation */
11a914c2 1722 if (IS_VALLEYVIEW(dev_priv)) {
885ea5a8 1723 error->gtier[0] = I915_READ(GTIER);
843db716 1724 error->ier = I915_READ(VLV_IER);
40181697 1725 error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
654c90c6 1726 }
84734a04 1727
cf819eff 1728 if (IS_GEN(dev_priv, 7))
654c90c6 1729 error->err_int = I915_READ(GEN7_ERR_INT);
84734a04 1730
5f56d5f9 1731 if (INTEL_GEN(dev_priv) >= 8) {
6c826f34
MK
1732 error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
1733 error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
1734 }
1735
cf819eff 1736 if (IS_GEN(dev_priv, 6)) {
40181697 1737 error->forcewake = I915_READ_FW(FORCEWAKE);
91ec5d11
BW
1738 error->gab_ctl = I915_READ(GAB_CTL);
1739 error->gfx_mode = I915_READ(GFX_MODE);
1740 }
84734a04 1741
654c90c6 1742 /* 2: Registers which belong to multiple generations */
5f56d5f9 1743 if (INTEL_GEN(dev_priv) >= 7)
40181697 1744 error->forcewake = I915_READ_FW(FORCEWAKE_MT);
84734a04 1745
5f56d5f9 1746 if (INTEL_GEN(dev_priv) >= 6) {
654c90c6 1747 error->derrmr = I915_READ(DERRMR);
84734a04
MK
1748 error->error = I915_READ(ERROR_GEN6);
1749 error->done_reg = I915_READ(DONE_REG);
1750 }
1751
5de92320 1752 if (INTEL_GEN(dev_priv) >= 5)
f2e4d76e
JL
1753 error->ccid = I915_READ(CCID);
1754
654c90c6 1755 /* 3: Feature specific registers */
f3ce44a0 1756 if (IS_GEN_RANGE(dev_priv, 6, 7)) {
91ec5d11
BW
1757 error->gam_ecochk = I915_READ(GAM_ECOCHK);
1758 error->gac_eco = I915_READ(GAC_ECO_BITS);
1759 }
1760
1761 /* 4: Everything else */
6b7a6a7b
OM
1762 if (INTEL_GEN(dev_priv) >= 11) {
1763 error->ier = I915_READ(GEN8_DE_MISC_IER);
1764 error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE);
1765 error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE);
1766 error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE);
1767 error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1768 error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE);
1769 error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE);
1770 error->ngtier = 6;
1771 } else if (INTEL_GEN(dev_priv) >= 8) {
885ea5a8
RV
1772 error->ier = I915_READ(GEN8_DE_MISC_IER);
1773 for (i = 0; i < 4; i++)
1774 error->gtier[i] = I915_READ(GEN8_GT_IER(i));
5a4c6f1b 1775 error->ngtier = 4;
6e266956 1776 } else if (HAS_PCH_SPLIT(dev_priv)) {
843db716 1777 error->ier = I915_READ(DEIER);
885ea5a8 1778 error->gtier[0] = I915_READ(GTIER);
5a4c6f1b 1779 error->ngtier = 1;
cf819eff 1780 } else if (IS_GEN(dev_priv, 2)) {
843db716 1781 error->ier = I915_READ16(IER);
11a914c2 1782 } else if (!IS_VALLEYVIEW(dev_priv)) {
843db716 1783 error->ier = I915_READ(IER);
654c90c6 1784 }
654c90c6
BW
1785 error->eir = I915_READ(EIR);
1786 error->pgtbl_er = I915_READ(PGTBL_ER);
1d762aad
BW
1787}
1788
c033666a 1789static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
5a4c6f1b 1790 struct i915_gpu_state *error,
14b730fc 1791 u32 engine_mask,
58174462 1792 const char *error_msg)
cb383002 1793{
cb383002 1794 u32 ecode;
6361f4ba 1795 int engine_id = -1, len;
cb383002 1796
6361f4ba 1797 ecode = i915_error_generate_code(dev_priv, error, &engine_id);
cb383002 1798
58174462 1799 len = scnprintf(error->error_msg, sizeof(error->error_msg),
0b5492d6 1800 "GPU HANG: ecode %d:%d:0x%08x",
6361f4ba 1801 INTEL_GEN(dev_priv), engine_id, ecode);
58174462 1802
4fa6053e 1803 if (engine_id != -1 && error->engine[engine_id].context.pid)
58174462
MK
1804 len += scnprintf(error->error_msg + len,
1805 sizeof(error->error_msg) - len,
1806 ", in %s [%d]",
4fa6053e
CW
1807 error->engine[engine_id].context.comm,
1808 error->engine[engine_id].context.pid);
58174462
MK
1809
1810 scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
1811 ", reason: %s, action: %s",
1812 error_msg,
14b730fc 1813 engine_mask ? "reset" : "continue");
cb383002
MK
1814}
1815
53b725c7 1816static void capture_gen_state(struct i915_gpu_state *error)
48b031e3 1817{
53b725c7
DCS
1818 struct drm_i915_private *i915 = error->i915;
1819
1820 error->awake = i915->gt.awake;
1821 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1822 error->suspended = i915->runtime_pm.suspended;
f73b5674 1823
eb5be9d0
CW
1824 error->iommu = -1;
1825#ifdef CONFIG_INTEL_IOMMU
1826 error->iommu = intel_iommu_gfx_mapped;
1827#endif
53b725c7
DCS
1828 error->reset_count = i915_reset_count(&i915->gpu_error);
1829 error->suspend_count = i915->suspend_count;
2bd160a1
CW
1830
1831 memcpy(&error->device_info,
53b725c7 1832 INTEL_INFO(i915),
2bd160a1 1833 sizeof(error->device_info));
53b725c7 1834 error->driver_caps = i915->caps;
48b031e3
MK
1835}
1836
84a20a8a
MW
1837static void capture_params(struct i915_gpu_state *error)
1838{
4081cef9 1839 i915_params_copy(&error->params, &i915_modparams);
84a20a8a
MW
1840}
1841
043477b0
MK
1842static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1843{
1844 unsigned long epoch = error->capture;
1845 int i;
1846
1847 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1848 const struct drm_i915_error_engine *ee = &error->engine[i];
1849
1850 if (ee->hangcheck_stalled &&
1851 time_before(ee->hangcheck_timestamp, epoch))
1852 epoch = ee->hangcheck_timestamp;
1853 }
1854
1855 return epoch;
1856}
1857
8f5c6fe4
CW
1858static void capture_finish(struct i915_gpu_state *error)
1859{
1860 struct i915_ggtt *ggtt = &error->i915->ggtt;
1861 const u64 slot = ggtt->error_capture.start;
1862
1863 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1864}
1865
9f267eb8
CW
1866static int capture(void *data)
1867{
5a4c6f1b 1868 struct i915_gpu_state *error = data;
9f267eb8 1869
c6270dbc
AB
1870 error->time = ktime_get_real();
1871 error->boottime = ktime_get_boottime();
1872 error->uptime = ktime_sub(ktime_get(),
1873 error->i915->gt.last_init_time);
043477b0 1874 error->capture = jiffies;
642c8a72 1875
84a20a8a 1876 capture_params(error);
53b725c7 1877 capture_gen_state(error);
7cc62d0b 1878 capture_uc_state(error);
53b725c7
DCS
1879 capture_reg_state(error);
1880 gem_record_fences(error);
1881 gem_record_rings(error);
1882 capture_active_buffers(error);
1883 capture_pinned_buffers(error);
9f267eb8 1884
9f267eb8
CW
1885 error->overlay = intel_overlay_capture_error_state(error->i915);
1886 error->display = intel_display_capture_error_state(error->i915);
1887
043477b0
MK
1888 error->epoch = capture_find_epoch(error);
1889
8f5c6fe4 1890 capture_finish(error);
9f267eb8
CW
1891 return 0;
1892}
1893
eafc4894
CW
1894#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1895
5a4c6f1b
CW
1896struct i915_gpu_state *
1897i915_capture_gpu_state(struct drm_i915_private *i915)
1898{
1899 struct i915_gpu_state *error;
1900
e6154e4c
CW
1901 /* Check if GPU capture has been disabled */
1902 error = READ_ONCE(i915->gpu_error.first_error);
1903 if (IS_ERR(error))
1904 return error;
1905
5a4c6f1b 1906 error = kzalloc(sizeof(*error), GFP_ATOMIC);
e6154e4c
CW
1907 if (!error) {
1908 i915_disable_error_state(i915, -ENOMEM);
1909 return ERR_PTR(-ENOMEM);
1910 }
5a4c6f1b
CW
1911
1912 kref_init(&error->ref);
1913 error->i915 = i915;
1914
1915 stop_machine(capture, error, NULL);
1916
1917 return error;
1918}
1919
1d762aad
BW
1920/**
1921 * i915_capture_error_state - capture an error record for later analysis
d03133a8
CW
1922 * @i915: i915 device
1923 * @engine_mask: the mask of engines triggering the hang
1924 * @error_msg: a message to insert into the error capture header
1d762aad
BW
1925 *
1926 * Should be called when an error is detected (either a hang or an error
1927 * interrupt) to capture error state from the time of the error. Fills
1928 * out a structure which becomes available in debugfs for user level tools
1929 * to pick up.
1930 */
d03133a8 1931void i915_capture_error_state(struct drm_i915_private *i915,
c033666a 1932 u32 engine_mask,
58174462 1933 const char *error_msg)
1d762aad 1934{
53a4c6b2 1935 static bool warned;
5a4c6f1b 1936 struct i915_gpu_state *error;
1d762aad 1937 unsigned long flags;
1d762aad 1938
4f044a88 1939 if (!i915_modparams.error_capture)
98a2f411
CW
1940 return;
1941
d03133a8 1942 if (READ_ONCE(i915->gpu_error.first_error))
9777cca0
CW
1943 return;
1944
d03133a8 1945 error = i915_capture_gpu_state(i915);
e6154e4c 1946 if (IS_ERR(error))
1d762aad 1947 return;
1d762aad 1948
d03133a8 1949 i915_error_capture_msg(i915, error, engine_mask, error_msg);
cb383002
MK
1950 DRM_INFO("%s\n", error->error_msg);
1951
bc3d6744 1952 if (!error->simulated) {
d03133a8
CW
1953 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1954 if (!i915->gpu_error.first_error) {
1955 i915->gpu_error.first_error = error;
bc3d6744
CW
1956 error = NULL;
1957 }
d03133a8 1958 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
84734a04 1959 }
84734a04 1960
cb383002 1961 if (error) {
5a4c6f1b 1962 __i915_gpu_state_free(&error->ref);
cb383002
MK
1963 return;
1964 }
1965
eafc4894
CW
1966 if (!warned &&
1967 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
cb383002
MK
1968 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1969 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1970 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1971 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
91c8a326 1972 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
d03133a8 1973 i915->drm.primary->index);
cb383002
MK
1974 warned = true;
1975 }
84734a04
MK
1976}
1977
5a4c6f1b
CW
1978struct i915_gpu_state *
1979i915_first_error_state(struct drm_i915_private *i915)
84734a04 1980{
5a4c6f1b 1981 struct i915_gpu_state *error;
84734a04 1982
5a4c6f1b
CW
1983 spin_lock_irq(&i915->gpu_error.lock);
1984 error = i915->gpu_error.first_error;
e6154e4c 1985 if (!IS_ERR_OR_NULL(error))
5a4c6f1b
CW
1986 i915_gpu_state_get(error);
1987 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 1988
5a4c6f1b 1989 return error;
84734a04
MK
1990}
1991
5a4c6f1b 1992void i915_reset_error_state(struct drm_i915_private *i915)
84734a04 1993{
5a4c6f1b 1994 struct i915_gpu_state *error;
84734a04 1995
5a4c6f1b
CW
1996 spin_lock_irq(&i915->gpu_error.lock);
1997 error = i915->gpu_error.first_error;
e6154e4c
CW
1998 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
1999 i915->gpu_error.first_error = NULL;
5a4c6f1b 2000 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 2001
e6154e4c 2002 if (!IS_ERR_OR_NULL(error))
fb6f0b64
CW
2003 i915_gpu_state_put(error);
2004}
2005
2006void i915_disable_error_state(struct drm_i915_private *i915, int err)
2007{
2008 spin_lock_irq(&i915->gpu_error.lock);
2009 if (!i915->gpu_error.first_error)
2010 i915->gpu_error.first_error = ERR_PTR(err);
2011 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 2012}