Merge tag 'nf-23-09-13' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf
[linux-block.git] / drivers / gpu / drm / i915 / i915_gpu_error.c
CommitLineData
84734a04
MK
1/*
2 * Copyright (c) 2008 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 * Keith Packard <keithp@keithp.com>
26 * Mika Kuoppala <mika.kuoppala@intel.com>
27 *
28 */
29
0e39037b 30#include <linux/ascii85.h>
e9b67ec2 31#include <linux/highmem.h>
0e39037b 32#include <linux/nmi.h>
3bdd4f84 33#include <linux/pagevec.h>
0e39037b 34#include <linux/scatterlist.h>
01fabda8 35#include <linux/string_helpers.h>
0e39037b 36#include <linux/utsname.h>
0a97015d 37#include <linux/zlib.h>
0e39037b 38
5f2ec909 39#include <drm/drm_cache.h>
7d41ef34
MW
40#include <drm/drm_print.h>
41
32f9402d 42#include "display/intel_dmc.h"
df0566a6
JN
43#include "display/intel_overlay.h"
44
10be98a7 45#include "gem/i915_gem_context.h"
895d8ebe 46#include "gem/i915_gem_lmem.h"
202b1f4c 47#include "gt/intel_engine_regs.h"
792592e7 48#include "gt/intel_gt.h"
9a92732f 49#include "gt/intel_gt_mcr.h"
742379c0 50#include "gt/intel_gt_pm.h"
0d6419e9 51#include "gt/intel_gt_regs.h"
a6f0f9cf 52#include "gt/uc/intel_guc_capture.h"
10be98a7 53
24524e3f 54#include "i915_driver.h"
84734a04 55#include "i915_drv.h"
05ca9306 56#include "i915_gpu_error.h"
9c9082b9 57#include "i915_memcpy.h"
801543b2 58#include "i915_reg.h"
37d63f8f 59#include "i915_scatterlist.h"
a7f46d5b 60#include "i915_utils.h"
84734a04 61
8b91cdd4 62#define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
3bdd4f84
CW
63#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
64
0e39037b
CW
65static void __sg_set_buf(struct scatterlist *sg,
66 void *addr, unsigned int len, loff_t it)
84734a04 67{
0e39037b
CW
68 sg->page_link = (unsigned long)virt_to_page(addr);
69 sg->offset = offset_in_page(addr);
70 sg->length = len;
71 sg->dma_address = it;
84734a04
MK
72}
73
0e39037b 74static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
84734a04 75{
0e39037b 76 if (!len)
84734a04 77 return false;
84734a04 78
0e39037b
CW
79 if (e->bytes + len + 1 <= e->size)
80 return true;
81
82 if (e->bytes) {
83 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
84 e->iter += e->bytes;
85 e->buf = NULL;
86 e->bytes = 0;
84734a04
MK
87 }
88
0e39037b
CW
89 if (e->cur == e->end) {
90 struct scatterlist *sgl;
84734a04 91
3bdd4f84 92 sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
0e39037b
CW
93 if (!sgl) {
94 e->err = -ENOMEM;
95 return false;
96 }
84734a04 97
0e39037b
CW
98 if (e->cur) {
99 e->cur->offset = 0;
100 e->cur->length = 0;
101 e->cur->page_link =
102 (unsigned long)sgl | SG_CHAIN;
103 } else {
104 e->sgl = sgl;
84734a04
MK
105 }
106
0e39037b
CW
107 e->cur = sgl;
108 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
84734a04
MK
109 }
110
0e39037b 111 e->size = ALIGN(len + 1, SZ_64K);
3bdd4f84 112 e->buf = kmalloc(e->size, ALLOW_FAIL);
0e39037b
CW
113 if (!e->buf) {
114 e->size = PAGE_ALIGN(len + 1);
115 e->buf = kmalloc(e->size, GFP_KERNEL);
116 }
117 if (!e->buf) {
118 e->err = -ENOMEM;
119 return false;
120 }
121
122 return true;
84734a04
MK
123}
124
dda35931 125__printf(2, 0)
84734a04 126static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
0e39037b 127 const char *fmt, va_list args)
84734a04 128{
0e39037b
CW
129 va_list ap;
130 int len;
84734a04 131
0e39037b 132 if (e->err)
84734a04
MK
133 return;
134
0e39037b
CW
135 va_copy(ap, args);
136 len = vsnprintf(NULL, 0, fmt, ap);
137 va_end(ap);
138 if (len <= 0) {
139 e->err = len;
140 return;
84734a04
MK
141 }
142
0e39037b
CW
143 if (!__i915_error_grow(e, len))
144 return;
84734a04 145
0e39037b
CW
146 GEM_BUG_ON(e->bytes >= e->size);
147 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
148 if (len < 0) {
149 e->err = len;
150 return;
151 }
152 e->bytes += len;
84734a04
MK
153}
154
0e39037b 155static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
84734a04
MK
156{
157 unsigned len;
158
0e39037b 159 if (e->err || !str)
84734a04
MK
160 return;
161
162 len = strlen(str);
0e39037b
CW
163 if (!__i915_error_grow(e, len))
164 return;
84734a04 165
0e39037b 166 GEM_BUG_ON(e->bytes + len > e->size);
84734a04 167 memcpy(e->buf + e->bytes, str, len);
0e39037b 168 e->bytes += len;
84734a04
MK
169}
170
171#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
172#define err_puts(e, s) i915_error_puts(e, s)
173
7d41ef34
MW
174static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
175{
176 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
177}
178
179static inline struct drm_printer
180i915_error_printer(struct drm_i915_error_state_buf *e)
181{
182 struct drm_printer p = {
183 .printfn = __i915_printfn_error,
184 .arg = e,
185 };
186 return p;
187}
188
3bdd4f84 189/* single threaded page allocator with a reserved stash for emergencies */
f8a101ff 190static void pool_fini(struct folio_batch *fbatch)
3bdd4f84 191{
f8a101ff 192 folio_batch_release(fbatch);
3bdd4f84
CW
193}
194
f8a101ff 195static int pool_refill(struct folio_batch *fbatch, gfp_t gfp)
3bdd4f84 196{
f8a101ff
MWO
197 while (folio_batch_space(fbatch)) {
198 struct folio *folio;
3bdd4f84 199
f8a101ff
MWO
200 folio = folio_alloc(gfp, 0);
201 if (!folio)
3bdd4f84
CW
202 return -ENOMEM;
203
f8a101ff 204 folio_batch_add(fbatch, folio);
3bdd4f84
CW
205 }
206
207 return 0;
208}
209
f8a101ff 210static int pool_init(struct folio_batch *fbatch, gfp_t gfp)
3bdd4f84
CW
211{
212 int err;
213
f8a101ff 214 folio_batch_init(fbatch);
3bdd4f84 215
f8a101ff 216 err = pool_refill(fbatch, gfp);
3bdd4f84 217 if (err)
f8a101ff 218 pool_fini(fbatch);
3bdd4f84
CW
219
220 return err;
221}
222
f8a101ff 223static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp)
3bdd4f84 224{
f8a101ff 225 struct folio *folio;
3bdd4f84 226
f8a101ff
MWO
227 folio = folio_alloc(gfp, 0);
228 if (!folio && folio_batch_count(fbatch))
229 folio = fbatch->folios[--fbatch->nr];
3bdd4f84 230
f8a101ff 231 return folio ? folio_address(folio) : NULL;
3bdd4f84
CW
232}
233
f8a101ff 234static void pool_free(struct folio_batch *fbatch, void *addr)
3bdd4f84 235{
f8a101ff 236 struct folio *folio = virt_to_folio(addr);
3bdd4f84 237
f8a101ff
MWO
238 if (folio_batch_space(fbatch))
239 folio_batch_add(fbatch, folio);
3bdd4f84 240 else
f8a101ff 241 folio_put(folio);
3bdd4f84
CW
242}
243
0a97015d
CW
244#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
245
742379c0 246struct i915_vma_compress {
f8a101ff 247 struct folio_batch pool;
d637c178
CW
248 struct z_stream_s zstream;
249 void *tmp;
250};
251
742379c0 252static bool compress_init(struct i915_vma_compress *c)
0a97015d 253{
3bdd4f84 254 struct z_stream_s *zstream = &c->zstream;
0a97015d 255
3bdd4f84 256 if (pool_init(&c->pool, ALLOW_FAIL))
0a97015d
CW
257 return false;
258
3bdd4f84
CW
259 zstream->workspace =
260 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
261 ALLOW_FAIL);
262 if (!zstream->workspace) {
263 pool_fini(&c->pool);
0a97015d
CW
264 return false;
265 }
266
d637c178 267 c->tmp = NULL;
c4d3ae68 268 if (i915_has_memcpy_from_wc())
3bdd4f84 269 c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
d637c178 270
0a97015d
CW
271 return true;
272}
273
742379c0 274static bool compress_start(struct i915_vma_compress *c)
83bc0f5b 275{
3bdd4f84
CW
276 struct z_stream_s *zstream = &c->zstream;
277 void *workspace = zstream->workspace;
278
279 memset(zstream, 0, sizeof(*zstream));
280 zstream->workspace = workspace;
281
282 return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
283}
284
742379c0
CW
285static void *compress_next_page(struct i915_vma_compress *c,
286 struct i915_vma_coredump *dst)
3bdd4f84 287{
e45b98ba
TH
288 void *page_addr;
289 struct page *page;
83bc0f5b 290
e45b98ba
TH
291 page_addr = pool_alloc(&c->pool, ALLOW_FAIL);
292 if (!page_addr)
83bc0f5b
CW
293 return ERR_PTR(-ENOMEM);
294
e45b98ba
TH
295 page = virt_to_page(page_addr);
296 list_add_tail(&page->lru, &dst->page_list);
297 return page_addr;
83bc0f5b
CW
298}
299
742379c0 300static int compress_page(struct i915_vma_compress *c,
0a97015d 301 void *src,
742379c0
CW
302 struct i915_vma_coredump *dst,
303 bool wc)
0a97015d 304{
d637c178
CW
305 struct z_stream_s *zstream = &c->zstream;
306
0a97015d 307 zstream->next_in = src;
742379c0 308 if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
d637c178 309 zstream->next_in = c->tmp;
0a97015d
CW
310 zstream->avail_in = PAGE_SIZE;
311
312 do {
313 if (zstream->avail_out == 0) {
3bdd4f84 314 zstream->next_out = compress_next_page(c, dst);
83bc0f5b
CW
315 if (IS_ERR(zstream->next_out))
316 return PTR_ERR(zstream->next_out);
0a97015d 317
0a97015d
CW
318 zstream->avail_out = PAGE_SIZE;
319 }
320
83bc0f5b 321 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
0a97015d 322 return -EIO;
7d555314
CW
323
324 cond_resched();
0a97015d
CW
325 } while (zstream->avail_in);
326
327 /* Fallback to uncompressed if we increase size? */
328 if (0 && zstream->total_out > zstream->total_in)
329 return -E2BIG;
330
331 return 0;
332}
333
742379c0
CW
334static int compress_flush(struct i915_vma_compress *c,
335 struct i915_vma_coredump *dst)
0a97015d 336{
d637c178
CW
337 struct z_stream_s *zstream = &c->zstream;
338
83bc0f5b
CW
339 do {
340 switch (zlib_deflate(zstream, Z_FINISH)) {
341 case Z_OK: /* more space requested */
3bdd4f84 342 zstream->next_out = compress_next_page(c, dst);
83bc0f5b
CW
343 if (IS_ERR(zstream->next_out))
344 return PTR_ERR(zstream->next_out);
345
346 zstream->avail_out = PAGE_SIZE;
347 break;
348
349 case Z_STREAM_END:
350 goto end;
351
352 default: /* any error */
353 return -EIO;
354 }
355 } while (1);
356
357end:
358 memset(zstream->next_out, 0, zstream->avail_out);
359 dst->unused = zstream->avail_out;
360 return 0;
361}
362
742379c0 363static void compress_finish(struct i915_vma_compress *c)
83bc0f5b 364{
3bdd4f84
CW
365 zlib_deflateEnd(&c->zstream);
366}
0a97015d 367
742379c0 368static void compress_fini(struct i915_vma_compress *c)
3bdd4f84
CW
369{
370 kfree(c->zstream.workspace);
d637c178 371 if (c->tmp)
3bdd4f84
CW
372 pool_free(&c->pool, c->tmp);
373 pool_fini(&c->pool);
0a97015d
CW
374}
375
376static void err_compression_marker(struct drm_i915_error_state_buf *m)
377{
378 err_puts(m, ":");
379}
380
381#else
382
742379c0 383struct i915_vma_compress {
f8a101ff 384 struct folio_batch pool;
d637c178
CW
385};
386
742379c0 387static bool compress_init(struct i915_vma_compress *c)
3bdd4f84
CW
388{
389 return pool_init(&c->pool, ALLOW_FAIL) == 0;
390}
391
742379c0 392static bool compress_start(struct i915_vma_compress *c)
0a97015d
CW
393{
394 return true;
395}
396
742379c0 397static int compress_page(struct i915_vma_compress *c,
0a97015d 398 void *src,
742379c0
CW
399 struct i915_vma_coredump *dst,
400 bool wc)
0a97015d 401{
d637c178 402 void *ptr;
0a97015d 403
79c7a28e 404 ptr = pool_alloc(&c->pool, ALLOW_FAIL);
3bdd4f84 405 if (!ptr)
0a97015d
CW
406 return -ENOMEM;
407
742379c0 408 if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
d637c178 409 memcpy(ptr, src, PAGE_SIZE);
e45b98ba 410 list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list);
7d555314 411 cond_resched();
0a97015d
CW
412
413 return 0;
414}
415
742379c0
CW
416static int compress_flush(struct i915_vma_compress *c,
417 struct i915_vma_coredump *dst)
83bc0f5b
CW
418{
419 return 0;
420}
421
742379c0 422static void compress_finish(struct i915_vma_compress *c)
0a97015d
CW
423{
424}
425
742379c0 426static void compress_fini(struct i915_vma_compress *c)
3bdd4f84
CW
427{
428 pool_fini(&c->pool);
429}
430
0a97015d
CW
431static void err_compression_marker(struct drm_i915_error_state_buf *m)
432{
433 err_puts(m, "~");
434}
435
436#endif
437
d636951e 438static void error_print_instdone(struct drm_i915_error_state_buf *m,
742379c0 439 const struct intel_engine_coredump *ee)
d636951e 440{
f9e61372
BW
441 int slice;
442 int subslice;
89f2e7ab 443 int iter;
f9e61372 444
d636951e
BW
445 err_printf(m, " INSTDONE: 0x%08x\n",
446 ee->instdone.instdone);
447
651e7d48 448 if (ee->engine->class != RENDER_CLASS || GRAPHICS_VER(m->i915) <= 3)
d636951e
BW
449 return;
450
451 err_printf(m, " SC_INSTDONE: 0x%08x\n",
452 ee->instdone.slice_common);
453
651e7d48 454 if (GRAPHICS_VER(m->i915) <= 6)
d636951e
BW
455 return;
456
9a92732f
MR
457 for_each_ss_steering(iter, ee->engine->gt, slice, subslice)
458 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
459 slice, subslice,
460 ee->instdone.sampler[slice][subslice]);
fa9899da 461
9a92732f
MR
462 for_each_ss_steering(iter, ee->engine->gt, slice, subslice)
463 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
464 slice, subslice,
465 ee->instdone.row[slice][subslice]);
f7043102 466
651e7d48 467 if (GRAPHICS_VER(m->i915) < 12)
f7043102
LL
468 return;
469
89f2e7ab 470 if (GRAPHICS_VER_FULL(m->i915) >= IP_VER(12, 55)) {
9a92732f 471 for_each_ss_steering(iter, ee->engine->gt, slice, subslice)
89f2e7ab
MR
472 err_printf(m, " GEOM_SVGUNIT_INSTDONE[%d][%d]: 0x%08x\n",
473 slice, subslice,
474 ee->instdone.geom_svg[slice][subslice]);
475 }
476
f7043102
LL
477 err_printf(m, " SC_INSTDONE_EXTRA: 0x%08x\n",
478 ee->instdone.slice_common_extra[0]);
479 err_printf(m, " SC_INSTDONE_EXTRA2: 0x%08x\n",
480 ee->instdone.slice_common_extra[1]);
d636951e
BW
481}
482
35ca039e
CW
483static void error_print_request(struct drm_i915_error_state_buf *m,
484 const char *prefix,
742379c0 485 const struct i915_request_coredump *erq)
35ca039e
CW
486{
487 if (!erq->seqno)
488 return;
489
9669a507 490 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n",
7f4127c4 491 prefix, erq->pid, erq->context, erq->seqno,
52c0fdb2
CW
492 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
493 &erq->flags) ? "!" : "",
494 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
495 &erq->flags) ? "+" : "",
496 erq->sched_attr.priority,
9669a507 497 erq->head, erq->tail);
35ca039e
CW
498}
499
4fa6053e
CW
500static void error_print_context(struct drm_i915_error_state_buf *m,
501 const char *header,
742379c0 502 const struct i915_gem_context_coredump *ctx)
4fa6053e 503{
1883a0a4 504 err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
2935ed53 505 header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
1883a0a4 506 ctx->guilty, ctx->active,
bb6287cb 507 ctx->total_runtime, ctx->avg_runtime);
c8a76df6 508 err_printf(m, " context timeline seqno %u\n", ctx->hwsp_seqno);
4fa6053e
CW
509}
510
742379c0
CW
511static struct i915_vma_coredump *
512__find_vma(struct i915_vma_coredump *vma, const char *name)
513{
514 while (vma) {
515 if (strcmp(vma->name, name) == 0)
516 return vma;
517 vma = vma->next;
518 }
519
520 return NULL;
521}
522
a0f1f7b4
AP
523struct i915_vma_coredump *
524intel_gpu_error_find_batch(const struct intel_engine_coredump *ee)
742379c0
CW
525{
526 return __find_vma(ee->vma, "batch");
527}
528
6361f4ba 529static void error_print_engine(struct drm_i915_error_state_buf *m,
742379c0 530 const struct intel_engine_coredump *ee)
84734a04 531{
742379c0 532 struct i915_vma_coredump *batch;
76e70087
MK
533 int n;
534
c990b4c3 535 err_printf(m, "%s command stream:\n", ee->engine->name);
742379c0 536 err_printf(m, " CCID: 0x%08x\n", ee->ccid);
6361f4ba 537 err_printf(m, " START: 0x%08x\n", ee->start);
06392e3b 538 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
cdb324bd
CW
539 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
540 ee->tail, ee->rq_post, ee->rq_tail);
6361f4ba 541 err_printf(m, " CTL: 0x%08x\n", ee->ctl);
21a2c58a 542 err_printf(m, " MODE: 0x%08x\n", ee->mode);
6361f4ba
CW
543 err_printf(m, " HWS: 0x%08x\n", ee->hws);
544 err_printf(m, " ACTHD: 0x%08x %08x\n",
545 (u32)(ee->acthd>>32), (u32)ee->acthd);
546 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
547 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
70a76a9b 548 err_printf(m, " ESR: 0x%08x\n", ee->esr);
d636951e
BW
549
550 error_print_instdone(m, ee);
551
a0f1f7b4 552 batch = intel_gpu_error_find_batch(ee);
742379c0
CW
553 if (batch) {
554 u64 start = batch->gtt_offset;
555 u64 end = start + batch->gtt_size;
03382dfb
CW
556
557 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
558 upper_32_bits(start), lower_32_bits(start),
559 upper_32_bits(end), lower_32_bits(end));
560 }
651e7d48 561 if (GRAPHICS_VER(m->i915) >= 4) {
03382dfb 562 err_printf(m, " BBADDR: 0x%08x_%08x\n",
6361f4ba
CW
563 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
564 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
565 err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
3dda20a9 566 }
6361f4ba
CW
567 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
568 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
569 lower_32_bits(ee->faddr));
651e7d48 570 if (GRAPHICS_VER(m->i915) >= 6) {
6361f4ba
CW
571 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
572 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
84734a04 573 }
b729cfee
SS
574 if (GRAPHICS_VER(m->i915) >= 11) {
575 err_printf(m, " NOPID: 0x%08x\n", ee->nopid);
576 err_printf(m, " EXCC: 0x%08x\n", ee->excc);
577 err_printf(m, " CMD_CCTL: 0x%08x\n", ee->cmd_cctl);
578 err_printf(m, " CSCMDOP: 0x%08x\n", ee->cscmdop);
579 err_printf(m, " CTX_SR_CTL: 0x%08x\n", ee->ctx_sr_ctl);
580 err_printf(m, " DMA_FADDR_HI: 0x%08x\n", ee->dma_faddr_hi);
581 err_printf(m, " DMA_FADDR_LO: 0x%08x\n", ee->dma_faddr_lo);
582 }
4bdafb9d 583 if (HAS_PPGTT(m->i915)) {
6361f4ba 584 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
6c7a01ec 585
651e7d48 586 if (GRAPHICS_VER(m->i915) >= 8) {
6c7a01ec
BW
587 int i;
588 for (i = 0; i < 4; i++)
589 err_printf(m, " PDP%d: 0x%016llx\n",
6361f4ba 590 i, ee->vm_info.pdp[i]);
6c7a01ec
BW
591 } else {
592 err_printf(m, " PP_DIR_BASE: 0x%08x\n",
6361f4ba 593 ee->vm_info.pp_dir_base);
6c7a01ec
BW
594 }
595 }
3fe3b030 596
76e70087
MK
597 for (n = 0; n < ee->num_ports; n++) {
598 err_printf(m, " ELSP[%d]:", n);
742379c0 599 error_print_request(m, " ", &ee->execlist[n]);
76e70087 600 }
84734a04
MK
601}
602
603void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
604{
605 va_list args;
606
607 va_start(args, f);
608 i915_error_vprintf(e, f, args);
609 va_end(args);
610}
611
a0f1f7b4
AP
612void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
613 const struct intel_engine_cs *engine,
614 const struct i915_vma_coredump *vma)
ab0e7ff9 615{
489cae63 616 char out[ASCII85_BUFSZ];
e45b98ba 617 struct page *page;
ab0e7ff9 618
742379c0 619 if (!vma)
fc4c79c3
CW
620 return;
621
742379c0
CW
622 err_printf(m, "%s --- %s = 0x%08x %08x\n",
623 engine ? engine->name : "global", vma->name,
624 upper_32_bits(vma->gtt_offset),
625 lower_32_bits(vma->gtt_offset));
fc4c79c3 626
742379c0
CW
627 if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
628 err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes);
fd521d3b 629
0a97015d 630 err_compression_marker(m);
e45b98ba 631 list_for_each_entry(page, &vma->page_list, lru) {
0a97015d 632 int i, len;
e45b98ba 633 const u32 *addr = page_address(page);
0a97015d
CW
634
635 len = PAGE_SIZE;
e45b98ba 636 if (page == list_last_entry(&vma->page_list, typeof(*page), lru))
742379c0 637 len -= vma->unused;
0a97015d
CW
638 len = ascii85_encode_len(len);
639
489cae63 640 for (i = 0; i < len; i++)
e45b98ba 641 err_puts(m, ascii85_encode(addr[i], out));
ab0e7ff9 642 }
0a97015d 643 err_puts(m, "\n");
ab0e7ff9
CW
644}
645
2bd160a1 646static void err_print_capabilities(struct drm_i915_error_state_buf *m,
792592e7 647 struct i915_gpu_coredump *error)
2bd160a1 648{
a8c9b849
MW
649 struct drm_printer p = i915_error_printer(m);
650
c7d3c844 651 intel_device_info_print(&error->device_info, &error->runtime_info, &p);
4ae7eb92
JN
652 intel_display_device_info_print(&error->display_device_info,
653 &error->display_runtime_info, &p);
792592e7 654 intel_driver_caps_print(&error->driver_caps, &p);
2bd160a1
CW
655}
656
642c8a72 657static void err_print_params(struct drm_i915_error_state_buf *m,
acfb9973 658 const struct i915_params *params)
642c8a72 659{
acfb9973
MW
660 struct drm_printer p = i915_error_printer(m);
661
662 i915_params_dump(params, &p);
642c8a72
CW
663}
664
5a4c6f1b
CW
665static void err_print_pciid(struct drm_i915_error_state_buf *m,
666 struct drm_i915_private *i915)
667{
8ff5446a 668 struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
5a4c6f1b
CW
669
670 err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
671 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
672 err_printf(m, "PCI Subsystem: %04x:%04x\n",
673 pdev->subsystem_vendor,
674 pdev->subsystem_device);
675}
676
c5de70f6
JH
677static void err_print_guc_ctb(struct drm_i915_error_state_buf *m,
678 const char *name,
679 const struct intel_ctb_coredump *ctb)
680{
681 if (!ctb->size)
682 return;
683
684 err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n",
685 name, ctb->raw_status, ctb->raw_head, ctb->raw_tail,
686 ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size);
687}
688
7d41ef34 689static void err_print_uc(struct drm_i915_error_state_buf *m,
742379c0 690 const struct intel_uc_coredump *error_uc)
7d41ef34
MW
691{
692 struct drm_printer p = i915_error_printer(m);
7d41ef34
MW
693
694 intel_uc_fw_dump(&error_uc->guc_fw, &p);
695 intel_uc_fw_dump(&error_uc->huc_fw, &p);
c5de70f6
JH
696 err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp);
697 intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log);
698 err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence);
699 err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0);
700 err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1);
701 intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb);
7d41ef34
MW
702}
703
0e39037b 704static void err_free_sgl(struct scatterlist *sgl)
84734a04 705{
0e39037b
CW
706 while (sgl) {
707 struct scatterlist *sg;
84734a04 708
0e39037b
CW
709 for (sg = sgl; !sg_is_chain(sg); sg++) {
710 kfree(sg_virt(sg));
711 if (sg_is_last(sg))
712 break;
713 }
714
715 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
716 free_page((unsigned long)sgl);
717 sgl = sg;
84734a04 718 }
0e39037b 719}
84734a04 720
68172f2c
CW
721static void err_print_gt_info(struct drm_i915_error_state_buf *m,
722 struct intel_gt_coredump *gt)
723{
724 struct drm_printer p = i915_error_printer(m);
725
726 intel_gt_info_print(&gt->info, &p);
cc1338f2 727 intel_sseu_print_topology(gt->_gt->i915, &gt->info.sseu, &p);
68172f2c
CW
728}
729
a6f0f9cf
AP
730static void err_print_gt_display(struct drm_i915_error_state_buf *m,
731 struct intel_gt_coredump *gt)
732{
733 err_printf(m, "IER: 0x%08x\n", gt->ier);
734 err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr);
735}
736
737static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m,
738 struct intel_gt_coredump *gt)
742379c0 739{
1a8585bd 740 int i;
742379c0 741
01fabda8 742 err_printf(m, "GT awake: %s\n", str_yes_no(gt->awake));
368d179a
JH
743 err_printf(m, "CS timestamp frequency: %u Hz, %d ns\n",
744 gt->clock_frequency, gt->clock_period_ns);
742379c0 745 err_printf(m, "EIR: 0x%08x\n", gt->eir);
a6f0f9cf
AP
746 err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er);
747
742379c0
CW
748 for (i = 0; i < gt->ngtier; i++)
749 err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]);
a6f0f9cf 750}
742379c0 751
a6f0f9cf
AP
752static void err_print_gt_global(struct drm_i915_error_state_buf *m,
753 struct intel_gt_coredump *gt)
754{
755 err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake);
742379c0 756
651e7d48 757 if (IS_GRAPHICS_VER(m->i915, 6, 11)) {
742379c0
CW
758 err_printf(m, "ERROR: 0x%08x\n", gt->error);
759 err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg);
760 }
761
651e7d48 762 if (GRAPHICS_VER(m->i915) >= 8)
742379c0
CW
763 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
764 gt->fault_data1, gt->fault_data0);
765
651e7d48 766 if (GRAPHICS_VER(m->i915) == 7)
742379c0
CW
767 err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int);
768
651e7d48 769 if (IS_GRAPHICS_VER(m->i915, 8, 11))
742379c0
CW
770 err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache);
771
651e7d48 772 if (GRAPHICS_VER(m->i915) == 12)
742379c0
CW
773 err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err);
774
651e7d48 775 if (GRAPHICS_VER(m->i915) >= 12) {
742379c0
CW
776 int i;
777
239bbb2f 778 for (i = 0; i < I915_MAX_SFC; i++) {
24d032e2
MR
779 /*
780 * SFC_DONE resides in the VD forcewake domain, so it
781 * only exists if the corresponding VCS engine is
782 * present.
783 */
45f63790
MR
784 if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 ||
785 !HAS_ENGINE(gt->_gt, _VCS(i * 2)))
24d032e2
MR
786 continue;
787
742379c0
CW
788 err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i,
789 gt->sfc_done[i]);
24d032e2 790 }
742379c0
CW
791
792 err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done);
793 }
a6f0f9cf
AP
794}
795
796static void err_print_gt_fences(struct drm_i915_error_state_buf *m,
797 struct intel_gt_coredump *gt)
798{
799 int i;
800
801 for (i = 0; i < gt->nfence; i++)
802 err_printf(m, " fence[%d] = %08llx\n", i, gt->fence[i]);
803}
804
805static void err_print_gt_engines(struct drm_i915_error_state_buf *m,
806 struct intel_gt_coredump *gt)
807{
808 const struct intel_engine_coredump *ee;
742379c0
CW
809
810 for (ee = gt->engine; ee; ee = ee->next) {
811 const struct i915_vma_coredump *vma;
812
e4730ae4
JH
813 if (gt->uc && gt->uc->guc.is_guc_capture) {
814 if (ee->guc_capture_node)
815 intel_guc_capture_print_engine_node(m, ee);
816 else
817 err_printf(m, " Missing GuC capture node for %s\n",
818 ee->engine->name);
819 } else {
a6f0f9cf 820 error_print_engine(m, ee);
e4730ae4 821 }
a6f0f9cf
AP
822
823 err_printf(m, " hung: %u\n", ee->hung);
824 err_printf(m, " engine reset count: %u\n", ee->reset_count);
825 error_print_context(m, " Active context: ", &ee->context);
826
742379c0 827 for (vma = ee->vma; vma; vma = vma->next)
a0f1f7b4 828 intel_gpu_error_print_vma(m, ee->engine, vma);
742379c0
CW
829 }
830
742379c0
CW
831}
832
0e39037b 833static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
742379c0 834 struct i915_gpu_coredump *error)
0e39037b 835{
742379c0 836 const struct intel_engine_coredump *ee;
0e39037b 837 struct timespec64 ts;
fb6f0b64 838
5a4c6f1b
CW
839 if (*error->error_msg)
840 err_printf(m, "%s\n", error->error_msg);
57428bcc
CW
841 err_printf(m, "Kernel: %s %s\n",
842 init_utsname()->release,
843 init_utsname()->machine);
d71c4b03 844 err_printf(m, "Driver: %s\n", DRIVER_DATE);
c6270dbc
AB
845 ts = ktime_to_timespec64(error->time);
846 err_printf(m, "Time: %lld s %ld us\n",
847 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
848 ts = ktime_to_timespec64(error->boottime);
849 err_printf(m, "Boottime: %lld s %ld us\n",
850 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
851 ts = ktime_to_timespec64(error->uptime);
852 err_printf(m, "Uptime: %lld s %ld us\n",
853 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
058179e7
CW
854 err_printf(m, "Capture: %lu jiffies; %d ms ago\n",
855 error->capture, jiffies_to_msecs(jiffies - error->capture));
3fe3b030 856
742379c0 857 for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next)
7f4127c4 858 err_printf(m, "Active process (on ring %s): %s [%d]\n",
c990b4c3
CW
859 ee->engine->name,
860 ee->context.comm,
861 ee->context.pid);
862
48b031e3 863 err_printf(m, "Reset count: %u\n", error->reset_count);
62d5d69b 864 err_printf(m, "Suspend count: %u\n", error->suspend_count);
2e0d26f8 865 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
805446c8
TU
866 err_printf(m, "Subplatform: 0x%x\n",
867 intel_subplatform(&error->runtime_info,
868 error->device_info.platform));
0e39037b 869 err_print_pciid(m, m->i915);
642c8a72 870
eb5be9d0 871 err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
0ac7655c 872
5efde05f 873 intel_dmc_print_error_state(m, m->i915);
0ac7655c 874
01fabda8
LDM
875 err_printf(m, "RPM wakelock: %s\n", str_yes_no(error->wakelock));
876 err_printf(m, "PM suspended: %s\n", str_yes_no(error->suspended));
84734a04 877
a6f0f9cf
AP
878 if (error->gt) {
879 bool print_guc_capture = false;
880
c5de70f6 881 if (error->gt->uc && error->gt->uc->guc.is_guc_capture)
a6f0f9cf
AP
882 print_guc_capture = true;
883
884 err_print_gt_display(m, error->gt);
885 err_print_gt_global_nonguc(m, error->gt);
886 err_print_gt_fences(m, error->gt);
887
888 /*
889 * GuC dumped global, eng-class and eng-instance registers together
890 * as part of engine state dump so we print in err_print_gt_engines
891 */
892 if (!print_guc_capture)
893 err_print_gt_global(m, error->gt);
894
895 err_print_gt_engines(m, error->gt);
896
897 if (error->gt->uc)
898 err_print_uc(m, error->gt->uc);
899
900 err_print_gt_info(m, error->gt);
901 }
84734a04
MK
902
903 if (error->overlay)
904 intel_overlay_print_error_state(m, error->overlay);
905
792592e7 906 err_print_capabilities(m, error);
642c8a72 907 err_print_params(m, &error->params);
0e39037b
CW
908}
909
742379c0 910static int err_print_to_sgl(struct i915_gpu_coredump *error)
0e39037b
CW
911{
912 struct drm_i915_error_state_buf m;
913
914 if (IS_ERR(error))
915 return PTR_ERR(error);
916
917 if (READ_ONCE(error->sgl))
918 return 0;
919
920 memset(&m, 0, sizeof(m));
921 m.i915 = error->i915;
922
923 __err_print_to_sgl(&m, error);
924
925 if (m.buf) {
926 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
927 m.bytes = 0;
928 m.buf = NULL;
929 }
930 if (m.cur) {
931 GEM_BUG_ON(m.end < m.cur);
932 sg_mark_end(m.cur - 1);
933 }
934 GEM_BUG_ON(m.sgl && !m.cur);
935
936 if (m.err) {
937 err_free_sgl(m.sgl);
938 return m.err;
939 }
642c8a72 940
0e39037b
CW
941 if (cmpxchg(&error->sgl, NULL, m.sgl))
942 err_free_sgl(m.sgl);
84734a04
MK
943
944 return 0;
945}
946
742379c0
CW
947ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
948 char *buf, loff_t off, size_t rem)
84734a04 949{
0e39037b
CW
950 struct scatterlist *sg;
951 size_t count;
952 loff_t pos;
953 int err;
84734a04 954
0e39037b
CW
955 if (!error || !rem)
956 return 0;
84734a04 957
0e39037b
CW
958 err = err_print_to_sgl(error);
959 if (err)
960 return err;
84734a04 961
0e39037b
CW
962 sg = READ_ONCE(error->fit);
963 if (!sg || off < sg->dma_address)
964 sg = error->sgl;
965 if (!sg)
966 return 0;
84734a04 967
0e39037b
CW
968 pos = sg->dma_address;
969 count = 0;
970 do {
971 size_t len, start;
972
973 if (sg_is_chain(sg)) {
974 sg = sg_chain_ptr(sg);
975 GEM_BUG_ON(sg_is_chain(sg));
976 }
84734a04 977
0e39037b
CW
978 len = sg->length;
979 if (pos + len <= off) {
980 pos += len;
981 continue;
982 }
84734a04 983
0e39037b
CW
984 start = sg->offset;
985 if (pos < off) {
986 GEM_BUG_ON(off - pos > len);
987 len -= off - pos;
988 start += off - pos;
989 pos = off;
990 }
991
992 len = min(len, rem);
993 GEM_BUG_ON(!len || len > sg->length);
994
995 memcpy(buf, page_address(sg_page(sg)) + start, len);
996
997 count += len;
998 pos += len;
999
1000 buf += len;
1001 rem -= len;
1002 if (!rem) {
1003 WRITE_ONCE(error->fit, sg);
1004 break;
1005 }
1006 } while (!sg_is_last(sg++));
1007
1008 return count;
84734a04
MK
1009}
1010
742379c0 1011static void i915_vma_coredump_free(struct i915_vma_coredump *vma)
84734a04 1012{
742379c0
CW
1013 while (vma) {
1014 struct i915_vma_coredump *next = vma->next;
e45b98ba 1015 struct page *page, *n;
84734a04 1016
e45b98ba
TH
1017 list_for_each_entry_safe(page, n, &vma->page_list, lru) {
1018 list_del_init(&page->lru);
1019 __free_page(page);
1020 }
84734a04 1021
742379c0
CW
1022 kfree(vma);
1023 vma = next;
1024 }
84734a04
MK
1025}
1026
742379c0 1027static void cleanup_params(struct i915_gpu_coredump *error)
84a20a8a 1028{
16cabb12 1029 i915_params_free(&error->params);
84a20a8a
MW
1030}
1031
742379c0 1032static void cleanup_uc(struct intel_uc_coredump *uc)
7d41ef34 1033{
665ae9c9
JH
1034 kfree(uc->guc_fw.file_selected.path);
1035 kfree(uc->huc_fw.file_selected.path);
1036 kfree(uc->guc_fw.file_wanted.path);
1037 kfree(uc->huc_fw.file_wanted.path);
c5de70f6
JH
1038 i915_vma_coredump_free(uc->guc.vma_log);
1039 i915_vma_coredump_free(uc->guc.vma_ctb);
7d41ef34 1040
742379c0 1041 kfree(uc);
7d41ef34
MW
1042}
1043
742379c0 1044static void cleanup_gt(struct intel_gt_coredump *gt)
84734a04 1045{
742379c0
CW
1046 while (gt->engine) {
1047 struct intel_engine_coredump *ee = gt->engine;
1048
1049 gt->engine = ee->next;
84734a04 1050
742379c0 1051 i915_vma_coredump_free(ee->vma);
a6f0f9cf 1052 intel_guc_capture_free_node(ee);
742379c0
CW
1053 kfree(ee);
1054 }
6361f4ba 1055
742379c0
CW
1056 if (gt->uc)
1057 cleanup_uc(gt->uc);
c990b4c3 1058
742379c0
CW
1059 kfree(gt);
1060}
b0fd47ad 1061
742379c0
CW
1062void __i915_gpu_coredump_free(struct kref *error_ref)
1063{
1064 struct i915_gpu_coredump *error =
1065 container_of(error_ref, typeof(*error), ref);
6361f4ba 1066
742379c0
CW
1067 while (error->gt) {
1068 struct intel_gt_coredump *gt = error->gt;
1069
1070 error->gt = gt->next;
1071 cleanup_gt(gt);
84734a04
MK
1072 }
1073
84734a04 1074 kfree(error->overlay);
1d6aa7a3 1075
84a20a8a 1076 cleanup_params(error);
7d41ef34 1077
0e39037b 1078 err_free_sgl(error->sgl);
84734a04
MK
1079 kfree(error);
1080}
1081
742379c0
CW
1082static struct i915_vma_coredump *
1083i915_vma_coredump_create(const struct intel_gt *gt,
60dc43d1
TH
1084 const struct i915_vma_resource *vma_res,
1085 struct i915_vma_compress *compress,
1086 const char *name)
1087
84734a04 1088{
742379c0 1089 struct i915_ggtt *ggtt = gt->ggtt;
95374d75 1090 const u64 slot = ggtt->error_capture.start;
742379c0 1091 struct i915_vma_coredump *dst;
95374d75 1092 struct sgt_iter iter;
83bc0f5b 1093 int ret;
84734a04 1094
79c7a28e
CW
1095 might_sleep();
1096
60dc43d1 1097 if (!vma_res || !vma_res->bi.pages || !compress)
058d88c4
CW
1098 return NULL;
1099
e45b98ba 1100 dst = kmalloc(sizeof(*dst), ALLOW_FAIL);
058d88c4 1101 if (!dst)
84734a04
MK
1102 return NULL;
1103
3bdd4f84
CW
1104 if (!compress_start(compress)) {
1105 kfree(dst);
1106 return NULL;
1107 }
1108
e45b98ba 1109 INIT_LIST_HEAD(&dst->page_list);
60dc43d1 1110 strcpy(dst->name, name);
742379c0
CW
1111 dst->next = NULL;
1112
60dc43d1
TH
1113 dst->gtt_offset = vma_res->start;
1114 dst->gtt_size = vma_res->node_size;
1115 dst->gtt_page_sizes = vma_res->page_sizes_gtt;
0a97015d
CW
1116 dst->unused = 0;
1117
83bc0f5b 1118 ret = -EINVAL;
895d8ebe 1119 if (drm_mm_node_allocated(&ggtt->error_capture)) {
95374d75 1120 void __iomem *s;
895d8ebe 1121 dma_addr_t dma;
b3c3f5e6 1122
60dc43d1 1123 for_each_sgt_daddr(dma, iter, vma_res->bi.pages) {
f2acf740 1124 mutex_lock(&ggtt->error_mutex);
a0696856
ND
1125 if (ggtt->vm.raw_insert_page)
1126 ggtt->vm.raw_insert_page(&ggtt->vm, dma, slot,
9275277d
FY
1127 i915_gem_get_pat_index(gt->i915,
1128 I915_CACHE_NONE),
1129 0);
a0696856
ND
1130 else
1131 ggtt->vm.insert_page(&ggtt->vm, dma, slot,
9275277d
FY
1132 i915_gem_get_pat_index(gt->i915,
1133 I915_CACHE_NONE),
1134 0);
742379c0 1135 mb();
b3c3f5e6 1136
895d8ebe 1137 s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
742379c0
CW
1138 ret = compress_page(compress,
1139 (void __force *)s, dst,
1140 true);
895d8ebe 1141 io_mapping_unmap(s);
f2acf740
CW
1142
1143 mb();
1144 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1145 mutex_unlock(&ggtt->error_mutex);
895d8ebe
DCS
1146 if (ret)
1147 break;
1148 }
60dc43d1
TH
1149 } else if (vma_res->bi.lmem) {
1150 struct intel_memory_region *mem = vma_res->mr;
895d8ebe
DCS
1151 dma_addr_t dma;
1152
60dc43d1 1153 for_each_sgt_daddr(dma, iter, vma_res->bi.pages) {
d42a738e 1154 dma_addr_t offset = dma - mem->region.start;
895d8ebe
DCS
1155 void __iomem *s;
1156
d42a738e
MA
1157 if (offset + PAGE_SIZE > mem->io_size) {
1158 ret = -EINVAL;
1159 break;
1160 }
1161
1162 s = io_mapping_map_wc(&mem->iomap, offset, PAGE_SIZE);
742379c0
CW
1163 ret = compress_page(compress,
1164 (void __force *)s, dst,
1165 true);
48715f70 1166 io_mapping_unmap(s);
895d8ebe
DCS
1167 if (ret)
1168 break;
1169 }
1170 } else {
1171 struct page *page;
1172
60dc43d1 1173 for_each_sgt_page(page, iter, vma_res->bi.pages) {
895d8ebe
DCS
1174 void *s;
1175
1176 drm_clflush_pages(&page, 1);
1177
36dd2a6e 1178 s = kmap_local_page(page);
742379c0 1179 ret = compress_page(compress, s, dst, false);
36dd2a6e 1180 kunmap_local(s);
895d8ebe
DCS
1181
1182 drm_clflush_pages(&page, 1);
1183
1184 if (ret)
1185 break;
1186 }
84734a04 1187 }
84734a04 1188
3bdd4f84 1189 if (ret || compress_flush(compress, dst)) {
e45b98ba
TH
1190 struct page *page, *n;
1191
1192 list_for_each_entry_safe_reverse(page, n, &dst->page_list, lru) {
1193 list_del_init(&page->lru);
1194 pool_free(&compress->pool, page_address(page));
1195 }
1196
83bc0f5b
CW
1197 kfree(dst);
1198 dst = NULL;
1199 }
3bdd4f84 1200 compress_finish(compress);
95374d75 1201
95374d75 1202 return dst;
84734a04 1203}
84734a04 1204
742379c0 1205static void gt_record_fences(struct intel_gt_coredump *gt)
011cf577 1206{
742379c0
CW
1207 struct i915_ggtt *ggtt = gt->_gt->ggtt;
1208 struct intel_uncore *uncore = gt->_gt->uncore;
84734a04
MK
1209 int i;
1210
651e7d48 1211 if (GRAPHICS_VER(uncore->i915) >= 6) {
742379c0
CW
1212 for (i = 0; i < ggtt->num_fences; i++)
1213 gt->fence[i] =
7f1502d9
TU
1214 intel_uncore_read64(uncore,
1215 FENCE_REG_GEN6_LO(i));
651e7d48 1216 } else if (GRAPHICS_VER(uncore->i915) >= 4) {
742379c0
CW
1217 for (i = 0; i < ggtt->num_fences; i++)
1218 gt->fence[i] =
7f1502d9
TU
1219 intel_uncore_read64(uncore,
1220 FENCE_REG_965_LO(i));
5a4c6f1b 1221 } else {
742379c0
CW
1222 for (i = 0; i < ggtt->num_fences; i++)
1223 gt->fence[i] =
7f1502d9 1224 intel_uncore_read(uncore, FENCE_REG(i));
eecf613a 1225 }
742379c0 1226 gt->nfence = i;
84734a04
MK
1227}
1228
742379c0 1229static void engine_record_registers(struct intel_engine_coredump *ee)
84734a04 1230{
742379c0
CW
1231 const struct intel_engine_cs *engine = ee->engine;
1232 struct drm_i915_private *i915 = engine->i915;
6361f4ba 1233
651e7d48 1234 if (GRAPHICS_VER(i915) >= 6) {
baba6e57 1235 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
91b59cd9 1236
ab1b2d40
MR
1237 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
1238 ee->fault_reg = intel_gt_mcr_read_any(engine->gt,
1239 XEHP_RING_FAULT_REG);
1240 else if (GRAPHICS_VER(i915) >= 12)
742379c0
CW
1241 ee->fault_reg = intel_uncore_read(engine->uncore,
1242 GEN12_RING_FAULT_REG);
651e7d48 1243 else if (GRAPHICS_VER(i915) >= 8)
742379c0
CW
1244 ee->fault_reg = intel_uncore_read(engine->uncore,
1245 GEN8_RING_FAULT_REG);
62acc7e8 1246 else
77a302e0 1247 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
4e5aabfd
BW
1248 }
1249
651e7d48 1250 if (GRAPHICS_VER(i915) >= 4) {
70a76a9b 1251 ee->esr = ENGINE_READ(engine, RING_ESR);
baba6e57
DCS
1252 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1253 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1254 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1255 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1256 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
742379c0 1257 ee->ccid = ENGINE_READ(engine, CCID);
651e7d48 1258 if (GRAPHICS_VER(i915) >= 8) {
baba6e57
DCS
1259 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1260 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
13ffadd1 1261 }
baba6e57 1262 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
84734a04 1263 } else {
baba6e57
DCS
1264 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1265 ee->ipeir = ENGINE_READ(engine, IPEIR);
1266 ee->ipehr = ENGINE_READ(engine, IPEHR);
84734a04
MK
1267 }
1268
b729cfee
SS
1269 if (GRAPHICS_VER(i915) >= 11) {
1270 ee->cmd_cctl = ENGINE_READ(engine, RING_CMD_CCTL);
1271 ee->cscmdop = ENGINE_READ(engine, RING_CSCMDOP);
1272 ee->ctx_sr_ctl = ENGINE_READ(engine, RING_CTX_SR_CTL);
1273 ee->dma_faddr_hi = ENGINE_READ(engine, RING_DMA_FADD_UDW);
1274 ee->dma_faddr_lo = ENGINE_READ(engine, RING_DMA_FADD);
1275 ee->nopid = ENGINE_READ(engine, RING_NOPID);
1276 ee->excc = ENGINE_READ(engine, RING_EXCC);
1277 }
1278
0e704476 1279 intel_engine_get_instdone(engine, &ee->instdone);
d636951e 1280
baba6e57 1281 ee->instpm = ENGINE_READ(engine, RING_INSTPM);
7e37f889 1282 ee->acthd = intel_engine_get_active_head(engine);
baba6e57
DCS
1283 ee->start = ENGINE_READ(engine, RING_START);
1284 ee->head = ENGINE_READ(engine, RING_HEAD);
1285 ee->tail = ENGINE_READ(engine, RING_TAIL);
1286 ee->ctl = ENGINE_READ(engine, RING_CTL);
651e7d48 1287 if (GRAPHICS_VER(i915) > 2)
baba6e57 1288 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
84734a04 1289
742379c0 1290 if (!HWS_NEEDS_PHYSICAL(i915)) {
f0f59a00 1291 i915_reg_t mmio;
f3ce3821 1292
651e7d48 1293 if (GRAPHICS_VER(i915) == 7) {
0bc40be8 1294 switch (engine->id) {
f3ce3821 1295 default:
8a68d464 1296 MISSING_CASE(engine->id);
df561f66 1297 fallthrough;
8a68d464 1298 case RCS0:
f3ce3821
CW
1299 mmio = RENDER_HWS_PGA_GEN7;
1300 break;
8a68d464 1301 case BCS0:
f3ce3821
CW
1302 mmio = BLT_HWS_PGA_GEN7;
1303 break;
8a68d464 1304 case VCS0:
f3ce3821
CW
1305 mmio = BSD_HWS_PGA_GEN7;
1306 break;
8a68d464 1307 case VECS0:
f3ce3821
CW
1308 mmio = VEBOX_HWS_PGA_GEN7;
1309 break;
1310 }
651e7d48 1311 } else if (GRAPHICS_VER(engine->i915) == 6) {
0bc40be8 1312 mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
f3ce3821
CW
1313 } else {
1314 /* XXX: gen8 returns to sanity */
0bc40be8 1315 mmio = RING_HWS_PGA(engine->mmio_base);
f3ce3821
CW
1316 }
1317
742379c0 1318 ee->hws = intel_uncore_read(engine->uncore, mmio);
f3ce3821
CW
1319 }
1320
742379c0 1321 ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine);
6c7a01ec 1322
742379c0 1323 if (HAS_PPGTT(i915)) {
6c7a01ec
BW
1324 int i;
1325
dbc65183 1326 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
6c7a01ec 1327
651e7d48 1328 if (GRAPHICS_VER(i915) == 6) {
6361f4ba 1329 ee->vm_info.pp_dir_base =
baba6e57 1330 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
651e7d48 1331 } else if (GRAPHICS_VER(i915) == 7) {
6361f4ba 1332 ee->vm_info.pp_dir_base =
6d425728 1333 ENGINE_READ(engine, RING_PP_DIR_BASE);
651e7d48 1334 } else if (GRAPHICS_VER(i915) >= 8) {
6d425728
CW
1335 u32 base = engine->mmio_base;
1336
6c7a01ec 1337 for (i = 0; i < 4; i++) {
6361f4ba 1338 ee->vm_info.pdp[i] =
742379c0
CW
1339 intel_uncore_read(engine->uncore,
1340 GEN8_RING_PDP_UDW(base, i));
6361f4ba
CW
1341 ee->vm_info.pdp[i] <<= 32;
1342 ee->vm_info.pdp[i] |=
742379c0
CW
1343 intel_uncore_read(engine->uncore,
1344 GEN8_RING_PDP_LDW(base, i));
6c7a01ec 1345 }
6d425728 1346 }
6c7a01ec 1347 }
84734a04
MK
1348}
1349
22b7a426 1350static void record_request(const struct i915_request *request,
742379c0 1351 struct i915_request_coredump *erq)
35ca039e 1352{
52c0fdb2 1353 erq->flags = request->fence.flags;
b300fde8
CW
1354 erq->context = request->fence.context;
1355 erq->seqno = request->fence.seqno;
b7268c5e 1356 erq->sched_attr = request->sched.attr;
35ca039e
CW
1357 erq->head = request->head;
1358 erq->tail = request->tail;
6a8679c0
CW
1359
1360 erq->pid = 0;
1361 rcu_read_lock();
24aac336
CW
1362 if (!intel_context_is_closed(request->context)) {
1363 const struct i915_gem_context *ctx;
1364
1365 ctx = rcu_dereference(request->context->gem_context);
1366 if (ctx)
1367 erq->pid = pid_nr(ctx->pid);
1368 }
6a8679c0 1369 rcu_read_unlock();
35ca039e
CW
1370}
1371
742379c0 1372static void engine_record_execlists(struct intel_engine_coredump *ee)
35ca039e 1373{
742379c0
CW
1374 const struct intel_engine_execlists * const el = &ee->engine->execlists;
1375 struct i915_request * const *port = el->active;
22b7a426 1376 unsigned int n = 0;
35ca039e 1377
22b7a426
CW
1378 while (*port)
1379 record_request(*port++, &ee->execlist[n++]);
76e70087
MK
1380
1381 ee->num_ports = n;
35ca039e
CW
1382}
1383
742379c0 1384static bool record_context(struct i915_gem_context_coredump *e,
e8a3319c 1385 struct intel_context *ce)
4fa6053e 1386{
6a8679c0
CW
1387 struct i915_gem_context *ctx;
1388 struct task_struct *task;
03d0ed8a 1389 bool simulated;
6a8679c0
CW
1390
1391 rcu_read_lock();
e8a3319c 1392 ctx = rcu_dereference(ce->gem_context);
6a8679c0
CW
1393 if (ctx && !kref_get_unless_zero(&ctx->ref))
1394 ctx = NULL;
1395 rcu_read_unlock();
9f3ccd40 1396 if (!ctx)
03d0ed8a 1397 return true;
c990b4c3 1398
6a8679c0
CW
1399 rcu_read_lock();
1400 task = pid_task(ctx->pid, PIDTYPE_PID);
1401 if (task) {
1402 strcpy(e->comm, task->comm);
1403 e->pid = task->pid;
4fa6053e 1404 }
6a8679c0 1405 rcu_read_unlock();
4fa6053e 1406
b7268c5e 1407 e->sched_attr = ctx->sched;
77b25a97
CW
1408 e->guilty = atomic_read(&ctx->guilty_count);
1409 e->active = atomic_read(&ctx->active_count);
c8a76df6
JH
1410 e->hwsp_seqno = (ce->timeline && ce->timeline->hwsp_seqno) ?
1411 *ce->timeline->hwsp_seqno : ~0U;
c990b4c3 1412
e8a3319c
JH
1413 e->total_runtime = intel_context_get_total_runtime_ns(ce);
1414 e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
1883a0a4 1415
03d0ed8a 1416 simulated = i915_gem_context_no_error_capture(ctx);
6a8679c0
CW
1417
1418 i915_gem_context_put(ctx);
03d0ed8a 1419 return simulated;
4fa6053e
CW
1420}
1421
742379c0
CW
1422struct intel_engine_capture_vma {
1423 struct intel_engine_capture_vma *next;
60dc43d1 1424 struct i915_vma_resource *vma_res;
742379c0 1425 char name[16];
ff20afc4 1426 bool lockdep_cookie;
79c7a28e
CW
1427};
1428
742379c0 1429static struct intel_engine_capture_vma *
ff20afc4 1430capture_vma_snapshot(struct intel_engine_capture_vma *next,
60dc43d1
TH
1431 struct i915_vma_resource *vma_res,
1432 gfp_t gfp, const char *name)
79c7a28e 1433{
742379c0 1434 struct intel_engine_capture_vma *c;
79c7a28e 1435
60dc43d1 1436 if (!vma_res)
79c7a28e
CW
1437 return next;
1438
742379c0 1439 c = kmalloc(sizeof(*c), gfp);
79c7a28e
CW
1440 if (!c)
1441 return next;
1442
60dc43d1 1443 if (!i915_vma_resource_hold(vma_res, &c->lockdep_cookie)) {
79c7a28e
CW
1444 kfree(c);
1445 return next;
1446 }
1447
60dc43d1
TH
1448 strcpy(c->name, name);
1449 c->vma_res = i915_vma_resource_get(vma_res);
79c7a28e
CW
1450
1451 c->next = next;
1452 return c;
1453}
1454
ff20afc4
TH
1455static struct intel_engine_capture_vma *
1456capture_vma(struct intel_engine_capture_vma *next,
1457 struct i915_vma *vma,
1458 const char *name,
1459 gfp_t gfp)
1460{
ff20afc4
TH
1461 if (!vma)
1462 return next;
1463
1464 /*
1465 * If the vma isn't pinned, then the vma should be snapshotted
1466 * to a struct i915_vma_snapshot at command submission time.
1467 * Not here.
1468 */
60dc43d1 1469 if (GEM_WARN_ON(!i915_vma_is_pinned(vma)))
ff20afc4
TH
1470 return next;
1471
60dc43d1 1472 next = capture_vma_snapshot(next, vma->resource, gfp, name);
ff20afc4
TH
1473
1474 return next;
1475}
1476
742379c0
CW
1477static struct intel_engine_capture_vma *
1478capture_user(struct intel_engine_capture_vma *capture,
1479 const struct i915_request *rq,
1480 gfp_t gfp)
b0fd47ad 1481{
e61e0f51 1482 struct i915_capture_list *c;
b0fd47ad 1483
742379c0 1484 for (c = rq->capture_list; c; c = c->next)
60dc43d1
TH
1485 capture = capture_vma_snapshot(capture, c->vma_res, gfp,
1486 "user");
79c7a28e
CW
1487
1488 return capture;
b0fd47ad
CW
1489}
1490
742379c0
CW
1491static void add_vma(struct intel_engine_coredump *ee,
1492 struct i915_vma_coredump *vma)
84734a04 1493{
742379c0
CW
1494 if (vma) {
1495 vma->next = ee->vma;
1496 ee->vma = vma;
1497 }
1498}
1499
ff20afc4
TH
1500static struct i915_vma_coredump *
1501create_vma_coredump(const struct intel_gt *gt, struct i915_vma *vma,
1502 const char *name, struct i915_vma_compress *compress)
1503{
60dc43d1
TH
1504 struct i915_vma_coredump *ret = NULL;
1505 struct i915_vma_resource *vma_res;
1506 bool lockdep_cookie;
ff20afc4
TH
1507
1508 if (!vma)
1509 return NULL;
1510
60dc43d1
TH
1511 vma_res = vma->resource;
1512
1513 if (i915_vma_resource_hold(vma_res, &lockdep_cookie)) {
1514 ret = i915_vma_coredump_create(gt, vma_res, compress, name);
1515 i915_vma_resource_unhold(vma_res, lockdep_cookie);
1516 }
ff20afc4
TH
1517
1518 return ret;
1519}
1520
1521static void add_vma_coredump(struct intel_engine_coredump *ee,
1522 const struct intel_gt *gt,
1523 struct i915_vma *vma,
1524 const char *name,
1525 struct i915_vma_compress *compress)
1526{
1527 add_vma(ee, create_vma_coredump(gt, vma, name, compress));
1528}
1529
742379c0 1530struct intel_engine_coredump *
a6f0f9cf 1531intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
742379c0
CW
1532{
1533 struct intel_engine_coredump *ee;
c990b4c3 1534
742379c0 1535 ee = kzalloc(sizeof(*ee), gfp);
c990b4c3 1536 if (!ee)
742379c0 1537 return NULL;
84734a04 1538
742379c0 1539 ee->engine = engine;
372fbb8e 1540
a6f0f9cf
AP
1541 if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)) {
1542 engine_record_registers(ee);
1543 engine_record_execlists(ee);
1544 }
3bdd4f84 1545
742379c0
CW
1546 return ee;
1547}
ab0e7ff9 1548
e8a3319c
JH
1549static struct intel_engine_capture_vma *
1550engine_coredump_add_context(struct intel_engine_coredump *ee,
1551 struct intel_context *ce,
1552 gfp_t gfp)
1553{
1554 struct intel_engine_capture_vma *vma = NULL;
1555
1556 ee->simulated |= record_context(&ee->context, ce);
1557 if (ee->simulated)
1558 return NULL;
1559
1560 /*
1561 * We need to copy these to an anonymous buffer
1562 * as the simplest method to avoid being overwritten
1563 * by userspace.
1564 */
1565 vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
1566 vma = capture_vma(vma, ce->state, "HW context", gfp);
1567
1568 return vma;
1569}
1570
742379c0
CW
1571struct intel_engine_capture_vma *
1572intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
1573 struct i915_request *rq,
1574 gfp_t gfp)
1575{
e8a3319c 1576 struct intel_engine_capture_vma *vma;
79c7a28e 1577
e8a3319c
JH
1578 vma = engine_coredump_add_context(ee, rq->context, gfp);
1579 if (!vma)
742379c0 1580 return NULL;
ab0e7ff9 1581
742379c0
CW
1582 /*
1583 * We need to copy these to an anonymous buffer
1584 * as the simplest method to avoid being overwritten
1585 * by userspace.
1586 */
60dc43d1 1587 vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
742379c0 1588 vma = capture_user(vma, rq, gfp);
79c7a28e 1589
742379c0
CW
1590 ee->rq_head = rq->head;
1591 ee->rq_post = rq->postfix;
1592 ee->rq_tail = rq->tail;
bc3d6744 1593
742379c0
CW
1594 return vma;
1595}
cdb324bd 1596
742379c0
CW
1597void
1598intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
1599 struct intel_engine_capture_vma *capture,
1600 struct i915_vma_compress *compress)
1601{
1602 const struct intel_engine_cs *engine = ee->engine;
57bc699d 1603
742379c0
CW
1604 while (capture) {
1605 struct intel_engine_capture_vma *this = capture;
60dc43d1 1606 struct i915_vma_resource *vma_res = this->vma_res;
c990b4c3 1607
742379c0 1608 add_vma(ee,
60dc43d1
TH
1609 i915_vma_coredump_create(engine->gt, vma_res,
1610 compress, this->name));
84734a04 1611
60dc43d1
TH
1612 i915_vma_resource_unhold(vma_res, this->lockdep_cookie);
1613 i915_vma_resource_put(vma_res);
c990b4c3 1614
742379c0
CW
1615 capture = this->next;
1616 kfree(this);
1617 }
79c7a28e 1618
ff20afc4
TH
1619 add_vma_coredump(ee, engine->gt, engine->status_page.vma,
1620 "HW Status", compress);
79c7a28e 1621
ff20afc4
TH
1622 add_vma_coredump(ee, engine->gt, engine->wa_ctx.vma,
1623 "WA context", compress);
742379c0
CW
1624}
1625
1626static struct intel_engine_coredump *
1627capture_engine(struct intel_engine_cs *engine,
a6f0f9cf
AP
1628 struct i915_vma_compress *compress,
1629 u32 dump_flags)
742379c0 1630{
1a8585bd 1631 struct intel_engine_capture_vma *capture = NULL;
742379c0 1632 struct intel_engine_coredump *ee;
a4be3dca 1633 struct intel_context *ce = NULL;
573ba126 1634 struct i915_request *rq = NULL;
79c7a28e 1635
a6f0f9cf 1636 ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
742379c0
CW
1637 if (!ee)
1638 return NULL;
c0ce4663 1639
a4be3dca 1640 intel_engine_get_hung_entity(engine, &ce, &rq);
e7696d65 1641 if (rq && !i915_request_started(rq))
e8a3319c
JH
1642 drm_info(&engine->gt->i915->drm, "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n",
1643 engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id);
ff20afc4 1644
e8a3319c
JH
1645 if (rq) {
1646 capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
1647 i915_request_put(rq);
1648 } else if (ce) {
1649 capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
1650 }
c990b4c3 1651
e8a3319c
JH
1652 if (capture) {
1653 intel_engine_coredump_add_vma(ee, capture, compress);
c990b4c3 1654
e8a3319c
JH
1655 if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
1656 intel_guc_capture_get_matching_node(engine->gt, ee, ce);
1657 } else {
1658 kfree(ee);
1659 ee = NULL;
1660 }
ff20afc4 1661
e8a3319c 1662 return ee;
84734a04
MK
1663}
1664
3bdd4f84 1665static void
742379c0 1666gt_record_engines(struct intel_gt_coredump *gt,
bda30024 1667 intel_engine_mask_t engine_mask,
a6f0f9cf
AP
1668 struct i915_vma_compress *compress,
1669 u32 dump_flags)
7d41ef34 1670{
742379c0
CW
1671 struct intel_engine_cs *engine;
1672 enum intel_engine_id id;
7d41ef34 1673
742379c0
CW
1674 for_each_engine(engine, gt->_gt, id) {
1675 struct intel_engine_coredump *ee;
1676
1677 /* Refill our page pool before entering atomic section */
1678 pool_refill(&compress->pool, ALLOW_FAIL);
1679
a6f0f9cf 1680 ee = capture_engine(engine, compress, dump_flags);
742379c0
CW
1681 if (!ee)
1682 continue;
1683
bda30024
TU
1684 ee->hung = engine->mask & engine_mask;
1685
742379c0
CW
1686 gt->simulated |= ee->simulated;
1687 if (ee->simulated) {
a6f0f9cf
AP
1688 if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
1689 intel_guc_capture_free_node(ee);
742379c0
CW
1690 kfree(ee);
1691 continue;
1692 }
1693
1694 ee->next = gt->engine;
1695 gt->engine = ee;
1696 }
1697}
1698
c5de70f6
JH
1699static void gt_record_guc_ctb(struct intel_ctb_coredump *saved,
1700 const struct intel_guc_ct_buffer *ctb,
1701 const void *blob_ptr, struct intel_guc *guc)
1702{
1703 if (!ctb || !ctb->desc)
1704 return;
1705
1706 saved->raw_status = ctb->desc->status;
1707 saved->raw_head = ctb->desc->head;
1708 saved->raw_tail = ctb->desc->tail;
1709 saved->head = ctb->head;
1710 saved->tail = ctb->tail;
1711 saved->size = ctb->size;
1712 saved->desc_offset = ((void *)ctb->desc) - blob_ptr;
1713 saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr;
1714}
1715
742379c0
CW
1716static struct intel_uc_coredump *
1717gt_record_uc(struct intel_gt_coredump *gt,
1718 struct i915_vma_compress *compress)
1719{
1720 const struct intel_uc *uc = &gt->_gt->uc;
1721 struct intel_uc_coredump *error_uc;
1722
1723 error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL);
1724 if (!error_uc)
1725 return NULL;
7d41ef34 1726
abb042f3
MW
1727 memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
1728 memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
7d41ef34 1729
665ae9c9
JH
1730 error_uc->guc_fw.file_selected.path = kstrdup(uc->guc.fw.file_selected.path, ALLOW_FAIL);
1731 error_uc->huc_fw.file_selected.path = kstrdup(uc->huc.fw.file_selected.path, ALLOW_FAIL);
1732 error_uc->guc_fw.file_wanted.path = kstrdup(uc->guc.fw.file_wanted.path, ALLOW_FAIL);
1733 error_uc->huc_fw.file_wanted.path = kstrdup(uc->huc.fw.file_wanted.path, ALLOW_FAIL);
368d179a
JH
1734
1735 /*
1736 * Save the GuC log and include a timestamp reference for converting the
1737 * log times to system times (in conjunction with the error->boottime and
1738 * gt->clock_frequency fields saved elsewhere).
1739 */
c5de70f6
JH
1740 error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP);
1741 error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma,
1742 "GuC log buffer", compress);
1743 error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma,
1744 "GuC CT buffer", compress);
1745 error_uc->guc.last_fence = uc->guc.ct.requests.last_fence;
1746 gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send,
1747 uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
1748 gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv,
1749 uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
742379c0
CW
1750
1751 return error_uc;
1752}
1753
a6f0f9cf
AP
1754/* Capture display registers. */
1755static void gt_record_display_regs(struct intel_gt_coredump *gt)
1756{
1757 struct intel_uncore *uncore = gt->_gt->uncore;
1758 struct drm_i915_private *i915 = uncore->i915;
1759
1760 if (GRAPHICS_VER(i915) >= 6)
1761 gt->derrmr = intel_uncore_read(uncore, DERRMR);
1762
1763 if (GRAPHICS_VER(i915) >= 8)
1764 gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1765 else if (IS_VALLEYVIEW(i915))
1766 gt->ier = intel_uncore_read(uncore, VLV_IER);
1767 else if (HAS_PCH_SPLIT(i915))
1768 gt->ier = intel_uncore_read(uncore, DEIER);
1769 else if (GRAPHICS_VER(i915) == 2)
1770 gt->ier = intel_uncore_read16(uncore, GEN2_IER);
1771 else
1772 gt->ier = intel_uncore_read(uncore, GEN2_IER);
1773}
1774
1775/* Capture all other registers that GuC doesn't capture. */
1776static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt)
1777{
1778 struct intel_uncore *uncore = gt->_gt->uncore;
1779 struct drm_i915_private *i915 = uncore->i915;
1780 int i;
1781
1782 if (IS_VALLEYVIEW(i915)) {
1783 gt->gtier[0] = intel_uncore_read(uncore, GTIER);
1784 gt->ngtier = 1;
1785 } else if (GRAPHICS_VER(i915) >= 11) {
1786 gt->gtier[0] =
1787 intel_uncore_read(uncore,
1788 GEN11_RENDER_COPY_INTR_ENABLE);
1789 gt->gtier[1] =
1790 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1791 gt->gtier[2] =
1792 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1793 gt->gtier[3] =
1794 intel_uncore_read(uncore,
1795 GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1796 gt->gtier[4] =
1797 intel_uncore_read(uncore,
1798 GEN11_CRYPTO_RSVD_INTR_ENABLE);
1799 gt->gtier[5] =
1800 intel_uncore_read(uncore,
1801 GEN11_GUNIT_CSME_INTR_ENABLE);
1802 gt->ngtier = 6;
1803 } else if (GRAPHICS_VER(i915) >= 8) {
1804 for (i = 0; i < 4; i++)
1805 gt->gtier[i] =
1806 intel_uncore_read(uncore, GEN8_GT_IER(i));
1807 gt->ngtier = 4;
1808 } else if (HAS_PCH_SPLIT(i915)) {
1809 gt->gtier[0] = intel_uncore_read(uncore, GTIER);
1810 gt->ngtier = 1;
1811 }
1812
1813 gt->eir = intel_uncore_read(uncore, EIR);
1814 gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1815}
1816
1817/*
1818 * Capture all registers that relate to workload submission.
1819 * NOTE: In GuC submission, when GuC resets an engine, it can dump these for us
1820 */
1821static void gt_record_global_regs(struct intel_gt_coredump *gt)
84734a04 1822{
742379c0
CW
1823 struct intel_uncore *uncore = gt->_gt->uncore;
1824 struct drm_i915_private *i915 = uncore->i915;
885ea5a8 1825 int i;
84734a04 1826
742379c0
CW
1827 /*
1828 * General organization
654c90c6
BW
1829 * 1. Registers specific to a single generation
1830 * 2. Registers which belong to multiple generations
1831 * 3. Feature specific registers.
1832 * 4. Everything else
1833 * Please try to follow the order.
1834 */
84734a04 1835
654c90c6 1836 /* 1: Registers specific to a single generation */
a6f0f9cf 1837 if (IS_VALLEYVIEW(i915))
742379c0 1838 gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
84734a04 1839
651e7d48 1840 if (GRAPHICS_VER(i915) == 7)
742379c0 1841 gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
84734a04 1842
ab1b2d40
MR
1843 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) {
1844 gt->fault_data0 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt,
1845 XEHP_FAULT_TLB_DATA0);
1846 gt->fault_data1 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt,
1847 XEHP_FAULT_TLB_DATA1);
1848 } else if (GRAPHICS_VER(i915) >= 12) {
742379c0
CW
1849 gt->fault_data0 = intel_uncore_read(uncore,
1850 GEN12_FAULT_TLB_DATA0);
1851 gt->fault_data1 = intel_uncore_read(uncore,
1852 GEN12_FAULT_TLB_DATA1);
651e7d48 1853 } else if (GRAPHICS_VER(i915) >= 8) {
742379c0
CW
1854 gt->fault_data0 = intel_uncore_read(uncore,
1855 GEN8_FAULT_TLB_DATA0);
1856 gt->fault_data1 = intel_uncore_read(uncore,
1857 GEN8_FAULT_TLB_DATA1);
6c826f34
MK
1858 }
1859
651e7d48 1860 if (GRAPHICS_VER(i915) == 6) {
742379c0
CW
1861 gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1862 gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1863 gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
91ec5d11 1864 }
84734a04 1865
654c90c6 1866 /* 2: Registers which belong to multiple generations */
651e7d48 1867 if (GRAPHICS_VER(i915) >= 7)
742379c0 1868 gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
84734a04 1869
651e7d48 1870 if (GRAPHICS_VER(i915) >= 6) {
651e7d48 1871 if (GRAPHICS_VER(i915) < 12) {
742379c0
CW
1872 gt->error = intel_uncore_read(uncore, ERROR_GEN6);
1873 gt->done_reg = intel_uncore_read(uncore, DONE_REG);
23dea051 1874 }
84734a04
MK
1875 }
1876
654c90c6 1877 /* 3: Feature specific registers */
651e7d48 1878 if (IS_GRAPHICS_VER(i915, 6, 7)) {
742379c0
CW
1879 gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1880 gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
91ec5d11
BW
1881 }
1882
651e7d48 1883 if (IS_GRAPHICS_VER(i915, 8, 11))
742379c0 1884 gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
fd521d3b 1885
651e7d48 1886 if (GRAPHICS_VER(i915) == 12)
742379c0 1887 gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG);
ba1d18e3 1888
651e7d48 1889 if (GRAPHICS_VER(i915) >= 12) {
239bbb2f 1890 for (i = 0; i < I915_MAX_SFC; i++) {
24d032e2
MR
1891 /*
1892 * SFC_DONE resides in the VD forcewake domain, so it
1893 * only exists if the corresponding VCS engine is
1894 * present.
1895 */
45f63790
MR
1896 if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 ||
1897 !HAS_ENGINE(gt->_gt, _VCS(i * 2)))
24d032e2
MR
1898 continue;
1899
742379c0 1900 gt->sfc_done[i] =
e50dbdbf
MK
1901 intel_uncore_read(uncore, GEN12_SFC_DONE(i));
1902 }
811bb3db 1903
742379c0 1904 gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE);
e50dbdbf 1905 }
742379c0
CW
1906}
1907
792592e7
DCS
1908static void gt_record_info(struct intel_gt_coredump *gt)
1909{
1910 memcpy(&gt->info, &gt->_gt->info, sizeof(struct intel_gt_info));
368d179a
JH
1911 gt->clock_frequency = gt->_gt->clock_frequency;
1912 gt->clock_period_ns = gt->_gt->clock_period_ns;
792592e7
DCS
1913}
1914
742379c0
CW
1915/*
1916 * Generate a semi-unique error code. The code is not meant to have meaning, The
1917 * code's only purpose is to try to prevent false duplicated bug reports by
1918 * grossly estimating a GPU error state.
1919 *
1920 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1921 * the hang if we could strip the GTT offset information from it.
1922 *
1923 * It's only a small step better than a random number in its current form.
1924 */
1925static u32 generate_ecode(const struct intel_engine_coredump *ee)
1926{
1927 /*
1928 * IPEHR would be an ideal way to detect errors, as it's the gross
1929 * measure of "the command that hung." However, has some very common
1930 * synchronization commands which almost always appear in the case
1931 * strictly a client bug. Use instdone to differentiate those some.
1932 */
1933 return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
1d762aad
BW
1934}
1935
742379c0 1936static const char *error_msg(struct i915_gpu_coredump *error)
cb383002 1937{
742379c0 1938 struct intel_engine_coredump *first = NULL;
2dae0c85 1939 unsigned int hung_classes = 0;
742379c0 1940 struct intel_gt_coredump *gt;
eb8d0f5a 1941 int len;
cb383002 1942
742379c0
CW
1943 for (gt = error->gt; gt; gt = gt->next) {
1944 struct intel_engine_coredump *cs;
1945
bda30024
TU
1946 for (cs = gt->engine; cs; cs = cs->next) {
1947 if (cs->hung) {
2dae0c85 1948 hung_classes |= BIT(cs->engine->uabi_class);
bda30024
TU
1949 if (!first)
1950 first = cs;
1951 }
1952 }
742379c0
CW
1953 }
1954
58174462 1955 len = scnprintf(error->error_msg, sizeof(error->error_msg),
742379c0 1956 "GPU HANG: ecode %d:%x:%08x",
651e7d48 1957 GRAPHICS_VER(error->i915), hung_classes,
742379c0 1958 generate_ecode(first));
29baf3ae 1959 if (first && first->context.pid) {
eb8d0f5a 1960 /* Just show the first executing process, more is confusing */
58174462
MK
1961 len += scnprintf(error->error_msg + len,
1962 sizeof(error->error_msg) - len,
1963 ", in %s [%d]",
742379c0 1964 first->context.comm, first->context.pid);
eb8d0f5a 1965 }
58174462 1966
eb8d0f5a 1967 return error->error_msg;
cb383002
MK
1968}
1969
742379c0 1970static void capture_gen(struct i915_gpu_coredump *error)
48b031e3 1971{
53b725c7
DCS
1972 struct drm_i915_private *i915 = error->i915;
1973
53b725c7
DCS
1974 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1975 error->suspended = i915->runtime_pm.suspended;
f73b5674 1976
a7f46d5b 1977 error->iommu = i915_vtd_active(i915);
53b725c7
DCS
1978 error->reset_count = i915_reset_count(&i915->gpu_error);
1979 error->suspend_count = i915->suspend_count;
2bd160a1 1980
8a25c4be 1981 i915_params_copy(&error->params, &i915->params);
2bd160a1 1982 memcpy(&error->device_info,
53b725c7 1983 INTEL_INFO(i915),
2bd160a1 1984 sizeof(error->device_info));
0258404f
JN
1985 memcpy(&error->runtime_info,
1986 RUNTIME_INFO(i915),
1987 sizeof(error->runtime_info));
4ae7eb92
JN
1988 memcpy(&error->display_device_info, DISPLAY_INFO(i915),
1989 sizeof(error->display_device_info));
1990 memcpy(&error->display_runtime_info, DISPLAY_RUNTIME_INFO(i915),
1991 sizeof(error->display_runtime_info));
53b725c7 1992 error->driver_caps = i915->caps;
48b031e3
MK
1993}
1994
742379c0
CW
1995struct i915_gpu_coredump *
1996i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
84a20a8a 1997{
742379c0
CW
1998 struct i915_gpu_coredump *error;
1999
8a25c4be 2000 if (!i915->params.error_capture)
742379c0
CW
2001 return NULL;
2002
2003 error = kzalloc(sizeof(*error), gfp);
2004 if (!error)
2005 return NULL;
2006
2007 kref_init(&error->ref);
2008 error->i915 = i915;
2009
2010 error->time = ktime_get_real();
2011 error->boottime = ktime_get_boottime();
2cbc876d 2012 error->uptime = ktime_sub(ktime_get(), to_gt(i915)->last_init_time);
742379c0
CW
2013 error->capture = jiffies;
2014
2015 capture_gen(error);
2016
2017 return error;
84a20a8a
MW
2018}
2019
742379c0
CW
2020#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
2021
2022struct intel_gt_coredump *
a6f0f9cf 2023intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
8f5c6fe4 2024{
742379c0 2025 struct intel_gt_coredump *gc;
8f5c6fe4 2026
742379c0
CW
2027 gc = kzalloc(sizeof(*gc), gfp);
2028 if (!gc)
2029 return NULL;
2030
2031 gc->_gt = gt;
2032 gc->awake = intel_gt_pm_is_awake(gt);
2033
a6f0f9cf
AP
2034 gt_record_display_regs(gc);
2035 gt_record_global_nonguc_regs(gc);
2036
2037 /*
2038 * GuC dumps global, eng-class and eng-instance registers
2039 * (that can change as part of engine state during execution)
2040 * before an engine is reset due to a hung context.
2041 * GuC captures and reports all three groups of registers
2042 * together as a single set before the engine is reset.
2043 * Thus, if GuC triggered the context reset we retrieve
2044 * the register values as part of gt_record_engines.
2045 */
2046 if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE))
2047 gt_record_global_regs(gc);
2048
742379c0
CW
2049 gt_record_fences(gc);
2050
2051 return gc;
2052}
895d8ebe 2053
742379c0
CW
2054struct i915_vma_compress *
2055i915_vma_capture_prepare(struct intel_gt_coredump *gt)
2056{
2057 struct i915_vma_compress *compress;
2058
2059 compress = kmalloc(sizeof(*compress), ALLOW_FAIL);
2060 if (!compress)
2061 return NULL;
2062
2063 if (!compress_init(compress)) {
2064 kfree(compress);
2065 return NULL;
895d8ebe 2066 }
742379c0 2067
742379c0 2068 return compress;
8f5c6fe4
CW
2069}
2070
742379c0
CW
2071void i915_vma_capture_finish(struct intel_gt_coredump *gt,
2072 struct i915_vma_compress *compress)
2073{
2074 if (!compress)
2075 return;
eafc4894 2076
742379c0
CW
2077 compress_fini(compress);
2078 kfree(compress);
2079}
2080
ff20afc4 2081static struct i915_gpu_coredump *
a6f0f9cf 2082__i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
5a4c6f1b 2083{
bda30024 2084 struct drm_i915_private *i915 = gt->i915;
742379c0 2085 struct i915_gpu_coredump *error;
5a4c6f1b 2086
e6154e4c
CW
2087 /* Check if GPU capture has been disabled */
2088 error = READ_ONCE(i915->gpu_error.first_error);
2089 if (IS_ERR(error))
2090 return error;
2091
742379c0
CW
2092 error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL);
2093 if (!error)
e6154e4c 2094 return ERR_PTR(-ENOMEM);
5a4c6f1b 2095
a6f0f9cf 2096 error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL, dump_flags);
742379c0
CW
2097 if (error->gt) {
2098 struct i915_vma_compress *compress;
3bdd4f84 2099
742379c0
CW
2100 compress = i915_vma_capture_prepare(error->gt);
2101 if (!compress) {
2102 kfree(error->gt);
2103 kfree(error);
2104 return ERR_PTR(-ENOMEM);
2105 }
5a4c6f1b 2106
39921e5f 2107 if (INTEL_INFO(i915)->has_gt_uc) {
a6f0f9cf
AP
2108 error->gt->uc = gt_record_uc(error->gt, compress);
2109 if (error->gt->uc) {
2110 if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
c5de70f6 2111 error->gt->uc->guc.is_guc_capture = true;
a6f0f9cf 2112 else
c5de70f6 2113 GEM_BUG_ON(error->gt->uc->guc.is_guc_capture);
a6f0f9cf
AP
2114 }
2115 }
2116
792592e7 2117 gt_record_info(error->gt);
a6f0f9cf 2118 gt_record_engines(error->gt, engine_mask, compress, dump_flags);
742379c0 2119
3bdd4f84 2120
742379c0
CW
2121 i915_vma_capture_finish(error->gt, compress);
2122
2123 error->simulated |= error->gt->simulated;
2124 }
3bdd4f84
CW
2125
2126 error->overlay = intel_overlay_capture_error_state(i915);
3bdd4f84 2127
5a4c6f1b
CW
2128 return error;
2129}
2130
ff20afc4 2131struct i915_gpu_coredump *
a6f0f9cf 2132i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
ff20afc4
TH
2133{
2134 static DEFINE_MUTEX(capture_mutex);
2135 int ret = mutex_lock_interruptible(&capture_mutex);
2136 struct i915_gpu_coredump *dump;
2137
2138 if (ret)
2139 return ERR_PTR(ret);
2140
a6f0f9cf 2141 dump = __i915_gpu_coredump(gt, engine_mask, dump_flags);
ff20afc4
TH
2142 mutex_unlock(&capture_mutex);
2143
2144 return dump;
2145}
2146
742379c0 2147void i915_error_state_store(struct i915_gpu_coredump *error)
1d762aad 2148{
742379c0 2149 struct drm_i915_private *i915;
53a4c6b2 2150 static bool warned;
1d762aad 2151
742379c0 2152 if (IS_ERR_OR_NULL(error))
98a2f411
CW
2153 return;
2154
742379c0 2155 i915 = error->i915;
58f44aad 2156 drm_info(&i915->drm, "%s\n", error_msg(error));
9777cca0 2157
742379c0
CW
2158 if (error->simulated ||
2159 cmpxchg(&i915->gpu_error.first_error, NULL, error))
1d762aad 2160 return;
1d762aad 2161
742379c0 2162 i915_gpu_coredump_get(error);
cb383002 2163
a1e37b02 2164 if (!xchg(&warned, true) &&
eafc4894 2165 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
88f8065c 2166 pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
ddae4d7a
JN
2167 pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n");
2168 pr_info("Please see https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs for details.\n");
88f8065c
CW
2169 pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
2170 pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n");
2171 pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n",
2172 i915->drm.primary->index);
cb383002 2173 }
84734a04
MK
2174}
2175
742379c0
CW
2176/**
2177 * i915_capture_error_state - capture an error record for later analysis
bda30024
TU
2178 * @gt: intel_gt which originated the hang
2179 * @engine_mask: hung engines
88629fee 2180 * @dump_flags: dump flags
742379c0
CW
2181 *
2182 * Should be called when an error is detected (either a hang or an error
2183 * interrupt) to capture error state from the time of the error. Fills
2184 * out a structure which becomes available in debugfs for user level tools
2185 * to pick up.
2186 */
bda30024 2187void i915_capture_error_state(struct intel_gt *gt,
a6f0f9cf 2188 intel_engine_mask_t engine_mask, u32 dump_flags)
742379c0
CW
2189{
2190 struct i915_gpu_coredump *error;
2191
a6f0f9cf 2192 error = i915_gpu_coredump(gt, engine_mask, dump_flags);
742379c0 2193 if (IS_ERR(error)) {
bda30024 2194 cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
742379c0
CW
2195 return;
2196 }
2197
2198 i915_error_state_store(error);
2199 i915_gpu_coredump_put(error);
2200}
2201
2202struct i915_gpu_coredump *
5a4c6f1b 2203i915_first_error_state(struct drm_i915_private *i915)
84734a04 2204{
742379c0 2205 struct i915_gpu_coredump *error;
84734a04 2206
5a4c6f1b
CW
2207 spin_lock_irq(&i915->gpu_error.lock);
2208 error = i915->gpu_error.first_error;
e6154e4c 2209 if (!IS_ERR_OR_NULL(error))
742379c0 2210 i915_gpu_coredump_get(error);
5a4c6f1b 2211 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 2212
5a4c6f1b 2213 return error;
84734a04
MK
2214}
2215
5a4c6f1b 2216void i915_reset_error_state(struct drm_i915_private *i915)
84734a04 2217{
742379c0 2218 struct i915_gpu_coredump *error;
84734a04 2219
5a4c6f1b
CW
2220 spin_lock_irq(&i915->gpu_error.lock);
2221 error = i915->gpu_error.first_error;
e6154e4c
CW
2222 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
2223 i915->gpu_error.first_error = NULL;
5a4c6f1b 2224 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 2225
e6154e4c 2226 if (!IS_ERR_OR_NULL(error))
742379c0 2227 i915_gpu_coredump_put(error);
fb6f0b64
CW
2228}
2229
2230void i915_disable_error_state(struct drm_i915_private *i915, int err)
2231{
2232 spin_lock_irq(&i915->gpu_error.lock);
2233 if (!i915->gpu_error.first_error)
2234 i915->gpu_error.first_error = ERR_PTR(err);
2235 spin_unlock_irq(&i915->gpu_error.lock);
84734a04 2236}
6197cff3
JH
2237
2238#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
2239void intel_klog_error_capture(struct intel_gt *gt,
2240 intel_engine_mask_t engine_mask)
2241{
2242 static int g_count;
2243 struct drm_i915_private *i915 = gt->i915;
2244 struct i915_gpu_coredump *error;
2245 intel_wakeref_t wakeref;
2246 size_t buf_size = PAGE_SIZE * 128;
2247 size_t pos_err;
2248 char *buf, *ptr, *next;
2249 int l_count = g_count++;
2250 int line = 0;
2251
2252 /* Can't allocate memory during a reset */
2253 if (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
2254 drm_err(&gt->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n",
2255 l_count, line++);
2256 return;
2257 }
2258
2259 error = READ_ONCE(i915->gpu_error.first_error);
2260 if (error) {
2261 drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n",
2262 l_count, line++);
2263 i915_reset_error_state(i915);
2264 }
2265
2266 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
2267 error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE);
2268
2269 if (IS_ERR(error)) {
2270 drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n",
2271 l_count, line++, PTR_ERR(error));
2272 return;
2273 }
2274
2275 buf = kvmalloc(buf_size, GFP_KERNEL);
2276 if (!buf) {
2277 drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n",
2278 l_count, line++);
2279 i915_gpu_coredump_put(error);
2280 return;
2281 }
2282
2283 drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n",
2284 l_count, line++, __builtin_return_address(0));
2285
2286 /* Largest string length safe to print via dmesg */
2287# define MAX_CHUNK 800
2288
2289 pos_err = 0;
2290 while (1) {
2291 ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1);
2292
2293 if (got <= 0)
2294 break;
2295
2296 buf[got] = 0;
2297 pos_err += got;
2298
2299 ptr = buf;
2300 while (got > 0) {
2301 size_t count;
2302 char tag[2];
2303
2304 next = strnchr(ptr, got, '\n');
2305 if (next) {
2306 count = next - ptr;
2307 *next = 0;
2308 tag[0] = '>';
2309 tag[1] = '<';
2310 } else {
2311 count = got;
2312 tag[0] = '}';
2313 tag[1] = '{';
2314 }
2315
2316 if (count > MAX_CHUNK) {
2317 size_t pos;
2318 char *ptr2 = ptr;
2319
2320 for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) {
2321 char chr = ptr[pos];
2322
2323 ptr[pos] = 0;
2324 drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n",
2325 l_count, line++, ptr2);
2326 ptr[pos] = chr;
2327 ptr2 = ptr + pos;
2328
2329 /*
2330 * If spewing large amounts of data via a serial console,
2331 * this can be a very slow process. So be friendly and try
2332 * not to cause 'softlockup on CPU' problems.
2333 */
2334 cond_resched();
2335 }
2336
2337 if (ptr2 < (ptr + count))
2338 drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
2339 l_count, line++, tag[0], ptr2, tag[1]);
2340 else if (tag[0] == '>')
2341 drm_info(&i915->drm, "[Capture/%d.%d] ><\n",
2342 l_count, line++);
2343 } else {
2344 drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
2345 l_count, line++, tag[0], ptr, tag[1]);
2346 }
2347
2348 ptr = next;
2349 got -= count;
2350 if (next) {
2351 ptr++;
2352 got--;
2353 }
2354
2355 /* As above. */
2356 cond_resched();
2357 }
2358
2359 if (got)
2360 drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n",
2361 l_count, line++, got);
2362 }
2363
2364 kvfree(buf);
2365
2366 drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err);
2367}
2368#endif