drm/amd/amdgpu: Include headers for PWR and SMUIO registers
[linux-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
CommitLineData
c030f2e4 1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
f867723b 27#include <linux/uaccess.h>
7c6e68c7
AG
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
f867723b 30
c030f2e4 31#include "amdgpu.h"
32#include "amdgpu_ras.h"
b404ae82 33#include "amdgpu_atomfirmware.h"
19744f5f 34#include "amdgpu_xgmi.h"
4e644fff 35#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
c030f2e4 36
c030f2e4 37const char *ras_error_string[] = {
38 "none",
39 "parity",
40 "single_correctable",
41 "multi_uncorrectable",
42 "poison",
43};
44
45const char *ras_block_string[] = {
46 "umc",
47 "sdma",
48 "gfx",
49 "mmhub",
50 "athub",
51 "pcie_bif",
52 "hdp",
53 "xgmi_wafl",
54 "df",
55 "smn",
56 "sem",
57 "mp0",
58 "mp1",
59 "fuse",
60};
61
62#define ras_err_str(i) (ras_error_string[ffs(i)])
63#define ras_block_str(i) (ras_block_string[i])
64
a564808e 65#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
66#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
108c6a63 67#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
68
7cdc2ee3
TZ
69/* inject address is 52 bits */
70#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
71
52dd95f2
GC
72enum amdgpu_ras_retire_page_reservation {
73 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
74 AMDGPU_RAS_RETIRE_PAGE_PENDING,
75 AMDGPU_RAS_RETIRE_PAGE_FAULT,
76};
7c6e68c7
AG
77
78atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
79
6e4be987
TZ
80static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
81 uint64_t addr);
82
61380faa
JC
83void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
84{
85 if (adev)
86 amdgpu_ras_get_context(adev)->error_query_ready = ready;
87}
88
89bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
90{
91 if (adev)
92 return amdgpu_ras_get_context(adev)->error_query_ready;
93
94 return false;
95}
96
c030f2e4 97static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
98 size_t size, loff_t *pos)
99{
100 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
101 struct ras_query_if info = {
102 .head = obj->head,
103 };
104 ssize_t s;
105 char val[128];
106
107 if (amdgpu_ras_error_query(obj->adev, &info))
108 return -EINVAL;
109
110 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
111 "ue", info.ue_count,
112 "ce", info.ce_count);
113 if (*pos >= s)
114 return 0;
115
116 s -= *pos;
117 s = min_t(u64, s, size);
118
119
120 if (copy_to_user(buf, &val[*pos], s))
121 return -EINVAL;
122
123 *pos += s;
124
125 return s;
126}
127
c030f2e4 128static const struct file_operations amdgpu_ras_debugfs_ops = {
129 .owner = THIS_MODULE,
130 .read = amdgpu_ras_debugfs_read,
190211ab 131 .write = NULL,
c030f2e4 132 .llseek = default_llseek
133};
134
96ebb307 135static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
136{
137 int i;
138
139 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
140 *block_id = i;
141 if (strcmp(name, ras_block_str(i)) == 0)
142 return 0;
143 }
144 return -EINVAL;
145}
146
147static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
148 const char __user *buf, size_t size,
149 loff_t *pos, struct ras_debug_if *data)
150{
151 ssize_t s = min_t(u64, 64, size);
152 char str[65];
153 char block_name[33];
154 char err[9] = "ue";
155 int op = -1;
156 int block_id;
44494f96 157 uint32_t sub_block;
96ebb307 158 u64 address, value;
159
160 if (*pos)
161 return -EINVAL;
162 *pos = size;
163
164 memset(str, 0, sizeof(str));
165 memset(data, 0, sizeof(*data));
166
167 if (copy_from_user(str, buf, s))
168 return -EINVAL;
169
170 if (sscanf(str, "disable %32s", block_name) == 1)
171 op = 0;
172 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
173 op = 1;
174 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
175 op = 2;
b076296b 176 else if (str[0] && str[1] && str[2] && str[3])
96ebb307 177 /* ascii string, but commands are not matched. */
178 return -EINVAL;
179
180 if (op != -1) {
181 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
182 return -EINVAL;
183
184 data->head.block = block_id;
e1063493
TZ
185 /* only ue and ce errors are supported */
186 if (!memcmp("ue", err, 2))
187 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
188 else if (!memcmp("ce", err, 2))
189 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
190 else
191 return -EINVAL;
192
96ebb307 193 data->op = op;
194
195 if (op == 2) {
44494f96
TZ
196 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
197 &sub_block, &address, &value) != 3)
198 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
199 &sub_block, &address, &value) != 3)
96ebb307 200 return -EINVAL;
44494f96 201 data->head.sub_block_index = sub_block;
96ebb307 202 data->inject.address = address;
203 data->inject.value = value;
204 }
205 } else {
73aa8e1a 206 if (size < sizeof(*data))
96ebb307 207 return -EINVAL;
208
209 if (copy_from_user(data, buf, sizeof(*data)))
210 return -EINVAL;
211 }
212
213 return 0;
214}
7c6e68c7 215
74abc221
TSD
216/**
217 * DOC: AMDGPU RAS debugfs control interface
36ea1bd2 218 *
219 * It accepts struct ras_debug_if who has two members.
220 *
221 * First member: ras_debug_if::head or ras_debug_if::inject.
96ebb307 222 *
223 * head is used to indicate which IP block will be under control.
36ea1bd2 224 *
225 * head has four members, they are block, type, sub_block_index, name.
226 * block: which IP will be under control.
227 * type: what kind of error will be enabled/disabled/injected.
228 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
229 * name: the name of IP.
230 *
231 * inject has two more members than head, they are address, value.
232 * As their names indicate, inject operation will write the
233 * value to the address.
234 *
ef177d11 235 * The second member: struct ras_debug_if::op.
c688a06b 236 * It has three kinds of operations.
879e723d
AZ
237 *
238 * - 0: disable RAS on the block. Take ::head as its data.
239 * - 1: enable RAS on the block. Take ::head as its data.
240 * - 2: inject errors on the block. Take ::inject as its data.
36ea1bd2 241 *
96ebb307 242 * How to use the interface?
ef177d11
AD
243 *
244 * Programs
245 *
246 * Copy the struct ras_debug_if in your codes and initialize it.
247 * Write the struct to the control node.
248 *
249 * Shells
96ebb307 250 *
879e723d
AZ
251 * .. code-block:: bash
252 *
a20bfd0f 253 * echo op block [error [sub_block address value]] > .../ras/ras_ctrl
879e723d 254 *
ef177d11
AD
255 * Parameters:
256 *
879e723d
AZ
257 * op: disable, enable, inject
258 * disable: only block is needed
259 * enable: block and error are needed
260 * inject: error, address, value are needed
a20bfd0f 261 * block: umc, sdma, gfx, .........
879e723d
AZ
262 * see ras_block_string[] for details
263 * error: ue, ce
264 * ue: multi_uncorrectable
265 * ce: single_correctable
266 * sub_block:
267 * sub block index, pass 0 if there is no sub block
268 *
269 * here are some examples for bash commands:
270 *
271 * .. code-block:: bash
96ebb307 272 *
44494f96
TZ
273 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
274 * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
96ebb307 275 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
276 *
36ea1bd2 277 * How to check the result?
278 *
279 * For disable/enable, please check ras features at
280 * /sys/class/drm/card[0/1/2...]/device/ras/features
281 *
282 * For inject, please check corresponding err count at
283 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
284 *
879e723d 285 * .. note::
ef177d11 286 * Operations are only allowed on blocks which are supported.
879e723d 287 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
ef177d11
AD
288 * to see which blocks support RAS on a particular asic.
289 *
36ea1bd2 290 */
291static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
292 size_t size, loff_t *pos)
293{
294 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
295 struct ras_debug_if data;
296 int ret = 0;
297
61380faa 298 if (!amdgpu_ras_get_error_query_ready(adev)) {
43c4d576
JC
299 DRM_WARN("RAS WARN: error injection currently inaccessible\n");
300 return size;
301 }
302
96ebb307 303 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
304 if (ret)
36ea1bd2 305 return -EINVAL;
306
36ea1bd2 307 if (!amdgpu_ras_is_supported(adev, data.head.block))
308 return -EINVAL;
309
310 switch (data.op) {
311 case 0:
312 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
313 break;
314 case 1:
315 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
316 break;
317 case 2:
7cdc2ee3
TZ
318 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
319 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
320 ret = -EINVAL;
321 break;
322 }
323
6e4be987
TZ
324 /* umc ce/ue error injection for a bad page is not allowed */
325 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
326 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
327 DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
328 data.inject.address);
329 break;
330 }
331
7cdc2ee3 332 /* data.inject.address is offset instead of absolute gpu address */
36ea1bd2 333 ret = amdgpu_ras_error_inject(adev, &data.inject);
334 break;
96ebb307 335 default:
336 ret = -EINVAL;
337 break;
374bf7bd 338 }
36ea1bd2 339
340 if (ret)
341 return -EINVAL;
342
343 return size;
344}
345
084fe13b
AG
346/**
347 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
348 *
f77c7109 349 * Some boards contain an EEPROM which is used to persistently store a list of
ef177d11 350 * bad pages which experiences ECC errors in vram. This interface provides
f77c7109
AD
351 * a way to reset the EEPROM, e.g., after testing error injection.
352 *
353 * Usage:
354 *
355 * .. code-block:: bash
356 *
357 * echo 1 > ../ras/ras_eeprom_reset
358 *
359 * will reset EEPROM table to 0 entries.
360 *
084fe13b
AG
361 */
362static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
363 size_t size, loff_t *pos)
364{
365 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
366 int ret;
367
368 ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control);
369
370 return ret == 1 ? size : -EIO;
371}
372
36ea1bd2 373static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
374 .owner = THIS_MODULE,
375 .read = NULL,
376 .write = amdgpu_ras_debugfs_ctrl_write,
377 .llseek = default_llseek
378};
379
084fe13b
AG
380static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
381 .owner = THIS_MODULE,
382 .read = NULL,
383 .write = amdgpu_ras_debugfs_eeprom_write,
384 .llseek = default_llseek
385};
386
f77c7109
AD
387/**
388 * DOC: AMDGPU RAS sysfs Error Count Interface
389 *
ef177d11 390 * It allows the user to read the error count for each IP block on the gpu through
f77c7109
AD
391 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
392 *
393 * It outputs the multiple lines which report the uncorrected (ue) and corrected
394 * (ce) error counts.
395 *
396 * The format of one line is below,
397 *
398 * [ce|ue]: count
399 *
400 * Example:
401 *
402 * .. code-block:: bash
403 *
404 * ue: 0
405 * ce: 1
406 *
407 */
c030f2e4 408static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
409 struct device_attribute *attr, char *buf)
410{
411 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
412 struct ras_query_if info = {
413 .head = obj->head,
414 };
415
61380faa 416 if (!amdgpu_ras_get_error_query_ready(obj->adev))
43c4d576
JC
417 return snprintf(buf, PAGE_SIZE,
418 "Query currently inaccessible\n");
419
c030f2e4 420 if (amdgpu_ras_error_query(obj->adev, &info))
421 return -EINVAL;
422
423 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
424 "ue", info.ue_count,
425 "ce", info.ce_count);
426}
427
428/* obj begin */
429
430#define get_obj(obj) do { (obj)->use++; } while (0)
431#define alive_obj(obj) ((obj)->use)
432
433static inline void put_obj(struct ras_manager *obj)
434{
435 if (obj && --obj->use == 0)
436 list_del(&obj->node);
437 if (obj && obj->use < 0) {
438 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
439 }
440}
441
442/* make one obj and return it. */
443static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
444 struct ras_common_if *head)
445{
446 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
447 struct ras_manager *obj;
448
449 if (!con)
450 return NULL;
451
452 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
453 return NULL;
454
455 obj = &con->objs[head->block];
456 /* already exist. return obj? */
457 if (alive_obj(obj))
458 return NULL;
459
460 obj->head = *head;
461 obj->adev = adev;
462 list_add(&obj->node, &con->head);
463 get_obj(obj);
464
465 return obj;
466}
467
468/* return an obj equal to head, or the first when head is NULL */
f2a79be1 469struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
c030f2e4 470 struct ras_common_if *head)
471{
472 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
473 struct ras_manager *obj;
474 int i;
475
476 if (!con)
477 return NULL;
478
479 if (head) {
480 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
481 return NULL;
482
483 obj = &con->objs[head->block];
484
485 if (alive_obj(obj)) {
486 WARN_ON(head->block != obj->head.block);
487 return obj;
488 }
489 } else {
490 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
491 obj = &con->objs[i];
492 if (alive_obj(obj)) {
493 WARN_ON(i != obj->head.block);
494 return obj;
495 }
496 }
497 }
498
499 return NULL;
500}
501/* obj end */
502
503/* feature ctl begin */
504static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
505 struct ras_common_if *head)
506{
5caf466a 507 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
508
509 return con->hw_supported & BIT(head->block);
c030f2e4 510}
511
512static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
513 struct ras_common_if *head)
514{
515 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
516
517 return con->features & BIT(head->block);
518}
519
520/*
521 * if obj is not created, then create one.
522 * set feature enable flag.
523 */
524static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
525 struct ras_common_if *head, int enable)
526{
527 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
528 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
529
5caf466a 530 /* If hardware does not support ras, then do not create obj.
531 * But if hardware support ras, we can create the obj.
532 * Ras framework checks con->hw_supported to see if it need do
533 * corresponding initialization.
534 * IP checks con->support to see if it need disable ras.
535 */
c030f2e4 536 if (!amdgpu_ras_is_feature_allowed(adev, head))
537 return 0;
538 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
539 return 0;
540
541 if (enable) {
542 if (!obj) {
543 obj = amdgpu_ras_create_obj(adev, head);
544 if (!obj)
545 return -EINVAL;
546 } else {
547 /* In case we create obj somewhere else */
548 get_obj(obj);
549 }
550 con->features |= BIT(head->block);
551 } else {
552 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
553 con->features &= ~BIT(head->block);
554 put_obj(obj);
555 }
556 }
557
558 return 0;
559}
560
561/* wrapper of psp_ras_enable_features */
562int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
563 struct ras_common_if *head, bool enable)
564{
565 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
566 union ta_ras_cmd_input info;
567 int ret;
568
569 if (!con)
570 return -EINVAL;
571
572 if (!enable) {
573 info.disable_features = (struct ta_ras_disable_features_input) {
828cfa29 574 .block_id = amdgpu_ras_block_to_ta(head->block),
575 .error_type = amdgpu_ras_error_to_ta(head->type),
c030f2e4 576 };
577 } else {
578 info.enable_features = (struct ta_ras_enable_features_input) {
828cfa29 579 .block_id = amdgpu_ras_block_to_ta(head->block),
580 .error_type = amdgpu_ras_error_to_ta(head->type),
c030f2e4 581 };
582 }
583
584 /* Do not enable if it is not allowed. */
585 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
586 /* Are we alerady in that state we are going to set? */
587 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
588 return 0;
589
bff77e86
LM
590 if (!amdgpu_ras_intr_triggered()) {
591 ret = psp_ras_enable_features(&adev->psp, &info, enable);
592 if (ret) {
593 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
594 enable ? "enable":"disable",
595 ras_block_str(head->block),
596 ret);
597 if (ret == TA_RAS_STATUS__RESET_NEEDED)
598 return -EAGAIN;
599 return -EINVAL;
600 }
c030f2e4 601 }
602
603 /* setup the obj */
604 __amdgpu_ras_feature_enable(adev, head, enable);
605
606 return 0;
607}
608
77de502b 609/* Only used in device probe stage and called only once. */
610int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
611 struct ras_common_if *head, bool enable)
612{
613 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
614 int ret;
615
616 if (!con)
617 return -EINVAL;
618
619 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
7af23ebe 620 if (enable) {
621 /* There is no harm to issue a ras TA cmd regardless of
622 * the currecnt ras state.
623 * If current state == target state, it will do nothing
624 * But sometimes it requests driver to reset and repost
625 * with error code -EAGAIN.
626 */
627 ret = amdgpu_ras_feature_enable(adev, head, 1);
628 /* With old ras TA, we might fail to enable ras.
629 * Log it and just setup the object.
630 * TODO need remove this WA in the future.
631 */
632 if (ret == -EINVAL) {
633 ret = __amdgpu_ras_feature_enable(adev, head, 1);
634 if (!ret)
635 DRM_INFO("RAS INFO: %s setup object\n",
636 ras_block_str(head->block));
637 }
638 } else {
639 /* setup the object then issue a ras TA disable cmd.*/
640 ret = __amdgpu_ras_feature_enable(adev, head, 1);
641 if (ret)
642 return ret;
77de502b 643
77de502b 644 ret = amdgpu_ras_feature_enable(adev, head, 0);
7af23ebe 645 }
77de502b 646 } else
647 ret = amdgpu_ras_feature_enable(adev, head, enable);
648
649 return ret;
650}
651
c030f2e4 652static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
653 bool bypass)
654{
655 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
656 struct ras_manager *obj, *tmp;
657
658 list_for_each_entry_safe(obj, tmp, &con->head, node) {
659 /* bypass psp.
660 * aka just release the obj and corresponding flags
661 */
662 if (bypass) {
663 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
664 break;
665 } else {
666 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
667 break;
668 }
289d513b 669 }
c030f2e4 670
671 return con->features;
672}
673
674static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
675 bool bypass)
676{
677 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
678 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
679 int i;
191051a1 680 const enum amdgpu_ras_error_type default_ras_type =
681 AMDGPU_RAS_ERROR__NONE;
c030f2e4 682
683 for (i = 0; i < ras_block_count; i++) {
684 struct ras_common_if head = {
685 .block = i,
191051a1 686 .type = default_ras_type,
c030f2e4 687 .sub_block_index = 0,
688 };
689 strcpy(head.name, ras_block_str(i));
690 if (bypass) {
691 /*
692 * bypass psp. vbios enable ras for us.
693 * so just create the obj
694 */
695 if (__amdgpu_ras_feature_enable(adev, &head, 1))
696 break;
697 } else {
698 if (amdgpu_ras_feature_enable(adev, &head, 1))
699 break;
700 }
289d513b 701 }
c030f2e4 702
703 return con->features;
704}
705/* feature ctl end */
706
707/* query/inject/cure begin */
708int amdgpu_ras_error_query(struct amdgpu_device *adev,
709 struct ras_query_if *info)
710{
711 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
6f102dba 712 struct ras_err_data err_data = {0, 0, 0, NULL};
3e81ee9a 713 int i;
c030f2e4 714
715 if (!obj)
716 return -EINVAL;
c030f2e4 717
939e2258
HZ
718 switch (info->head.block) {
719 case AMDGPU_RAS_BLOCK__UMC:
045c0216
TZ
720 if (adev->umc.funcs->query_ras_error_count)
721 adev->umc.funcs->query_ras_error_count(adev, &err_data);
13b7c46c
TZ
722 /* umc query_ras_error_address is also responsible for clearing
723 * error status
724 */
725 if (adev->umc.funcs->query_ras_error_address)
726 adev->umc.funcs->query_ras_error_address(adev, &err_data);
939e2258 727 break;
3e81ee9a
HZ
728 case AMDGPU_RAS_BLOCK__SDMA:
729 if (adev->sdma.funcs->query_ras_error_count) {
730 for (i = 0; i < adev->sdma.num_instances; i++)
731 adev->sdma.funcs->query_ras_error_count(adev, i,
732 &err_data);
733 }
734 break;
83b0582c
DL
735 case AMDGPU_RAS_BLOCK__GFX:
736 if (adev->gfx.funcs->query_ras_error_count)
737 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
738 break;
9fb2d8de 739 case AMDGPU_RAS_BLOCK__MMHUB:
d65bf1f8
TZ
740 if (adev->mmhub.funcs->query_ras_error_count)
741 adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
9fb2d8de 742 break;
d7bd680d
GC
743 case AMDGPU_RAS_BLOCK__PCIE_BIF:
744 if (adev->nbio.funcs->query_ras_error_count)
745 adev->nbio.funcs->query_ras_error_count(adev, &err_data);
746 break;
ec01fe2d
HZ
747 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
748 amdgpu_xgmi_query_ras_error_count(adev, &err_data);
749 break;
939e2258
HZ
750 default:
751 break;
752 }
05a58345
TZ
753
754 obj->err_data.ue_count += err_data.ue_count;
755 obj->err_data.ce_count += err_data.ce_count;
756
c030f2e4 757 info->ue_count = obj->err_data.ue_count;
758 info->ce_count = obj->err_data.ce_count;
759
7c6e68c7 760 if (err_data.ce_count) {
05a58345
TZ
761 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
762 obj->err_data.ce_count, ras_block_str(info->head.block));
7c6e68c7
AG
763 }
764 if (err_data.ue_count) {
05a58345
TZ
765 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
766 obj->err_data.ue_count, ras_block_str(info->head.block));
7c6e68c7 767 }
05a58345 768
c030f2e4 769 return 0;
770}
771
772/* wrapper of psp_ras_trigger_error */
773int amdgpu_ras_error_inject(struct amdgpu_device *adev,
774 struct ras_inject_if *info)
775{
776 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
777 struct ta_ras_trigger_error_input block_info = {
828cfa29 778 .block_id = amdgpu_ras_block_to_ta(info->head.block),
779 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
c030f2e4 780 .sub_block_index = info->head.sub_block_index,
781 .address = info->address,
782 .value = info->value,
783 };
784 int ret = 0;
785
786 if (!obj)
787 return -EINVAL;
788
a6c44d25
JC
789 /* Calculate XGMI relative offset */
790 if (adev->gmc.xgmi.num_physical_nodes > 1) {
19744f5f
HZ
791 block_info.address =
792 amdgpu_xgmi_get_relative_phy_addr(adev,
793 block_info.address);
a6c44d25
JC
794 }
795
83b0582c
DL
796 switch (info->head.block) {
797 case AMDGPU_RAS_BLOCK__GFX:
798 if (adev->gfx.funcs->ras_error_inject)
799 ret = adev->gfx.funcs->ras_error_inject(adev, info);
800 else
801 ret = -EINVAL;
802 break;
803 case AMDGPU_RAS_BLOCK__UMC:
9fb2d8de 804 case AMDGPU_RAS_BLOCK__MMHUB:
f3170352 805 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
d7bd680d 806 case AMDGPU_RAS_BLOCK__PCIE_BIF:
83b0582c
DL
807 ret = psp_ras_trigger_error(&adev->psp, &block_info);
808 break;
809 default:
a5dd40ca
HZ
810 DRM_INFO("%s error injection is not supported yet\n",
811 ras_block_str(info->head.block));
83b0582c 812 ret = -EINVAL;
a5dd40ca
HZ
813 }
814
c030f2e4 815 if (ret)
816 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
817 ras_block_str(info->head.block),
818 ret);
819
820 return ret;
821}
822
823int amdgpu_ras_error_cure(struct amdgpu_device *adev,
824 struct ras_cure_if *info)
825{
826 /* psp fw has no cure interface for now. */
827 return 0;
828}
829
830/* get the total error counts on all IPs */
64cc5414 831unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
c030f2e4 832 bool is_ce)
833{
834 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
835 struct ras_manager *obj;
836 struct ras_err_data data = {0, 0};
837
838 if (!con)
64cc5414 839 return 0;
c030f2e4 840
841 list_for_each_entry(obj, &con->head, node) {
842 struct ras_query_if info = {
843 .head = obj->head,
844 };
845
846 if (amdgpu_ras_error_query(adev, &info))
64cc5414 847 return 0;
c030f2e4 848
849 data.ce_count += info.ce_count;
850 data.ue_count += info.ue_count;
851 }
852
853 return is_ce ? data.ce_count : data.ue_count;
854}
855/* query/inject/cure end */
856
857
858/* sysfs begin */
859
466b1793 860static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
861 struct ras_badpage **bps, unsigned int *count);
862
863static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
864{
865 switch (flags) {
52dd95f2 866 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
466b1793 867 return "R";
52dd95f2 868 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
466b1793 869 return "P";
52dd95f2 870 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
466b1793 871 default:
872 return "F";
873 };
874}
875
f77c7109
AD
876/**
877 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
466b1793 878 *
879 * It allows user to read the bad pages of vram on the gpu through
880 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
881 *
882 * It outputs multiple lines, and each line stands for one gpu page.
883 *
884 * The format of one line is below,
885 * gpu pfn : gpu page size : flags
886 *
887 * gpu pfn and gpu page size are printed in hex format.
888 * flags can be one of below character,
f77c7109 889 *
466b1793 890 * R: reserved, this gpu page is reserved and not able to use.
f77c7109 891 *
466b1793 892 * P: pending for reserve, this gpu page is marked as bad, will be reserved
f77c7109
AD
893 * in next window of page_reserve.
894 *
466b1793 895 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
896 *
f77c7109
AD
897 * Examples:
898 *
899 * .. code-block:: bash
900 *
901 * 0x00000001 : 0x00001000 : R
902 * 0x00000002 : 0x00001000 : P
903 *
466b1793 904 */
905
906static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
907 struct kobject *kobj, struct bin_attribute *attr,
908 char *buf, loff_t ppos, size_t count)
909{
910 struct amdgpu_ras *con =
911 container_of(attr, struct amdgpu_ras, badpages_attr);
912 struct amdgpu_device *adev = con->adev;
913 const unsigned int element_size =
914 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
d6ee400e
SA
915 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
916 unsigned int end = div64_ul(ppos + count - 1, element_size);
466b1793 917 ssize_t s = 0;
918 struct ras_badpage *bps = NULL;
919 unsigned int bps_count = 0;
920
921 memset(buf, 0, count);
922
923 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
924 return 0;
925
926 for (; start < end && start < bps_count; start++)
927 s += scnprintf(&buf[s], element_size + 1,
928 "0x%08x : 0x%08x : %1s\n",
929 bps[start].bp,
930 bps[start].size,
931 amdgpu_ras_badpage_flags_str(bps[start].flags));
932
933 kfree(bps);
934
935 return s;
936}
937
c030f2e4 938static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
939 struct device_attribute *attr, char *buf)
940{
941 struct amdgpu_ras *con =
942 container_of(attr, struct amdgpu_ras, features_attr);
c030f2e4 943
5212a3bd 944 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
c030f2e4 945}
946
947static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
948{
949 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
950 struct attribute *attrs[] = {
951 &con->features_attr.attr,
952 NULL
953 };
466b1793 954 struct bin_attribute *bin_attrs[] = {
955 &con->badpages_attr,
956 NULL
957 };
c030f2e4 958 struct attribute_group group = {
959 .name = "ras",
960 .attrs = attrs,
466b1793 961 .bin_attrs = bin_attrs,
c030f2e4 962 };
963
964 con->features_attr = (struct device_attribute) {
965 .attr = {
966 .name = "features",
967 .mode = S_IRUGO,
968 },
969 .show = amdgpu_ras_sysfs_features_read,
970 };
466b1793 971
972 con->badpages_attr = (struct bin_attribute) {
973 .attr = {
974 .name = "gpu_vram_bad_pages",
975 .mode = S_IRUGO,
976 },
977 .size = 0,
978 .private = NULL,
979 .read = amdgpu_ras_sysfs_badpages_read,
980 };
981
163def43 982 sysfs_attr_init(attrs[0]);
466b1793 983 sysfs_bin_attr_init(bin_attrs[0]);
c030f2e4 984
985 return sysfs_create_group(&adev->dev->kobj, &group);
986}
987
988static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
989{
990 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
991 struct attribute *attrs[] = {
992 &con->features_attr.attr,
993 NULL
994 };
466b1793 995 struct bin_attribute *bin_attrs[] = {
996 &con->badpages_attr,
997 NULL
998 };
c030f2e4 999 struct attribute_group group = {
1000 .name = "ras",
1001 .attrs = attrs,
466b1793 1002 .bin_attrs = bin_attrs,
c030f2e4 1003 };
1004
1005 sysfs_remove_group(&adev->dev->kobj, &group);
1006
1007 return 0;
1008}
1009
1010int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1011 struct ras_fs_if *head)
1012{
1013 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1014
1015 if (!obj || obj->attr_inuse)
1016 return -EINVAL;
1017
1018 get_obj(obj);
1019
1020 memcpy(obj->fs_data.sysfs_name,
1021 head->sysfs_name,
1022 sizeof(obj->fs_data.sysfs_name));
1023
1024 obj->sysfs_attr = (struct device_attribute){
1025 .attr = {
1026 .name = obj->fs_data.sysfs_name,
1027 .mode = S_IRUGO,
1028 },
1029 .show = amdgpu_ras_sysfs_read,
1030 };
163def43 1031 sysfs_attr_init(&obj->sysfs_attr.attr);
c030f2e4 1032
1033 if (sysfs_add_file_to_group(&adev->dev->kobj,
1034 &obj->sysfs_attr.attr,
1035 "ras")) {
1036 put_obj(obj);
1037 return -EINVAL;
1038 }
1039
1040 obj->attr_inuse = 1;
1041
1042 return 0;
1043}
1044
1045int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1046 struct ras_common_if *head)
1047{
1048 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1049
1050 if (!obj || !obj->attr_inuse)
1051 return -EINVAL;
1052
1053 sysfs_remove_file_from_group(&adev->dev->kobj,
1054 &obj->sysfs_attr.attr,
1055 "ras");
1056 obj->attr_inuse = 0;
1057 put_obj(obj);
1058
1059 return 0;
1060}
1061
1062static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1063{
1064 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1065 struct ras_manager *obj, *tmp;
1066
1067 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1068 amdgpu_ras_sysfs_remove(adev, &obj->head);
1069 }
1070
1071 amdgpu_ras_sysfs_remove_feature_node(adev);
1072
1073 return 0;
1074}
1075/* sysfs end */
1076
ef177d11
AD
1077/**
1078 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1079 *
1080 * Normally when there is an uncorrectable error, the driver will reset
1081 * the GPU to recover. However, in the event of an unrecoverable error,
1082 * the driver provides an interface to reboot the system automatically
1083 * in that event.
1084 *
1085 * The following file in debugfs provides that interface:
1086 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1087 *
1088 * Usage:
1089 *
1090 * .. code-block:: bash
1091 *
1092 * echo true > .../ras/auto_reboot
1093 *
1094 */
c030f2e4 1095/* debugfs begin */
450f30ea 1096static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
36ea1bd2 1097{
1098 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1099 struct drm_minor *minor = adev->ddev->primary;
36ea1bd2 1100
450f30ea 1101 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
012dd14d
GC
1102 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
1103 adev, &amdgpu_ras_debugfs_ctrl_ops);
1104 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
1105 adev, &amdgpu_ras_debugfs_eeprom_ops);
c688a06b
GC
1106
1107 /*
1108 * After one uncorrectable error happens, usually GPU recovery will
1109 * be scheduled. But due to the known problem in GPU recovery failing
1110 * to bring GPU back, below interface provides one direct way to
1111 * user to reboot system automatically in such case within
1112 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1113 * will never be called.
1114 */
1115 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
1116 &con->reboot);
36ea1bd2 1117}
1118
450f30ea 1119void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
c030f2e4 1120 struct ras_fs_if *head)
1121{
1122 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1123 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
c030f2e4 1124
1125 if (!obj || obj->ent)
450f30ea 1126 return;
c030f2e4 1127
1128 get_obj(obj);
1129
1130 memcpy(obj->fs_data.debugfs_name,
1131 head->debugfs_name,
1132 sizeof(obj->fs_data.debugfs_name));
1133
450f30ea
GKH
1134 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1135 S_IWUGO | S_IRUGO, con->dir, obj,
1136 &amdgpu_ras_debugfs_ops);
c030f2e4 1137}
1138
f9317014
TZ
1139void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1140{
1141 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
c1509f3f 1142 struct ras_manager *obj;
f9317014
TZ
1143 struct ras_fs_if fs_info;
1144
1145 /*
1146 * it won't be called in resume path, no need to check
1147 * suspend and gpu reset status
1148 */
1149 if (!con)
1150 return;
1151
1152 amdgpu_ras_debugfs_create_ctrl_node(adev);
1153
c1509f3f 1154 list_for_each_entry(obj, &con->head, node) {
f9317014
TZ
1155 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1156 (obj->attr_inuse == 1)) {
1157 sprintf(fs_info.debugfs_name, "%s_err_inject",
1158 ras_block_str(obj->head.block));
1159 fs_info.head = obj->head;
1160 amdgpu_ras_debugfs_create(adev, &fs_info);
1161 }
1162 }
1163}
1164
450f30ea 1165void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
c030f2e4 1166 struct ras_common_if *head)
1167{
1168 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1169
1170 if (!obj || !obj->ent)
450f30ea 1171 return;
c030f2e4 1172
1173 debugfs_remove(obj->ent);
1174 obj->ent = NULL;
1175 put_obj(obj);
c030f2e4 1176}
1177
450f30ea 1178static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
c030f2e4 1179{
1180 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1181 struct ras_manager *obj, *tmp;
1182
1183 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1184 amdgpu_ras_debugfs_remove(adev, &obj->head);
1185 }
1186
012dd14d 1187 debugfs_remove_recursive(con->dir);
c030f2e4 1188 con->dir = NULL;
c030f2e4 1189}
1190/* debugfs end */
1191
1192/* ras fs */
1193
1194static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1195{
c030f2e4 1196 amdgpu_ras_sysfs_create_feature_node(adev);
1197
1198 return 0;
1199}
1200
1201static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1202{
1203 amdgpu_ras_debugfs_remove_all(adev);
1204 amdgpu_ras_sysfs_remove_all(adev);
1205 return 0;
1206}
1207/* ras fs end */
1208
1209/* ih begin */
1210static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1211{
1212 struct ras_ih_data *data = &obj->ih_data;
1213 struct amdgpu_iv_entry entry;
1214 int ret;
cf04dfd0 1215 struct ras_err_data err_data = {0, 0, 0, NULL};
c030f2e4 1216
1217 while (data->rptr != data->wptr) {
1218 rmb();
1219 memcpy(&entry, &data->ring[data->rptr],
1220 data->element_size);
1221
1222 wmb();
1223 data->rptr = (data->aligned_element_size +
1224 data->rptr) % data->ring_size;
1225
1226 /* Let IP handle its data, maybe we need get the output
1227 * from the callback to udpate the error type/count, etc
1228 */
1229 if (data->cb) {
cf04dfd0 1230 ret = data->cb(obj->adev, &err_data, &entry);
c030f2e4 1231 /* ue will trigger an interrupt, and in that case
1232 * we need do a reset to recovery the whole system.
1233 * But leave IP do that recovery, here we just dispatch
1234 * the error.
1235 */
bd2280da 1236 if (ret == AMDGPU_RAS_SUCCESS) {
51437623
TZ
1237 /* these counts could be left as 0 if
1238 * some blocks do not count error number
1239 */
cf04dfd0 1240 obj->err_data.ue_count += err_data.ue_count;
51437623 1241 obj->err_data.ce_count += err_data.ce_count;
c030f2e4 1242 }
c030f2e4 1243 }
1244 }
1245}
1246
1247static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1248{
1249 struct ras_ih_data *data =
1250 container_of(work, struct ras_ih_data, ih_work);
1251 struct ras_manager *obj =
1252 container_of(data, struct ras_manager, ih_data);
1253
1254 amdgpu_ras_interrupt_handler(obj);
1255}
1256
1257int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1258 struct ras_dispatch_if *info)
1259{
1260 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1261 struct ras_ih_data *data = &obj->ih_data;
1262
1263 if (!obj)
1264 return -EINVAL;
1265
1266 if (data->inuse == 0)
1267 return 0;
1268
1269 /* Might be overflow... */
1270 memcpy(&data->ring[data->wptr], info->entry,
1271 data->element_size);
1272
1273 wmb();
1274 data->wptr = (data->aligned_element_size +
1275 data->wptr) % data->ring_size;
1276
1277 schedule_work(&data->ih_work);
1278
1279 return 0;
1280}
1281
1282int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1283 struct ras_ih_if *info)
1284{
1285 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1286 struct ras_ih_data *data;
1287
1288 if (!obj)
1289 return -EINVAL;
1290
1291 data = &obj->ih_data;
1292 if (data->inuse == 0)
1293 return 0;
1294
1295 cancel_work_sync(&data->ih_work);
1296
1297 kfree(data->ring);
1298 memset(data, 0, sizeof(*data));
1299 put_obj(obj);
1300
1301 return 0;
1302}
1303
1304int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1305 struct ras_ih_if *info)
1306{
1307 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1308 struct ras_ih_data *data;
1309
1310 if (!obj) {
1311 /* in case we registe the IH before enable ras feature */
1312 obj = amdgpu_ras_create_obj(adev, &info->head);
1313 if (!obj)
1314 return -EINVAL;
1315 } else
1316 get_obj(obj);
1317
1318 data = &obj->ih_data;
1319 /* add the callback.etc */
1320 *data = (struct ras_ih_data) {
1321 .inuse = 0,
1322 .cb = info->cb,
1323 .element_size = sizeof(struct amdgpu_iv_entry),
1324 .rptr = 0,
1325 .wptr = 0,
1326 };
1327
1328 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1329
1330 data->aligned_element_size = ALIGN(data->element_size, 8);
1331 /* the ring can store 64 iv entries. */
1332 data->ring_size = 64 * data->aligned_element_size;
1333 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1334 if (!data->ring) {
1335 put_obj(obj);
1336 return -ENOMEM;
1337 }
1338
1339 /* IH is ready */
1340 data->inuse = 1;
1341
1342 return 0;
1343}
1344
1345static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1346{
1347 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1348 struct ras_manager *obj, *tmp;
1349
1350 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1351 struct ras_ih_if info = {
1352 .head = obj->head,
1353 };
1354 amdgpu_ras_interrupt_remove_handler(adev, &info);
1355 }
1356
1357 return 0;
1358}
1359/* ih end */
1360
313c8fd3
GC
1361/* traversal all IPs except NBIO to query error counter */
1362static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1363{
1364 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1365 struct ras_manager *obj;
1366
1367 if (!con)
1368 return;
1369
1370 list_for_each_entry(obj, &con->head, node) {
1371 struct ras_query_if info = {
1372 .head = obj->head,
1373 };
1374
1375 /*
1376 * PCIE_BIF IP has one different isr by ras controller
1377 * interrupt, the specific ras counter query will be
1378 * done in that isr. So skip such block from common
1379 * sync flood interrupt isr calling.
1380 */
1381 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1382 continue;
1383
1384 amdgpu_ras_error_query(adev, &info);
1385 }
1386}
1387
c030f2e4 1388/* recovery begin */
466b1793 1389
1390/* return 0 on success.
1391 * caller need free bps.
1392 */
1393static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1394 struct ras_badpage **bps, unsigned int *count)
1395{
1396 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1397 struct ras_err_handler_data *data;
1398 int i = 0;
1399 int ret = 0;
1400
1401 if (!con || !con->eh_data || !bps || !count)
1402 return -EINVAL;
1403
1404 mutex_lock(&con->recovery_lock);
1405 data = con->eh_data;
1406 if (!data || data->count == 0) {
1407 *bps = NULL;
46cf2fec 1408 ret = -EINVAL;
466b1793 1409 goto out;
1410 }
1411
1412 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1413 if (!*bps) {
1414 ret = -ENOMEM;
1415 goto out;
1416 }
1417
1418 for (; i < data->count; i++) {
1419 (*bps)[i] = (struct ras_badpage){
9dc23a63 1420 .bp = data->bps[i].retired_page,
466b1793 1421 .size = AMDGPU_GPU_PAGE_SIZE,
52dd95f2 1422 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
466b1793 1423 };
1424
1425 if (data->last_reserved <= i)
52dd95f2 1426 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
9dc23a63 1427 else if (data->bps_bo[i] == NULL)
52dd95f2 1428 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
466b1793 1429 }
1430
1431 *count = data->count;
1432out:
1433 mutex_unlock(&con->recovery_lock);
1434 return ret;
1435}
1436
c030f2e4 1437static void amdgpu_ras_do_recovery(struct work_struct *work)
1438{
1439 struct amdgpu_ras *ras =
1440 container_of(work, struct amdgpu_ras, recovery_work);
1441
313c8fd3
GC
1442 /*
1443 * Query and print non zero error counter per IP block for
1444 * awareness before recovering GPU.
1445 */
1446 amdgpu_ras_log_on_err_counter(ras->adev);
1447
93af20f7
HZ
1448 if (amdgpu_device_should_recover_gpu(ras->adev))
1449 amdgpu_device_gpu_recover(ras->adev, 0);
c030f2e4 1450 atomic_set(&ras->in_recovery, 0);
1451}
1452
c030f2e4 1453/* alloc/realloc bps array */
1454static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1455 struct ras_err_handler_data *data, int pages)
1456{
1457 unsigned int old_space = data->count + data->space_left;
1458 unsigned int new_space = old_space + pages;
9dc23a63
TZ
1459 unsigned int align_space = ALIGN(new_space, 512);
1460 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1461 struct amdgpu_bo **bps_bo =
1462 kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
1463
1464 if (!bps || !bps_bo) {
1465 kfree(bps);
1466 kfree(bps_bo);
c030f2e4 1467 return -ENOMEM;
9dc23a63 1468 }
c030f2e4 1469
1470 if (data->bps) {
9dc23a63 1471 memcpy(bps, data->bps,
c030f2e4 1472 data->count * sizeof(*data->bps));
1473 kfree(data->bps);
1474 }
9dc23a63
TZ
1475 if (data->bps_bo) {
1476 memcpy(bps_bo, data->bps_bo,
1477 data->count * sizeof(*data->bps_bo));
1478 kfree(data->bps_bo);
1479 }
c030f2e4 1480
9dc23a63
TZ
1481 data->bps = bps;
1482 data->bps_bo = bps_bo;
c030f2e4 1483 data->space_left += align_space - old_space;
1484 return 0;
1485}
1486
1487/* it deal with vram only. */
1488int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
9dc23a63 1489 struct eeprom_table_record *bps, int pages)
c030f2e4 1490{
1491 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
73aa8e1a 1492 struct ras_err_handler_data *data;
c030f2e4 1493 int ret = 0;
1494
73aa8e1a 1495 if (!con || !con->eh_data || !bps || pages <= 0)
c030f2e4 1496 return 0;
1497
1498 mutex_lock(&con->recovery_lock);
73aa8e1a 1499 data = con->eh_data;
c030f2e4 1500 if (!data)
1501 goto out;
1502
1503 if (data->space_left <= pages)
1504 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1505 ret = -ENOMEM;
1506 goto out;
1507 }
1508
9dc23a63
TZ
1509 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
1510 data->count += pages;
c030f2e4 1511 data->space_left -= pages;
9dc23a63 1512
c030f2e4 1513out:
1514 mutex_unlock(&con->recovery_lock);
1515
1516 return ret;
1517}
1518
78ad00c9
TZ
1519/*
1520 * write error record array to eeprom, the function should be
1521 * protected by recovery_lock
1522 */
1523static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1524{
1525 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1526 struct ras_err_handler_data *data;
8a3e801f 1527 struct amdgpu_ras_eeprom_control *control;
78ad00c9
TZ
1528 int save_count;
1529
1530 if (!con || !con->eh_data)
1531 return 0;
1532
8a3e801f 1533 control = &con->eeprom_control;
78ad00c9
TZ
1534 data = con->eh_data;
1535 save_count = data->count - control->num_recs;
1536 /* only new entries are saved */
1537 if (save_count > 0)
0771b0bf 1538 if (amdgpu_ras_eeprom_process_recods(control,
78ad00c9
TZ
1539 &data->bps[control->num_recs],
1540 true,
1541 save_count)) {
1542 DRM_ERROR("Failed to save EEPROM table data!");
1543 return -EIO;
1544 }
1545
1546 return 0;
1547}
1548
1549/*
1550 * read error record array in eeprom and reserve enough space for
1551 * storing new bad pages
1552 */
1553static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1554{
1555 struct amdgpu_ras_eeprom_control *control =
1556 &adev->psp.ras.ras->eeprom_control;
1557 struct eeprom_table_record *bps = NULL;
1558 int ret = 0;
1559
1560 /* no bad page record, skip eeprom access */
1561 if (!control->num_recs)
1562 return ret;
1563
1564 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1565 if (!bps)
1566 return -ENOMEM;
1567
1568 if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1569 control->num_recs)) {
1570 DRM_ERROR("Failed to load EEPROM table records!");
1571 ret = -EIO;
1572 goto out;
1573 }
1574
1575 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1576
1577out:
1578 kfree(bps);
1579 return ret;
1580}
1581
6e4be987
TZ
1582/*
1583 * check if an address belongs to bad page
1584 *
1585 * Note: this check is only for umc block
1586 */
1587static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1588 uint64_t addr)
1589{
1590 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1591 struct ras_err_handler_data *data;
1592 int i;
1593 bool ret = false;
1594
1595 if (!con || !con->eh_data)
1596 return ret;
1597
1598 mutex_lock(&con->recovery_lock);
1599 data = con->eh_data;
1600 if (!data)
1601 goto out;
1602
1603 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1604 for (i = 0; i < data->count; i++)
1605 if (addr == data->bps[i].retired_page) {
1606 ret = true;
1607 goto out;
1608 }
1609
1610out:
1611 mutex_unlock(&con->recovery_lock);
1612 return ret;
1613}
1614
c030f2e4 1615/* called in gpu recovery/init */
1616int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1617{
1618 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
73aa8e1a 1619 struct ras_err_handler_data *data;
c030f2e4 1620 uint64_t bp;
de7b45ba 1621 struct amdgpu_bo *bo = NULL;
78ad00c9 1622 int i, ret = 0;
c030f2e4 1623
73aa8e1a 1624 if (!con || !con->eh_data)
c030f2e4 1625 return 0;
1626
1627 mutex_lock(&con->recovery_lock);
73aa8e1a 1628 data = con->eh_data;
1629 if (!data)
1630 goto out;
c030f2e4 1631 /* reserve vram at driver post stage. */
1632 for (i = data->last_reserved; i < data->count; i++) {
9dc23a63 1633 bp = data->bps[i].retired_page;
c030f2e4 1634
ae115c81
TZ
1635 /* There are two cases of reserve error should be ignored:
1636 * 1) a ras bad page has been allocated (used by someone);
1637 * 2) a ras bad page has been reserved (duplicate error injection
1638 * for one page);
1639 */
a142ba88
AD
1640 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
1641 AMDGPU_GPU_PAGE_SIZE,
de7b45ba
CK
1642 AMDGPU_GEM_DOMAIN_VRAM,
1643 &bo, NULL))
ae115c81 1644 DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp);
c030f2e4 1645
9dc23a63 1646 data->bps_bo[i] = bo;
c030f2e4 1647 data->last_reserved = i + 1;
de7b45ba 1648 bo = NULL;
c030f2e4 1649 }
78ad00c9
TZ
1650
1651 /* continue to save bad pages to eeprom even reesrve_vram fails */
1652 ret = amdgpu_ras_save_bad_pages(adev);
73aa8e1a 1653out:
c030f2e4 1654 mutex_unlock(&con->recovery_lock);
78ad00c9 1655 return ret;
c030f2e4 1656}
1657
1658/* called when driver unload */
1659static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1660{
1661 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
73aa8e1a 1662 struct ras_err_handler_data *data;
c030f2e4 1663 struct amdgpu_bo *bo;
1664 int i;
1665
73aa8e1a 1666 if (!con || !con->eh_data)
c030f2e4 1667 return 0;
1668
1669 mutex_lock(&con->recovery_lock);
73aa8e1a 1670 data = con->eh_data;
1671 if (!data)
1672 goto out;
1673
c030f2e4 1674 for (i = data->last_reserved - 1; i >= 0; i--) {
9dc23a63 1675 bo = data->bps_bo[i];
c030f2e4 1676
de7b45ba 1677 amdgpu_bo_free_kernel(&bo, NULL, NULL);
c030f2e4 1678
9dc23a63 1679 data->bps_bo[i] = bo;
c030f2e4 1680 data->last_reserved = i;
1681 }
73aa8e1a 1682out:
c030f2e4 1683 mutex_unlock(&con->recovery_lock);
1684 return 0;
1685}
1686
1a6fc071 1687int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
c030f2e4 1688{
1689 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4d1337d2 1690 struct ras_err_handler_data **data;
78ad00c9 1691 int ret;
c030f2e4 1692
4d1337d2
AG
1693 if (con)
1694 data = &con->eh_data;
1695 else
1696 return 0;
1697
1a6fc071
TZ
1698 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1699 if (!*data) {
1700 ret = -ENOMEM;
1701 goto out;
1702 }
c030f2e4 1703
1704 mutex_init(&con->recovery_lock);
1705 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1706 atomic_set(&con->in_recovery, 0);
1707 con->adev = adev;
1708
0771b0bf 1709 ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
78ad00c9 1710 if (ret)
1a6fc071 1711 goto free;
78ad00c9 1712
0771b0bf 1713 if (con->eeprom_control.num_recs) {
78ad00c9
TZ
1714 ret = amdgpu_ras_load_bad_pages(adev);
1715 if (ret)
1a6fc071 1716 goto free;
78ad00c9
TZ
1717 ret = amdgpu_ras_reserve_bad_pages(adev);
1718 if (ret)
1a6fc071 1719 goto release;
78ad00c9 1720 }
c030f2e4 1721
1722 return 0;
1a6fc071
TZ
1723
1724release:
1725 amdgpu_ras_release_bad_pages(adev);
1726free:
1a6fc071
TZ
1727 kfree((*data)->bps);
1728 kfree((*data)->bps_bo);
1729 kfree(*data);
1995b3a3 1730 con->eh_data = NULL;
1a6fc071
TZ
1731out:
1732 DRM_WARN("Failed to initialize ras recovery!\n");
1733
1734 return ret;
c030f2e4 1735}
1736
1737static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1738{
1739 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1740 struct ras_err_handler_data *data = con->eh_data;
1741
1a6fc071
TZ
1742 /* recovery_init failed to init it, fini is useless */
1743 if (!data)
1744 return 0;
1745
c030f2e4 1746 cancel_work_sync(&con->recovery_work);
c030f2e4 1747 amdgpu_ras_release_bad_pages(adev);
1748
1749 mutex_lock(&con->recovery_lock);
1750 con->eh_data = NULL;
1751 kfree(data->bps);
1a6fc071 1752 kfree(data->bps_bo);
c030f2e4 1753 kfree(data);
1754 mutex_unlock(&con->recovery_lock);
1755
1756 return 0;
1757}
1758/* recovery end */
1759
a564808e 1760/* return 0 if ras will reset gpu and repost.*/
1761int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1762 unsigned int block)
1763{
1764 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1765
1766 if (!ras)
1767 return -EINVAL;
1768
1769 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1770 return 0;
1771}
1772
5caf466a 1773/*
1774 * check hardware's ras ability which will be saved in hw_supported.
1775 * if hardware does not support ras, we can skip some ras initializtion and
1776 * forbid some ras operations from IP.
1777 * if software itself, say boot parameter, limit the ras ability. We still
1778 * need allow IP do some limited operations, like disable. In such case,
1779 * we have to initialize ras as normal. but need check if operation is
1780 * allowed or not in each function.
1781 */
1782static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1783 uint32_t *hw_supported, uint32_t *supported)
c030f2e4 1784{
5caf466a 1785 *hw_supported = 0;
1786 *supported = 0;
c030f2e4 1787
88474cca 1788 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
baaeb610
HZ
1789 (adev->asic_type != CHIP_VEGA20 &&
1790 adev->asic_type != CHIP_ARCTURUS))
5caf466a 1791 return;
b404ae82 1792
88474cca
GC
1793 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
1794 DRM_INFO("HBM ECC is active.\n");
1795 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
1796 1 << AMDGPU_RAS_BLOCK__DF);
1797 } else
1798 DRM_INFO("HBM ECC is not presented.\n");
1799
1800 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
1801 DRM_INFO("SRAM ECC is active.\n");
1802 *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1803 1 << AMDGPU_RAS_BLOCK__DF);
1804 } else
1805 DRM_INFO("SRAM ECC is not presented.\n");
1806
1807 /* hw_supported needs to be aligned with RAS block mask. */
1808 *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
b404ae82 1809
5caf466a 1810 *supported = amdgpu_ras_enable == 0 ?
88474cca 1811 0 : *hw_supported & amdgpu_ras_mask;
c030f2e4 1812}
1813
1814int amdgpu_ras_init(struct amdgpu_device *adev)
1815{
1816 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4e644fff 1817 int r;
c030f2e4 1818
b404ae82 1819 if (con)
c030f2e4 1820 return 0;
1821
1822 con = kmalloc(sizeof(struct amdgpu_ras) +
1823 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1824 GFP_KERNEL|__GFP_ZERO);
1825 if (!con)
1826 return -ENOMEM;
1827
1828 con->objs = (struct ras_manager *)(con + 1);
1829
1830 amdgpu_ras_set_context(adev, con);
1831
5caf466a 1832 amdgpu_ras_check_supported(adev, &con->hw_supported,
1833 &con->supported);
fb2a3607
HZ
1834 if (!con->hw_supported) {
1835 amdgpu_ras_set_context(adev, NULL);
1836 kfree(con);
1837 return 0;
1838 }
1839
c030f2e4 1840 con->features = 0;
1841 INIT_LIST_HEAD(&con->head);
108c6a63 1842 /* Might need get this flag from vbios. */
1843 con->flags = RAS_DEFAULT_FLAGS;
c030f2e4 1844
4e644fff
HZ
1845 if (adev->nbio.funcs->init_ras_controller_interrupt) {
1846 r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
1847 if (r)
1848 return r;
1849 }
1850
1851 if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
1852 r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
1853 if (r)
1854 return r;
1855 }
1856
c030f2e4 1857 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1858
c030f2e4 1859 if (amdgpu_ras_fs_init(adev))
1860 goto fs_out;
1861
5d0f903f 1862 DRM_INFO("RAS INFO: ras initialized successfully, "
1863 "hardware ability[%x] ras_mask[%x]\n",
1864 con->hw_supported, con->supported);
c030f2e4 1865 return 0;
1866fs_out:
c030f2e4 1867 amdgpu_ras_set_context(adev, NULL);
1868 kfree(con);
1869
1870 return -EINVAL;
1871}
1872
b293e891
HZ
1873/* helper function to handle common stuff in ip late init phase */
1874int amdgpu_ras_late_init(struct amdgpu_device *adev,
1875 struct ras_common_if *ras_block,
1876 struct ras_fs_if *fs_info,
1877 struct ras_ih_if *ih_info)
1878{
1879 int r;
1880
1881 /* disable RAS feature per IP block if it is not supported */
1882 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
1883 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
1884 return 0;
1885 }
1886
1887 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
1888 if (r) {
1889 if (r == -EAGAIN) {
1890 /* request gpu reset. will run again */
1891 amdgpu_ras_request_reset_on_boot(adev,
1892 ras_block->block);
1893 return 0;
1894 } else if (adev->in_suspend || adev->in_gpu_reset) {
1895 /* in resume phase, if fail to enable ras,
1896 * clean up all ras fs nodes, and disable ras */
1897 goto cleanup;
1898 } else
1899 return r;
1900 }
1901
1902 /* in resume phase, no need to create ras fs node */
61380faa
JC
1903 if (adev->in_suspend || adev->in_gpu_reset) {
1904 amdgpu_ras_set_error_query_ready(adev, true);
b293e891 1905 return 0;
61380faa 1906 }
b293e891
HZ
1907
1908 if (ih_info->cb) {
1909 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
1910 if (r)
1911 goto interrupt;
1912 }
1913
b293e891
HZ
1914 r = amdgpu_ras_sysfs_create(adev, fs_info);
1915 if (r)
1916 goto sysfs;
1917
61380faa
JC
1918 amdgpu_ras_set_error_query_ready(adev, true);
1919
b293e891
HZ
1920 return 0;
1921cleanup:
1922 amdgpu_ras_sysfs_remove(adev, ras_block);
1923sysfs:
b293e891
HZ
1924 if (ih_info->cb)
1925 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1926interrupt:
1927 amdgpu_ras_feature_enable(adev, ras_block, 0);
1928 return r;
1929}
1930
1931/* helper function to remove ras fs node and interrupt handler */
1932void amdgpu_ras_late_fini(struct amdgpu_device *adev,
1933 struct ras_common_if *ras_block,
1934 struct ras_ih_if *ih_info)
1935{
1936 if (!ras_block || !ih_info)
1937 return;
1938
1939 amdgpu_ras_sysfs_remove(adev, ras_block);
b293e891
HZ
1940 if (ih_info->cb)
1941 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1942 amdgpu_ras_feature_enable(adev, ras_block, 0);
1943}
1944
a564808e 1945/* do some init work after IP late init as dependence.
511fdbc3 1946 * and it runs in resume/gpu reset/booting up cases.
a564808e 1947 */
511fdbc3 1948void amdgpu_ras_resume(struct amdgpu_device *adev)
108c6a63 1949{
1950 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1951 struct ras_manager *obj, *tmp;
1952
1953 if (!con)
1954 return;
1955
108c6a63 1956 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
191051a1 1957 /* Set up all other IPs which are not implemented. There is a
1958 * tricky thing that IP's actual ras error type should be
1959 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1960 * ERROR_NONE make sense anyway.
1961 */
1962 amdgpu_ras_enable_all_features(adev, 1);
1963
1964 /* We enable ras on all hw_supported block, but as boot
1965 * parameter might disable some of them and one or more IP has
1966 * not implemented yet. So we disable them on behalf.
1967 */
108c6a63 1968 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1969 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1970 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1971 /* there should be no any reference. */
1972 WARN_ON(alive_obj(obj));
1973 }
191051a1 1974 }
108c6a63 1975 }
a564808e 1976
1977 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1978 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1979 /* setup ras obj state as disabled.
1980 * for init_by_vbios case.
1981 * if we want to enable ras, just enable it in a normal way.
1982 * If we want do disable it, need setup ras obj as enabled,
1983 * then issue another TA disable cmd.
1984 * See feature_enable_on_boot
1985 */
1986 amdgpu_ras_disable_all_features(adev, 1);
61934624 1987 amdgpu_ras_reset_gpu(adev);
a564808e 1988 }
108c6a63 1989}
1990
511fdbc3 1991void amdgpu_ras_suspend(struct amdgpu_device *adev)
1992{
1993 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1994
1995 if (!con)
1996 return;
1997
1998 amdgpu_ras_disable_all_features(adev, 0);
1999 /* Make sure all ras objects are disabled. */
2000 if (con->features)
2001 amdgpu_ras_disable_all_features(adev, 1);
2002}
2003
c030f2e4 2004/* do some fini work before IP fini as dependence */
2005int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2006{
2007 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2008
2009 if (!con)
2010 return 0;
2011
2012 /* Need disable ras on all IPs here before ip [hw/sw]fini */
2013 amdgpu_ras_disable_all_features(adev, 0);
2014 amdgpu_ras_recovery_fini(adev);
2015 return 0;
2016}
2017
2018int amdgpu_ras_fini(struct amdgpu_device *adev)
2019{
2020 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2021
2022 if (!con)
2023 return 0;
2024
2025 amdgpu_ras_fs_fini(adev);
2026 amdgpu_ras_interrupt_remove_all(adev);
2027
2028 WARN(con->features, "Feature mask is not cleared");
2029
2030 if (con->features)
2031 amdgpu_ras_disable_all_features(adev, 1);
2032
2033 amdgpu_ras_set_context(adev, NULL);
2034 kfree(con);
2035
2036 return 0;
2037}
7c6e68c7
AG
2038
2039void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2040{
ed606f8a
AG
2041 uint32_t hw_supported, supported;
2042
2043 amdgpu_ras_check_supported(adev, &hw_supported, &supported);
2044 if (!hw_supported)
2045 return;
2046
7c6e68c7 2047 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
d5ea093e
AG
2048 DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
2049
61934624 2050 amdgpu_ras_reset_gpu(adev);
7c6e68c7
AG
2051 }
2052}