drm/amdgpu: add dpm helper function for DF Cstate control
[linux-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
CommitLineData
c030f2e4 1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
f867723b 27#include <linux/uaccess.h>
7c6e68c7
AG
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
f867723b 30
c030f2e4 31#include "amdgpu.h"
32#include "amdgpu_ras.h"
b404ae82 33#include "amdgpu_atomfirmware.h"
4e644fff 34#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
c030f2e4 35
c030f2e4 36const char *ras_error_string[] = {
37 "none",
38 "parity",
39 "single_correctable",
40 "multi_uncorrectable",
41 "poison",
42};
43
44const char *ras_block_string[] = {
45 "umc",
46 "sdma",
47 "gfx",
48 "mmhub",
49 "athub",
50 "pcie_bif",
51 "hdp",
52 "xgmi_wafl",
53 "df",
54 "smn",
55 "sem",
56 "mp0",
57 "mp1",
58 "fuse",
59};
60
61#define ras_err_str(i) (ras_error_string[ffs(i)])
62#define ras_block_str(i) (ras_block_string[i])
63
a564808e 64#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
65#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
108c6a63 66#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
67
7cdc2ee3
TZ
68/* inject address is 52 bits */
69#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
70
52dd95f2
GC
71enum amdgpu_ras_retire_page_reservation {
72 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
73 AMDGPU_RAS_RETIRE_PAGE_PENDING,
74 AMDGPU_RAS_RETIRE_PAGE_FAULT,
75};
7c6e68c7
AG
76
77atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
78
6e4be987
TZ
79static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
80 uint64_t addr);
81
c030f2e4 82static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
83 size_t size, loff_t *pos)
84{
85 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
86 struct ras_query_if info = {
87 .head = obj->head,
88 };
89 ssize_t s;
90 char val[128];
91
92 if (amdgpu_ras_error_query(obj->adev, &info))
93 return -EINVAL;
94
95 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
96 "ue", info.ue_count,
97 "ce", info.ce_count);
98 if (*pos >= s)
99 return 0;
100
101 s -= *pos;
102 s = min_t(u64, s, size);
103
104
105 if (copy_to_user(buf, &val[*pos], s))
106 return -EINVAL;
107
108 *pos += s;
109
110 return s;
111}
112
c030f2e4 113static const struct file_operations amdgpu_ras_debugfs_ops = {
114 .owner = THIS_MODULE,
115 .read = amdgpu_ras_debugfs_read,
190211ab 116 .write = NULL,
c030f2e4 117 .llseek = default_llseek
118};
119
96ebb307 120static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
121{
122 int i;
123
124 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
125 *block_id = i;
126 if (strcmp(name, ras_block_str(i)) == 0)
127 return 0;
128 }
129 return -EINVAL;
130}
131
132static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
133 const char __user *buf, size_t size,
134 loff_t *pos, struct ras_debug_if *data)
135{
136 ssize_t s = min_t(u64, 64, size);
137 char str[65];
138 char block_name[33];
139 char err[9] = "ue";
140 int op = -1;
141 int block_id;
44494f96 142 uint32_t sub_block;
96ebb307 143 u64 address, value;
144
145 if (*pos)
146 return -EINVAL;
147 *pos = size;
148
149 memset(str, 0, sizeof(str));
150 memset(data, 0, sizeof(*data));
151
152 if (copy_from_user(str, buf, s))
153 return -EINVAL;
154
155 if (sscanf(str, "disable %32s", block_name) == 1)
156 op = 0;
157 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
158 op = 1;
159 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
160 op = 2;
b076296b 161 else if (str[0] && str[1] && str[2] && str[3])
96ebb307 162 /* ascii string, but commands are not matched. */
163 return -EINVAL;
164
165 if (op != -1) {
166 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
167 return -EINVAL;
168
169 data->head.block = block_id;
e1063493
TZ
170 /* only ue and ce errors are supported */
171 if (!memcmp("ue", err, 2))
172 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
173 else if (!memcmp("ce", err, 2))
174 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
175 else
176 return -EINVAL;
177
96ebb307 178 data->op = op;
179
180 if (op == 2) {
44494f96
TZ
181 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
182 &sub_block, &address, &value) != 3)
183 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
184 &sub_block, &address, &value) != 3)
96ebb307 185 return -EINVAL;
44494f96 186 data->head.sub_block_index = sub_block;
96ebb307 187 data->inject.address = address;
188 data->inject.value = value;
189 }
190 } else {
73aa8e1a 191 if (size < sizeof(*data))
96ebb307 192 return -EINVAL;
193
194 if (copy_from_user(data, buf, sizeof(*data)))
195 return -EINVAL;
196 }
197
198 return 0;
199}
7c6e68c7 200
74abc221
TSD
201/**
202 * DOC: AMDGPU RAS debugfs control interface
36ea1bd2 203 *
204 * It accepts struct ras_debug_if who has two members.
205 *
206 * First member: ras_debug_if::head or ras_debug_if::inject.
96ebb307 207 *
208 * head is used to indicate which IP block will be under control.
36ea1bd2 209 *
210 * head has four members, they are block, type, sub_block_index, name.
211 * block: which IP will be under control.
212 * type: what kind of error will be enabled/disabled/injected.
213 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
214 * name: the name of IP.
215 *
216 * inject has two more members than head, they are address, value.
217 * As their names indicate, inject operation will write the
218 * value to the address.
219 *
ef177d11 220 * The second member: struct ras_debug_if::op.
c688a06b 221 * It has three kinds of operations.
879e723d
AZ
222 *
223 * - 0: disable RAS on the block. Take ::head as its data.
224 * - 1: enable RAS on the block. Take ::head as its data.
225 * - 2: inject errors on the block. Take ::inject as its data.
36ea1bd2 226 *
96ebb307 227 * How to use the interface?
ef177d11
AD
228 *
229 * Programs
230 *
231 * Copy the struct ras_debug_if in your codes and initialize it.
232 * Write the struct to the control node.
233 *
234 * Shells
96ebb307 235 *
879e723d
AZ
236 * .. code-block:: bash
237 *
a20bfd0f 238 * echo op block [error [sub_block address value]] > .../ras/ras_ctrl
879e723d 239 *
ef177d11
AD
240 * Parameters:
241 *
879e723d
AZ
242 * op: disable, enable, inject
243 * disable: only block is needed
244 * enable: block and error are needed
245 * inject: error, address, value are needed
a20bfd0f 246 * block: umc, sdma, gfx, .........
879e723d
AZ
247 * see ras_block_string[] for details
248 * error: ue, ce
249 * ue: multi_uncorrectable
250 * ce: single_correctable
251 * sub_block:
252 * sub block index, pass 0 if there is no sub block
253 *
254 * here are some examples for bash commands:
255 *
256 * .. code-block:: bash
96ebb307 257 *
44494f96
TZ
258 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
259 * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
96ebb307 260 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
261 *
36ea1bd2 262 * How to check the result?
263 *
264 * For disable/enable, please check ras features at
265 * /sys/class/drm/card[0/1/2...]/device/ras/features
266 *
267 * For inject, please check corresponding err count at
268 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
269 *
879e723d 270 * .. note::
ef177d11 271 * Operations are only allowed on blocks which are supported.
879e723d 272 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
ef177d11
AD
273 * to see which blocks support RAS on a particular asic.
274 *
36ea1bd2 275 */
276static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
277 size_t size, loff_t *pos)
278{
279 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
280 struct ras_debug_if data;
281 int ret = 0;
282
96ebb307 283 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
284 if (ret)
36ea1bd2 285 return -EINVAL;
286
36ea1bd2 287 if (!amdgpu_ras_is_supported(adev, data.head.block))
288 return -EINVAL;
289
290 switch (data.op) {
291 case 0:
292 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
293 break;
294 case 1:
295 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
296 break;
297 case 2:
7cdc2ee3
TZ
298 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
299 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
300 ret = -EINVAL;
301 break;
302 }
303
6e4be987
TZ
304 /* umc ce/ue error injection for a bad page is not allowed */
305 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
306 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
307 DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
308 data.inject.address);
309 break;
310 }
311
7cdc2ee3 312 /* data.inject.address is offset instead of absolute gpu address */
36ea1bd2 313 ret = amdgpu_ras_error_inject(adev, &data.inject);
314 break;
96ebb307 315 default:
316 ret = -EINVAL;
317 break;
374bf7bd 318 }
36ea1bd2 319
320 if (ret)
321 return -EINVAL;
322
323 return size;
324}
325
084fe13b
AG
326/**
327 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
328 *
f77c7109 329 * Some boards contain an EEPROM which is used to persistently store a list of
ef177d11 330 * bad pages which experiences ECC errors in vram. This interface provides
f77c7109
AD
331 * a way to reset the EEPROM, e.g., after testing error injection.
332 *
333 * Usage:
334 *
335 * .. code-block:: bash
336 *
337 * echo 1 > ../ras/ras_eeprom_reset
338 *
339 * will reset EEPROM table to 0 entries.
340 *
084fe13b
AG
341 */
342static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
343 size_t size, loff_t *pos)
344{
345 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
346 int ret;
347
348 ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control);
349
350 return ret == 1 ? size : -EIO;
351}
352
36ea1bd2 353static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
354 .owner = THIS_MODULE,
355 .read = NULL,
356 .write = amdgpu_ras_debugfs_ctrl_write,
357 .llseek = default_llseek
358};
359
084fe13b
AG
360static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
361 .owner = THIS_MODULE,
362 .read = NULL,
363 .write = amdgpu_ras_debugfs_eeprom_write,
364 .llseek = default_llseek
365};
366
f77c7109
AD
367/**
368 * DOC: AMDGPU RAS sysfs Error Count Interface
369 *
ef177d11 370 * It allows the user to read the error count for each IP block on the gpu through
f77c7109
AD
371 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
372 *
373 * It outputs the multiple lines which report the uncorrected (ue) and corrected
374 * (ce) error counts.
375 *
376 * The format of one line is below,
377 *
378 * [ce|ue]: count
379 *
380 * Example:
381 *
382 * .. code-block:: bash
383 *
384 * ue: 0
385 * ce: 1
386 *
387 */
c030f2e4 388static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
389 struct device_attribute *attr, char *buf)
390{
391 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
392 struct ras_query_if info = {
393 .head = obj->head,
394 };
395
396 if (amdgpu_ras_error_query(obj->adev, &info))
397 return -EINVAL;
398
399 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
400 "ue", info.ue_count,
401 "ce", info.ce_count);
402}
403
404/* obj begin */
405
406#define get_obj(obj) do { (obj)->use++; } while (0)
407#define alive_obj(obj) ((obj)->use)
408
409static inline void put_obj(struct ras_manager *obj)
410{
411 if (obj && --obj->use == 0)
412 list_del(&obj->node);
413 if (obj && obj->use < 0) {
414 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
415 }
416}
417
418/* make one obj and return it. */
419static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
420 struct ras_common_if *head)
421{
422 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
423 struct ras_manager *obj;
424
425 if (!con)
426 return NULL;
427
428 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
429 return NULL;
430
431 obj = &con->objs[head->block];
432 /* already exist. return obj? */
433 if (alive_obj(obj))
434 return NULL;
435
436 obj->head = *head;
437 obj->adev = adev;
438 list_add(&obj->node, &con->head);
439 get_obj(obj);
440
441 return obj;
442}
443
444/* return an obj equal to head, or the first when head is NULL */
f2a79be1 445struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
c030f2e4 446 struct ras_common_if *head)
447{
448 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
449 struct ras_manager *obj;
450 int i;
451
452 if (!con)
453 return NULL;
454
455 if (head) {
456 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
457 return NULL;
458
459 obj = &con->objs[head->block];
460
461 if (alive_obj(obj)) {
462 WARN_ON(head->block != obj->head.block);
463 return obj;
464 }
465 } else {
466 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
467 obj = &con->objs[i];
468 if (alive_obj(obj)) {
469 WARN_ON(i != obj->head.block);
470 return obj;
471 }
472 }
473 }
474
475 return NULL;
476}
477/* obj end */
478
479/* feature ctl begin */
480static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
481 struct ras_common_if *head)
482{
5caf466a 483 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
484
485 return con->hw_supported & BIT(head->block);
c030f2e4 486}
487
488static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
489 struct ras_common_if *head)
490{
491 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
492
493 return con->features & BIT(head->block);
494}
495
496/*
497 * if obj is not created, then create one.
498 * set feature enable flag.
499 */
500static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
501 struct ras_common_if *head, int enable)
502{
503 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
504 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
505
5caf466a 506 /* If hardware does not support ras, then do not create obj.
507 * But if hardware support ras, we can create the obj.
508 * Ras framework checks con->hw_supported to see if it need do
509 * corresponding initialization.
510 * IP checks con->support to see if it need disable ras.
511 */
c030f2e4 512 if (!amdgpu_ras_is_feature_allowed(adev, head))
513 return 0;
514 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
515 return 0;
516
517 if (enable) {
518 if (!obj) {
519 obj = amdgpu_ras_create_obj(adev, head);
520 if (!obj)
521 return -EINVAL;
522 } else {
523 /* In case we create obj somewhere else */
524 get_obj(obj);
525 }
526 con->features |= BIT(head->block);
527 } else {
528 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
529 con->features &= ~BIT(head->block);
530 put_obj(obj);
531 }
532 }
533
534 return 0;
535}
536
537/* wrapper of psp_ras_enable_features */
538int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
539 struct ras_common_if *head, bool enable)
540{
541 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
542 union ta_ras_cmd_input info;
543 int ret;
544
545 if (!con)
546 return -EINVAL;
547
548 if (!enable) {
549 info.disable_features = (struct ta_ras_disable_features_input) {
828cfa29 550 .block_id = amdgpu_ras_block_to_ta(head->block),
551 .error_type = amdgpu_ras_error_to_ta(head->type),
c030f2e4 552 };
553 } else {
554 info.enable_features = (struct ta_ras_enable_features_input) {
828cfa29 555 .block_id = amdgpu_ras_block_to_ta(head->block),
556 .error_type = amdgpu_ras_error_to_ta(head->type),
c030f2e4 557 };
558 }
559
560 /* Do not enable if it is not allowed. */
561 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
562 /* Are we alerady in that state we are going to set? */
563 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
564 return 0;
565
bff77e86
LM
566 if (!amdgpu_ras_intr_triggered()) {
567 ret = psp_ras_enable_features(&adev->psp, &info, enable);
568 if (ret) {
569 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
570 enable ? "enable":"disable",
571 ras_block_str(head->block),
572 ret);
573 if (ret == TA_RAS_STATUS__RESET_NEEDED)
574 return -EAGAIN;
575 return -EINVAL;
576 }
c030f2e4 577 }
578
579 /* setup the obj */
580 __amdgpu_ras_feature_enable(adev, head, enable);
581
582 return 0;
583}
584
77de502b 585/* Only used in device probe stage and called only once. */
586int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
587 struct ras_common_if *head, bool enable)
588{
589 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
590 int ret;
591
592 if (!con)
593 return -EINVAL;
594
595 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
7af23ebe 596 if (enable) {
597 /* There is no harm to issue a ras TA cmd regardless of
598 * the currecnt ras state.
599 * If current state == target state, it will do nothing
600 * But sometimes it requests driver to reset and repost
601 * with error code -EAGAIN.
602 */
603 ret = amdgpu_ras_feature_enable(adev, head, 1);
604 /* With old ras TA, we might fail to enable ras.
605 * Log it and just setup the object.
606 * TODO need remove this WA in the future.
607 */
608 if (ret == -EINVAL) {
609 ret = __amdgpu_ras_feature_enable(adev, head, 1);
610 if (!ret)
611 DRM_INFO("RAS INFO: %s setup object\n",
612 ras_block_str(head->block));
613 }
614 } else {
615 /* setup the object then issue a ras TA disable cmd.*/
616 ret = __amdgpu_ras_feature_enable(adev, head, 1);
617 if (ret)
618 return ret;
77de502b 619
77de502b 620 ret = amdgpu_ras_feature_enable(adev, head, 0);
7af23ebe 621 }
77de502b 622 } else
623 ret = amdgpu_ras_feature_enable(adev, head, enable);
624
625 return ret;
626}
627
c030f2e4 628static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
629 bool bypass)
630{
631 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
632 struct ras_manager *obj, *tmp;
633
634 list_for_each_entry_safe(obj, tmp, &con->head, node) {
635 /* bypass psp.
636 * aka just release the obj and corresponding flags
637 */
638 if (bypass) {
639 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
640 break;
641 } else {
642 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
643 break;
644 }
289d513b 645 }
c030f2e4 646
647 return con->features;
648}
649
650static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
651 bool bypass)
652{
653 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
654 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
655 int i;
191051a1 656 const enum amdgpu_ras_error_type default_ras_type =
657 AMDGPU_RAS_ERROR__NONE;
c030f2e4 658
659 for (i = 0; i < ras_block_count; i++) {
660 struct ras_common_if head = {
661 .block = i,
191051a1 662 .type = default_ras_type,
c030f2e4 663 .sub_block_index = 0,
664 };
665 strcpy(head.name, ras_block_str(i));
666 if (bypass) {
667 /*
668 * bypass psp. vbios enable ras for us.
669 * so just create the obj
670 */
671 if (__amdgpu_ras_feature_enable(adev, &head, 1))
672 break;
673 } else {
674 if (amdgpu_ras_feature_enable(adev, &head, 1))
675 break;
676 }
289d513b 677 }
c030f2e4 678
679 return con->features;
680}
681/* feature ctl end */
682
683/* query/inject/cure begin */
684int amdgpu_ras_error_query(struct amdgpu_device *adev,
685 struct ras_query_if *info)
686{
687 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
6f102dba 688 struct ras_err_data err_data = {0, 0, 0, NULL};
3e81ee9a 689 int i;
c030f2e4 690
691 if (!obj)
692 return -EINVAL;
c030f2e4 693
939e2258
HZ
694 switch (info->head.block) {
695 case AMDGPU_RAS_BLOCK__UMC:
045c0216
TZ
696 if (adev->umc.funcs->query_ras_error_count)
697 adev->umc.funcs->query_ras_error_count(adev, &err_data);
13b7c46c
TZ
698 /* umc query_ras_error_address is also responsible for clearing
699 * error status
700 */
701 if (adev->umc.funcs->query_ras_error_address)
702 adev->umc.funcs->query_ras_error_address(adev, &err_data);
939e2258 703 break;
3e81ee9a
HZ
704 case AMDGPU_RAS_BLOCK__SDMA:
705 if (adev->sdma.funcs->query_ras_error_count) {
706 for (i = 0; i < adev->sdma.num_instances; i++)
707 adev->sdma.funcs->query_ras_error_count(adev, i,
708 &err_data);
709 }
710 break;
83b0582c
DL
711 case AMDGPU_RAS_BLOCK__GFX:
712 if (adev->gfx.funcs->query_ras_error_count)
713 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
714 break;
9fb2d8de 715 case AMDGPU_RAS_BLOCK__MMHUB:
d65bf1f8
TZ
716 if (adev->mmhub.funcs->query_ras_error_count)
717 adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
9fb2d8de 718 break;
d7bd680d
GC
719 case AMDGPU_RAS_BLOCK__PCIE_BIF:
720 if (adev->nbio.funcs->query_ras_error_count)
721 adev->nbio.funcs->query_ras_error_count(adev, &err_data);
722 break;
939e2258
HZ
723 default:
724 break;
725 }
05a58345
TZ
726
727 obj->err_data.ue_count += err_data.ue_count;
728 obj->err_data.ce_count += err_data.ce_count;
729
c030f2e4 730 info->ue_count = obj->err_data.ue_count;
731 info->ce_count = obj->err_data.ce_count;
732
7c6e68c7 733 if (err_data.ce_count) {
05a58345
TZ
734 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
735 obj->err_data.ce_count, ras_block_str(info->head.block));
7c6e68c7
AG
736 }
737 if (err_data.ue_count) {
05a58345
TZ
738 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
739 obj->err_data.ue_count, ras_block_str(info->head.block));
7c6e68c7 740 }
05a58345 741
c030f2e4 742 return 0;
743}
744
a6c44d25
JC
745uint64_t get_xgmi_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr)
746{
747 uint32_t df_inst_id;
748
749 if ((!adev->df.funcs) ||
750 (!adev->df.funcs->get_df_inst_id) ||
751 (!adev->df.funcs->get_dram_base_addr))
752 return addr;
753
754 df_inst_id = adev->df.funcs->get_df_inst_id(adev);
755
756 return addr + adev->df.funcs->get_dram_base_addr(adev, df_inst_id);
757}
758
c030f2e4 759/* wrapper of psp_ras_trigger_error */
760int amdgpu_ras_error_inject(struct amdgpu_device *adev,
761 struct ras_inject_if *info)
762{
763 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
764 struct ta_ras_trigger_error_input block_info = {
828cfa29 765 .block_id = amdgpu_ras_block_to_ta(info->head.block),
766 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
c030f2e4 767 .sub_block_index = info->head.sub_block_index,
768 .address = info->address,
769 .value = info->value,
770 };
771 int ret = 0;
772
773 if (!obj)
774 return -EINVAL;
775
a6c44d25
JC
776 /* Calculate XGMI relative offset */
777 if (adev->gmc.xgmi.num_physical_nodes > 1) {
778 block_info.address = get_xgmi_relative_phy_addr(adev,
779 block_info.address);
780 }
781
83b0582c
DL
782 switch (info->head.block) {
783 case AMDGPU_RAS_BLOCK__GFX:
784 if (adev->gfx.funcs->ras_error_inject)
785 ret = adev->gfx.funcs->ras_error_inject(adev, info);
786 else
787 ret = -EINVAL;
788 break;
789 case AMDGPU_RAS_BLOCK__UMC:
9fb2d8de 790 case AMDGPU_RAS_BLOCK__MMHUB:
f3170352 791 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
d7bd680d 792 case AMDGPU_RAS_BLOCK__PCIE_BIF:
83b0582c
DL
793 ret = psp_ras_trigger_error(&adev->psp, &block_info);
794 break;
795 default:
a5dd40ca
HZ
796 DRM_INFO("%s error injection is not supported yet\n",
797 ras_block_str(info->head.block));
83b0582c 798 ret = -EINVAL;
a5dd40ca
HZ
799 }
800
c030f2e4 801 if (ret)
802 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
803 ras_block_str(info->head.block),
804 ret);
805
806 return ret;
807}
808
809int amdgpu_ras_error_cure(struct amdgpu_device *adev,
810 struct ras_cure_if *info)
811{
812 /* psp fw has no cure interface for now. */
813 return 0;
814}
815
816/* get the total error counts on all IPs */
64cc5414 817unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
c030f2e4 818 bool is_ce)
819{
820 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
821 struct ras_manager *obj;
822 struct ras_err_data data = {0, 0};
823
824 if (!con)
64cc5414 825 return 0;
c030f2e4 826
827 list_for_each_entry(obj, &con->head, node) {
828 struct ras_query_if info = {
829 .head = obj->head,
830 };
831
832 if (amdgpu_ras_error_query(adev, &info))
64cc5414 833 return 0;
c030f2e4 834
835 data.ce_count += info.ce_count;
836 data.ue_count += info.ue_count;
837 }
838
839 return is_ce ? data.ce_count : data.ue_count;
840}
841/* query/inject/cure end */
842
843
844/* sysfs begin */
845
466b1793 846static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
847 struct ras_badpage **bps, unsigned int *count);
848
849static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
850{
851 switch (flags) {
52dd95f2 852 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
466b1793 853 return "R";
52dd95f2 854 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
466b1793 855 return "P";
52dd95f2 856 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
466b1793 857 default:
858 return "F";
859 };
860}
861
f77c7109
AD
862/**
863 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
466b1793 864 *
865 * It allows user to read the bad pages of vram on the gpu through
866 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
867 *
868 * It outputs multiple lines, and each line stands for one gpu page.
869 *
870 * The format of one line is below,
871 * gpu pfn : gpu page size : flags
872 *
873 * gpu pfn and gpu page size are printed in hex format.
874 * flags can be one of below character,
f77c7109 875 *
466b1793 876 * R: reserved, this gpu page is reserved and not able to use.
f77c7109 877 *
466b1793 878 * P: pending for reserve, this gpu page is marked as bad, will be reserved
f77c7109
AD
879 * in next window of page_reserve.
880 *
466b1793 881 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
882 *
f77c7109
AD
883 * Examples:
884 *
885 * .. code-block:: bash
886 *
887 * 0x00000001 : 0x00001000 : R
888 * 0x00000002 : 0x00001000 : P
889 *
466b1793 890 */
891
892static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
893 struct kobject *kobj, struct bin_attribute *attr,
894 char *buf, loff_t ppos, size_t count)
895{
896 struct amdgpu_ras *con =
897 container_of(attr, struct amdgpu_ras, badpages_attr);
898 struct amdgpu_device *adev = con->adev;
899 const unsigned int element_size =
900 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
d6ee400e
SA
901 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
902 unsigned int end = div64_ul(ppos + count - 1, element_size);
466b1793 903 ssize_t s = 0;
904 struct ras_badpage *bps = NULL;
905 unsigned int bps_count = 0;
906
907 memset(buf, 0, count);
908
909 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
910 return 0;
911
912 for (; start < end && start < bps_count; start++)
913 s += scnprintf(&buf[s], element_size + 1,
914 "0x%08x : 0x%08x : %1s\n",
915 bps[start].bp,
916 bps[start].size,
917 amdgpu_ras_badpage_flags_str(bps[start].flags));
918
919 kfree(bps);
920
921 return s;
922}
923
c030f2e4 924static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
925 struct device_attribute *attr, char *buf)
926{
927 struct amdgpu_ras *con =
928 container_of(attr, struct amdgpu_ras, features_attr);
c030f2e4 929
5212a3bd 930 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
c030f2e4 931}
932
933static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
934{
935 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
936 struct attribute *attrs[] = {
937 &con->features_attr.attr,
938 NULL
939 };
466b1793 940 struct bin_attribute *bin_attrs[] = {
941 &con->badpages_attr,
942 NULL
943 };
c030f2e4 944 struct attribute_group group = {
945 .name = "ras",
946 .attrs = attrs,
466b1793 947 .bin_attrs = bin_attrs,
c030f2e4 948 };
949
950 con->features_attr = (struct device_attribute) {
951 .attr = {
952 .name = "features",
953 .mode = S_IRUGO,
954 },
955 .show = amdgpu_ras_sysfs_features_read,
956 };
466b1793 957
958 con->badpages_attr = (struct bin_attribute) {
959 .attr = {
960 .name = "gpu_vram_bad_pages",
961 .mode = S_IRUGO,
962 },
963 .size = 0,
964 .private = NULL,
965 .read = amdgpu_ras_sysfs_badpages_read,
966 };
967
163def43 968 sysfs_attr_init(attrs[0]);
466b1793 969 sysfs_bin_attr_init(bin_attrs[0]);
c030f2e4 970
971 return sysfs_create_group(&adev->dev->kobj, &group);
972}
973
974static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
975{
976 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
977 struct attribute *attrs[] = {
978 &con->features_attr.attr,
979 NULL
980 };
466b1793 981 struct bin_attribute *bin_attrs[] = {
982 &con->badpages_attr,
983 NULL
984 };
c030f2e4 985 struct attribute_group group = {
986 .name = "ras",
987 .attrs = attrs,
466b1793 988 .bin_attrs = bin_attrs,
c030f2e4 989 };
990
991 sysfs_remove_group(&adev->dev->kobj, &group);
992
993 return 0;
994}
995
996int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
997 struct ras_fs_if *head)
998{
999 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1000
1001 if (!obj || obj->attr_inuse)
1002 return -EINVAL;
1003
1004 get_obj(obj);
1005
1006 memcpy(obj->fs_data.sysfs_name,
1007 head->sysfs_name,
1008 sizeof(obj->fs_data.sysfs_name));
1009
1010 obj->sysfs_attr = (struct device_attribute){
1011 .attr = {
1012 .name = obj->fs_data.sysfs_name,
1013 .mode = S_IRUGO,
1014 },
1015 .show = amdgpu_ras_sysfs_read,
1016 };
163def43 1017 sysfs_attr_init(&obj->sysfs_attr.attr);
c030f2e4 1018
1019 if (sysfs_add_file_to_group(&adev->dev->kobj,
1020 &obj->sysfs_attr.attr,
1021 "ras")) {
1022 put_obj(obj);
1023 return -EINVAL;
1024 }
1025
1026 obj->attr_inuse = 1;
1027
1028 return 0;
1029}
1030
1031int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1032 struct ras_common_if *head)
1033{
1034 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1035
1036 if (!obj || !obj->attr_inuse)
1037 return -EINVAL;
1038
1039 sysfs_remove_file_from_group(&adev->dev->kobj,
1040 &obj->sysfs_attr.attr,
1041 "ras");
1042 obj->attr_inuse = 0;
1043 put_obj(obj);
1044
1045 return 0;
1046}
1047
1048static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1049{
1050 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1051 struct ras_manager *obj, *tmp;
1052
1053 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1054 amdgpu_ras_sysfs_remove(adev, &obj->head);
1055 }
1056
1057 amdgpu_ras_sysfs_remove_feature_node(adev);
1058
1059 return 0;
1060}
1061/* sysfs end */
1062
ef177d11
AD
1063/**
1064 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1065 *
1066 * Normally when there is an uncorrectable error, the driver will reset
1067 * the GPU to recover. However, in the event of an unrecoverable error,
1068 * the driver provides an interface to reboot the system automatically
1069 * in that event.
1070 *
1071 * The following file in debugfs provides that interface:
1072 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1073 *
1074 * Usage:
1075 *
1076 * .. code-block:: bash
1077 *
1078 * echo true > .../ras/auto_reboot
1079 *
1080 */
c030f2e4 1081/* debugfs begin */
450f30ea 1082static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
36ea1bd2 1083{
1084 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1085 struct drm_minor *minor = adev->ddev->primary;
36ea1bd2 1086
450f30ea 1087 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
012dd14d
GC
1088 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
1089 adev, &amdgpu_ras_debugfs_ctrl_ops);
1090 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
1091 adev, &amdgpu_ras_debugfs_eeprom_ops);
c688a06b
GC
1092
1093 /*
1094 * After one uncorrectable error happens, usually GPU recovery will
1095 * be scheduled. But due to the known problem in GPU recovery failing
1096 * to bring GPU back, below interface provides one direct way to
1097 * user to reboot system automatically in such case within
1098 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1099 * will never be called.
1100 */
1101 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
1102 &con->reboot);
36ea1bd2 1103}
1104
450f30ea 1105void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
c030f2e4 1106 struct ras_fs_if *head)
1107{
1108 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1109 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
c030f2e4 1110
1111 if (!obj || obj->ent)
450f30ea 1112 return;
c030f2e4 1113
1114 get_obj(obj);
1115
1116 memcpy(obj->fs_data.debugfs_name,
1117 head->debugfs_name,
1118 sizeof(obj->fs_data.debugfs_name));
1119
450f30ea
GKH
1120 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1121 S_IWUGO | S_IRUGO, con->dir, obj,
1122 &amdgpu_ras_debugfs_ops);
c030f2e4 1123}
1124
450f30ea 1125void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
c030f2e4 1126 struct ras_common_if *head)
1127{
1128 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1129
1130 if (!obj || !obj->ent)
450f30ea 1131 return;
c030f2e4 1132
1133 debugfs_remove(obj->ent);
1134 obj->ent = NULL;
1135 put_obj(obj);
c030f2e4 1136}
1137
450f30ea 1138static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
c030f2e4 1139{
1140 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141 struct ras_manager *obj, *tmp;
1142
1143 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144 amdgpu_ras_debugfs_remove(adev, &obj->head);
1145 }
1146
012dd14d 1147 debugfs_remove_recursive(con->dir);
c030f2e4 1148 con->dir = NULL;
c030f2e4 1149}
1150/* debugfs end */
1151
1152/* ras fs */
1153
1154static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1155{
c030f2e4 1156 amdgpu_ras_sysfs_create_feature_node(adev);
36ea1bd2 1157 amdgpu_ras_debugfs_create_ctrl_node(adev);
c030f2e4 1158
1159 return 0;
1160}
1161
1162static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1163{
1164 amdgpu_ras_debugfs_remove_all(adev);
1165 amdgpu_ras_sysfs_remove_all(adev);
1166 return 0;
1167}
1168/* ras fs end */
1169
1170/* ih begin */
1171static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1172{
1173 struct ras_ih_data *data = &obj->ih_data;
1174 struct amdgpu_iv_entry entry;
1175 int ret;
cf04dfd0 1176 struct ras_err_data err_data = {0, 0, 0, NULL};
c030f2e4 1177
1178 while (data->rptr != data->wptr) {
1179 rmb();
1180 memcpy(&entry, &data->ring[data->rptr],
1181 data->element_size);
1182
1183 wmb();
1184 data->rptr = (data->aligned_element_size +
1185 data->rptr) % data->ring_size;
1186
1187 /* Let IP handle its data, maybe we need get the output
1188 * from the callback to udpate the error type/count, etc
1189 */
1190 if (data->cb) {
cf04dfd0 1191 ret = data->cb(obj->adev, &err_data, &entry);
c030f2e4 1192 /* ue will trigger an interrupt, and in that case
1193 * we need do a reset to recovery the whole system.
1194 * But leave IP do that recovery, here we just dispatch
1195 * the error.
1196 */
bd2280da 1197 if (ret == AMDGPU_RAS_SUCCESS) {
51437623
TZ
1198 /* these counts could be left as 0 if
1199 * some blocks do not count error number
1200 */
cf04dfd0 1201 obj->err_data.ue_count += err_data.ue_count;
51437623 1202 obj->err_data.ce_count += err_data.ce_count;
c030f2e4 1203 }
c030f2e4 1204 }
1205 }
1206}
1207
1208static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1209{
1210 struct ras_ih_data *data =
1211 container_of(work, struct ras_ih_data, ih_work);
1212 struct ras_manager *obj =
1213 container_of(data, struct ras_manager, ih_data);
1214
1215 amdgpu_ras_interrupt_handler(obj);
1216}
1217
1218int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1219 struct ras_dispatch_if *info)
1220{
1221 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1222 struct ras_ih_data *data = &obj->ih_data;
1223
1224 if (!obj)
1225 return -EINVAL;
1226
1227 if (data->inuse == 0)
1228 return 0;
1229
1230 /* Might be overflow... */
1231 memcpy(&data->ring[data->wptr], info->entry,
1232 data->element_size);
1233
1234 wmb();
1235 data->wptr = (data->aligned_element_size +
1236 data->wptr) % data->ring_size;
1237
1238 schedule_work(&data->ih_work);
1239
1240 return 0;
1241}
1242
1243int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1244 struct ras_ih_if *info)
1245{
1246 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1247 struct ras_ih_data *data;
1248
1249 if (!obj)
1250 return -EINVAL;
1251
1252 data = &obj->ih_data;
1253 if (data->inuse == 0)
1254 return 0;
1255
1256 cancel_work_sync(&data->ih_work);
1257
1258 kfree(data->ring);
1259 memset(data, 0, sizeof(*data));
1260 put_obj(obj);
1261
1262 return 0;
1263}
1264
1265int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1266 struct ras_ih_if *info)
1267{
1268 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1269 struct ras_ih_data *data;
1270
1271 if (!obj) {
1272 /* in case we registe the IH before enable ras feature */
1273 obj = amdgpu_ras_create_obj(adev, &info->head);
1274 if (!obj)
1275 return -EINVAL;
1276 } else
1277 get_obj(obj);
1278
1279 data = &obj->ih_data;
1280 /* add the callback.etc */
1281 *data = (struct ras_ih_data) {
1282 .inuse = 0,
1283 .cb = info->cb,
1284 .element_size = sizeof(struct amdgpu_iv_entry),
1285 .rptr = 0,
1286 .wptr = 0,
1287 };
1288
1289 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1290
1291 data->aligned_element_size = ALIGN(data->element_size, 8);
1292 /* the ring can store 64 iv entries. */
1293 data->ring_size = 64 * data->aligned_element_size;
1294 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1295 if (!data->ring) {
1296 put_obj(obj);
1297 return -ENOMEM;
1298 }
1299
1300 /* IH is ready */
1301 data->inuse = 1;
1302
1303 return 0;
1304}
1305
1306static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1307{
1308 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1309 struct ras_manager *obj, *tmp;
1310
1311 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1312 struct ras_ih_if info = {
1313 .head = obj->head,
1314 };
1315 amdgpu_ras_interrupt_remove_handler(adev, &info);
1316 }
1317
1318 return 0;
1319}
1320/* ih end */
1321
313c8fd3
GC
1322/* traversal all IPs except NBIO to query error counter */
1323static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1324{
1325 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1326 struct ras_manager *obj;
1327
1328 if (!con)
1329 return;
1330
1331 list_for_each_entry(obj, &con->head, node) {
1332 struct ras_query_if info = {
1333 .head = obj->head,
1334 };
1335
1336 /*
1337 * PCIE_BIF IP has one different isr by ras controller
1338 * interrupt, the specific ras counter query will be
1339 * done in that isr. So skip such block from common
1340 * sync flood interrupt isr calling.
1341 */
1342 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1343 continue;
1344
1345 amdgpu_ras_error_query(adev, &info);
1346 }
1347}
1348
c030f2e4 1349/* recovery begin */
466b1793 1350
1351/* return 0 on success.
1352 * caller need free bps.
1353 */
1354static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1355 struct ras_badpage **bps, unsigned int *count)
1356{
1357 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1358 struct ras_err_handler_data *data;
1359 int i = 0;
1360 int ret = 0;
1361
1362 if (!con || !con->eh_data || !bps || !count)
1363 return -EINVAL;
1364
1365 mutex_lock(&con->recovery_lock);
1366 data = con->eh_data;
1367 if (!data || data->count == 0) {
1368 *bps = NULL;
46cf2fec 1369 ret = -EINVAL;
466b1793 1370 goto out;
1371 }
1372
1373 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1374 if (!*bps) {
1375 ret = -ENOMEM;
1376 goto out;
1377 }
1378
1379 for (; i < data->count; i++) {
1380 (*bps)[i] = (struct ras_badpage){
9dc23a63 1381 .bp = data->bps[i].retired_page,
466b1793 1382 .size = AMDGPU_GPU_PAGE_SIZE,
52dd95f2 1383 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
466b1793 1384 };
1385
1386 if (data->last_reserved <= i)
52dd95f2 1387 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
9dc23a63 1388 else if (data->bps_bo[i] == NULL)
52dd95f2 1389 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
466b1793 1390 }
1391
1392 *count = data->count;
1393out:
1394 mutex_unlock(&con->recovery_lock);
1395 return ret;
1396}
1397
c030f2e4 1398static void amdgpu_ras_do_recovery(struct work_struct *work)
1399{
1400 struct amdgpu_ras *ras =
1401 container_of(work, struct amdgpu_ras, recovery_work);
1402
313c8fd3
GC
1403 /*
1404 * Query and print non zero error counter per IP block for
1405 * awareness before recovering GPU.
1406 */
1407 amdgpu_ras_log_on_err_counter(ras->adev);
1408
93af20f7
HZ
1409 if (amdgpu_device_should_recover_gpu(ras->adev))
1410 amdgpu_device_gpu_recover(ras->adev, 0);
c030f2e4 1411 atomic_set(&ras->in_recovery, 0);
1412}
1413
c030f2e4 1414/* alloc/realloc bps array */
1415static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1416 struct ras_err_handler_data *data, int pages)
1417{
1418 unsigned int old_space = data->count + data->space_left;
1419 unsigned int new_space = old_space + pages;
9dc23a63
TZ
1420 unsigned int align_space = ALIGN(new_space, 512);
1421 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1422 struct amdgpu_bo **bps_bo =
1423 kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
1424
1425 if (!bps || !bps_bo) {
1426 kfree(bps);
1427 kfree(bps_bo);
c030f2e4 1428 return -ENOMEM;
9dc23a63 1429 }
c030f2e4 1430
1431 if (data->bps) {
9dc23a63 1432 memcpy(bps, data->bps,
c030f2e4 1433 data->count * sizeof(*data->bps));
1434 kfree(data->bps);
1435 }
9dc23a63
TZ
1436 if (data->bps_bo) {
1437 memcpy(bps_bo, data->bps_bo,
1438 data->count * sizeof(*data->bps_bo));
1439 kfree(data->bps_bo);
1440 }
c030f2e4 1441
9dc23a63
TZ
1442 data->bps = bps;
1443 data->bps_bo = bps_bo;
c030f2e4 1444 data->space_left += align_space - old_space;
1445 return 0;
1446}
1447
1448/* it deal with vram only. */
1449int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
9dc23a63 1450 struct eeprom_table_record *bps, int pages)
c030f2e4 1451{
1452 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
73aa8e1a 1453 struct ras_err_handler_data *data;
c030f2e4 1454 int ret = 0;
1455
73aa8e1a 1456 if (!con || !con->eh_data || !bps || pages <= 0)
c030f2e4 1457 return 0;
1458
1459 mutex_lock(&con->recovery_lock);
73aa8e1a 1460 data = con->eh_data;
c030f2e4 1461 if (!data)
1462 goto out;
1463
1464 if (data->space_left <= pages)
1465 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1466 ret = -ENOMEM;
1467 goto out;
1468 }
1469
9dc23a63
TZ
1470 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
1471 data->count += pages;
c030f2e4 1472 data->space_left -= pages;
9dc23a63 1473
c030f2e4 1474out:
1475 mutex_unlock(&con->recovery_lock);
1476
1477 return ret;
1478}
1479
78ad00c9
TZ
1480/*
1481 * write error record array to eeprom, the function should be
1482 * protected by recovery_lock
1483 */
1484static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1485{
1486 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1487 struct ras_err_handler_data *data;
8a3e801f 1488 struct amdgpu_ras_eeprom_control *control;
78ad00c9
TZ
1489 int save_count;
1490
1491 if (!con || !con->eh_data)
1492 return 0;
1493
8a3e801f 1494 control = &con->eeprom_control;
78ad00c9
TZ
1495 data = con->eh_data;
1496 save_count = data->count - control->num_recs;
1497 /* only new entries are saved */
1498 if (save_count > 0)
0771b0bf 1499 if (amdgpu_ras_eeprom_process_recods(control,
78ad00c9
TZ
1500 &data->bps[control->num_recs],
1501 true,
1502 save_count)) {
1503 DRM_ERROR("Failed to save EEPROM table data!");
1504 return -EIO;
1505 }
1506
1507 return 0;
1508}
1509
1510/*
1511 * read error record array in eeprom and reserve enough space for
1512 * storing new bad pages
1513 */
1514static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1515{
1516 struct amdgpu_ras_eeprom_control *control =
1517 &adev->psp.ras.ras->eeprom_control;
1518 struct eeprom_table_record *bps = NULL;
1519 int ret = 0;
1520
1521 /* no bad page record, skip eeprom access */
1522 if (!control->num_recs)
1523 return ret;
1524
1525 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1526 if (!bps)
1527 return -ENOMEM;
1528
1529 if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1530 control->num_recs)) {
1531 DRM_ERROR("Failed to load EEPROM table records!");
1532 ret = -EIO;
1533 goto out;
1534 }
1535
1536 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1537
1538out:
1539 kfree(bps);
1540 return ret;
1541}
1542
6e4be987
TZ
1543/*
1544 * check if an address belongs to bad page
1545 *
1546 * Note: this check is only for umc block
1547 */
1548static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1549 uint64_t addr)
1550{
1551 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1552 struct ras_err_handler_data *data;
1553 int i;
1554 bool ret = false;
1555
1556 if (!con || !con->eh_data)
1557 return ret;
1558
1559 mutex_lock(&con->recovery_lock);
1560 data = con->eh_data;
1561 if (!data)
1562 goto out;
1563
1564 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1565 for (i = 0; i < data->count; i++)
1566 if (addr == data->bps[i].retired_page) {
1567 ret = true;
1568 goto out;
1569 }
1570
1571out:
1572 mutex_unlock(&con->recovery_lock);
1573 return ret;
1574}
1575
c030f2e4 1576/* called in gpu recovery/init */
1577int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1578{
1579 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
73aa8e1a 1580 struct ras_err_handler_data *data;
c030f2e4 1581 uint64_t bp;
de7b45ba 1582 struct amdgpu_bo *bo = NULL;
78ad00c9 1583 int i, ret = 0;
c030f2e4 1584
73aa8e1a 1585 if (!con || !con->eh_data)
c030f2e4 1586 return 0;
1587
1588 mutex_lock(&con->recovery_lock);
73aa8e1a 1589 data = con->eh_data;
1590 if (!data)
1591 goto out;
c030f2e4 1592 /* reserve vram at driver post stage. */
1593 for (i = data->last_reserved; i < data->count; i++) {
9dc23a63 1594 bp = data->bps[i].retired_page;
c030f2e4 1595
ae115c81
TZ
1596 /* There are two cases of reserve error should be ignored:
1597 * 1) a ras bad page has been allocated (used by someone);
1598 * 2) a ras bad page has been reserved (duplicate error injection
1599 * for one page);
1600 */
a142ba88
AD
1601 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
1602 AMDGPU_GPU_PAGE_SIZE,
de7b45ba
CK
1603 AMDGPU_GEM_DOMAIN_VRAM,
1604 &bo, NULL))
ae115c81 1605 DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp);
c030f2e4 1606
9dc23a63 1607 data->bps_bo[i] = bo;
c030f2e4 1608 data->last_reserved = i + 1;
de7b45ba 1609 bo = NULL;
c030f2e4 1610 }
78ad00c9
TZ
1611
1612 /* continue to save bad pages to eeprom even reesrve_vram fails */
1613 ret = amdgpu_ras_save_bad_pages(adev);
73aa8e1a 1614out:
c030f2e4 1615 mutex_unlock(&con->recovery_lock);
78ad00c9 1616 return ret;
c030f2e4 1617}
1618
1619/* called when driver unload */
1620static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1621{
1622 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
73aa8e1a 1623 struct ras_err_handler_data *data;
c030f2e4 1624 struct amdgpu_bo *bo;
1625 int i;
1626
73aa8e1a 1627 if (!con || !con->eh_data)
c030f2e4 1628 return 0;
1629
1630 mutex_lock(&con->recovery_lock);
73aa8e1a 1631 data = con->eh_data;
1632 if (!data)
1633 goto out;
1634
c030f2e4 1635 for (i = data->last_reserved - 1; i >= 0; i--) {
9dc23a63 1636 bo = data->bps_bo[i];
c030f2e4 1637
de7b45ba 1638 amdgpu_bo_free_kernel(&bo, NULL, NULL);
c030f2e4 1639
9dc23a63 1640 data->bps_bo[i] = bo;
c030f2e4 1641 data->last_reserved = i;
1642 }
73aa8e1a 1643out:
c030f2e4 1644 mutex_unlock(&con->recovery_lock);
1645 return 0;
1646}
1647
1a6fc071 1648int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
c030f2e4 1649{
1650 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4d1337d2 1651 struct ras_err_handler_data **data;
78ad00c9 1652 int ret;
c030f2e4 1653
4d1337d2
AG
1654 if (con)
1655 data = &con->eh_data;
1656 else
1657 return 0;
1658
1a6fc071
TZ
1659 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1660 if (!*data) {
1661 ret = -ENOMEM;
1662 goto out;
1663 }
c030f2e4 1664
1665 mutex_init(&con->recovery_lock);
1666 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1667 atomic_set(&con->in_recovery, 0);
1668 con->adev = adev;
1669
0771b0bf 1670 ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
78ad00c9 1671 if (ret)
1a6fc071 1672 goto free;
78ad00c9 1673
0771b0bf 1674 if (con->eeprom_control.num_recs) {
78ad00c9
TZ
1675 ret = amdgpu_ras_load_bad_pages(adev);
1676 if (ret)
1a6fc071 1677 goto free;
78ad00c9
TZ
1678 ret = amdgpu_ras_reserve_bad_pages(adev);
1679 if (ret)
1a6fc071 1680 goto release;
78ad00c9 1681 }
c030f2e4 1682
1683 return 0;
1a6fc071
TZ
1684
1685release:
1686 amdgpu_ras_release_bad_pages(adev);
1687free:
1a6fc071
TZ
1688 kfree((*data)->bps);
1689 kfree((*data)->bps_bo);
1690 kfree(*data);
1995b3a3 1691 con->eh_data = NULL;
1a6fc071
TZ
1692out:
1693 DRM_WARN("Failed to initialize ras recovery!\n");
1694
1695 return ret;
c030f2e4 1696}
1697
1698static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1699{
1700 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1701 struct ras_err_handler_data *data = con->eh_data;
1702
1a6fc071
TZ
1703 /* recovery_init failed to init it, fini is useless */
1704 if (!data)
1705 return 0;
1706
c030f2e4 1707 cancel_work_sync(&con->recovery_work);
c030f2e4 1708 amdgpu_ras_release_bad_pages(adev);
1709
1710 mutex_lock(&con->recovery_lock);
1711 con->eh_data = NULL;
1712 kfree(data->bps);
1a6fc071 1713 kfree(data->bps_bo);
c030f2e4 1714 kfree(data);
1715 mutex_unlock(&con->recovery_lock);
1716
1717 return 0;
1718}
1719/* recovery end */
1720
a564808e 1721/* return 0 if ras will reset gpu and repost.*/
1722int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1723 unsigned int block)
1724{
1725 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1726
1727 if (!ras)
1728 return -EINVAL;
1729
1730 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1731 return 0;
1732}
1733
5caf466a 1734/*
1735 * check hardware's ras ability which will be saved in hw_supported.
1736 * if hardware does not support ras, we can skip some ras initializtion and
1737 * forbid some ras operations from IP.
1738 * if software itself, say boot parameter, limit the ras ability. We still
1739 * need allow IP do some limited operations, like disable. In such case,
1740 * we have to initialize ras as normal. but need check if operation is
1741 * allowed or not in each function.
1742 */
1743static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1744 uint32_t *hw_supported, uint32_t *supported)
c030f2e4 1745{
5caf466a 1746 *hw_supported = 0;
1747 *supported = 0;
c030f2e4 1748
5caf466a 1749 if (amdgpu_sriov_vf(adev) ||
baaeb610
HZ
1750 (adev->asic_type != CHIP_VEGA20 &&
1751 adev->asic_type != CHIP_ARCTURUS))
5caf466a 1752 return;
b404ae82 1753
5d0f903f 1754 if (adev->is_atom_fw &&
1755 (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1756 amdgpu_atomfirmware_sram_ecc_supported(adev)))
5caf466a 1757 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
b404ae82 1758
5caf466a 1759 *supported = amdgpu_ras_enable == 0 ?
1760 0 : *hw_supported & amdgpu_ras_mask;
c030f2e4 1761}
1762
1763int amdgpu_ras_init(struct amdgpu_device *adev)
1764{
1765 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
4e644fff 1766 int r;
c030f2e4 1767
b404ae82 1768 if (con)
c030f2e4 1769 return 0;
1770
1771 con = kmalloc(sizeof(struct amdgpu_ras) +
1772 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1773 GFP_KERNEL|__GFP_ZERO);
1774 if (!con)
1775 return -ENOMEM;
1776
1777 con->objs = (struct ras_manager *)(con + 1);
1778
1779 amdgpu_ras_set_context(adev, con);
1780
5caf466a 1781 amdgpu_ras_check_supported(adev, &con->hw_supported,
1782 &con->supported);
fb2a3607
HZ
1783 if (!con->hw_supported) {
1784 amdgpu_ras_set_context(adev, NULL);
1785 kfree(con);
1786 return 0;
1787 }
1788
c030f2e4 1789 con->features = 0;
1790 INIT_LIST_HEAD(&con->head);
108c6a63 1791 /* Might need get this flag from vbios. */
1792 con->flags = RAS_DEFAULT_FLAGS;
c030f2e4 1793
4e644fff
HZ
1794 if (adev->nbio.funcs->init_ras_controller_interrupt) {
1795 r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
1796 if (r)
1797 return r;
1798 }
1799
1800 if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
1801 r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
1802 if (r)
1803 return r;
1804 }
1805
c030f2e4 1806 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1807
c030f2e4 1808 if (amdgpu_ras_fs_init(adev))
1809 goto fs_out;
1810
5d0f903f 1811 DRM_INFO("RAS INFO: ras initialized successfully, "
1812 "hardware ability[%x] ras_mask[%x]\n",
1813 con->hw_supported, con->supported);
c030f2e4 1814 return 0;
1815fs_out:
c030f2e4 1816 amdgpu_ras_set_context(adev, NULL);
1817 kfree(con);
1818
1819 return -EINVAL;
1820}
1821
b293e891
HZ
1822/* helper function to handle common stuff in ip late init phase */
1823int amdgpu_ras_late_init(struct amdgpu_device *adev,
1824 struct ras_common_if *ras_block,
1825 struct ras_fs_if *fs_info,
1826 struct ras_ih_if *ih_info)
1827{
1828 int r;
1829
1830 /* disable RAS feature per IP block if it is not supported */
1831 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
1832 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
1833 return 0;
1834 }
1835
1836 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
1837 if (r) {
1838 if (r == -EAGAIN) {
1839 /* request gpu reset. will run again */
1840 amdgpu_ras_request_reset_on_boot(adev,
1841 ras_block->block);
1842 return 0;
1843 } else if (adev->in_suspend || adev->in_gpu_reset) {
1844 /* in resume phase, if fail to enable ras,
1845 * clean up all ras fs nodes, and disable ras */
1846 goto cleanup;
1847 } else
1848 return r;
1849 }
1850
1851 /* in resume phase, no need to create ras fs node */
1852 if (adev->in_suspend || adev->in_gpu_reset)
1853 return 0;
1854
1855 if (ih_info->cb) {
1856 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
1857 if (r)
1858 goto interrupt;
1859 }
1860
1861 amdgpu_ras_debugfs_create(adev, fs_info);
1862
1863 r = amdgpu_ras_sysfs_create(adev, fs_info);
1864 if (r)
1865 goto sysfs;
1866
1867 return 0;
1868cleanup:
1869 amdgpu_ras_sysfs_remove(adev, ras_block);
1870sysfs:
1871 amdgpu_ras_debugfs_remove(adev, ras_block);
1872 if (ih_info->cb)
1873 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1874interrupt:
1875 amdgpu_ras_feature_enable(adev, ras_block, 0);
1876 return r;
1877}
1878
1879/* helper function to remove ras fs node and interrupt handler */
1880void amdgpu_ras_late_fini(struct amdgpu_device *adev,
1881 struct ras_common_if *ras_block,
1882 struct ras_ih_if *ih_info)
1883{
1884 if (!ras_block || !ih_info)
1885 return;
1886
1887 amdgpu_ras_sysfs_remove(adev, ras_block);
1888 amdgpu_ras_debugfs_remove(adev, ras_block);
1889 if (ih_info->cb)
1890 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1891 amdgpu_ras_feature_enable(adev, ras_block, 0);
1892}
1893
a564808e 1894/* do some init work after IP late init as dependence.
511fdbc3 1895 * and it runs in resume/gpu reset/booting up cases.
a564808e 1896 */
511fdbc3 1897void amdgpu_ras_resume(struct amdgpu_device *adev)
108c6a63 1898{
1899 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1900 struct ras_manager *obj, *tmp;
1901
1902 if (!con)
1903 return;
1904
108c6a63 1905 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
191051a1 1906 /* Set up all other IPs which are not implemented. There is a
1907 * tricky thing that IP's actual ras error type should be
1908 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1909 * ERROR_NONE make sense anyway.
1910 */
1911 amdgpu_ras_enable_all_features(adev, 1);
1912
1913 /* We enable ras on all hw_supported block, but as boot
1914 * parameter might disable some of them and one or more IP has
1915 * not implemented yet. So we disable them on behalf.
1916 */
108c6a63 1917 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1918 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1919 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1920 /* there should be no any reference. */
1921 WARN_ON(alive_obj(obj));
1922 }
191051a1 1923 }
108c6a63 1924 }
a564808e 1925
1926 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1927 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1928 /* setup ras obj state as disabled.
1929 * for init_by_vbios case.
1930 * if we want to enable ras, just enable it in a normal way.
1931 * If we want do disable it, need setup ras obj as enabled,
1932 * then issue another TA disable cmd.
1933 * See feature_enable_on_boot
1934 */
1935 amdgpu_ras_disable_all_features(adev, 1);
61934624 1936 amdgpu_ras_reset_gpu(adev);
a564808e 1937 }
108c6a63 1938}
1939
511fdbc3 1940void amdgpu_ras_suspend(struct amdgpu_device *adev)
1941{
1942 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1943
1944 if (!con)
1945 return;
1946
1947 amdgpu_ras_disable_all_features(adev, 0);
1948 /* Make sure all ras objects are disabled. */
1949 if (con->features)
1950 amdgpu_ras_disable_all_features(adev, 1);
1951}
1952
c030f2e4 1953/* do some fini work before IP fini as dependence */
1954int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1955{
1956 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1957
1958 if (!con)
1959 return 0;
1960
1961 /* Need disable ras on all IPs here before ip [hw/sw]fini */
1962 amdgpu_ras_disable_all_features(adev, 0);
1963 amdgpu_ras_recovery_fini(adev);
1964 return 0;
1965}
1966
1967int amdgpu_ras_fini(struct amdgpu_device *adev)
1968{
1969 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1970
1971 if (!con)
1972 return 0;
1973
1974 amdgpu_ras_fs_fini(adev);
1975 amdgpu_ras_interrupt_remove_all(adev);
1976
1977 WARN(con->features, "Feature mask is not cleared");
1978
1979 if (con->features)
1980 amdgpu_ras_disable_all_features(adev, 1);
1981
1982 amdgpu_ras_set_context(adev, NULL);
1983 kfree(con);
1984
1985 return 0;
1986}
7c6e68c7
AG
1987
1988void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
1989{
ed606f8a
AG
1990 uint32_t hw_supported, supported;
1991
1992 amdgpu_ras_check_supported(adev, &hw_supported, &supported);
1993 if (!hw_supported)
1994 return;
1995
7c6e68c7 1996 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
d5ea093e
AG
1997 DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
1998
61934624 1999 amdgpu_ras_reset_gpu(adev);
7c6e68c7
AG
2000 }
2001}