drm/amdgpu: add human readable debugfs control support (v2)
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
CommitLineData
c030f2e4 1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include "amdgpu.h"
28#include "amdgpu_ras.h"
29
30struct ras_ih_data {
31 /* interrupt bottom half */
32 struct work_struct ih_work;
33 int inuse;
34 /* IP callback */
35 ras_ih_cb cb;
36 /* full of entries */
37 unsigned char *ring;
38 unsigned int ring_size;
39 unsigned int element_size;
40 unsigned int aligned_element_size;
41 unsigned int rptr;
42 unsigned int wptr;
43};
44
45struct ras_fs_data {
46 char sysfs_name[32];
47 char debugfs_name[32];
48};
49
50struct ras_err_data {
51 unsigned long ue_count;
52 unsigned long ce_count;
53};
54
55struct ras_err_handler_data {
56 /* point to bad pages array */
57 struct {
58 unsigned long bp;
59 struct amdgpu_bo *bo;
60 } *bps;
61 /* the count of entries */
62 int count;
63 /* the space can place new entries */
64 int space_left;
65 /* last reserved entry's index + 1 */
66 int last_reserved;
67};
68
69struct ras_manager {
70 struct ras_common_if head;
71 /* reference count */
72 int use;
73 /* ras block link */
74 struct list_head node;
75 /* the device */
76 struct amdgpu_device *adev;
77 /* debugfs */
78 struct dentry *ent;
79 /* sysfs */
80 struct device_attribute sysfs_attr;
81 int attr_inuse;
82
83 /* fs node name */
84 struct ras_fs_data fs_data;
85
86 /* IH data */
87 struct ras_ih_data ih_data;
88
89 struct ras_err_data err_data;
90};
91
92const char *ras_error_string[] = {
93 "none",
94 "parity",
95 "single_correctable",
96 "multi_uncorrectable",
97 "poison",
98};
99
100const char *ras_block_string[] = {
101 "umc",
102 "sdma",
103 "gfx",
104 "mmhub",
105 "athub",
106 "pcie_bif",
107 "hdp",
108 "xgmi_wafl",
109 "df",
110 "smn",
111 "sem",
112 "mp0",
113 "mp1",
114 "fuse",
115};
116
117#define ras_err_str(i) (ras_error_string[ffs(i)])
118#define ras_block_str(i) (ras_block_string[i])
119
120static void amdgpu_ras_self_test(struct amdgpu_device *adev)
121{
122 /* TODO */
123}
124
125static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
126 size_t size, loff_t *pos)
127{
128 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
129 struct ras_query_if info = {
130 .head = obj->head,
131 };
132 ssize_t s;
133 char val[128];
134
135 if (amdgpu_ras_error_query(obj->adev, &info))
136 return -EINVAL;
137
138 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
139 "ue", info.ue_count,
140 "ce", info.ce_count);
141 if (*pos >= s)
142 return 0;
143
144 s -= *pos;
145 s = min_t(u64, s, size);
146
147
148 if (copy_to_user(buf, &val[*pos], s))
149 return -EINVAL;
150
151 *pos += s;
152
153 return s;
154}
155
156static ssize_t amdgpu_ras_debugfs_write(struct file *f, const char __user *buf,
157 size_t size, loff_t *pos)
158{
159 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
160 struct ras_inject_if info = {
161 .head = obj->head,
162 };
163 ssize_t s = min_t(u64, 64, size);
164 char val[64];
165 char *str = val;
166 memset(val, 0, sizeof(val));
167
168 if (*pos)
169 return -EINVAL;
170
171 if (copy_from_user(str, buf, s))
172 return -EINVAL;
173
174 /* only care ue/ce for now. */
175 if (memcmp(str, "ue", 2) == 0) {
176 info.head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
177 str += 2;
178 } else if (memcmp(str, "ce", 2) == 0) {
179 info.head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
180 str += 2;
181 }
182
183 if (sscanf(str, "0x%llx 0x%llx", &info.address, &info.value) != 2) {
184 if (sscanf(str, "%llu %llu", &info.address, &info.value) != 2)
185 return -EINVAL;
186 }
187
188 *pos = s;
189
190 if (amdgpu_ras_error_inject(obj->adev, &info))
191 return -EINVAL;
192
193 return size;
194}
195
196static const struct file_operations amdgpu_ras_debugfs_ops = {
197 .owner = THIS_MODULE,
198 .read = amdgpu_ras_debugfs_read,
199 .write = amdgpu_ras_debugfs_write,
200 .llseek = default_llseek
201};
202
96ebb307 203static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
204{
205 int i;
206
207 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
208 *block_id = i;
209 if (strcmp(name, ras_block_str(i)) == 0)
210 return 0;
211 }
212 return -EINVAL;
213}
214
215static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
216 const char __user *buf, size_t size,
217 loff_t *pos, struct ras_debug_if *data)
218{
219 ssize_t s = min_t(u64, 64, size);
220 char str[65];
221 char block_name[33];
222 char err[9] = "ue";
223 int op = -1;
224 int block_id;
225 u64 address, value;
226
227 if (*pos)
228 return -EINVAL;
229 *pos = size;
230
231 memset(str, 0, sizeof(str));
232 memset(data, 0, sizeof(*data));
233
234 if (copy_from_user(str, buf, s))
235 return -EINVAL;
236
237 if (sscanf(str, "disable %32s", block_name) == 1)
238 op = 0;
239 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
240 op = 1;
241 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
242 op = 2;
243 else if (sscanf(str, "%32s", block_name) == 1)
244 /* ascii string, but commands are not matched. */
245 return -EINVAL;
246
247 if (op != -1) {
248 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
249 return -EINVAL;
250
251 data->head.block = block_id;
252 data->head.type = memcmp("ue", err, 2) == 0 ?
253 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
254 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
255 data->op = op;
256
257 if (op == 2) {
258 if (sscanf(str, "%*s %*s %*s %llu %llu",
259 &address, &value) != 2)
260 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
261 &address, &value) != 2)
262 return -EINVAL;
263 data->inject.address = address;
264 data->inject.value = value;
265 }
266 } else {
267 if (size < sizeof(data))
268 return -EINVAL;
269
270 if (copy_from_user(data, buf, sizeof(*data)))
271 return -EINVAL;
272 }
273
274 return 0;
275}
36ea1bd2 276/*
277 * DOC: ras debugfs control interface
278 *
279 * It accepts struct ras_debug_if who has two members.
280 *
281 * First member: ras_debug_if::head or ras_debug_if::inject.
96ebb307 282 *
283 * head is used to indicate which IP block will be under control.
36ea1bd2 284 *
285 * head has four members, they are block, type, sub_block_index, name.
286 * block: which IP will be under control.
287 * type: what kind of error will be enabled/disabled/injected.
288 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
289 * name: the name of IP.
290 *
291 * inject has two more members than head, they are address, value.
292 * As their names indicate, inject operation will write the
293 * value to the address.
294 *
295 * Second member: struct ras_debug_if::op.
296 * It has three kinds of operations.
297 * 0: disable RAS on the block. Take ::head as its data.
298 * 1: enable RAS on the block. Take ::head as its data.
299 * 2: inject errors on the block. Take ::inject as its data.
300 *
96ebb307 301 * How to use the interface?
302 * programs:
303 * copy the struct ras_debug_if in your codes and initialize it.
304 * write the struct to the control node.
305 *
306 * bash:
307 * echo op block [error [address value]] > .../ras/ras_ctrl
308 * op: disable, enable, inject
309 * disable: only block is needed
310 * enable: block and error are needed
311 * inject: error, address, value are needed
312 * block: umc, smda, gfx, .........
313 * see ras_block_string[] for details
314 * error: ue, ce
315 * ue: multi_uncorrectable
316 * ce: single_correctable
317 *
318 * here are some examples for bash commands,
319 * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
320 * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
321 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
322 *
36ea1bd2 323 * How to check the result?
324 *
325 * For disable/enable, please check ras features at
326 * /sys/class/drm/card[0/1/2...]/device/ras/features
327 *
328 * For inject, please check corresponding err count at
329 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
330 *
331 * NOTE: operation is only allowed on blocks which are supported.
332 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
333 */
334static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
335 size_t size, loff_t *pos)
336{
337 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
338 struct ras_debug_if data;
339 int ret = 0;
340
96ebb307 341 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
342 if (ret)
36ea1bd2 343 return -EINVAL;
344
36ea1bd2 345 if (!amdgpu_ras_is_supported(adev, data.head.block))
346 return -EINVAL;
347
348 switch (data.op) {
349 case 0:
350 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
351 break;
352 case 1:
353 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
354 break;
355 case 2:
356 ret = amdgpu_ras_error_inject(adev, &data.inject);
357 break;
96ebb307 358 default:
359 ret = -EINVAL;
360 break;
36ea1bd2 361 };
362
363 if (ret)
364 return -EINVAL;
365
366 return size;
367}
368
369static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
370 .owner = THIS_MODULE,
371 .read = NULL,
372 .write = amdgpu_ras_debugfs_ctrl_write,
373 .llseek = default_llseek
374};
375
c030f2e4 376static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
377 struct device_attribute *attr, char *buf)
378{
379 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
380 struct ras_query_if info = {
381 .head = obj->head,
382 };
383
384 if (amdgpu_ras_error_query(obj->adev, &info))
385 return -EINVAL;
386
387 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
388 "ue", info.ue_count,
389 "ce", info.ce_count);
390}
391
392/* obj begin */
393
394#define get_obj(obj) do { (obj)->use++; } while (0)
395#define alive_obj(obj) ((obj)->use)
396
397static inline void put_obj(struct ras_manager *obj)
398{
399 if (obj && --obj->use == 0)
400 list_del(&obj->node);
401 if (obj && obj->use < 0) {
402 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
403 }
404}
405
406/* make one obj and return it. */
407static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
408 struct ras_common_if *head)
409{
410 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
411 struct ras_manager *obj;
412
413 if (!con)
414 return NULL;
415
416 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
417 return NULL;
418
419 obj = &con->objs[head->block];
420 /* already exist. return obj? */
421 if (alive_obj(obj))
422 return NULL;
423
424 obj->head = *head;
425 obj->adev = adev;
426 list_add(&obj->node, &con->head);
427 get_obj(obj);
428
429 return obj;
430}
431
432/* return an obj equal to head, or the first when head is NULL */
433static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
434 struct ras_common_if *head)
435{
436 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
437 struct ras_manager *obj;
438 int i;
439
440 if (!con)
441 return NULL;
442
443 if (head) {
444 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
445 return NULL;
446
447 obj = &con->objs[head->block];
448
449 if (alive_obj(obj)) {
450 WARN_ON(head->block != obj->head.block);
451 return obj;
452 }
453 } else {
454 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
455 obj = &con->objs[i];
456 if (alive_obj(obj)) {
457 WARN_ON(i != obj->head.block);
458 return obj;
459 }
460 }
461 }
462
463 return NULL;
464}
465/* obj end */
466
467/* feature ctl begin */
468static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
469 struct ras_common_if *head)
470{
471 return amdgpu_ras_enable && (amdgpu_ras_mask & BIT(head->block));
472}
473
474static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
475 struct ras_common_if *head)
476{
477 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
478
479 return con->features & BIT(head->block);
480}
481
482/*
483 * if obj is not created, then create one.
484 * set feature enable flag.
485 */
486static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
487 struct ras_common_if *head, int enable)
488{
489 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
490 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
491
492 if (!amdgpu_ras_is_feature_allowed(adev, head))
493 return 0;
494 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
495 return 0;
496
497 if (enable) {
498 if (!obj) {
499 obj = amdgpu_ras_create_obj(adev, head);
500 if (!obj)
501 return -EINVAL;
502 } else {
503 /* In case we create obj somewhere else */
504 get_obj(obj);
505 }
506 con->features |= BIT(head->block);
507 } else {
508 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
509 con->features &= ~BIT(head->block);
510 put_obj(obj);
511 }
512 }
513
514 return 0;
515}
516
517/* wrapper of psp_ras_enable_features */
518int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
519 struct ras_common_if *head, bool enable)
520{
521 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
522 union ta_ras_cmd_input info;
523 int ret;
524
525 if (!con)
526 return -EINVAL;
527
528 if (!enable) {
529 info.disable_features = (struct ta_ras_disable_features_input) {
530 .block_id = head->block,
531 .error_type = head->type,
532 };
533 } else {
534 info.enable_features = (struct ta_ras_enable_features_input) {
535 .block_id = head->block,
536 .error_type = head->type,
537 };
538 }
539
540 /* Do not enable if it is not allowed. */
541 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
542 /* Are we alerady in that state we are going to set? */
543 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
544 return 0;
545
546 ret = psp_ras_enable_features(&adev->psp, &info, enable);
547 if (ret) {
548 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
549 enable ? "enable":"disable",
550 ras_block_str(head->block),
551 ret);
552 return -EINVAL;
553 }
554
555 /* setup the obj */
556 __amdgpu_ras_feature_enable(adev, head, enable);
557
558 return 0;
559}
560
561static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
562 bool bypass)
563{
564 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
565 struct ras_manager *obj, *tmp;
566
567 list_for_each_entry_safe(obj, tmp, &con->head, node) {
568 /* bypass psp.
569 * aka just release the obj and corresponding flags
570 */
571 if (bypass) {
572 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
573 break;
574 } else {
575 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
576 break;
577 }
578 };
579
580 return con->features;
581}
582
583static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
584 bool bypass)
585{
586 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
587 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
588 int i;
589
590 for (i = 0; i < ras_block_count; i++) {
591 struct ras_common_if head = {
592 .block = i,
593 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
594 .sub_block_index = 0,
595 };
596 strcpy(head.name, ras_block_str(i));
597 if (bypass) {
598 /*
599 * bypass psp. vbios enable ras for us.
600 * so just create the obj
601 */
602 if (__amdgpu_ras_feature_enable(adev, &head, 1))
603 break;
604 } else {
605 if (amdgpu_ras_feature_enable(adev, &head, 1))
606 break;
607 }
608 };
609
610 return con->features;
611}
612/* feature ctl end */
613
614/* query/inject/cure begin */
615int amdgpu_ras_error_query(struct amdgpu_device *adev,
616 struct ras_query_if *info)
617{
618 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
619
620 if (!obj)
621 return -EINVAL;
622 /* TODO might read the register to read the count */
623
624 info->ue_count = obj->err_data.ue_count;
625 info->ce_count = obj->err_data.ce_count;
626
627 return 0;
628}
629
630/* wrapper of psp_ras_trigger_error */
631int amdgpu_ras_error_inject(struct amdgpu_device *adev,
632 struct ras_inject_if *info)
633{
634 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
635 struct ta_ras_trigger_error_input block_info = {
636 .block_id = info->head.block,
637 .inject_error_type = info->head.type,
638 .sub_block_index = info->head.sub_block_index,
639 .address = info->address,
640 .value = info->value,
641 };
642 int ret = 0;
643
644 if (!obj)
645 return -EINVAL;
646
647 ret = psp_ras_trigger_error(&adev->psp, &block_info);
648 if (ret)
649 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
650 ras_block_str(info->head.block),
651 ret);
652
653 return ret;
654}
655
656int amdgpu_ras_error_cure(struct amdgpu_device *adev,
657 struct ras_cure_if *info)
658{
659 /* psp fw has no cure interface for now. */
660 return 0;
661}
662
663/* get the total error counts on all IPs */
664int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
665 bool is_ce)
666{
667 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
668 struct ras_manager *obj;
669 struct ras_err_data data = {0, 0};
670
671 if (!con)
672 return -EINVAL;
673
674 list_for_each_entry(obj, &con->head, node) {
675 struct ras_query_if info = {
676 .head = obj->head,
677 };
678
679 if (amdgpu_ras_error_query(adev, &info))
680 return -EINVAL;
681
682 data.ce_count += info.ce_count;
683 data.ue_count += info.ue_count;
684 }
685
686 return is_ce ? data.ce_count : data.ue_count;
687}
688/* query/inject/cure end */
689
690
691/* sysfs begin */
692
693static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
694 struct device_attribute *attr, char *buf)
695{
696 struct amdgpu_ras *con =
697 container_of(attr, struct amdgpu_ras, features_attr);
698 struct drm_device *ddev = dev_get_drvdata(dev);
699 struct amdgpu_device *adev = ddev->dev_private;
700 struct ras_common_if head;
701 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
702 int i;
703 ssize_t s;
704 struct ras_manager *obj;
705
706 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
707
708 for (i = 0; i < ras_block_count; i++) {
709 head.block = i;
710
711 if (amdgpu_ras_is_feature_enabled(adev, &head)) {
712 obj = amdgpu_ras_find_obj(adev, &head);
713 s += scnprintf(&buf[s], PAGE_SIZE - s,
714 "%s: %s\n",
715 ras_block_str(i),
716 ras_err_str(obj->head.type));
717 } else
718 s += scnprintf(&buf[s], PAGE_SIZE - s,
719 "%s: disabled\n",
720 ras_block_str(i));
721 }
722
723 return s;
724}
725
726static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
727{
728 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
729 struct attribute *attrs[] = {
730 &con->features_attr.attr,
731 NULL
732 };
733 struct attribute_group group = {
734 .name = "ras",
735 .attrs = attrs,
736 };
737
738 con->features_attr = (struct device_attribute) {
739 .attr = {
740 .name = "features",
741 .mode = S_IRUGO,
742 },
743 .show = amdgpu_ras_sysfs_features_read,
744 };
745
746 return sysfs_create_group(&adev->dev->kobj, &group);
747}
748
749static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
750{
751 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
752 struct attribute *attrs[] = {
753 &con->features_attr.attr,
754 NULL
755 };
756 struct attribute_group group = {
757 .name = "ras",
758 .attrs = attrs,
759 };
760
761 sysfs_remove_group(&adev->dev->kobj, &group);
762
763 return 0;
764}
765
766int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
767 struct ras_fs_if *head)
768{
769 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
770
771 if (!obj || obj->attr_inuse)
772 return -EINVAL;
773
774 get_obj(obj);
775
776 memcpy(obj->fs_data.sysfs_name,
777 head->sysfs_name,
778 sizeof(obj->fs_data.sysfs_name));
779
780 obj->sysfs_attr = (struct device_attribute){
781 .attr = {
782 .name = obj->fs_data.sysfs_name,
783 .mode = S_IRUGO,
784 },
785 .show = amdgpu_ras_sysfs_read,
786 };
787
788 if (sysfs_add_file_to_group(&adev->dev->kobj,
789 &obj->sysfs_attr.attr,
790 "ras")) {
791 put_obj(obj);
792 return -EINVAL;
793 }
794
795 obj->attr_inuse = 1;
796
797 return 0;
798}
799
800int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
801 struct ras_common_if *head)
802{
803 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
804
805 if (!obj || !obj->attr_inuse)
806 return -EINVAL;
807
808 sysfs_remove_file_from_group(&adev->dev->kobj,
809 &obj->sysfs_attr.attr,
810 "ras");
811 obj->attr_inuse = 0;
812 put_obj(obj);
813
814 return 0;
815}
816
817static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
818{
819 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
820 struct ras_manager *obj, *tmp;
821
822 list_for_each_entry_safe(obj, tmp, &con->head, node) {
823 amdgpu_ras_sysfs_remove(adev, &obj->head);
824 }
825
826 amdgpu_ras_sysfs_remove_feature_node(adev);
827
828 return 0;
829}
830/* sysfs end */
831
832/* debugfs begin */
36ea1bd2 833static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
834{
835 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
836 struct drm_minor *minor = adev->ddev->primary;
837 struct dentry *root = minor->debugfs_root, *dir;
838 struct dentry *ent;
839
840 dir = debugfs_create_dir("ras", root);
841 if (IS_ERR(dir))
842 return -EINVAL;
843
844 con->dir = dir;
845
846 ent = debugfs_create_file("ras_ctrl",
847 S_IWUGO | S_IRUGO, con->dir,
848 adev, &amdgpu_ras_debugfs_ctrl_ops);
849 if (IS_ERR(ent)) {
850 debugfs_remove(con->dir);
851 return -EINVAL;
852 }
853
854 con->ent = ent;
855 return 0;
856}
857
c030f2e4 858int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
859 struct ras_fs_if *head)
860{
861 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
862 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
863 struct dentry *ent;
864
865 if (!obj || obj->ent)
866 return -EINVAL;
867
868 get_obj(obj);
869
870 memcpy(obj->fs_data.debugfs_name,
871 head->debugfs_name,
872 sizeof(obj->fs_data.debugfs_name));
873
874 ent = debugfs_create_file(obj->fs_data.debugfs_name,
875 S_IWUGO | S_IRUGO, con->dir,
876 obj, &amdgpu_ras_debugfs_ops);
877
878 if (IS_ERR(ent))
879 return -EINVAL;
880
881 obj->ent = ent;
882
883 return 0;
884}
885
886int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
887 struct ras_common_if *head)
888{
889 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
890
891 if (!obj || !obj->ent)
892 return 0;
893
894 debugfs_remove(obj->ent);
895 obj->ent = NULL;
896 put_obj(obj);
897
898 return 0;
899}
900
901static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
902{
903 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
904 struct ras_manager *obj, *tmp;
905
906 list_for_each_entry_safe(obj, tmp, &con->head, node) {
907 amdgpu_ras_debugfs_remove(adev, &obj->head);
908 }
909
36ea1bd2 910 debugfs_remove(con->ent);
c030f2e4 911 debugfs_remove(con->dir);
912 con->dir = NULL;
36ea1bd2 913 con->ent = NULL;
c030f2e4 914
915 return 0;
916}
917/* debugfs end */
918
919/* ras fs */
920
921static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
922{
c030f2e4 923 amdgpu_ras_sysfs_create_feature_node(adev);
36ea1bd2 924 amdgpu_ras_debugfs_create_ctrl_node(adev);
c030f2e4 925
926 return 0;
927}
928
929static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
930{
931 amdgpu_ras_debugfs_remove_all(adev);
932 amdgpu_ras_sysfs_remove_all(adev);
933 return 0;
934}
935/* ras fs end */
936
937/* ih begin */
938static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
939{
940 struct ras_ih_data *data = &obj->ih_data;
941 struct amdgpu_iv_entry entry;
942 int ret;
943
944 while (data->rptr != data->wptr) {
945 rmb();
946 memcpy(&entry, &data->ring[data->rptr],
947 data->element_size);
948
949 wmb();
950 data->rptr = (data->aligned_element_size +
951 data->rptr) % data->ring_size;
952
953 /* Let IP handle its data, maybe we need get the output
954 * from the callback to udpate the error type/count, etc
955 */
956 if (data->cb) {
957 ret = data->cb(obj->adev, &entry);
958 /* ue will trigger an interrupt, and in that case
959 * we need do a reset to recovery the whole system.
960 * But leave IP do that recovery, here we just dispatch
961 * the error.
962 */
963 if (ret == AMDGPU_RAS_UE) {
964 obj->err_data.ue_count++;
965 }
966 /* Might need get ce count by register, but not all IP
967 * saves ce count, some IP just use one bit or two bits
968 * to indicate ce happened.
969 */
970 }
971 }
972}
973
974static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
975{
976 struct ras_ih_data *data =
977 container_of(work, struct ras_ih_data, ih_work);
978 struct ras_manager *obj =
979 container_of(data, struct ras_manager, ih_data);
980
981 amdgpu_ras_interrupt_handler(obj);
982}
983
984int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
985 struct ras_dispatch_if *info)
986{
987 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
988 struct ras_ih_data *data = &obj->ih_data;
989
990 if (!obj)
991 return -EINVAL;
992
993 if (data->inuse == 0)
994 return 0;
995
996 /* Might be overflow... */
997 memcpy(&data->ring[data->wptr], info->entry,
998 data->element_size);
999
1000 wmb();
1001 data->wptr = (data->aligned_element_size +
1002 data->wptr) % data->ring_size;
1003
1004 schedule_work(&data->ih_work);
1005
1006 return 0;
1007}
1008
1009int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1010 struct ras_ih_if *info)
1011{
1012 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1013 struct ras_ih_data *data;
1014
1015 if (!obj)
1016 return -EINVAL;
1017
1018 data = &obj->ih_data;
1019 if (data->inuse == 0)
1020 return 0;
1021
1022 cancel_work_sync(&data->ih_work);
1023
1024 kfree(data->ring);
1025 memset(data, 0, sizeof(*data));
1026 put_obj(obj);
1027
1028 return 0;
1029}
1030
1031int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1032 struct ras_ih_if *info)
1033{
1034 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1035 struct ras_ih_data *data;
1036
1037 if (!obj) {
1038 /* in case we registe the IH before enable ras feature */
1039 obj = amdgpu_ras_create_obj(adev, &info->head);
1040 if (!obj)
1041 return -EINVAL;
1042 } else
1043 get_obj(obj);
1044
1045 data = &obj->ih_data;
1046 /* add the callback.etc */
1047 *data = (struct ras_ih_data) {
1048 .inuse = 0,
1049 .cb = info->cb,
1050 .element_size = sizeof(struct amdgpu_iv_entry),
1051 .rptr = 0,
1052 .wptr = 0,
1053 };
1054
1055 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1056
1057 data->aligned_element_size = ALIGN(data->element_size, 8);
1058 /* the ring can store 64 iv entries. */
1059 data->ring_size = 64 * data->aligned_element_size;
1060 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1061 if (!data->ring) {
1062 put_obj(obj);
1063 return -ENOMEM;
1064 }
1065
1066 /* IH is ready */
1067 data->inuse = 1;
1068
1069 return 0;
1070}
1071
1072static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1073{
1074 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1075 struct ras_manager *obj, *tmp;
1076
1077 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1078 struct ras_ih_if info = {
1079 .head = obj->head,
1080 };
1081 amdgpu_ras_interrupt_remove_handler(adev, &info);
1082 }
1083
1084 return 0;
1085}
1086/* ih end */
1087
1088/* recovery begin */
1089static void amdgpu_ras_do_recovery(struct work_struct *work)
1090{
1091 struct amdgpu_ras *ras =
1092 container_of(work, struct amdgpu_ras, recovery_work);
1093
1094 amdgpu_device_gpu_recover(ras->adev, 0);
1095 atomic_set(&ras->in_recovery, 0);
1096}
1097
1098static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1099 struct amdgpu_bo **bo_ptr)
1100{
1101 /* no need to free it actually. */
1102 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1103 return 0;
1104}
1105
1106/* reserve vram with size@offset */
1107static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1108 uint64_t offset, uint64_t size,
1109 struct amdgpu_bo **bo_ptr)
1110{
1111 struct ttm_operation_ctx ctx = { false, false };
1112 struct amdgpu_bo_param bp;
1113 int r = 0;
1114 int i;
1115 struct amdgpu_bo *bo;
1116
1117 if (bo_ptr)
1118 *bo_ptr = NULL;
1119 memset(&bp, 0, sizeof(bp));
1120 bp.size = size;
1121 bp.byte_align = PAGE_SIZE;
1122 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1123 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1124 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1125 bp.type = ttm_bo_type_kernel;
1126 bp.resv = NULL;
1127
1128 r = amdgpu_bo_create(adev, &bp, &bo);
1129 if (r)
1130 return -EINVAL;
1131
1132 r = amdgpu_bo_reserve(bo, false);
1133 if (r)
1134 goto error_reserve;
1135
1136 offset = ALIGN(offset, PAGE_SIZE);
1137 for (i = 0; i < bo->placement.num_placement; ++i) {
1138 bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1139 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1140 }
1141
1142 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1143 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1144 if (r)
1145 goto error_pin;
1146
1147 r = amdgpu_bo_pin_restricted(bo,
1148 AMDGPU_GEM_DOMAIN_VRAM,
1149 offset,
1150 offset + size);
1151 if (r)
1152 goto error_pin;
1153
1154 if (bo_ptr)
1155 *bo_ptr = bo;
1156
1157 amdgpu_bo_unreserve(bo);
1158 return r;
1159
1160error_pin:
1161 amdgpu_bo_unreserve(bo);
1162error_reserve:
1163 amdgpu_bo_unref(&bo);
1164 return r;
1165}
1166
1167/* alloc/realloc bps array */
1168static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1169 struct ras_err_handler_data *data, int pages)
1170{
1171 unsigned int old_space = data->count + data->space_left;
1172 unsigned int new_space = old_space + pages;
1173 unsigned int align_space = ALIGN(new_space, 1024);
1174 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1175
1176 if (!tmp)
1177 return -ENOMEM;
1178
1179 if (data->bps) {
1180 memcpy(tmp, data->bps,
1181 data->count * sizeof(*data->bps));
1182 kfree(data->bps);
1183 }
1184
1185 data->bps = tmp;
1186 data->space_left += align_space - old_space;
1187 return 0;
1188}
1189
1190/* it deal with vram only. */
1191int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1192 unsigned long *bps, int pages)
1193{
1194 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1195 struct ras_err_handler_data *data = con->eh_data;
1196 int i = pages;
1197 int ret = 0;
1198
1199 if (!con || !data || !bps || pages <= 0)
1200 return 0;
1201
1202 mutex_lock(&con->recovery_lock);
1203 if (!data)
1204 goto out;
1205
1206 if (data->space_left <= pages)
1207 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1208 ret = -ENOMEM;
1209 goto out;
1210 }
1211
1212 while (i--)
1213 data->bps[data->count++].bp = bps[i];
1214
1215 data->space_left -= pages;
1216out:
1217 mutex_unlock(&con->recovery_lock);
1218
1219 return ret;
1220}
1221
1222/* called in gpu recovery/init */
1223int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1224{
1225 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1226 struct ras_err_handler_data *data = con->eh_data;
1227 uint64_t bp;
1228 struct amdgpu_bo *bo;
1229 int i;
1230
1231 if (!con || !data)
1232 return 0;
1233
1234 mutex_lock(&con->recovery_lock);
1235 /* reserve vram at driver post stage. */
1236 for (i = data->last_reserved; i < data->count; i++) {
1237 bp = data->bps[i].bp;
1238
1239 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1240 PAGE_SIZE, &bo))
1241 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1242
1243 data->bps[i].bo = bo;
1244 data->last_reserved = i + 1;
1245 }
1246 mutex_unlock(&con->recovery_lock);
1247 return 0;
1248}
1249
1250/* called when driver unload */
1251static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1252{
1253 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1254 struct ras_err_handler_data *data = con->eh_data;
1255 struct amdgpu_bo *bo;
1256 int i;
1257
1258 if (!con || !data)
1259 return 0;
1260
1261 mutex_lock(&con->recovery_lock);
1262 for (i = data->last_reserved - 1; i >= 0; i--) {
1263 bo = data->bps[i].bo;
1264
1265 amdgpu_ras_release_vram(adev, &bo);
1266
1267 data->bps[i].bo = bo;
1268 data->last_reserved = i;
1269 }
1270 mutex_unlock(&con->recovery_lock);
1271 return 0;
1272}
1273
1274static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1275{
1276 /* TODO
1277 * write the array to eeprom when SMU disabled.
1278 */
1279 return 0;
1280}
1281
1282static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1283{
1284 /* TODO
1285 * read the array to eeprom when SMU disabled.
1286 */
1287 return 0;
1288}
1289
1290static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1291{
1292 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1293 struct ras_err_handler_data **data = &con->eh_data;
1294
1295 *data = kmalloc(sizeof(**data),
1296 GFP_KERNEL|__GFP_ZERO);
1297 if (!*data)
1298 return -ENOMEM;
1299
1300 mutex_init(&con->recovery_lock);
1301 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1302 atomic_set(&con->in_recovery, 0);
1303 con->adev = adev;
1304
1305 amdgpu_ras_load_bad_pages(adev);
1306 amdgpu_ras_reserve_bad_pages(adev);
1307
1308 return 0;
1309}
1310
1311static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1312{
1313 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1314 struct ras_err_handler_data *data = con->eh_data;
1315
1316 cancel_work_sync(&con->recovery_work);
1317 amdgpu_ras_save_bad_pages(adev);
1318 amdgpu_ras_release_bad_pages(adev);
1319
1320 mutex_lock(&con->recovery_lock);
1321 con->eh_data = NULL;
1322 kfree(data->bps);
1323 kfree(data);
1324 mutex_unlock(&con->recovery_lock);
1325
1326 return 0;
1327}
1328/* recovery end */
1329
1330struct ras_DID_capability {
1331 u16 did;
1332 u8 rid;
1333 u32 capability;
1334};
1335
1336static const struct ras_DID_capability supported_DID_array[] = {
1337 {0x66a0, 0x00, AMDGPU_RAS_BLOCK_MASK},
1338 {0x66a0, 0x02, AMDGPU_RAS_BLOCK_MASK},
1339 {0x66a1, 0x00, AMDGPU_RAS_BLOCK_MASK},
1340 {0x66a1, 0x01, AMDGPU_RAS_BLOCK_MASK},
1341 {0x66a1, 0x04, AMDGPU_RAS_BLOCK_MASK},
1342 {0x66a3, 0x00, AMDGPU_RAS_BLOCK_MASK},
1343 {0x66a7, 0x00, AMDGPU_RAS_BLOCK_MASK},
1344};
1345
1346static uint32_t amdgpu_ras_check_supported(struct amdgpu_device *adev)
1347{
1348 /* TODO need check vbios table */
1349 int i;
1350 int did = adev->pdev->device;
1351 int rid = adev->pdev->revision;
1352
1353 for (i = 0; i < ARRAY_SIZE(supported_DID_array); i++) {
1354 if (did == supported_DID_array[i].did &&
1355 rid == supported_DID_array[i].rid) {
1356 return supported_DID_array[i].capability;
1357 }
1358 }
1359 return 0;
1360}
1361
1362int amdgpu_ras_init(struct amdgpu_device *adev)
1363{
1364 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1365 uint32_t supported = amdgpu_ras_check_supported(adev);
1366
1367 if (con || supported == 0)
1368 return 0;
1369
1370 con = kmalloc(sizeof(struct amdgpu_ras) +
1371 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1372 GFP_KERNEL|__GFP_ZERO);
1373 if (!con)
1374 return -ENOMEM;
1375
1376 con->objs = (struct ras_manager *)(con + 1);
1377
1378 amdgpu_ras_set_context(adev, con);
1379
1380 con->supported = supported;
1381 con->features = 0;
1382 INIT_LIST_HEAD(&con->head);
1383
1384 if (amdgpu_ras_recovery_init(adev))
1385 goto recovery_out;
1386
1387 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1388
1389 amdgpu_ras_enable_all_features(adev, 1);
1390
1391 if (amdgpu_ras_fs_init(adev))
1392 goto fs_out;
1393
1394 amdgpu_ras_self_test(adev);
1395 return 0;
1396fs_out:
1397 amdgpu_ras_recovery_fini(adev);
1398recovery_out:
1399 amdgpu_ras_set_context(adev, NULL);
1400 kfree(con);
1401
1402 return -EINVAL;
1403}
1404
1405/* do some fini work before IP fini as dependence */
1406int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1407{
1408 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1409
1410 if (!con)
1411 return 0;
1412
1413 /* Need disable ras on all IPs here before ip [hw/sw]fini */
1414 amdgpu_ras_disable_all_features(adev, 0);
1415 amdgpu_ras_recovery_fini(adev);
1416 return 0;
1417}
1418
1419int amdgpu_ras_fini(struct amdgpu_device *adev)
1420{
1421 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1422
1423 if (!con)
1424 return 0;
1425
1426 amdgpu_ras_fs_fini(adev);
1427 amdgpu_ras_interrupt_remove_all(adev);
1428
1429 WARN(con->features, "Feature mask is not cleared");
1430
1431 if (con->features)
1432 amdgpu_ras_disable_all_features(adev, 1);
1433
1434 amdgpu_ras_set_context(adev, NULL);
1435 kfree(con);
1436
1437 return 0;
1438}