drm/amdgpu: Fix ras debugfs data parse
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
CommitLineData
c030f2e4 1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include "amdgpu.h"
28#include "amdgpu_ras.h"
b404ae82 29#include "amdgpu_atomfirmware.h"
c030f2e4 30
31struct ras_ih_data {
32 /* interrupt bottom half */
33 struct work_struct ih_work;
34 int inuse;
35 /* IP callback */
36 ras_ih_cb cb;
37 /* full of entries */
38 unsigned char *ring;
39 unsigned int ring_size;
40 unsigned int element_size;
41 unsigned int aligned_element_size;
42 unsigned int rptr;
43 unsigned int wptr;
44};
45
46struct ras_fs_data {
47 char sysfs_name[32];
48 char debugfs_name[32];
49};
50
51struct ras_err_data {
52 unsigned long ue_count;
53 unsigned long ce_count;
54};
55
56struct ras_err_handler_data {
57 /* point to bad pages array */
58 struct {
59 unsigned long bp;
60 struct amdgpu_bo *bo;
61 } *bps;
62 /* the count of entries */
63 int count;
64 /* the space can place new entries */
65 int space_left;
66 /* last reserved entry's index + 1 */
67 int last_reserved;
68};
69
70struct ras_manager {
71 struct ras_common_if head;
72 /* reference count */
73 int use;
74 /* ras block link */
75 struct list_head node;
76 /* the device */
77 struct amdgpu_device *adev;
78 /* debugfs */
79 struct dentry *ent;
80 /* sysfs */
81 struct device_attribute sysfs_attr;
82 int attr_inuse;
83
84 /* fs node name */
85 struct ras_fs_data fs_data;
86
87 /* IH data */
88 struct ras_ih_data ih_data;
89
90 struct ras_err_data err_data;
91};
92
93const char *ras_error_string[] = {
94 "none",
95 "parity",
96 "single_correctable",
97 "multi_uncorrectable",
98 "poison",
99};
100
101const char *ras_block_string[] = {
102 "umc",
103 "sdma",
104 "gfx",
105 "mmhub",
106 "athub",
107 "pcie_bif",
108 "hdp",
109 "xgmi_wafl",
110 "df",
111 "smn",
112 "sem",
113 "mp0",
114 "mp1",
115 "fuse",
116};
117
118#define ras_err_str(i) (ras_error_string[ffs(i)])
119#define ras_block_str(i) (ras_block_string[i])
120
121static void amdgpu_ras_self_test(struct amdgpu_device *adev)
122{
123 /* TODO */
124}
125
126static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
127 size_t size, loff_t *pos)
128{
129 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
130 struct ras_query_if info = {
131 .head = obj->head,
132 };
133 ssize_t s;
134 char val[128];
135
136 if (amdgpu_ras_error_query(obj->adev, &info))
137 return -EINVAL;
138
139 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
140 "ue", info.ue_count,
141 "ce", info.ce_count);
142 if (*pos >= s)
143 return 0;
144
145 s -= *pos;
146 s = min_t(u64, s, size);
147
148
149 if (copy_to_user(buf, &val[*pos], s))
150 return -EINVAL;
151
152 *pos += s;
153
154 return s;
155}
156
157static ssize_t amdgpu_ras_debugfs_write(struct file *f, const char __user *buf,
158 size_t size, loff_t *pos)
159{
160 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
161 struct ras_inject_if info = {
162 .head = obj->head,
163 };
164 ssize_t s = min_t(u64, 64, size);
165 char val[64];
166 char *str = val;
167 memset(val, 0, sizeof(val));
168
169 if (*pos)
170 return -EINVAL;
171
172 if (copy_from_user(str, buf, s))
173 return -EINVAL;
174
175 /* only care ue/ce for now. */
176 if (memcmp(str, "ue", 2) == 0) {
177 info.head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
178 str += 2;
179 } else if (memcmp(str, "ce", 2) == 0) {
180 info.head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
181 str += 2;
182 }
183
184 if (sscanf(str, "0x%llx 0x%llx", &info.address, &info.value) != 2) {
185 if (sscanf(str, "%llu %llu", &info.address, &info.value) != 2)
186 return -EINVAL;
187 }
188
189 *pos = s;
190
191 if (amdgpu_ras_error_inject(obj->adev, &info))
192 return -EINVAL;
193
194 return size;
195}
196
197static const struct file_operations amdgpu_ras_debugfs_ops = {
198 .owner = THIS_MODULE,
199 .read = amdgpu_ras_debugfs_read,
200 .write = amdgpu_ras_debugfs_write,
201 .llseek = default_llseek
202};
203
96ebb307 204static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
205{
206 int i;
207
208 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
209 *block_id = i;
210 if (strcmp(name, ras_block_str(i)) == 0)
211 return 0;
212 }
213 return -EINVAL;
214}
215
216static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
217 const char __user *buf, size_t size,
218 loff_t *pos, struct ras_debug_if *data)
219{
220 ssize_t s = min_t(u64, 64, size);
221 char str[65];
222 char block_name[33];
223 char err[9] = "ue";
224 int op = -1;
225 int block_id;
226 u64 address, value;
227
228 if (*pos)
229 return -EINVAL;
230 *pos = size;
231
232 memset(str, 0, sizeof(str));
233 memset(data, 0, sizeof(*data));
234
235 if (copy_from_user(str, buf, s))
236 return -EINVAL;
237
238 if (sscanf(str, "disable %32s", block_name) == 1)
239 op = 0;
240 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
241 op = 1;
242 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
243 op = 2;
b076296b 244 else if (str[0] && str[1] && str[2] && str[3])
96ebb307 245 /* ascii string, but commands are not matched. */
246 return -EINVAL;
247
248 if (op != -1) {
249 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
250 return -EINVAL;
251
252 data->head.block = block_id;
253 data->head.type = memcmp("ue", err, 2) == 0 ?
254 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
255 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
256 data->op = op;
257
258 if (op == 2) {
259 if (sscanf(str, "%*s %*s %*s %llu %llu",
260 &address, &value) != 2)
261 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
262 &address, &value) != 2)
263 return -EINVAL;
264 data->inject.address = address;
265 data->inject.value = value;
266 }
267 } else {
268 if (size < sizeof(data))
269 return -EINVAL;
270
271 if (copy_from_user(data, buf, sizeof(*data)))
272 return -EINVAL;
273 }
274
275 return 0;
276}
36ea1bd2 277/*
278 * DOC: ras debugfs control interface
279 *
280 * It accepts struct ras_debug_if who has two members.
281 *
282 * First member: ras_debug_if::head or ras_debug_if::inject.
96ebb307 283 *
284 * head is used to indicate which IP block will be under control.
36ea1bd2 285 *
286 * head has four members, they are block, type, sub_block_index, name.
287 * block: which IP will be under control.
288 * type: what kind of error will be enabled/disabled/injected.
289 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
290 * name: the name of IP.
291 *
292 * inject has two more members than head, they are address, value.
293 * As their names indicate, inject operation will write the
294 * value to the address.
295 *
296 * Second member: struct ras_debug_if::op.
297 * It has three kinds of operations.
298 * 0: disable RAS on the block. Take ::head as its data.
299 * 1: enable RAS on the block. Take ::head as its data.
300 * 2: inject errors on the block. Take ::inject as its data.
301 *
96ebb307 302 * How to use the interface?
303 * programs:
304 * copy the struct ras_debug_if in your codes and initialize it.
305 * write the struct to the control node.
306 *
307 * bash:
308 * echo op block [error [address value]] > .../ras/ras_ctrl
309 * op: disable, enable, inject
310 * disable: only block is needed
311 * enable: block and error are needed
312 * inject: error, address, value are needed
313 * block: umc, smda, gfx, .........
314 * see ras_block_string[] for details
315 * error: ue, ce
316 * ue: multi_uncorrectable
317 * ce: single_correctable
318 *
319 * here are some examples for bash commands,
320 * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
321 * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
322 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
323 *
36ea1bd2 324 * How to check the result?
325 *
326 * For disable/enable, please check ras features at
327 * /sys/class/drm/card[0/1/2...]/device/ras/features
328 *
329 * For inject, please check corresponding err count at
330 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
331 *
332 * NOTE: operation is only allowed on blocks which are supported.
333 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
334 */
335static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
336 size_t size, loff_t *pos)
337{
338 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
339 struct ras_debug_if data;
340 int ret = 0;
341
96ebb307 342 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
343 if (ret)
36ea1bd2 344 return -EINVAL;
345
36ea1bd2 346 if (!amdgpu_ras_is_supported(adev, data.head.block))
347 return -EINVAL;
348
349 switch (data.op) {
350 case 0:
351 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
352 break;
353 case 1:
354 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
355 break;
356 case 2:
357 ret = amdgpu_ras_error_inject(adev, &data.inject);
358 break;
96ebb307 359 default:
360 ret = -EINVAL;
361 break;
36ea1bd2 362 };
363
364 if (ret)
365 return -EINVAL;
366
367 return size;
368}
369
370static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
371 .owner = THIS_MODULE,
372 .read = NULL,
373 .write = amdgpu_ras_debugfs_ctrl_write,
374 .llseek = default_llseek
375};
376
c030f2e4 377static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
378 struct device_attribute *attr, char *buf)
379{
380 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
381 struct ras_query_if info = {
382 .head = obj->head,
383 };
384
385 if (amdgpu_ras_error_query(obj->adev, &info))
386 return -EINVAL;
387
388 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
389 "ue", info.ue_count,
390 "ce", info.ce_count);
391}
392
393/* obj begin */
394
395#define get_obj(obj) do { (obj)->use++; } while (0)
396#define alive_obj(obj) ((obj)->use)
397
398static inline void put_obj(struct ras_manager *obj)
399{
400 if (obj && --obj->use == 0)
401 list_del(&obj->node);
402 if (obj && obj->use < 0) {
403 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
404 }
405}
406
407/* make one obj and return it. */
408static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
409 struct ras_common_if *head)
410{
411 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
412 struct ras_manager *obj;
413
414 if (!con)
415 return NULL;
416
417 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
418 return NULL;
419
420 obj = &con->objs[head->block];
421 /* already exist. return obj? */
422 if (alive_obj(obj))
423 return NULL;
424
425 obj->head = *head;
426 obj->adev = adev;
427 list_add(&obj->node, &con->head);
428 get_obj(obj);
429
430 return obj;
431}
432
433/* return an obj equal to head, or the first when head is NULL */
434static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
435 struct ras_common_if *head)
436{
437 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
438 struct ras_manager *obj;
439 int i;
440
441 if (!con)
442 return NULL;
443
444 if (head) {
445 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
446 return NULL;
447
448 obj = &con->objs[head->block];
449
450 if (alive_obj(obj)) {
451 WARN_ON(head->block != obj->head.block);
452 return obj;
453 }
454 } else {
455 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
456 obj = &con->objs[i];
457 if (alive_obj(obj)) {
458 WARN_ON(i != obj->head.block);
459 return obj;
460 }
461 }
462 }
463
464 return NULL;
465}
466/* obj end */
467
468/* feature ctl begin */
469static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
470 struct ras_common_if *head)
471{
5caf466a 472 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
473
474 return con->hw_supported & BIT(head->block);
c030f2e4 475}
476
477static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
478 struct ras_common_if *head)
479{
480 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
481
482 return con->features & BIT(head->block);
483}
484
485/*
486 * if obj is not created, then create one.
487 * set feature enable flag.
488 */
489static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
490 struct ras_common_if *head, int enable)
491{
492 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
493 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
494
5caf466a 495 /* If hardware does not support ras, then do not create obj.
496 * But if hardware support ras, we can create the obj.
497 * Ras framework checks con->hw_supported to see if it need do
498 * corresponding initialization.
499 * IP checks con->support to see if it need disable ras.
500 */
c030f2e4 501 if (!amdgpu_ras_is_feature_allowed(adev, head))
502 return 0;
503 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
504 return 0;
505
506 if (enable) {
507 if (!obj) {
508 obj = amdgpu_ras_create_obj(adev, head);
509 if (!obj)
510 return -EINVAL;
511 } else {
512 /* In case we create obj somewhere else */
513 get_obj(obj);
514 }
515 con->features |= BIT(head->block);
516 } else {
517 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
518 con->features &= ~BIT(head->block);
519 put_obj(obj);
520 }
521 }
522
523 return 0;
524}
525
526/* wrapper of psp_ras_enable_features */
527int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
528 struct ras_common_if *head, bool enable)
529{
530 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
531 union ta_ras_cmd_input info;
532 int ret;
533
534 if (!con)
535 return -EINVAL;
536
537 if (!enable) {
538 info.disable_features = (struct ta_ras_disable_features_input) {
539 .block_id = head->block,
540 .error_type = head->type,
541 };
542 } else {
543 info.enable_features = (struct ta_ras_enable_features_input) {
544 .block_id = head->block,
545 .error_type = head->type,
546 };
547 }
548
549 /* Do not enable if it is not allowed. */
550 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
551 /* Are we alerady in that state we are going to set? */
552 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
553 return 0;
554
555 ret = psp_ras_enable_features(&adev->psp, &info, enable);
556 if (ret) {
557 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
558 enable ? "enable":"disable",
559 ras_block_str(head->block),
560 ret);
561 return -EINVAL;
562 }
563
564 /* setup the obj */
565 __amdgpu_ras_feature_enable(adev, head, enable);
566
567 return 0;
568}
569
570static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
571 bool bypass)
572{
573 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
574 struct ras_manager *obj, *tmp;
575
576 list_for_each_entry_safe(obj, tmp, &con->head, node) {
577 /* bypass psp.
578 * aka just release the obj and corresponding flags
579 */
580 if (bypass) {
581 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
582 break;
583 } else {
584 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
585 break;
586 }
587 };
588
589 return con->features;
590}
591
592static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
593 bool bypass)
594{
595 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
596 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
597 int i;
598
599 for (i = 0; i < ras_block_count; i++) {
600 struct ras_common_if head = {
601 .block = i,
602 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
603 .sub_block_index = 0,
604 };
605 strcpy(head.name, ras_block_str(i));
606 if (bypass) {
607 /*
608 * bypass psp. vbios enable ras for us.
609 * so just create the obj
610 */
611 if (__amdgpu_ras_feature_enable(adev, &head, 1))
612 break;
613 } else {
614 if (amdgpu_ras_feature_enable(adev, &head, 1))
615 break;
616 }
617 };
618
619 return con->features;
620}
621/* feature ctl end */
622
623/* query/inject/cure begin */
624int amdgpu_ras_error_query(struct amdgpu_device *adev,
625 struct ras_query_if *info)
626{
627 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
628
629 if (!obj)
630 return -EINVAL;
631 /* TODO might read the register to read the count */
632
633 info->ue_count = obj->err_data.ue_count;
634 info->ce_count = obj->err_data.ce_count;
635
636 return 0;
637}
638
639/* wrapper of psp_ras_trigger_error */
640int amdgpu_ras_error_inject(struct amdgpu_device *adev,
641 struct ras_inject_if *info)
642{
643 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
644 struct ta_ras_trigger_error_input block_info = {
645 .block_id = info->head.block,
646 .inject_error_type = info->head.type,
647 .sub_block_index = info->head.sub_block_index,
648 .address = info->address,
649 .value = info->value,
650 };
651 int ret = 0;
652
653 if (!obj)
654 return -EINVAL;
655
656 ret = psp_ras_trigger_error(&adev->psp, &block_info);
657 if (ret)
658 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
659 ras_block_str(info->head.block),
660 ret);
661
662 return ret;
663}
664
665int amdgpu_ras_error_cure(struct amdgpu_device *adev,
666 struct ras_cure_if *info)
667{
668 /* psp fw has no cure interface for now. */
669 return 0;
670}
671
672/* get the total error counts on all IPs */
673int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
674 bool is_ce)
675{
676 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
677 struct ras_manager *obj;
678 struct ras_err_data data = {0, 0};
679
680 if (!con)
681 return -EINVAL;
682
683 list_for_each_entry(obj, &con->head, node) {
684 struct ras_query_if info = {
685 .head = obj->head,
686 };
687
688 if (amdgpu_ras_error_query(adev, &info))
689 return -EINVAL;
690
691 data.ce_count += info.ce_count;
692 data.ue_count += info.ue_count;
693 }
694
695 return is_ce ? data.ce_count : data.ue_count;
696}
697/* query/inject/cure end */
698
699
700/* sysfs begin */
701
702static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
703 struct device_attribute *attr, char *buf)
704{
705 struct amdgpu_ras *con =
706 container_of(attr, struct amdgpu_ras, features_attr);
707 struct drm_device *ddev = dev_get_drvdata(dev);
708 struct amdgpu_device *adev = ddev->dev_private;
709 struct ras_common_if head;
710 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
711 int i;
712 ssize_t s;
713 struct ras_manager *obj;
714
715 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
716
717 for (i = 0; i < ras_block_count; i++) {
718 head.block = i;
719
720 if (amdgpu_ras_is_feature_enabled(adev, &head)) {
721 obj = amdgpu_ras_find_obj(adev, &head);
722 s += scnprintf(&buf[s], PAGE_SIZE - s,
723 "%s: %s\n",
724 ras_block_str(i),
725 ras_err_str(obj->head.type));
726 } else
727 s += scnprintf(&buf[s], PAGE_SIZE - s,
728 "%s: disabled\n",
729 ras_block_str(i));
730 }
731
732 return s;
733}
734
735static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
736{
737 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
738 struct attribute *attrs[] = {
739 &con->features_attr.attr,
740 NULL
741 };
742 struct attribute_group group = {
743 .name = "ras",
744 .attrs = attrs,
745 };
746
747 con->features_attr = (struct device_attribute) {
748 .attr = {
749 .name = "features",
750 .mode = S_IRUGO,
2b9505e3 751#ifdef CONFIG_DEBUG_LOCK_ALLOC
752 .ignore_lockdep = 1,
753#endif
c030f2e4 754 },
755 .show = amdgpu_ras_sysfs_features_read,
756 };
757
758 return sysfs_create_group(&adev->dev->kobj, &group);
759}
760
761static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
762{
763 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
764 struct attribute *attrs[] = {
765 &con->features_attr.attr,
766 NULL
767 };
768 struct attribute_group group = {
769 .name = "ras",
770 .attrs = attrs,
771 };
772
773 sysfs_remove_group(&adev->dev->kobj, &group);
774
775 return 0;
776}
777
778int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
779 struct ras_fs_if *head)
780{
781 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
782
783 if (!obj || obj->attr_inuse)
784 return -EINVAL;
785
786 get_obj(obj);
787
788 memcpy(obj->fs_data.sysfs_name,
789 head->sysfs_name,
790 sizeof(obj->fs_data.sysfs_name));
791
792 obj->sysfs_attr = (struct device_attribute){
793 .attr = {
794 .name = obj->fs_data.sysfs_name,
795 .mode = S_IRUGO,
2b9505e3 796#ifdef CONFIG_DEBUG_LOCK_ALLOC
797 .ignore_lockdep = 1,
798#endif
c030f2e4 799 },
800 .show = amdgpu_ras_sysfs_read,
801 };
802
803 if (sysfs_add_file_to_group(&adev->dev->kobj,
804 &obj->sysfs_attr.attr,
805 "ras")) {
806 put_obj(obj);
807 return -EINVAL;
808 }
809
810 obj->attr_inuse = 1;
811
812 return 0;
813}
814
815int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
816 struct ras_common_if *head)
817{
818 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
819
820 if (!obj || !obj->attr_inuse)
821 return -EINVAL;
822
823 sysfs_remove_file_from_group(&adev->dev->kobj,
824 &obj->sysfs_attr.attr,
825 "ras");
826 obj->attr_inuse = 0;
827 put_obj(obj);
828
829 return 0;
830}
831
832static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
833{
834 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
835 struct ras_manager *obj, *tmp;
836
837 list_for_each_entry_safe(obj, tmp, &con->head, node) {
838 amdgpu_ras_sysfs_remove(adev, &obj->head);
839 }
840
841 amdgpu_ras_sysfs_remove_feature_node(adev);
842
843 return 0;
844}
845/* sysfs end */
846
847/* debugfs begin */
36ea1bd2 848static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
849{
850 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
851 struct drm_minor *minor = adev->ddev->primary;
852 struct dentry *root = minor->debugfs_root, *dir;
853 struct dentry *ent;
854
855 dir = debugfs_create_dir("ras", root);
856 if (IS_ERR(dir))
857 return -EINVAL;
858
859 con->dir = dir;
860
861 ent = debugfs_create_file("ras_ctrl",
862 S_IWUGO | S_IRUGO, con->dir,
863 adev, &amdgpu_ras_debugfs_ctrl_ops);
864 if (IS_ERR(ent)) {
865 debugfs_remove(con->dir);
866 return -EINVAL;
867 }
868
869 con->ent = ent;
870 return 0;
871}
872
c030f2e4 873int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
874 struct ras_fs_if *head)
875{
876 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
877 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
878 struct dentry *ent;
879
880 if (!obj || obj->ent)
881 return -EINVAL;
882
883 get_obj(obj);
884
885 memcpy(obj->fs_data.debugfs_name,
886 head->debugfs_name,
887 sizeof(obj->fs_data.debugfs_name));
888
889 ent = debugfs_create_file(obj->fs_data.debugfs_name,
890 S_IWUGO | S_IRUGO, con->dir,
891 obj, &amdgpu_ras_debugfs_ops);
892
893 if (IS_ERR(ent))
894 return -EINVAL;
895
896 obj->ent = ent;
897
898 return 0;
899}
900
901int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
902 struct ras_common_if *head)
903{
904 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
905
906 if (!obj || !obj->ent)
907 return 0;
908
909 debugfs_remove(obj->ent);
910 obj->ent = NULL;
911 put_obj(obj);
912
913 return 0;
914}
915
916static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
917{
918 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
919 struct ras_manager *obj, *tmp;
920
921 list_for_each_entry_safe(obj, tmp, &con->head, node) {
922 amdgpu_ras_debugfs_remove(adev, &obj->head);
923 }
924
36ea1bd2 925 debugfs_remove(con->ent);
c030f2e4 926 debugfs_remove(con->dir);
927 con->dir = NULL;
36ea1bd2 928 con->ent = NULL;
c030f2e4 929
930 return 0;
931}
932/* debugfs end */
933
934/* ras fs */
935
936static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
937{
c030f2e4 938 amdgpu_ras_sysfs_create_feature_node(adev);
36ea1bd2 939 amdgpu_ras_debugfs_create_ctrl_node(adev);
c030f2e4 940
941 return 0;
942}
943
944static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
945{
946 amdgpu_ras_debugfs_remove_all(adev);
947 amdgpu_ras_sysfs_remove_all(adev);
948 return 0;
949}
950/* ras fs end */
951
952/* ih begin */
953static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
954{
955 struct ras_ih_data *data = &obj->ih_data;
956 struct amdgpu_iv_entry entry;
957 int ret;
958
959 while (data->rptr != data->wptr) {
960 rmb();
961 memcpy(&entry, &data->ring[data->rptr],
962 data->element_size);
963
964 wmb();
965 data->rptr = (data->aligned_element_size +
966 data->rptr) % data->ring_size;
967
968 /* Let IP handle its data, maybe we need get the output
969 * from the callback to udpate the error type/count, etc
970 */
971 if (data->cb) {
972 ret = data->cb(obj->adev, &entry);
973 /* ue will trigger an interrupt, and in that case
974 * we need do a reset to recovery the whole system.
975 * But leave IP do that recovery, here we just dispatch
976 * the error.
977 */
978 if (ret == AMDGPU_RAS_UE) {
979 obj->err_data.ue_count++;
980 }
981 /* Might need get ce count by register, but not all IP
982 * saves ce count, some IP just use one bit or two bits
983 * to indicate ce happened.
984 */
985 }
986 }
987}
988
989static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
990{
991 struct ras_ih_data *data =
992 container_of(work, struct ras_ih_data, ih_work);
993 struct ras_manager *obj =
994 container_of(data, struct ras_manager, ih_data);
995
996 amdgpu_ras_interrupt_handler(obj);
997}
998
999int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1000 struct ras_dispatch_if *info)
1001{
1002 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1003 struct ras_ih_data *data = &obj->ih_data;
1004
1005 if (!obj)
1006 return -EINVAL;
1007
1008 if (data->inuse == 0)
1009 return 0;
1010
1011 /* Might be overflow... */
1012 memcpy(&data->ring[data->wptr], info->entry,
1013 data->element_size);
1014
1015 wmb();
1016 data->wptr = (data->aligned_element_size +
1017 data->wptr) % data->ring_size;
1018
1019 schedule_work(&data->ih_work);
1020
1021 return 0;
1022}
1023
1024int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1025 struct ras_ih_if *info)
1026{
1027 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1028 struct ras_ih_data *data;
1029
1030 if (!obj)
1031 return -EINVAL;
1032
1033 data = &obj->ih_data;
1034 if (data->inuse == 0)
1035 return 0;
1036
1037 cancel_work_sync(&data->ih_work);
1038
1039 kfree(data->ring);
1040 memset(data, 0, sizeof(*data));
1041 put_obj(obj);
1042
1043 return 0;
1044}
1045
1046int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1047 struct ras_ih_if *info)
1048{
1049 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1050 struct ras_ih_data *data;
1051
1052 if (!obj) {
1053 /* in case we registe the IH before enable ras feature */
1054 obj = amdgpu_ras_create_obj(adev, &info->head);
1055 if (!obj)
1056 return -EINVAL;
1057 } else
1058 get_obj(obj);
1059
1060 data = &obj->ih_data;
1061 /* add the callback.etc */
1062 *data = (struct ras_ih_data) {
1063 .inuse = 0,
1064 .cb = info->cb,
1065 .element_size = sizeof(struct amdgpu_iv_entry),
1066 .rptr = 0,
1067 .wptr = 0,
1068 };
1069
1070 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1071
1072 data->aligned_element_size = ALIGN(data->element_size, 8);
1073 /* the ring can store 64 iv entries. */
1074 data->ring_size = 64 * data->aligned_element_size;
1075 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1076 if (!data->ring) {
1077 put_obj(obj);
1078 return -ENOMEM;
1079 }
1080
1081 /* IH is ready */
1082 data->inuse = 1;
1083
1084 return 0;
1085}
1086
1087static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1088{
1089 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1090 struct ras_manager *obj, *tmp;
1091
1092 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1093 struct ras_ih_if info = {
1094 .head = obj->head,
1095 };
1096 amdgpu_ras_interrupt_remove_handler(adev, &info);
1097 }
1098
1099 return 0;
1100}
1101/* ih end */
1102
1103/* recovery begin */
1104static void amdgpu_ras_do_recovery(struct work_struct *work)
1105{
1106 struct amdgpu_ras *ras =
1107 container_of(work, struct amdgpu_ras, recovery_work);
1108
1109 amdgpu_device_gpu_recover(ras->adev, 0);
1110 atomic_set(&ras->in_recovery, 0);
1111}
1112
1113static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1114 struct amdgpu_bo **bo_ptr)
1115{
1116 /* no need to free it actually. */
1117 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1118 return 0;
1119}
1120
1121/* reserve vram with size@offset */
1122static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1123 uint64_t offset, uint64_t size,
1124 struct amdgpu_bo **bo_ptr)
1125{
1126 struct ttm_operation_ctx ctx = { false, false };
1127 struct amdgpu_bo_param bp;
1128 int r = 0;
1129 int i;
1130 struct amdgpu_bo *bo;
1131
1132 if (bo_ptr)
1133 *bo_ptr = NULL;
1134 memset(&bp, 0, sizeof(bp));
1135 bp.size = size;
1136 bp.byte_align = PAGE_SIZE;
1137 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1138 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1139 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1140 bp.type = ttm_bo_type_kernel;
1141 bp.resv = NULL;
1142
1143 r = amdgpu_bo_create(adev, &bp, &bo);
1144 if (r)
1145 return -EINVAL;
1146
1147 r = amdgpu_bo_reserve(bo, false);
1148 if (r)
1149 goto error_reserve;
1150
1151 offset = ALIGN(offset, PAGE_SIZE);
1152 for (i = 0; i < bo->placement.num_placement; ++i) {
1153 bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1154 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1155 }
1156
1157 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1158 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1159 if (r)
1160 goto error_pin;
1161
1162 r = amdgpu_bo_pin_restricted(bo,
1163 AMDGPU_GEM_DOMAIN_VRAM,
1164 offset,
1165 offset + size);
1166 if (r)
1167 goto error_pin;
1168
1169 if (bo_ptr)
1170 *bo_ptr = bo;
1171
1172 amdgpu_bo_unreserve(bo);
1173 return r;
1174
1175error_pin:
1176 amdgpu_bo_unreserve(bo);
1177error_reserve:
1178 amdgpu_bo_unref(&bo);
1179 return r;
1180}
1181
1182/* alloc/realloc bps array */
1183static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1184 struct ras_err_handler_data *data, int pages)
1185{
1186 unsigned int old_space = data->count + data->space_left;
1187 unsigned int new_space = old_space + pages;
1188 unsigned int align_space = ALIGN(new_space, 1024);
1189 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1190
1191 if (!tmp)
1192 return -ENOMEM;
1193
1194 if (data->bps) {
1195 memcpy(tmp, data->bps,
1196 data->count * sizeof(*data->bps));
1197 kfree(data->bps);
1198 }
1199
1200 data->bps = tmp;
1201 data->space_left += align_space - old_space;
1202 return 0;
1203}
1204
1205/* it deal with vram only. */
1206int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1207 unsigned long *bps, int pages)
1208{
1209 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1210 struct ras_err_handler_data *data = con->eh_data;
1211 int i = pages;
1212 int ret = 0;
1213
1214 if (!con || !data || !bps || pages <= 0)
1215 return 0;
1216
1217 mutex_lock(&con->recovery_lock);
1218 if (!data)
1219 goto out;
1220
1221 if (data->space_left <= pages)
1222 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1223 ret = -ENOMEM;
1224 goto out;
1225 }
1226
1227 while (i--)
1228 data->bps[data->count++].bp = bps[i];
1229
1230 data->space_left -= pages;
1231out:
1232 mutex_unlock(&con->recovery_lock);
1233
1234 return ret;
1235}
1236
1237/* called in gpu recovery/init */
1238int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1239{
1240 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1241 struct ras_err_handler_data *data = con->eh_data;
1242 uint64_t bp;
1243 struct amdgpu_bo *bo;
1244 int i;
1245
1246 if (!con || !data)
1247 return 0;
1248
1249 mutex_lock(&con->recovery_lock);
1250 /* reserve vram at driver post stage. */
1251 for (i = data->last_reserved; i < data->count; i++) {
1252 bp = data->bps[i].bp;
1253
1254 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1255 PAGE_SIZE, &bo))
1256 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1257
1258 data->bps[i].bo = bo;
1259 data->last_reserved = i + 1;
1260 }
1261 mutex_unlock(&con->recovery_lock);
1262 return 0;
1263}
1264
1265/* called when driver unload */
1266static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1267{
1268 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1269 struct ras_err_handler_data *data = con->eh_data;
1270 struct amdgpu_bo *bo;
1271 int i;
1272
1273 if (!con || !data)
1274 return 0;
1275
1276 mutex_lock(&con->recovery_lock);
1277 for (i = data->last_reserved - 1; i >= 0; i--) {
1278 bo = data->bps[i].bo;
1279
1280 amdgpu_ras_release_vram(adev, &bo);
1281
1282 data->bps[i].bo = bo;
1283 data->last_reserved = i;
1284 }
1285 mutex_unlock(&con->recovery_lock);
1286 return 0;
1287}
1288
1289static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1290{
1291 /* TODO
1292 * write the array to eeprom when SMU disabled.
1293 */
1294 return 0;
1295}
1296
1297static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1298{
1299 /* TODO
1300 * read the array to eeprom when SMU disabled.
1301 */
1302 return 0;
1303}
1304
1305static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1306{
1307 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1308 struct ras_err_handler_data **data = &con->eh_data;
1309
1310 *data = kmalloc(sizeof(**data),
1311 GFP_KERNEL|__GFP_ZERO);
1312 if (!*data)
1313 return -ENOMEM;
1314
1315 mutex_init(&con->recovery_lock);
1316 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1317 atomic_set(&con->in_recovery, 0);
1318 con->adev = adev;
1319
1320 amdgpu_ras_load_bad_pages(adev);
1321 amdgpu_ras_reserve_bad_pages(adev);
1322
1323 return 0;
1324}
1325
1326static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1327{
1328 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1329 struct ras_err_handler_data *data = con->eh_data;
1330
1331 cancel_work_sync(&con->recovery_work);
1332 amdgpu_ras_save_bad_pages(adev);
1333 amdgpu_ras_release_bad_pages(adev);
1334
1335 mutex_lock(&con->recovery_lock);
1336 con->eh_data = NULL;
1337 kfree(data->bps);
1338 kfree(data);
1339 mutex_unlock(&con->recovery_lock);
1340
1341 return 0;
1342}
1343/* recovery end */
1344
5caf466a 1345/*
1346 * check hardware's ras ability which will be saved in hw_supported.
1347 * if hardware does not support ras, we can skip some ras initializtion and
1348 * forbid some ras operations from IP.
1349 * if software itself, say boot parameter, limit the ras ability. We still
1350 * need allow IP do some limited operations, like disable. In such case,
1351 * we have to initialize ras as normal. but need check if operation is
1352 * allowed or not in each function.
1353 */
1354static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1355 uint32_t *hw_supported, uint32_t *supported)
c030f2e4 1356{
5caf466a 1357 *hw_supported = 0;
1358 *supported = 0;
c030f2e4 1359
5caf466a 1360 if (amdgpu_sriov_vf(adev) ||
b404ae82 1361 adev->asic_type != CHIP_VEGA20)
5caf466a 1362 return;
b404ae82 1363
1364 if (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
5caf466a 1365 amdgpu_atomfirmware_sram_ecc_supported(adev))
1366 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
b404ae82 1367
5caf466a 1368 *supported = amdgpu_ras_enable == 0 ?
1369 0 : *hw_supported & amdgpu_ras_mask;
c030f2e4 1370}
1371
1372int amdgpu_ras_init(struct amdgpu_device *adev)
1373{
1374 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
c030f2e4 1375
b404ae82 1376 if (con)
c030f2e4 1377 return 0;
1378
1379 con = kmalloc(sizeof(struct amdgpu_ras) +
1380 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1381 GFP_KERNEL|__GFP_ZERO);
1382 if (!con)
1383 return -ENOMEM;
1384
1385 con->objs = (struct ras_manager *)(con + 1);
1386
1387 amdgpu_ras_set_context(adev, con);
1388
5caf466a 1389 amdgpu_ras_check_supported(adev, &con->hw_supported,
1390 &con->supported);
c030f2e4 1391 con->features = 0;
1392 INIT_LIST_HEAD(&con->head);
1393
1394 if (amdgpu_ras_recovery_init(adev))
1395 goto recovery_out;
1396
1397 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1398
1399 amdgpu_ras_enable_all_features(adev, 1);
1400
1401 if (amdgpu_ras_fs_init(adev))
1402 goto fs_out;
1403
1404 amdgpu_ras_self_test(adev);
1405 return 0;
1406fs_out:
1407 amdgpu_ras_recovery_fini(adev);
1408recovery_out:
1409 amdgpu_ras_set_context(adev, NULL);
1410 kfree(con);
1411
1412 return -EINVAL;
1413}
1414
1415/* do some fini work before IP fini as dependence */
1416int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1417{
1418 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1419
1420 if (!con)
1421 return 0;
1422
1423 /* Need disable ras on all IPs here before ip [hw/sw]fini */
1424 amdgpu_ras_disable_all_features(adev, 0);
1425 amdgpu_ras_recovery_fini(adev);
1426 return 0;
1427}
1428
1429int amdgpu_ras_fini(struct amdgpu_device *adev)
1430{
1431 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1432
1433 if (!con)
1434 return 0;
1435
1436 amdgpu_ras_fs_fini(adev);
1437 amdgpu_ras_interrupt_remove_all(adev);
1438
1439 WARN(con->features, "Feature mask is not cleared");
1440
1441 if (con->features)
1442 amdgpu_ras_disable_all_features(adev, 1);
1443
1444 amdgpu_ras_set_context(adev, NULL);
1445 kfree(con);
1446
1447 return 0;
1448}