c030f2e4 |
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | * |
23 | */ |
24 | #include <linux/debugfs.h> |
25 | #include <linux/list.h> |
26 | #include <linux/module.h> |
27 | #include "amdgpu.h" |
28 | #include "amdgpu_ras.h" |
b404ae82 |
29 | #include "amdgpu_atomfirmware.h" |
c030f2e4 |
30 | |
31 | struct ras_ih_data { |
32 | /* interrupt bottom half */ |
33 | struct work_struct ih_work; |
34 | int inuse; |
35 | /* IP callback */ |
36 | ras_ih_cb cb; |
37 | /* full of entries */ |
38 | unsigned char *ring; |
39 | unsigned int ring_size; |
40 | unsigned int element_size; |
41 | unsigned int aligned_element_size; |
42 | unsigned int rptr; |
43 | unsigned int wptr; |
44 | }; |
45 | |
46 | struct ras_fs_data { |
47 | char sysfs_name[32]; |
48 | char debugfs_name[32]; |
49 | }; |
50 | |
51 | struct ras_err_data { |
52 | unsigned long ue_count; |
53 | unsigned long ce_count; |
54 | }; |
55 | |
56 | struct ras_err_handler_data { |
57 | /* point to bad pages array */ |
58 | struct { |
59 | unsigned long bp; |
60 | struct amdgpu_bo *bo; |
61 | } *bps; |
62 | /* the count of entries */ |
63 | int count; |
64 | /* the space can place new entries */ |
65 | int space_left; |
66 | /* last reserved entry's index + 1 */ |
67 | int last_reserved; |
68 | }; |
69 | |
70 | struct ras_manager { |
71 | struct ras_common_if head; |
72 | /* reference count */ |
73 | int use; |
74 | /* ras block link */ |
75 | struct list_head node; |
76 | /* the device */ |
77 | struct amdgpu_device *adev; |
78 | /* debugfs */ |
79 | struct dentry *ent; |
80 | /* sysfs */ |
81 | struct device_attribute sysfs_attr; |
82 | int attr_inuse; |
83 | |
84 | /* fs node name */ |
85 | struct ras_fs_data fs_data; |
86 | |
87 | /* IH data */ |
88 | struct ras_ih_data ih_data; |
89 | |
90 | struct ras_err_data err_data; |
91 | }; |
92 | |
466b1793 |
93 | struct ras_badpage { |
94 | unsigned int bp; |
95 | unsigned int size; |
96 | unsigned int flags; |
97 | }; |
98 | |
c030f2e4 |
99 | const char *ras_error_string[] = { |
100 | "none", |
101 | "parity", |
102 | "single_correctable", |
103 | "multi_uncorrectable", |
104 | "poison", |
105 | }; |
106 | |
107 | const char *ras_block_string[] = { |
108 | "umc", |
109 | "sdma", |
110 | "gfx", |
111 | "mmhub", |
112 | "athub", |
113 | "pcie_bif", |
114 | "hdp", |
115 | "xgmi_wafl", |
116 | "df", |
117 | "smn", |
118 | "sem", |
119 | "mp0", |
120 | "mp1", |
121 | "fuse", |
122 | }; |
123 | |
124 | #define ras_err_str(i) (ras_error_string[ffs(i)]) |
125 | #define ras_block_str(i) (ras_block_string[i]) |
126 | |
a564808e |
127 | #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 |
128 | #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 |
108c6a63 |
129 | #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) |
130 | |
c030f2e4 |
131 | static void amdgpu_ras_self_test(struct amdgpu_device *adev) |
132 | { |
133 | /* TODO */ |
134 | } |
135 | |
136 | static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, |
137 | size_t size, loff_t *pos) |
138 | { |
139 | struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; |
140 | struct ras_query_if info = { |
141 | .head = obj->head, |
142 | }; |
143 | ssize_t s; |
144 | char val[128]; |
145 | |
146 | if (amdgpu_ras_error_query(obj->adev, &info)) |
147 | return -EINVAL; |
148 | |
149 | s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", |
150 | "ue", info.ue_count, |
151 | "ce", info.ce_count); |
152 | if (*pos >= s) |
153 | return 0; |
154 | |
155 | s -= *pos; |
156 | s = min_t(u64, s, size); |
157 | |
158 | |
159 | if (copy_to_user(buf, &val[*pos], s)) |
160 | return -EINVAL; |
161 | |
162 | *pos += s; |
163 | |
164 | return s; |
165 | } |
166 | |
c030f2e4 |
167 | static const struct file_operations amdgpu_ras_debugfs_ops = { |
168 | .owner = THIS_MODULE, |
169 | .read = amdgpu_ras_debugfs_read, |
190211ab |
170 | .write = NULL, |
c030f2e4 |
171 | .llseek = default_llseek |
172 | }; |
173 | |
96ebb307 |
174 | static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) |
175 | { |
176 | int i; |
177 | |
178 | for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { |
179 | *block_id = i; |
180 | if (strcmp(name, ras_block_str(i)) == 0) |
181 | return 0; |
182 | } |
183 | return -EINVAL; |
184 | } |
185 | |
186 | static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, |
187 | const char __user *buf, size_t size, |
188 | loff_t *pos, struct ras_debug_if *data) |
189 | { |
190 | ssize_t s = min_t(u64, 64, size); |
191 | char str[65]; |
192 | char block_name[33]; |
193 | char err[9] = "ue"; |
194 | int op = -1; |
195 | int block_id; |
196 | u64 address, value; |
197 | |
198 | if (*pos) |
199 | return -EINVAL; |
200 | *pos = size; |
201 | |
202 | memset(str, 0, sizeof(str)); |
203 | memset(data, 0, sizeof(*data)); |
204 | |
205 | if (copy_from_user(str, buf, s)) |
206 | return -EINVAL; |
207 | |
208 | if (sscanf(str, "disable %32s", block_name) == 1) |
209 | op = 0; |
210 | else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) |
211 | op = 1; |
212 | else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) |
213 | op = 2; |
b076296b |
214 | else if (str[0] && str[1] && str[2] && str[3]) |
96ebb307 |
215 | /* ascii string, but commands are not matched. */ |
216 | return -EINVAL; |
217 | |
218 | if (op != -1) { |
219 | if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) |
220 | return -EINVAL; |
221 | |
222 | data->head.block = block_id; |
223 | data->head.type = memcmp("ue", err, 2) == 0 ? |
224 | AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE : |
225 | AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; |
226 | data->op = op; |
227 | |
228 | if (op == 2) { |
229 | if (sscanf(str, "%*s %*s %*s %llu %llu", |
230 | &address, &value) != 2) |
231 | if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx", |
232 | &address, &value) != 2) |
233 | return -EINVAL; |
234 | data->inject.address = address; |
235 | data->inject.value = value; |
236 | } |
237 | } else { |
73aa8e1a |
238 | if (size < sizeof(*data)) |
96ebb307 |
239 | return -EINVAL; |
240 | |
241 | if (copy_from_user(data, buf, sizeof(*data))) |
242 | return -EINVAL; |
243 | } |
244 | |
245 | return 0; |
246 | } |
36ea1bd2 |
247 | /* |
248 | * DOC: ras debugfs control interface |
249 | * |
250 | * It accepts struct ras_debug_if who has two members. |
251 | * |
252 | * First member: ras_debug_if::head or ras_debug_if::inject. |
96ebb307 |
253 | * |
254 | * head is used to indicate which IP block will be under control. |
36ea1bd2 |
255 | * |
256 | * head has four members, they are block, type, sub_block_index, name. |
257 | * block: which IP will be under control. |
258 | * type: what kind of error will be enabled/disabled/injected. |
259 | * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. |
260 | * name: the name of IP. |
261 | * |
262 | * inject has two more members than head, they are address, value. |
263 | * As their names indicate, inject operation will write the |
264 | * value to the address. |
265 | * |
266 | * Second member: struct ras_debug_if::op. |
267 | * It has three kinds of operations. |
268 | * 0: disable RAS on the block. Take ::head as its data. |
269 | * 1: enable RAS on the block. Take ::head as its data. |
270 | * 2: inject errors on the block. Take ::inject as its data. |
271 | * |
96ebb307 |
272 | * How to use the interface? |
273 | * programs: |
274 | * copy the struct ras_debug_if in your codes and initialize it. |
275 | * write the struct to the control node. |
276 | * |
277 | * bash: |
278 | * echo op block [error [address value]] > .../ras/ras_ctrl |
279 | * op: disable, enable, inject |
280 | * disable: only block is needed |
281 | * enable: block and error are needed |
282 | * inject: error, address, value are needed |
283 | * block: umc, smda, gfx, ......... |
284 | * see ras_block_string[] for details |
285 | * error: ue, ce |
286 | * ue: multi_uncorrectable |
287 | * ce: single_correctable |
288 | * |
289 | * here are some examples for bash commands, |
290 | * echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl |
291 | * echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl |
292 | * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl |
293 | * |
36ea1bd2 |
294 | * How to check the result? |
295 | * |
296 | * For disable/enable, please check ras features at |
297 | * /sys/class/drm/card[0/1/2...]/device/ras/features |
298 | * |
299 | * For inject, please check corresponding err count at |
300 | * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count |
301 | * |
302 | * NOTE: operation is only allowed on blocks which are supported. |
303 | * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask |
304 | */ |
305 | static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, |
306 | size_t size, loff_t *pos) |
307 | { |
308 | struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; |
309 | struct ras_debug_if data; |
310 | int ret = 0; |
311 | |
96ebb307 |
312 | ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); |
313 | if (ret) |
36ea1bd2 |
314 | return -EINVAL; |
315 | |
36ea1bd2 |
316 | if (!amdgpu_ras_is_supported(adev, data.head.block)) |
317 | return -EINVAL; |
318 | |
319 | switch (data.op) { |
320 | case 0: |
321 | ret = amdgpu_ras_feature_enable(adev, &data.head, 0); |
322 | break; |
323 | case 1: |
324 | ret = amdgpu_ras_feature_enable(adev, &data.head, 1); |
325 | break; |
326 | case 2: |
327 | ret = amdgpu_ras_error_inject(adev, &data.inject); |
328 | break; |
96ebb307 |
329 | default: |
330 | ret = -EINVAL; |
331 | break; |
36ea1bd2 |
332 | }; |
333 | |
334 | if (ret) |
335 | return -EINVAL; |
336 | |
337 | return size; |
338 | } |
339 | |
340 | static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { |
341 | .owner = THIS_MODULE, |
342 | .read = NULL, |
343 | .write = amdgpu_ras_debugfs_ctrl_write, |
344 | .llseek = default_llseek |
345 | }; |
346 | |
c030f2e4 |
347 | static ssize_t amdgpu_ras_sysfs_read(struct device *dev, |
348 | struct device_attribute *attr, char *buf) |
349 | { |
350 | struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); |
351 | struct ras_query_if info = { |
352 | .head = obj->head, |
353 | }; |
354 | |
355 | if (amdgpu_ras_error_query(obj->adev, &info)) |
356 | return -EINVAL; |
357 | |
358 | return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", |
359 | "ue", info.ue_count, |
360 | "ce", info.ce_count); |
361 | } |
362 | |
363 | /* obj begin */ |
364 | |
365 | #define get_obj(obj) do { (obj)->use++; } while (0) |
366 | #define alive_obj(obj) ((obj)->use) |
367 | |
368 | static inline void put_obj(struct ras_manager *obj) |
369 | { |
370 | if (obj && --obj->use == 0) |
371 | list_del(&obj->node); |
372 | if (obj && obj->use < 0) { |
373 | DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); |
374 | } |
375 | } |
376 | |
377 | /* make one obj and return it. */ |
378 | static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, |
379 | struct ras_common_if *head) |
380 | { |
381 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
382 | struct ras_manager *obj; |
383 | |
384 | if (!con) |
385 | return NULL; |
386 | |
387 | if (head->block >= AMDGPU_RAS_BLOCK_COUNT) |
388 | return NULL; |
389 | |
390 | obj = &con->objs[head->block]; |
391 | /* already exist. return obj? */ |
392 | if (alive_obj(obj)) |
393 | return NULL; |
394 | |
395 | obj->head = *head; |
396 | obj->adev = adev; |
397 | list_add(&obj->node, &con->head); |
398 | get_obj(obj); |
399 | |
400 | return obj; |
401 | } |
402 | |
403 | /* return an obj equal to head, or the first when head is NULL */ |
404 | static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, |
405 | struct ras_common_if *head) |
406 | { |
407 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
408 | struct ras_manager *obj; |
409 | int i; |
410 | |
411 | if (!con) |
412 | return NULL; |
413 | |
414 | if (head) { |
415 | if (head->block >= AMDGPU_RAS_BLOCK_COUNT) |
416 | return NULL; |
417 | |
418 | obj = &con->objs[head->block]; |
419 | |
420 | if (alive_obj(obj)) { |
421 | WARN_ON(head->block != obj->head.block); |
422 | return obj; |
423 | } |
424 | } else { |
425 | for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { |
426 | obj = &con->objs[i]; |
427 | if (alive_obj(obj)) { |
428 | WARN_ON(i != obj->head.block); |
429 | return obj; |
430 | } |
431 | } |
432 | } |
433 | |
434 | return NULL; |
435 | } |
436 | /* obj end */ |
437 | |
438 | /* feature ctl begin */ |
439 | static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, |
440 | struct ras_common_if *head) |
441 | { |
5caf466a |
442 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
443 | |
444 | return con->hw_supported & BIT(head->block); |
c030f2e4 |
445 | } |
446 | |
447 | static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, |
448 | struct ras_common_if *head) |
449 | { |
450 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
451 | |
452 | return con->features & BIT(head->block); |
453 | } |
454 | |
455 | /* |
456 | * if obj is not created, then create one. |
457 | * set feature enable flag. |
458 | */ |
459 | static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, |
460 | struct ras_common_if *head, int enable) |
461 | { |
462 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
463 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); |
464 | |
5caf466a |
465 | /* If hardware does not support ras, then do not create obj. |
466 | * But if hardware support ras, we can create the obj. |
467 | * Ras framework checks con->hw_supported to see if it need do |
468 | * corresponding initialization. |
469 | * IP checks con->support to see if it need disable ras. |
470 | */ |
c030f2e4 |
471 | if (!amdgpu_ras_is_feature_allowed(adev, head)) |
472 | return 0; |
473 | if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) |
474 | return 0; |
475 | |
476 | if (enable) { |
477 | if (!obj) { |
478 | obj = amdgpu_ras_create_obj(adev, head); |
479 | if (!obj) |
480 | return -EINVAL; |
481 | } else { |
482 | /* In case we create obj somewhere else */ |
483 | get_obj(obj); |
484 | } |
485 | con->features |= BIT(head->block); |
486 | } else { |
487 | if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { |
488 | con->features &= ~BIT(head->block); |
489 | put_obj(obj); |
490 | } |
491 | } |
492 | |
493 | return 0; |
494 | } |
495 | |
496 | /* wrapper of psp_ras_enable_features */ |
497 | int amdgpu_ras_feature_enable(struct amdgpu_device *adev, |
498 | struct ras_common_if *head, bool enable) |
499 | { |
500 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
501 | union ta_ras_cmd_input info; |
502 | int ret; |
503 | |
504 | if (!con) |
505 | return -EINVAL; |
506 | |
507 | if (!enable) { |
508 | info.disable_features = (struct ta_ras_disable_features_input) { |
828cfa29 |
509 | .block_id = amdgpu_ras_block_to_ta(head->block), |
510 | .error_type = amdgpu_ras_error_to_ta(head->type), |
c030f2e4 |
511 | }; |
512 | } else { |
513 | info.enable_features = (struct ta_ras_enable_features_input) { |
828cfa29 |
514 | .block_id = amdgpu_ras_block_to_ta(head->block), |
515 | .error_type = amdgpu_ras_error_to_ta(head->type), |
c030f2e4 |
516 | }; |
517 | } |
518 | |
519 | /* Do not enable if it is not allowed. */ |
520 | WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); |
521 | /* Are we alerady in that state we are going to set? */ |
522 | if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) |
523 | return 0; |
524 | |
525 | ret = psp_ras_enable_features(&adev->psp, &info, enable); |
526 | if (ret) { |
527 | DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", |
528 | enable ? "enable":"disable", |
529 | ras_block_str(head->block), |
530 | ret); |
7af23ebe |
531 | if (ret == TA_RAS_STATUS__RESET_NEEDED) |
532 | return -EAGAIN; |
c030f2e4 |
533 | return -EINVAL; |
534 | } |
535 | |
536 | /* setup the obj */ |
537 | __amdgpu_ras_feature_enable(adev, head, enable); |
538 | |
539 | return 0; |
540 | } |
541 | |
77de502b |
542 | /* Only used in device probe stage and called only once. */ |
543 | int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, |
544 | struct ras_common_if *head, bool enable) |
545 | { |
546 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
547 | int ret; |
548 | |
549 | if (!con) |
550 | return -EINVAL; |
551 | |
552 | if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { |
7af23ebe |
553 | if (enable) { |
554 | /* There is no harm to issue a ras TA cmd regardless of |
555 | * the currecnt ras state. |
556 | * If current state == target state, it will do nothing |
557 | * But sometimes it requests driver to reset and repost |
558 | * with error code -EAGAIN. |
559 | */ |
560 | ret = amdgpu_ras_feature_enable(adev, head, 1); |
561 | /* With old ras TA, we might fail to enable ras. |
562 | * Log it and just setup the object. |
563 | * TODO need remove this WA in the future. |
564 | */ |
565 | if (ret == -EINVAL) { |
566 | ret = __amdgpu_ras_feature_enable(adev, head, 1); |
567 | if (!ret) |
568 | DRM_INFO("RAS INFO: %s setup object\n", |
569 | ras_block_str(head->block)); |
570 | } |
571 | } else { |
572 | /* setup the object then issue a ras TA disable cmd.*/ |
573 | ret = __amdgpu_ras_feature_enable(adev, head, 1); |
574 | if (ret) |
575 | return ret; |
77de502b |
576 | |
77de502b |
577 | ret = amdgpu_ras_feature_enable(adev, head, 0); |
7af23ebe |
578 | } |
77de502b |
579 | } else |
580 | ret = amdgpu_ras_feature_enable(adev, head, enable); |
581 | |
582 | return ret; |
583 | } |
584 | |
c030f2e4 |
585 | static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, |
586 | bool bypass) |
587 | { |
588 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
589 | struct ras_manager *obj, *tmp; |
590 | |
591 | list_for_each_entry_safe(obj, tmp, &con->head, node) { |
592 | /* bypass psp. |
593 | * aka just release the obj and corresponding flags |
594 | */ |
595 | if (bypass) { |
596 | if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) |
597 | break; |
598 | } else { |
599 | if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) |
600 | break; |
601 | } |
289d513b |
602 | } |
c030f2e4 |
603 | |
604 | return con->features; |
605 | } |
606 | |
607 | static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, |
608 | bool bypass) |
609 | { |
610 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
611 | int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; |
612 | int i; |
191051a1 |
613 | const enum amdgpu_ras_error_type default_ras_type = |
614 | AMDGPU_RAS_ERROR__NONE; |
c030f2e4 |
615 | |
616 | for (i = 0; i < ras_block_count; i++) { |
617 | struct ras_common_if head = { |
618 | .block = i, |
191051a1 |
619 | .type = default_ras_type, |
c030f2e4 |
620 | .sub_block_index = 0, |
621 | }; |
622 | strcpy(head.name, ras_block_str(i)); |
623 | if (bypass) { |
624 | /* |
625 | * bypass psp. vbios enable ras for us. |
626 | * so just create the obj |
627 | */ |
628 | if (__amdgpu_ras_feature_enable(adev, &head, 1)) |
629 | break; |
630 | } else { |
631 | if (amdgpu_ras_feature_enable(adev, &head, 1)) |
632 | break; |
633 | } |
289d513b |
634 | } |
c030f2e4 |
635 | |
636 | return con->features; |
637 | } |
638 | /* feature ctl end */ |
639 | |
640 | /* query/inject/cure begin */ |
641 | int amdgpu_ras_error_query(struct amdgpu_device *adev, |
642 | struct ras_query_if *info) |
643 | { |
644 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); |
645 | |
646 | if (!obj) |
647 | return -EINVAL; |
648 | /* TODO might read the register to read the count */ |
649 | |
650 | info->ue_count = obj->err_data.ue_count; |
651 | info->ce_count = obj->err_data.ce_count; |
652 | |
653 | return 0; |
654 | } |
655 | |
656 | /* wrapper of psp_ras_trigger_error */ |
657 | int amdgpu_ras_error_inject(struct amdgpu_device *adev, |
658 | struct ras_inject_if *info) |
659 | { |
660 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); |
661 | struct ta_ras_trigger_error_input block_info = { |
828cfa29 |
662 | .block_id = amdgpu_ras_block_to_ta(info->head.block), |
663 | .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), |
c030f2e4 |
664 | .sub_block_index = info->head.sub_block_index, |
665 | .address = info->address, |
666 | .value = info->value, |
667 | }; |
668 | int ret = 0; |
669 | |
670 | if (!obj) |
671 | return -EINVAL; |
672 | |
673 | ret = psp_ras_trigger_error(&adev->psp, &block_info); |
674 | if (ret) |
675 | DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", |
676 | ras_block_str(info->head.block), |
677 | ret); |
678 | |
679 | return ret; |
680 | } |
681 | |
682 | int amdgpu_ras_error_cure(struct amdgpu_device *adev, |
683 | struct ras_cure_if *info) |
684 | { |
685 | /* psp fw has no cure interface for now. */ |
686 | return 0; |
687 | } |
688 | |
689 | /* get the total error counts on all IPs */ |
690 | int amdgpu_ras_query_error_count(struct amdgpu_device *adev, |
691 | bool is_ce) |
692 | { |
693 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
694 | struct ras_manager *obj; |
695 | struct ras_err_data data = {0, 0}; |
696 | |
697 | if (!con) |
698 | return -EINVAL; |
699 | |
700 | list_for_each_entry(obj, &con->head, node) { |
701 | struct ras_query_if info = { |
702 | .head = obj->head, |
703 | }; |
704 | |
705 | if (amdgpu_ras_error_query(adev, &info)) |
706 | return -EINVAL; |
707 | |
708 | data.ce_count += info.ce_count; |
709 | data.ue_count += info.ue_count; |
710 | } |
711 | |
712 | return is_ce ? data.ce_count : data.ue_count; |
713 | } |
714 | /* query/inject/cure end */ |
715 | |
716 | |
717 | /* sysfs begin */ |
718 | |
466b1793 |
719 | static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, |
720 | struct ras_badpage **bps, unsigned int *count); |
721 | |
722 | static char *amdgpu_ras_badpage_flags_str(unsigned int flags) |
723 | { |
724 | switch (flags) { |
725 | case 0: |
726 | return "R"; |
727 | case 1: |
728 | return "P"; |
729 | case 2: |
730 | default: |
731 | return "F"; |
732 | }; |
733 | } |
734 | |
735 | /* |
736 | * DOC: ras sysfs gpu_vram_bad_pages interface |
737 | * |
738 | * It allows user to read the bad pages of vram on the gpu through |
739 | * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages |
740 | * |
741 | * It outputs multiple lines, and each line stands for one gpu page. |
742 | * |
743 | * The format of one line is below, |
744 | * gpu pfn : gpu page size : flags |
745 | * |
746 | * gpu pfn and gpu page size are printed in hex format. |
747 | * flags can be one of below character, |
748 | * R: reserved, this gpu page is reserved and not able to use. |
749 | * P: pending for reserve, this gpu page is marked as bad, will be reserved |
750 | * in next window of page_reserve. |
751 | * F: unable to reserve. this gpu page can't be reserved due to some reasons. |
752 | * |
753 | * examples: |
754 | * 0x00000001 : 0x00001000 : R |
755 | * 0x00000002 : 0x00001000 : P |
756 | */ |
757 | |
758 | static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, |
759 | struct kobject *kobj, struct bin_attribute *attr, |
760 | char *buf, loff_t ppos, size_t count) |
761 | { |
762 | struct amdgpu_ras *con = |
763 | container_of(attr, struct amdgpu_ras, badpages_attr); |
764 | struct amdgpu_device *adev = con->adev; |
765 | const unsigned int element_size = |
766 | sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; |
767 | unsigned int start = (ppos + element_size - 1) / element_size; |
768 | unsigned int end = (ppos + count - 1) / element_size; |
769 | ssize_t s = 0; |
770 | struct ras_badpage *bps = NULL; |
771 | unsigned int bps_count = 0; |
772 | |
773 | memset(buf, 0, count); |
774 | |
775 | if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) |
776 | return 0; |
777 | |
778 | for (; start < end && start < bps_count; start++) |
779 | s += scnprintf(&buf[s], element_size + 1, |
780 | "0x%08x : 0x%08x : %1s\n", |
781 | bps[start].bp, |
782 | bps[start].size, |
783 | amdgpu_ras_badpage_flags_str(bps[start].flags)); |
784 | |
785 | kfree(bps); |
786 | |
787 | return s; |
788 | } |
789 | |
c030f2e4 |
790 | static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, |
791 | struct device_attribute *attr, char *buf) |
792 | { |
793 | struct amdgpu_ras *con = |
794 | container_of(attr, struct amdgpu_ras, features_attr); |
795 | struct drm_device *ddev = dev_get_drvdata(dev); |
796 | struct amdgpu_device *adev = ddev->dev_private; |
797 | struct ras_common_if head; |
798 | int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; |
799 | int i; |
800 | ssize_t s; |
801 | struct ras_manager *obj; |
802 | |
803 | s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); |
804 | |
805 | for (i = 0; i < ras_block_count; i++) { |
806 | head.block = i; |
807 | |
808 | if (amdgpu_ras_is_feature_enabled(adev, &head)) { |
809 | obj = amdgpu_ras_find_obj(adev, &head); |
810 | s += scnprintf(&buf[s], PAGE_SIZE - s, |
811 | "%s: %s\n", |
812 | ras_block_str(i), |
813 | ras_err_str(obj->head.type)); |
814 | } else |
815 | s += scnprintf(&buf[s], PAGE_SIZE - s, |
816 | "%s: disabled\n", |
817 | ras_block_str(i)); |
818 | } |
819 | |
820 | return s; |
821 | } |
822 | |
823 | static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) |
824 | { |
825 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
826 | struct attribute *attrs[] = { |
827 | &con->features_attr.attr, |
828 | NULL |
829 | }; |
466b1793 |
830 | struct bin_attribute *bin_attrs[] = { |
831 | &con->badpages_attr, |
832 | NULL |
833 | }; |
c030f2e4 |
834 | struct attribute_group group = { |
835 | .name = "ras", |
836 | .attrs = attrs, |
466b1793 |
837 | .bin_attrs = bin_attrs, |
c030f2e4 |
838 | }; |
839 | |
840 | con->features_attr = (struct device_attribute) { |
841 | .attr = { |
842 | .name = "features", |
843 | .mode = S_IRUGO, |
844 | }, |
845 | .show = amdgpu_ras_sysfs_features_read, |
846 | }; |
466b1793 |
847 | |
848 | con->badpages_attr = (struct bin_attribute) { |
849 | .attr = { |
850 | .name = "gpu_vram_bad_pages", |
851 | .mode = S_IRUGO, |
852 | }, |
853 | .size = 0, |
854 | .private = NULL, |
855 | .read = amdgpu_ras_sysfs_badpages_read, |
856 | }; |
857 | |
163def43 |
858 | sysfs_attr_init(attrs[0]); |
466b1793 |
859 | sysfs_bin_attr_init(bin_attrs[0]); |
c030f2e4 |
860 | |
861 | return sysfs_create_group(&adev->dev->kobj, &group); |
862 | } |
863 | |
864 | static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) |
865 | { |
866 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
867 | struct attribute *attrs[] = { |
868 | &con->features_attr.attr, |
869 | NULL |
870 | }; |
466b1793 |
871 | struct bin_attribute *bin_attrs[] = { |
872 | &con->badpages_attr, |
873 | NULL |
874 | }; |
c030f2e4 |
875 | struct attribute_group group = { |
876 | .name = "ras", |
877 | .attrs = attrs, |
466b1793 |
878 | .bin_attrs = bin_attrs, |
c030f2e4 |
879 | }; |
880 | |
881 | sysfs_remove_group(&adev->dev->kobj, &group); |
882 | |
883 | return 0; |
884 | } |
885 | |
886 | int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, |
887 | struct ras_fs_if *head) |
888 | { |
889 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); |
890 | |
891 | if (!obj || obj->attr_inuse) |
892 | return -EINVAL; |
893 | |
894 | get_obj(obj); |
895 | |
896 | memcpy(obj->fs_data.sysfs_name, |
897 | head->sysfs_name, |
898 | sizeof(obj->fs_data.sysfs_name)); |
899 | |
900 | obj->sysfs_attr = (struct device_attribute){ |
901 | .attr = { |
902 | .name = obj->fs_data.sysfs_name, |
903 | .mode = S_IRUGO, |
904 | }, |
905 | .show = amdgpu_ras_sysfs_read, |
906 | }; |
163def43 |
907 | sysfs_attr_init(&obj->sysfs_attr.attr); |
c030f2e4 |
908 | |
909 | if (sysfs_add_file_to_group(&adev->dev->kobj, |
910 | &obj->sysfs_attr.attr, |
911 | "ras")) { |
912 | put_obj(obj); |
913 | return -EINVAL; |
914 | } |
915 | |
916 | obj->attr_inuse = 1; |
917 | |
918 | return 0; |
919 | } |
920 | |
921 | int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, |
922 | struct ras_common_if *head) |
923 | { |
924 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); |
925 | |
926 | if (!obj || !obj->attr_inuse) |
927 | return -EINVAL; |
928 | |
929 | sysfs_remove_file_from_group(&adev->dev->kobj, |
930 | &obj->sysfs_attr.attr, |
931 | "ras"); |
932 | obj->attr_inuse = 0; |
933 | put_obj(obj); |
934 | |
935 | return 0; |
936 | } |
937 | |
938 | static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) |
939 | { |
940 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
941 | struct ras_manager *obj, *tmp; |
942 | |
943 | list_for_each_entry_safe(obj, tmp, &con->head, node) { |
944 | amdgpu_ras_sysfs_remove(adev, &obj->head); |
945 | } |
946 | |
947 | amdgpu_ras_sysfs_remove_feature_node(adev); |
948 | |
949 | return 0; |
950 | } |
951 | /* sysfs end */ |
952 | |
953 | /* debugfs begin */ |
36ea1bd2 |
954 | static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) |
955 | { |
956 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
957 | struct drm_minor *minor = adev->ddev->primary; |
958 | struct dentry *root = minor->debugfs_root, *dir; |
959 | struct dentry *ent; |
960 | |
961 | dir = debugfs_create_dir("ras", root); |
962 | if (IS_ERR(dir)) |
963 | return -EINVAL; |
964 | |
965 | con->dir = dir; |
966 | |
967 | ent = debugfs_create_file("ras_ctrl", |
968 | S_IWUGO | S_IRUGO, con->dir, |
969 | adev, &amdgpu_ras_debugfs_ctrl_ops); |
970 | if (IS_ERR(ent)) { |
971 | debugfs_remove(con->dir); |
972 | return -EINVAL; |
973 | } |
974 | |
975 | con->ent = ent; |
976 | return 0; |
977 | } |
978 | |
c030f2e4 |
979 | int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, |
980 | struct ras_fs_if *head) |
981 | { |
982 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
983 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); |
984 | struct dentry *ent; |
985 | |
986 | if (!obj || obj->ent) |
987 | return -EINVAL; |
988 | |
989 | get_obj(obj); |
990 | |
991 | memcpy(obj->fs_data.debugfs_name, |
992 | head->debugfs_name, |
993 | sizeof(obj->fs_data.debugfs_name)); |
994 | |
995 | ent = debugfs_create_file(obj->fs_data.debugfs_name, |
996 | S_IWUGO | S_IRUGO, con->dir, |
997 | obj, &amdgpu_ras_debugfs_ops); |
998 | |
999 | if (IS_ERR(ent)) |
1000 | return -EINVAL; |
1001 | |
1002 | obj->ent = ent; |
1003 | |
1004 | return 0; |
1005 | } |
1006 | |
1007 | int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, |
1008 | struct ras_common_if *head) |
1009 | { |
1010 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); |
1011 | |
1012 | if (!obj || !obj->ent) |
1013 | return 0; |
1014 | |
1015 | debugfs_remove(obj->ent); |
1016 | obj->ent = NULL; |
1017 | put_obj(obj); |
1018 | |
1019 | return 0; |
1020 | } |
1021 | |
1022 | static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) |
1023 | { |
1024 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1025 | struct ras_manager *obj, *tmp; |
1026 | |
1027 | list_for_each_entry_safe(obj, tmp, &con->head, node) { |
1028 | amdgpu_ras_debugfs_remove(adev, &obj->head); |
1029 | } |
1030 | |
36ea1bd2 |
1031 | debugfs_remove(con->ent); |
c030f2e4 |
1032 | debugfs_remove(con->dir); |
1033 | con->dir = NULL; |
36ea1bd2 |
1034 | con->ent = NULL; |
c030f2e4 |
1035 | |
1036 | return 0; |
1037 | } |
1038 | /* debugfs end */ |
1039 | |
1040 | /* ras fs */ |
1041 | |
1042 | static int amdgpu_ras_fs_init(struct amdgpu_device *adev) |
1043 | { |
c030f2e4 |
1044 | amdgpu_ras_sysfs_create_feature_node(adev); |
36ea1bd2 |
1045 | amdgpu_ras_debugfs_create_ctrl_node(adev); |
c030f2e4 |
1046 | |
1047 | return 0; |
1048 | } |
1049 | |
1050 | static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) |
1051 | { |
1052 | amdgpu_ras_debugfs_remove_all(adev); |
1053 | amdgpu_ras_sysfs_remove_all(adev); |
1054 | return 0; |
1055 | } |
1056 | /* ras fs end */ |
1057 | |
1058 | /* ih begin */ |
1059 | static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) |
1060 | { |
1061 | struct ras_ih_data *data = &obj->ih_data; |
1062 | struct amdgpu_iv_entry entry; |
1063 | int ret; |
1064 | |
1065 | while (data->rptr != data->wptr) { |
1066 | rmb(); |
1067 | memcpy(&entry, &data->ring[data->rptr], |
1068 | data->element_size); |
1069 | |
1070 | wmb(); |
1071 | data->rptr = (data->aligned_element_size + |
1072 | data->rptr) % data->ring_size; |
1073 | |
1074 | /* Let IP handle its data, maybe we need get the output |
1075 | * from the callback to udpate the error type/count, etc |
1076 | */ |
1077 | if (data->cb) { |
1078 | ret = data->cb(obj->adev, &entry); |
1079 | /* ue will trigger an interrupt, and in that case |
1080 | * we need do a reset to recovery the whole system. |
1081 | * But leave IP do that recovery, here we just dispatch |
1082 | * the error. |
1083 | */ |
1084 | if (ret == AMDGPU_RAS_UE) { |
1085 | obj->err_data.ue_count++; |
1086 | } |
1087 | /* Might need get ce count by register, but not all IP |
1088 | * saves ce count, some IP just use one bit or two bits |
1089 | * to indicate ce happened. |
1090 | */ |
1091 | } |
1092 | } |
1093 | } |
1094 | |
1095 | static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) |
1096 | { |
1097 | struct ras_ih_data *data = |
1098 | container_of(work, struct ras_ih_data, ih_work); |
1099 | struct ras_manager *obj = |
1100 | container_of(data, struct ras_manager, ih_data); |
1101 | |
1102 | amdgpu_ras_interrupt_handler(obj); |
1103 | } |
1104 | |
1105 | int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, |
1106 | struct ras_dispatch_if *info) |
1107 | { |
1108 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); |
1109 | struct ras_ih_data *data = &obj->ih_data; |
1110 | |
1111 | if (!obj) |
1112 | return -EINVAL; |
1113 | |
1114 | if (data->inuse == 0) |
1115 | return 0; |
1116 | |
1117 | /* Might be overflow... */ |
1118 | memcpy(&data->ring[data->wptr], info->entry, |
1119 | data->element_size); |
1120 | |
1121 | wmb(); |
1122 | data->wptr = (data->aligned_element_size + |
1123 | data->wptr) % data->ring_size; |
1124 | |
1125 | schedule_work(&data->ih_work); |
1126 | |
1127 | return 0; |
1128 | } |
1129 | |
1130 | int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, |
1131 | struct ras_ih_if *info) |
1132 | { |
1133 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); |
1134 | struct ras_ih_data *data; |
1135 | |
1136 | if (!obj) |
1137 | return -EINVAL; |
1138 | |
1139 | data = &obj->ih_data; |
1140 | if (data->inuse == 0) |
1141 | return 0; |
1142 | |
1143 | cancel_work_sync(&data->ih_work); |
1144 | |
1145 | kfree(data->ring); |
1146 | memset(data, 0, sizeof(*data)); |
1147 | put_obj(obj); |
1148 | |
1149 | return 0; |
1150 | } |
1151 | |
1152 | int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, |
1153 | struct ras_ih_if *info) |
1154 | { |
1155 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); |
1156 | struct ras_ih_data *data; |
1157 | |
1158 | if (!obj) { |
1159 | /* in case we registe the IH before enable ras feature */ |
1160 | obj = amdgpu_ras_create_obj(adev, &info->head); |
1161 | if (!obj) |
1162 | return -EINVAL; |
1163 | } else |
1164 | get_obj(obj); |
1165 | |
1166 | data = &obj->ih_data; |
1167 | /* add the callback.etc */ |
1168 | *data = (struct ras_ih_data) { |
1169 | .inuse = 0, |
1170 | .cb = info->cb, |
1171 | .element_size = sizeof(struct amdgpu_iv_entry), |
1172 | .rptr = 0, |
1173 | .wptr = 0, |
1174 | }; |
1175 | |
1176 | INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); |
1177 | |
1178 | data->aligned_element_size = ALIGN(data->element_size, 8); |
1179 | /* the ring can store 64 iv entries. */ |
1180 | data->ring_size = 64 * data->aligned_element_size; |
1181 | data->ring = kmalloc(data->ring_size, GFP_KERNEL); |
1182 | if (!data->ring) { |
1183 | put_obj(obj); |
1184 | return -ENOMEM; |
1185 | } |
1186 | |
1187 | /* IH is ready */ |
1188 | data->inuse = 1; |
1189 | |
1190 | return 0; |
1191 | } |
1192 | |
1193 | static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) |
1194 | { |
1195 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1196 | struct ras_manager *obj, *tmp; |
1197 | |
1198 | list_for_each_entry_safe(obj, tmp, &con->head, node) { |
1199 | struct ras_ih_if info = { |
1200 | .head = obj->head, |
1201 | }; |
1202 | amdgpu_ras_interrupt_remove_handler(adev, &info); |
1203 | } |
1204 | |
1205 | return 0; |
1206 | } |
1207 | /* ih end */ |
1208 | |
1209 | /* recovery begin */ |
466b1793 |
1210 | |
1211 | /* return 0 on success. |
1212 | * caller need free bps. |
1213 | */ |
1214 | static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, |
1215 | struct ras_badpage **bps, unsigned int *count) |
1216 | { |
1217 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1218 | struct ras_err_handler_data *data; |
1219 | int i = 0; |
1220 | int ret = 0; |
1221 | |
1222 | if (!con || !con->eh_data || !bps || !count) |
1223 | return -EINVAL; |
1224 | |
1225 | mutex_lock(&con->recovery_lock); |
1226 | data = con->eh_data; |
1227 | if (!data || data->count == 0) { |
1228 | *bps = NULL; |
1229 | goto out; |
1230 | } |
1231 | |
1232 | *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); |
1233 | if (!*bps) { |
1234 | ret = -ENOMEM; |
1235 | goto out; |
1236 | } |
1237 | |
1238 | for (; i < data->count; i++) { |
1239 | (*bps)[i] = (struct ras_badpage){ |
1240 | .bp = data->bps[i].bp, |
1241 | .size = AMDGPU_GPU_PAGE_SIZE, |
1242 | .flags = 0, |
1243 | }; |
1244 | |
1245 | if (data->last_reserved <= i) |
1246 | (*bps)[i].flags = 1; |
1247 | else if (data->bps[i].bo == NULL) |
1248 | (*bps)[i].flags = 2; |
1249 | } |
1250 | |
1251 | *count = data->count; |
1252 | out: |
1253 | mutex_unlock(&con->recovery_lock); |
1254 | return ret; |
1255 | } |
1256 | |
c030f2e4 |
1257 | static void amdgpu_ras_do_recovery(struct work_struct *work) |
1258 | { |
1259 | struct amdgpu_ras *ras = |
1260 | container_of(work, struct amdgpu_ras, recovery_work); |
1261 | |
1262 | amdgpu_device_gpu_recover(ras->adev, 0); |
1263 | atomic_set(&ras->in_recovery, 0); |
1264 | } |
1265 | |
1266 | static int amdgpu_ras_release_vram(struct amdgpu_device *adev, |
1267 | struct amdgpu_bo **bo_ptr) |
1268 | { |
1269 | /* no need to free it actually. */ |
1270 | amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); |
1271 | return 0; |
1272 | } |
1273 | |
1274 | /* reserve vram with size@offset */ |
1275 | static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, |
1276 | uint64_t offset, uint64_t size, |
1277 | struct amdgpu_bo **bo_ptr) |
1278 | { |
1279 | struct ttm_operation_ctx ctx = { false, false }; |
1280 | struct amdgpu_bo_param bp; |
1281 | int r = 0; |
1282 | int i; |
1283 | struct amdgpu_bo *bo; |
1284 | |
1285 | if (bo_ptr) |
1286 | *bo_ptr = NULL; |
1287 | memset(&bp, 0, sizeof(bp)); |
1288 | bp.size = size; |
1289 | bp.byte_align = PAGE_SIZE; |
1290 | bp.domain = AMDGPU_GEM_DOMAIN_VRAM; |
1291 | bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | |
1292 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; |
1293 | bp.type = ttm_bo_type_kernel; |
1294 | bp.resv = NULL; |
1295 | |
1296 | r = amdgpu_bo_create(adev, &bp, &bo); |
1297 | if (r) |
1298 | return -EINVAL; |
1299 | |
1300 | r = amdgpu_bo_reserve(bo, false); |
1301 | if (r) |
1302 | goto error_reserve; |
1303 | |
1304 | offset = ALIGN(offset, PAGE_SIZE); |
1305 | for (i = 0; i < bo->placement.num_placement; ++i) { |
1306 | bo->placements[i].fpfn = offset >> PAGE_SHIFT; |
1307 | bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; |
1308 | } |
1309 | |
1310 | ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); |
1311 | r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); |
1312 | if (r) |
1313 | goto error_pin; |
1314 | |
1315 | r = amdgpu_bo_pin_restricted(bo, |
1316 | AMDGPU_GEM_DOMAIN_VRAM, |
1317 | offset, |
1318 | offset + size); |
1319 | if (r) |
1320 | goto error_pin; |
1321 | |
1322 | if (bo_ptr) |
1323 | *bo_ptr = bo; |
1324 | |
1325 | amdgpu_bo_unreserve(bo); |
1326 | return r; |
1327 | |
1328 | error_pin: |
1329 | amdgpu_bo_unreserve(bo); |
1330 | error_reserve: |
1331 | amdgpu_bo_unref(&bo); |
1332 | return r; |
1333 | } |
1334 | |
1335 | /* alloc/realloc bps array */ |
1336 | static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, |
1337 | struct ras_err_handler_data *data, int pages) |
1338 | { |
1339 | unsigned int old_space = data->count + data->space_left; |
1340 | unsigned int new_space = old_space + pages; |
1341 | unsigned int align_space = ALIGN(new_space, 1024); |
1342 | void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); |
1343 | |
1344 | if (!tmp) |
1345 | return -ENOMEM; |
1346 | |
1347 | if (data->bps) { |
1348 | memcpy(tmp, data->bps, |
1349 | data->count * sizeof(*data->bps)); |
1350 | kfree(data->bps); |
1351 | } |
1352 | |
1353 | data->bps = tmp; |
1354 | data->space_left += align_space - old_space; |
1355 | return 0; |
1356 | } |
1357 | |
1358 | /* it deal with vram only. */ |
1359 | int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, |
1360 | unsigned long *bps, int pages) |
1361 | { |
1362 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
73aa8e1a |
1363 | struct ras_err_handler_data *data; |
c030f2e4 |
1364 | int i = pages; |
1365 | int ret = 0; |
1366 | |
73aa8e1a |
1367 | if (!con || !con->eh_data || !bps || pages <= 0) |
c030f2e4 |
1368 | return 0; |
1369 | |
1370 | mutex_lock(&con->recovery_lock); |
73aa8e1a |
1371 | data = con->eh_data; |
c030f2e4 |
1372 | if (!data) |
1373 | goto out; |
1374 | |
1375 | if (data->space_left <= pages) |
1376 | if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { |
1377 | ret = -ENOMEM; |
1378 | goto out; |
1379 | } |
1380 | |
1381 | while (i--) |
1382 | data->bps[data->count++].bp = bps[i]; |
1383 | |
1384 | data->space_left -= pages; |
1385 | out: |
1386 | mutex_unlock(&con->recovery_lock); |
1387 | |
1388 | return ret; |
1389 | } |
1390 | |
1391 | /* called in gpu recovery/init */ |
1392 | int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) |
1393 | { |
1394 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
73aa8e1a |
1395 | struct ras_err_handler_data *data; |
c030f2e4 |
1396 | uint64_t bp; |
1397 | struct amdgpu_bo *bo; |
1398 | int i; |
1399 | |
73aa8e1a |
1400 | if (!con || !con->eh_data) |
c030f2e4 |
1401 | return 0; |
1402 | |
1403 | mutex_lock(&con->recovery_lock); |
73aa8e1a |
1404 | data = con->eh_data; |
1405 | if (!data) |
1406 | goto out; |
c030f2e4 |
1407 | /* reserve vram at driver post stage. */ |
1408 | for (i = data->last_reserved; i < data->count; i++) { |
1409 | bp = data->bps[i].bp; |
1410 | |
1411 | if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, |
1412 | PAGE_SIZE, &bo)) |
1413 | DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); |
1414 | |
1415 | data->bps[i].bo = bo; |
1416 | data->last_reserved = i + 1; |
1417 | } |
73aa8e1a |
1418 | out: |
c030f2e4 |
1419 | mutex_unlock(&con->recovery_lock); |
1420 | return 0; |
1421 | } |
1422 | |
1423 | /* called when driver unload */ |
1424 | static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) |
1425 | { |
1426 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
73aa8e1a |
1427 | struct ras_err_handler_data *data; |
c030f2e4 |
1428 | struct amdgpu_bo *bo; |
1429 | int i; |
1430 | |
73aa8e1a |
1431 | if (!con || !con->eh_data) |
c030f2e4 |
1432 | return 0; |
1433 | |
1434 | mutex_lock(&con->recovery_lock); |
73aa8e1a |
1435 | data = con->eh_data; |
1436 | if (!data) |
1437 | goto out; |
1438 | |
c030f2e4 |
1439 | for (i = data->last_reserved - 1; i >= 0; i--) { |
1440 | bo = data->bps[i].bo; |
1441 | |
1442 | amdgpu_ras_release_vram(adev, &bo); |
1443 | |
1444 | data->bps[i].bo = bo; |
1445 | data->last_reserved = i; |
1446 | } |
73aa8e1a |
1447 | out: |
c030f2e4 |
1448 | mutex_unlock(&con->recovery_lock); |
1449 | return 0; |
1450 | } |
1451 | |
1452 | static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) |
1453 | { |
1454 | /* TODO |
1455 | * write the array to eeprom when SMU disabled. |
1456 | */ |
1457 | return 0; |
1458 | } |
1459 | |
1460 | static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) |
1461 | { |
1462 | /* TODO |
1463 | * read the array to eeprom when SMU disabled. |
1464 | */ |
1465 | return 0; |
1466 | } |
1467 | |
1468 | static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) |
1469 | { |
1470 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1471 | struct ras_err_handler_data **data = &con->eh_data; |
1472 | |
1473 | *data = kmalloc(sizeof(**data), |
1474 | GFP_KERNEL|__GFP_ZERO); |
1475 | if (!*data) |
1476 | return -ENOMEM; |
1477 | |
1478 | mutex_init(&con->recovery_lock); |
1479 | INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); |
1480 | atomic_set(&con->in_recovery, 0); |
1481 | con->adev = adev; |
1482 | |
1483 | amdgpu_ras_load_bad_pages(adev); |
1484 | amdgpu_ras_reserve_bad_pages(adev); |
1485 | |
1486 | return 0; |
1487 | } |
1488 | |
1489 | static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) |
1490 | { |
1491 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1492 | struct ras_err_handler_data *data = con->eh_data; |
1493 | |
1494 | cancel_work_sync(&con->recovery_work); |
1495 | amdgpu_ras_save_bad_pages(adev); |
1496 | amdgpu_ras_release_bad_pages(adev); |
1497 | |
1498 | mutex_lock(&con->recovery_lock); |
1499 | con->eh_data = NULL; |
1500 | kfree(data->bps); |
1501 | kfree(data); |
1502 | mutex_unlock(&con->recovery_lock); |
1503 | |
1504 | return 0; |
1505 | } |
1506 | /* recovery end */ |
1507 | |
a564808e |
1508 | /* return 0 if ras will reset gpu and repost.*/ |
1509 | int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, |
1510 | unsigned int block) |
1511 | { |
1512 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
1513 | |
1514 | if (!ras) |
1515 | return -EINVAL; |
1516 | |
1517 | ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; |
1518 | return 0; |
1519 | } |
1520 | |
5caf466a |
1521 | /* |
1522 | * check hardware's ras ability which will be saved in hw_supported. |
1523 | * if hardware does not support ras, we can skip some ras initializtion and |
1524 | * forbid some ras operations from IP. |
1525 | * if software itself, say boot parameter, limit the ras ability. We still |
1526 | * need allow IP do some limited operations, like disable. In such case, |
1527 | * we have to initialize ras as normal. but need check if operation is |
1528 | * allowed or not in each function. |
1529 | */ |
1530 | static void amdgpu_ras_check_supported(struct amdgpu_device *adev, |
1531 | uint32_t *hw_supported, uint32_t *supported) |
c030f2e4 |
1532 | { |
5caf466a |
1533 | *hw_supported = 0; |
1534 | *supported = 0; |
c030f2e4 |
1535 | |
5caf466a |
1536 | if (amdgpu_sriov_vf(adev) || |
b404ae82 |
1537 | adev->asic_type != CHIP_VEGA20) |
5caf466a |
1538 | return; |
b404ae82 |
1539 | |
5d0f903f |
1540 | if (adev->is_atom_fw && |
1541 | (amdgpu_atomfirmware_mem_ecc_supported(adev) || |
1542 | amdgpu_atomfirmware_sram_ecc_supported(adev))) |
5caf466a |
1543 | *hw_supported = AMDGPU_RAS_BLOCK_MASK; |
b404ae82 |
1544 | |
5caf466a |
1545 | *supported = amdgpu_ras_enable == 0 ? |
1546 | 0 : *hw_supported & amdgpu_ras_mask; |
c030f2e4 |
1547 | } |
1548 | |
1549 | int amdgpu_ras_init(struct amdgpu_device *adev) |
1550 | { |
1551 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
c030f2e4 |
1552 | |
b404ae82 |
1553 | if (con) |
c030f2e4 |
1554 | return 0; |
1555 | |
1556 | con = kmalloc(sizeof(struct amdgpu_ras) + |
1557 | sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, |
1558 | GFP_KERNEL|__GFP_ZERO); |
1559 | if (!con) |
1560 | return -ENOMEM; |
1561 | |
1562 | con->objs = (struct ras_manager *)(con + 1); |
1563 | |
1564 | amdgpu_ras_set_context(adev, con); |
1565 | |
5caf466a |
1566 | amdgpu_ras_check_supported(adev, &con->hw_supported, |
1567 | &con->supported); |
c030f2e4 |
1568 | con->features = 0; |
1569 | INIT_LIST_HEAD(&con->head); |
108c6a63 |
1570 | /* Might need get this flag from vbios. */ |
1571 | con->flags = RAS_DEFAULT_FLAGS; |
c030f2e4 |
1572 | |
1573 | if (amdgpu_ras_recovery_init(adev)) |
1574 | goto recovery_out; |
1575 | |
1576 | amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; |
1577 | |
c030f2e4 |
1578 | if (amdgpu_ras_fs_init(adev)) |
1579 | goto fs_out; |
1580 | |
1581 | amdgpu_ras_self_test(adev); |
5d0f903f |
1582 | |
1583 | DRM_INFO("RAS INFO: ras initialized successfully, " |
1584 | "hardware ability[%x] ras_mask[%x]\n", |
1585 | con->hw_supported, con->supported); |
c030f2e4 |
1586 | return 0; |
1587 | fs_out: |
1588 | amdgpu_ras_recovery_fini(adev); |
1589 | recovery_out: |
1590 | amdgpu_ras_set_context(adev, NULL); |
1591 | kfree(con); |
1592 | |
1593 | return -EINVAL; |
1594 | } |
1595 | |
a564808e |
1596 | /* do some init work after IP late init as dependence. |
1597 | * TODO |
1598 | * gpu reset will re-enable ras, need fint out one way to run it again. |
1599 | * for now, if a gpu reset happened, unless IP enable its ras, the ras state |
1600 | * will be showed as disabled. |
1601 | */ |
108c6a63 |
1602 | void amdgpu_ras_post_init(struct amdgpu_device *adev) |
1603 | { |
1604 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1605 | struct ras_manager *obj, *tmp; |
1606 | |
1607 | if (!con) |
1608 | return; |
1609 | |
108c6a63 |
1610 | if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { |
191051a1 |
1611 | /* Set up all other IPs which are not implemented. There is a |
1612 | * tricky thing that IP's actual ras error type should be |
1613 | * MULTI_UNCORRECTABLE, but as driver does not handle it, so |
1614 | * ERROR_NONE make sense anyway. |
1615 | */ |
1616 | amdgpu_ras_enable_all_features(adev, 1); |
1617 | |
1618 | /* We enable ras on all hw_supported block, but as boot |
1619 | * parameter might disable some of them and one or more IP has |
1620 | * not implemented yet. So we disable them on behalf. |
1621 | */ |
108c6a63 |
1622 | list_for_each_entry_safe(obj, tmp, &con->head, node) { |
1623 | if (!amdgpu_ras_is_supported(adev, obj->head.block)) { |
1624 | amdgpu_ras_feature_enable(adev, &obj->head, 0); |
1625 | /* there should be no any reference. */ |
1626 | WARN_ON(alive_obj(obj)); |
1627 | } |
191051a1 |
1628 | } |
108c6a63 |
1629 | } |
a564808e |
1630 | |
1631 | if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { |
1632 | con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; |
1633 | /* setup ras obj state as disabled. |
1634 | * for init_by_vbios case. |
1635 | * if we want to enable ras, just enable it in a normal way. |
1636 | * If we want do disable it, need setup ras obj as enabled, |
1637 | * then issue another TA disable cmd. |
1638 | * See feature_enable_on_boot |
1639 | */ |
1640 | amdgpu_ras_disable_all_features(adev, 1); |
1641 | amdgpu_ras_reset_gpu(adev, 0); |
1642 | } |
108c6a63 |
1643 | } |
1644 | |
c030f2e4 |
1645 | /* do some fini work before IP fini as dependence */ |
1646 | int amdgpu_ras_pre_fini(struct amdgpu_device *adev) |
1647 | { |
1648 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1649 | |
1650 | if (!con) |
1651 | return 0; |
1652 | |
1653 | /* Need disable ras on all IPs here before ip [hw/sw]fini */ |
1654 | amdgpu_ras_disable_all_features(adev, 0); |
1655 | amdgpu_ras_recovery_fini(adev); |
1656 | return 0; |
1657 | } |
1658 | |
1659 | int amdgpu_ras_fini(struct amdgpu_device *adev) |
1660 | { |
1661 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1662 | |
1663 | if (!con) |
1664 | return 0; |
1665 | |
1666 | amdgpu_ras_fs_fini(adev); |
1667 | amdgpu_ras_interrupt_remove_all(adev); |
1668 | |
1669 | WARN(con->features, "Feature mask is not cleared"); |
1670 | |
1671 | if (con->features) |
1672 | amdgpu_ras_disable_all_features(adev, 1); |
1673 | |
1674 | amdgpu_ras_set_context(adev, NULL); |
1675 | kfree(con); |
1676 | |
1677 | return 0; |
1678 | } |