Commit | Line | Data |
---|---|---|
fb30fc59 SL |
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/list.h> | |
25 | #include "amdgpu.h" | |
5183411b | 26 | #include "amdgpu_xgmi.h" |
029fbd43 | 27 | #include "amdgpu_ras.h" |
18f36157 | 28 | #include "soc15.h" |
24f9aacf | 29 | #include "df/df_3_6_offset.h" |
18f36157 HZ |
30 | #include "xgmi/xgmi_4_0_0_smn.h" |
31 | #include "xgmi/xgmi_4_0_0_sh_mask.h" | |
32 | #include "wafl/wafl2_4_0_0_smn.h" | |
33 | #include "wafl/wafl2_4_0_0_sh_mask.h" | |
fb30fc59 | 34 | |
cfbb6b00 AG |
35 | #include "amdgpu_reset.h" |
36 | ||
3c4ff2dc JC |
37 | #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c |
38 | #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 | |
39 | ||
fb30fc59 SL |
40 | static DEFINE_MUTEX(xgmi_mutex); |
41 | ||
fb30fc59 SL |
42 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 |
43 | ||
d95e8e97 | 44 | static LIST_HEAD(xgmi_hive_list); |
fb30fc59 | 45 | |
18f36157 HZ |
46 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
47 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
48 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
49 | }; | |
50 | ||
51 | static const int wafl_pcs_err_status_reg_vg20[] = { | |
52 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
53 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
54 | }; | |
55 | ||
a61f41b1 HZ |
56 | static const int xgmi_pcs_err_status_reg_arct[] = { |
57 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
58 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
59 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, | |
60 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, | |
61 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, | |
62 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, | |
63 | }; | |
64 | ||
65 | /* same as vg20*/ | |
66 | static const int wafl_pcs_err_status_reg_arct[] = { | |
67 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
68 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
69 | }; | |
70 | ||
3c4ff2dc JC |
71 | static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { |
72 | smnPCS_XGMI3X16_PCS_ERROR_STATUS, | |
73 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, | |
74 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, | |
75 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, | |
76 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, | |
77 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, | |
78 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, | |
79 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 | |
80 | }; | |
81 | ||
82 | static const int walf_pcs_err_status_reg_aldebaran[] = { | |
83 | smnPCS_GOPX1_PCS_ERROR_STATUS, | |
84 | smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 | |
85 | }; | |
86 | ||
18f36157 HZ |
87 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
88 | {"XGMI PCS DataLossErr", | |
89 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, | |
90 | {"XGMI PCS TrainingErr", | |
91 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, | |
92 | {"XGMI PCS CRCErr", | |
93 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, | |
94 | {"XGMI PCS BERExceededErr", | |
95 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, | |
96 | {"XGMI PCS TxMetaDataErr", | |
97 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
98 | {"XGMI PCS ReplayBufParityErr", | |
99 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
100 | {"XGMI PCS DataParityErr", | |
101 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, | |
102 | {"XGMI PCS ReplayFifoOverflowErr", | |
103 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
104 | {"XGMI PCS ReplayFifoUnderflowErr", | |
105 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
106 | {"XGMI PCS ElasticFifoOverflowErr", | |
107 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
108 | {"XGMI PCS DeskewErr", | |
109 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, | |
110 | {"XGMI PCS DataStartupLimitErr", | |
111 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
112 | {"XGMI PCS FCInitTimeoutErr", | |
113 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
114 | {"XGMI PCS RecoveryTimeoutErr", | |
115 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
116 | {"XGMI PCS ReadySerialTimeoutErr", | |
117 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
118 | {"XGMI PCS ReadySerialAttemptErr", | |
119 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
120 | {"XGMI PCS RecoveryAttemptErr", | |
121 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
122 | {"XGMI PCS RecoveryRelockAttemptErr", | |
123 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
124 | }; | |
125 | ||
126 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { | |
127 | {"WAFL PCS DataLossErr", | |
128 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, | |
129 | {"WAFL PCS TrainingErr", | |
130 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, | |
131 | {"WAFL PCS CRCErr", | |
132 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, | |
133 | {"WAFL PCS BERExceededErr", | |
134 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, | |
135 | {"WAFL PCS TxMetaDataErr", | |
136 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
137 | {"WAFL PCS ReplayBufParityErr", | |
138 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
139 | {"WAFL PCS DataParityErr", | |
140 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, | |
141 | {"WAFL PCS ReplayFifoOverflowErr", | |
142 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
143 | {"WAFL PCS ReplayFifoUnderflowErr", | |
144 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
145 | {"WAFL PCS ElasticFifoOverflowErr", | |
146 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
147 | {"WAFL PCS DeskewErr", | |
148 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, | |
149 | {"WAFL PCS DataStartupLimitErr", | |
150 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
151 | {"WAFL PCS FCInitTimeoutErr", | |
152 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
153 | {"WAFL PCS RecoveryTimeoutErr", | |
154 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
155 | {"WAFL PCS ReadySerialTimeoutErr", | |
156 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
157 | {"WAFL PCS ReadySerialAttemptErr", | |
158 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
159 | {"WAFL PCS RecoveryAttemptErr", | |
160 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
161 | {"WAFL PCS RecoveryRelockAttemptErr", | |
162 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
163 | }; | |
164 | ||
1c1e53f7 TSD |
165 | /** |
166 | * DOC: AMDGPU XGMI Support | |
167 | * | |
168 | * XGMI is a high speed interconnect that joins multiple GPU cards | |
169 | * into a homogeneous memory space that is organized by a collective | |
170 | * hive ID and individual node IDs, both of which are 64-bit numbers. | |
171 | * | |
172 | * The file xgmi_device_id contains the unique per GPU device ID and | |
173 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. | |
174 | * | |
175 | * Inside the device directory a sub-directory 'xgmi_hive_info' is | |
176 | * created which contains the hive ID and the list of nodes. | |
177 | * | |
178 | * The hive ID is stored in: | |
179 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id | |
180 | * | |
181 | * The node information is stored in numbered directories: | |
182 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id | |
183 | * | |
184 | * Each device has their own xgmi_hive_info direction with a mirror | |
185 | * set of node sub-directories. | |
186 | * | |
187 | * The XGMI memory space is built by contiguously adding the power of | |
188 | * two padded VRAM space from each node to each other. | |
189 | * | |
190 | */ | |
191 | ||
d95e8e97 DL |
192 | static struct attribute amdgpu_xgmi_hive_id = { |
193 | .name = "xgmi_hive_id", | |
194 | .mode = S_IRUGO | |
195 | }; | |
1c1e53f7 | 196 | |
d95e8e97 DL |
197 | static struct attribute *amdgpu_xgmi_hive_attrs[] = { |
198 | &amdgpu_xgmi_hive_id, | |
199 | NULL | |
200 | }; | |
7ff61cdc | 201 | ATTRIBUTE_GROUPS(amdgpu_xgmi_hive); |
b1fa8c89 | 202 | |
d95e8e97 DL |
203 | static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, |
204 | struct attribute *attr, char *buf) | |
b1fa8c89 | 205 | { |
d95e8e97 DL |
206 | struct amdgpu_hive_info *hive = container_of( |
207 | kobj, struct amdgpu_hive_info, kobj); | |
b1fa8c89 | 208 | |
d95e8e97 DL |
209 | if (attr == &amdgpu_xgmi_hive_id) |
210 | return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); | |
b1fa8c89 | 211 | |
d95e8e97 | 212 | return 0; |
b1fa8c89 AG |
213 | } |
214 | ||
d95e8e97 | 215 | static void amdgpu_xgmi_hive_release(struct kobject *kobj) |
b1fa8c89 | 216 | { |
d95e8e97 DL |
217 | struct amdgpu_hive_info *hive = container_of( |
218 | kobj, struct amdgpu_hive_info, kobj); | |
219 | ||
cfbb6b00 AG |
220 | amdgpu_reset_put_reset_domain(hive->reset_domain); |
221 | hive->reset_domain = NULL; | |
222 | ||
d95e8e97 DL |
223 | mutex_destroy(&hive->hive_lock); |
224 | kfree(hive); | |
b1fa8c89 AG |
225 | } |
226 | ||
d95e8e97 DL |
227 | static const struct sysfs_ops amdgpu_xgmi_hive_ops = { |
228 | .show = amdgpu_xgmi_show_attrs, | |
229 | }; | |
230 | ||
231 | struct kobj_type amdgpu_xgmi_hive_type = { | |
232 | .release = amdgpu_xgmi_hive_release, | |
233 | .sysfs_ops = &amdgpu_xgmi_hive_ops, | |
7ff61cdc | 234 | .default_groups = amdgpu_xgmi_hive_groups, |
d95e8e97 DL |
235 | }; |
236 | ||
b1fa8c89 AG |
237 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, |
238 | struct device_attribute *attr, | |
239 | char *buf) | |
240 | { | |
241 | struct drm_device *ddev = dev_get_drvdata(dev); | |
1348969a | 242 | struct amdgpu_device *adev = drm_to_adev(ddev); |
b1fa8c89 | 243 | |
36000c7a | 244 | return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id); |
b1fa8c89 AG |
245 | |
246 | } | |
247 | ||
24f9aacf JK |
248 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) |
249 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, | |
250 | struct device_attribute *attr, | |
251 | char *buf) | |
252 | { | |
253 | struct drm_device *ddev = dev_get_drvdata(dev); | |
1348969a | 254 | struct amdgpu_device *adev = drm_to_adev(ddev); |
24f9aacf JK |
255 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; |
256 | uint64_t fica_out; | |
257 | unsigned int error_count = 0; | |
258 | ||
259 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); | |
260 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); | |
b1fa8c89 | 261 | |
cace4bff HZ |
262 | if ((!adev->df.funcs) || |
263 | (!adev->df.funcs->get_fica) || | |
264 | (!adev->df.funcs->set_fica)) | |
265 | return -EINVAL; | |
266 | ||
bdf84a80 | 267 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
24f9aacf JK |
268 | if (fica_out != 0x1f) |
269 | pr_err("xGMI error counters not enabled!\n"); | |
270 | ||
bdf84a80 | 271 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
24f9aacf JK |
272 | |
273 | if ((fica_out & 0xffff) == 2) | |
274 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); | |
b1fa8c89 | 275 | |
bdf84a80 | 276 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
24f9aacf | 277 | |
36000c7a | 278 | return sysfs_emit(buf, "%u\n", error_count); |
24f9aacf JK |
279 | } |
280 | ||
281 | ||
282 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | |
283 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); | |
b1fa8c89 AG |
284 | |
285 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | |
286 | struct amdgpu_hive_info *hive) | |
287 | { | |
288 | int ret = 0; | |
289 | char node[10] = { 0 }; | |
290 | ||
291 | /* Create xgmi device id file */ | |
292 | ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); | |
293 | if (ret) { | |
294 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); | |
295 | return ret; | |
296 | } | |
297 | ||
24f9aacf JK |
298 | /* Create xgmi error file */ |
299 | ret = device_create_file(adev->dev, &dev_attr_xgmi_error); | |
300 | if (ret) | |
301 | pr_err("failed to create xgmi_error\n"); | |
302 | ||
303 | ||
b1fa8c89 | 304 | /* Create sysfs link to hive info folder on the first device */ |
d95e8e97 DL |
305 | if (hive->kobj.parent != (&adev->dev->kobj)) { |
306 | ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, | |
b1fa8c89 AG |
307 | "xgmi_hive_info"); |
308 | if (ret) { | |
309 | dev_err(adev->dev, "XGMI: Failed to create link to hive info"); | |
310 | goto remove_file; | |
311 | } | |
312 | } | |
313 | ||
d95e8e97 | 314 | sprintf(node, "node%d", atomic_read(&hive->number_devices)); |
b1fa8c89 | 315 | /* Create sysfs link form the hive folder to yourself */ |
d95e8e97 | 316 | ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); |
b1fa8c89 AG |
317 | if (ret) { |
318 | dev_err(adev->dev, "XGMI: Failed to create link from hive info"); | |
319 | goto remove_link; | |
320 | } | |
321 | ||
322 | goto success; | |
323 | ||
324 | ||
325 | remove_link: | |
4a580877 | 326 | sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); |
b1fa8c89 AG |
327 | |
328 | remove_file: | |
329 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
330 | ||
331 | success: | |
332 | return ret; | |
333 | } | |
334 | ||
335 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, | |
336 | struct amdgpu_hive_info *hive) | |
337 | { | |
a89b5dae JZ |
338 | char node[10]; |
339 | memset(node, 0, sizeof(node)); | |
340 | ||
b1fa8c89 | 341 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); |
a89b5dae JZ |
342 | device_remove_file(adev->dev, &dev_attr_xgmi_error); |
343 | ||
d95e8e97 | 344 | if (hive->kobj.parent != (&adev->dev->kobj)) |
a89b5dae JZ |
345 | sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); |
346 | ||
d95e8e97 DL |
347 | sprintf(node, "node%d", atomic_read(&hive->number_devices)); |
348 | sysfs_remove_link(&hive->kobj, node); | |
a89b5dae | 349 | |
b1fa8c89 AG |
350 | } |
351 | ||
352 | ||
353 | ||
d95e8e97 | 354 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) |
fb30fc59 | 355 | { |
be8901c2 | 356 | struct amdgpu_hive_info *hive = NULL; |
d95e8e97 | 357 | int ret; |
fb30fc59 SL |
358 | |
359 | if (!adev->gmc.xgmi.hive_id) | |
360 | return NULL; | |
22d6575b | 361 | |
d95e8e97 DL |
362 | if (adev->hive) { |
363 | kobject_get(&adev->hive->kobj); | |
364 | return adev->hive; | |
365 | } | |
366 | ||
22d6575b TSD |
367 | mutex_lock(&xgmi_mutex); |
368 | ||
be8901c2 KW |
369 | list_for_each_entry(hive, &xgmi_hive_list, node) { |
370 | if (hive->hive_id == adev->gmc.xgmi.hive_id) | |
371 | goto pro_end; | |
fb30fc59 | 372 | } |
d95e8e97 DL |
373 | |
374 | hive = kzalloc(sizeof(*hive), GFP_KERNEL); | |
375 | if (!hive) { | |
376 | dev_err(adev->dev, "XGMI: allocation failed\n"); | |
377 | hive = NULL; | |
378 | goto pro_end; | |
22d6575b | 379 | } |
fb30fc59 SL |
380 | |
381 | /* initialize new hive if not exist */ | |
d95e8e97 DL |
382 | ret = kobject_init_and_add(&hive->kobj, |
383 | &amdgpu_xgmi_hive_type, | |
384 | &adev->dev->kobj, | |
385 | "%s", "xgmi_hive_info"); | |
386 | if (ret) { | |
387 | dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); | |
7b833d68 | 388 | kobject_put(&hive->kobj); |
d95e8e97 DL |
389 | kfree(hive); |
390 | hive = NULL; | |
391 | goto pro_end; | |
b1fa8c89 AG |
392 | } |
393 | ||
cfbb6b00 AG |
394 | /** |
395 | * Avoid recreating reset domain when hive is reconstructed for the case | |
396 | * of reset the devices in the XGMI hive during probe for SRIOV | |
397 | * See https://www.spinics.net/lists/amd-gfx/msg58836.html | |
398 | */ | |
399 | if (adev->reset_domain->type != XGMI_HIVE) { | |
400 | hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); | |
401 | if (!hive->reset_domain) { | |
402 | dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); | |
403 | ret = -ENOMEM; | |
404 | kobject_put(&hive->kobj); | |
405 | kfree(hive); | |
406 | hive = NULL; | |
407 | goto pro_end; | |
408 | } | |
409 | } else { | |
410 | amdgpu_reset_get_reset_domain(adev->reset_domain); | |
411 | hive->reset_domain = adev->reset_domain; | |
a4c63caf AG |
412 | } |
413 | ||
d95e8e97 DL |
414 | hive->hive_id = adev->gmc.xgmi.hive_id; |
415 | INIT_LIST_HEAD(&hive->device_list); | |
416 | INIT_LIST_HEAD(&hive->node); | |
417 | mutex_init(&hive->hive_lock); | |
d95e8e97 DL |
418 | atomic_set(&hive->number_devices, 0); |
419 | task_barrier_init(&hive->tb); | |
420 | hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; | |
421 | hive->hi_req_gpu = NULL; | |
a4c63caf | 422 | |
d84a430d JK |
423 | /* |
424 | * hive pstate on boot is high in vega20 so we have to go to low | |
425 | * pstate on after boot. | |
426 | */ | |
d95e8e97 DL |
427 | hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; |
428 | list_add_tail(&hive->node, &xgmi_hive_list); | |
429 | ||
430 | pro_end: | |
431 | if (hive) | |
432 | kobject_get(&hive->kobj); | |
22d6575b | 433 | mutex_unlock(&xgmi_mutex); |
d95e8e97 DL |
434 | return hive; |
435 | } | |
ed2bf522 | 436 | |
d95e8e97 DL |
437 | void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) |
438 | { | |
439 | if (hive) | |
440 | kobject_put(&hive->kobj); | |
fb30fc59 SL |
441 | } |
442 | ||
df399b06 | 443 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
444 | { | |
445 | int ret = 0; | |
a9f5f98f HZ |
446 | struct amdgpu_hive_info *hive; |
447 | struct amdgpu_device *request_adev; | |
d84a430d | 448 | bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; |
a9f5f98f | 449 | bool init_low; |
df399b06 | 450 | |
a9f5f98f HZ |
451 | hive = amdgpu_get_xgmi_hive(adev); |
452 | if (!hive) | |
453 | return 0; | |
454 | ||
455 | request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; | |
456 | init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; | |
d95e8e97 | 457 | amdgpu_put_xgmi_hive(hive); |
d84a430d | 458 | /* fw bug so temporarily disable pstate switching */ |
dfe31f25 JK |
459 | return 0; |
460 | ||
461 | if (!hive || adev->asic_type != CHIP_VEGA20) | |
df399b06 | 462 | return 0; |
463 | ||
f1403342 | 464 | mutex_lock(&hive->hive_lock); |
5c5b2ba0 | 465 | |
d84a430d JK |
466 | if (is_hi_req) |
467 | hive->hi_req_count++; | |
468 | else | |
469 | hive->hi_req_count--; | |
470 | ||
471 | /* | |
472 | * Vega20 only needs single peer to request pstate high for the hive to | |
473 | * go high but all peers must request pstate low for the hive to go low | |
474 | */ | |
475 | if (hive->pstate == pstate || | |
476 | (!is_hi_req && hive->hi_req_count && !init_low)) | |
cb5932f8 | 477 | goto out; |
93abb05f | 478 | |
d84a430d | 479 | dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); |
93abb05f | 480 | |
d84a430d | 481 | ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); |
5c5b2ba0 | 482 | if (ret) { |
d84a430d | 483 | dev_err(request_adev->dev, |
93abb05f | 484 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", |
d84a430d JK |
485 | request_adev->gmc.xgmi.node_id, |
486 | request_adev->gmc.xgmi.hive_id, ret); | |
5c5b2ba0 EQ |
487 | goto out; |
488 | } | |
489 | ||
d84a430d JK |
490 | if (init_low) |
491 | hive->pstate = hive->hi_req_count ? | |
492 | hive->pstate : AMDGPU_XGMI_PSTATE_MIN; | |
493 | else { | |
5c5b2ba0 | 494 | hive->pstate = pstate; |
d84a430d JK |
495 | hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? |
496 | adev : NULL; | |
497 | } | |
5c5b2ba0 | 498 | out: |
f1403342 | 499 | mutex_unlock(&hive->hive_lock); |
df399b06 | 500 | return ret; |
501 | } | |
502 | ||
5183411b AG |
503 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
504 | { | |
29c1ec24 | 505 | int ret; |
5183411b AG |
506 | |
507 | /* Each psp need to set the latest topology */ | |
508 | ret = psp_xgmi_set_topology_info(&adev->psp, | |
d95e8e97 | 509 | atomic_read(&hive->number_devices), |
da361dd1 | 510 | &adev->psp.xgmi_context.top_info); |
5183411b AG |
511 | if (ret) |
512 | dev_err(adev->dev, | |
513 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d", | |
514 | adev->gmc.xgmi.node_id, | |
515 | adev->gmc.xgmi.hive_id, ret); | |
5183411b AG |
516 | |
517 | return ret; | |
518 | } | |
519 | ||
da361dd1 | 520 | |
4ac5617c JK |
521 | /* |
522 | * NOTE psp_xgmi_node_info.num_hops layout is as follows: | |
523 | * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) | |
524 | * num_hops[5:3] = reserved | |
525 | * num_hops[2:0] = number of hops | |
526 | */ | |
da361dd1 | 527 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, |
528 | struct amdgpu_device *peer_adev) | |
529 | { | |
530 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
4ac5617c | 531 | uint8_t num_hops_mask = 0x7; |
da361dd1 | 532 | int i; |
533 | ||
534 | for (i = 0 ; i < top->num_nodes; ++i) | |
535 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
4ac5617c | 536 | return top->nodes[i].num_hops & num_hops_mask; |
da361dd1 | 537 | return -EINVAL; |
538 | } | |
539 | ||
3f46c4e9 JK |
540 | int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, |
541 | struct amdgpu_device *peer_adev) | |
542 | { | |
543 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
544 | int i; | |
545 | ||
546 | for (i = 0 ; i < top->num_nodes; ++i) | |
547 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
548 | return top->nodes[i].num_links; | |
549 | return -EINVAL; | |
550 | } | |
551 | ||
44357a1b JK |
552 | /* |
553 | * Devices that support extended data require the entire hive to initialize with | |
554 | * the shared memory buffer flag set. | |
555 | * | |
556 | * Hive locks and conditions apply - see amdgpu_xgmi_add_device | |
557 | */ | |
558 | static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, | |
559 | bool set_extended_data) | |
560 | { | |
561 | struct amdgpu_device *tmp_adev; | |
562 | int ret; | |
563 | ||
564 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
565 | ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false); | |
566 | if (ret) { | |
567 | dev_err(tmp_adev->dev, | |
568 | "XGMI: Failed to initialize xgmi session for data partition %i\n", | |
569 | set_extended_data); | |
570 | return ret; | |
571 | } | |
572 | ||
573 | } | |
574 | ||
575 | return 0; | |
576 | } | |
577 | ||
fb30fc59 SL |
578 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
579 | { | |
da361dd1 | 580 | struct psp_xgmi_topology_info *top_info; |
fb30fc59 SL |
581 | struct amdgpu_hive_info *hive; |
582 | struct amdgpu_xgmi *entry; | |
5183411b | 583 | struct amdgpu_device *tmp_adev = NULL; |
fb30fc59 | 584 | |
75b2fce2 | 585 | int count = 0, ret = 0; |
fb30fc59 | 586 | |
47622ba0 | 587 | if (!adev->gmc.xgmi.supported) |
fb30fc59 | 588 | return 0; |
47622ba0 | 589 | |
e3c1b071 | 590 | if (!adev->gmc.xgmi.pending_reset && |
591 | amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { | |
44357a1b | 592 | ret = psp_xgmi_initialize(&adev->psp, false, true); |
0b9d3760 HZ |
593 | if (ret) { |
594 | dev_err(adev->dev, | |
595 | "XGMI: Failed to initialize xgmi session\n"); | |
596 | return ret; | |
597 | } | |
598 | ||
2f2eab3a OZ |
599 | ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); |
600 | if (ret) { | |
601 | dev_err(adev->dev, | |
602 | "XGMI: Failed to get hive id\n"); | |
603 | return ret; | |
604 | } | |
379c237e | 605 | |
2f2eab3a OZ |
606 | ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); |
607 | if (ret) { | |
608 | dev_err(adev->dev, | |
609 | "XGMI: Failed to get node id\n"); | |
610 | return ret; | |
611 | } | |
612 | } else { | |
613 | adev->gmc.xgmi.hive_id = 16; | |
614 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; | |
379c237e | 615 | } |
fb30fc59 | 616 | |
d95e8e97 | 617 | hive = amdgpu_get_xgmi_hive(adev); |
36ca09a0 | 618 | if (!hive) { |
619 | ret = -EINVAL; | |
620 | dev_err(adev->dev, | |
c1219b94 | 621 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", |
36ca09a0 | 622 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
fb30fc59 | 623 | goto exit; |
36ca09a0 | 624 | } |
d95e8e97 | 625 | mutex_lock(&hive->hive_lock); |
fb30fc59 | 626 | |
da361dd1 | 627 | top_info = &adev->psp.xgmi_context.top_info; |
5183411b | 628 | |
fb30fc59 SL |
629 | list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); |
630 | list_for_each_entry(entry, &hive->device_list, head) | |
da361dd1 | 631 | top_info->nodes[count++].node_id = entry->node_id; |
e008299e | 632 | top_info->num_nodes = count; |
d95e8e97 | 633 | atomic_set(&hive->number_devices, count); |
fb30fc59 | 634 | |
f33a8770 AG |
635 | task_barrier_add_task(&hive->tb); |
636 | ||
e3c1b071 | 637 | if (!adev->gmc.xgmi.pending_reset && |
638 | amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { | |
75b2fce2 LM |
639 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
640 | /* update node list for other device in the hive */ | |
641 | if (tmp_adev != adev) { | |
642 | top_info = &tmp_adev->psp.xgmi_context.top_info; | |
643 | top_info->nodes[count - 1].node_id = | |
644 | adev->gmc.xgmi.node_id; | |
645 | top_info->num_nodes = count; | |
646 | } | |
647 | ret = amdgpu_xgmi_update_topology(hive, tmp_adev); | |
648 | if (ret) | |
94561899 | 649 | goto exit_unlock; |
e008299e | 650 | } |
e008299e | 651 | |
75b2fce2 LM |
652 | /* get latest topology info for each device from psp */ |
653 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
654 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
44357a1b | 655 | &tmp_adev->psp.xgmi_context.top_info, false); |
75b2fce2 LM |
656 | if (ret) { |
657 | dev_err(tmp_adev->dev, | |
658 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d", | |
659 | tmp_adev->gmc.xgmi.node_id, | |
660 | tmp_adev->gmc.xgmi.hive_id, ret); | |
661 | /* To do : continue with some node failed or disable the whole hive */ | |
94561899 | 662 | goto exit_unlock; |
75b2fce2 | 663 | } |
a82c1566 | 664 | } |
44357a1b JK |
665 | |
666 | /* get topology again for hives that support extended data */ | |
667 | if (adev->psp.xgmi_context.supports_extended_data) { | |
668 | ||
669 | /* initialize the hive to get extended data. */ | |
670 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true); | |
671 | if (ret) | |
672 | goto exit_unlock; | |
673 | ||
674 | /* get the extended data. */ | |
675 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
676 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
677 | &tmp_adev->psp.xgmi_context.top_info, true); | |
678 | if (ret) { | |
679 | dev_err(tmp_adev->dev, | |
680 | "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d", | |
681 | tmp_adev->gmc.xgmi.node_id, | |
682 | tmp_adev->gmc.xgmi.hive_id, ret); | |
683 | goto exit_unlock; | |
684 | } | |
685 | } | |
686 | ||
687 | /* initialize the hive to get non-extended data for the next round. */ | |
688 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false); | |
689 | if (ret) | |
690 | goto exit_unlock; | |
691 | ||
692 | } | |
fb30fc59 | 693 | } |
a82c1566 | 694 | |
e3c1b071 | 695 | if (!ret && !adev->gmc.xgmi.pending_reset) |
b1fa8c89 AG |
696 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); |
697 | ||
94561899 | 698 | exit_unlock: |
e008299e | 699 | mutex_unlock(&hive->hive_lock); |
700 | exit: | |
d95e8e97 DL |
701 | if (!ret) { |
702 | adev->hive = hive; | |
b1fa8c89 AG |
703 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", |
704 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); | |
d95e8e97 DL |
705 | } else { |
706 | amdgpu_put_xgmi_hive(hive); | |
b1fa8c89 AG |
707 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", |
708 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, | |
709 | ret); | |
d95e8e97 | 710 | } |
b1fa8c89 | 711 | |
fb30fc59 SL |
712 | return ret; |
713 | } | |
a82400b5 | 714 | |
0b9d3760 | 715 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
a82400b5 | 716 | { |
d95e8e97 | 717 | struct amdgpu_hive_info *hive = adev->hive; |
a82400b5 AG |
718 | |
719 | if (!adev->gmc.xgmi.supported) | |
0b9d3760 | 720 | return -EINVAL; |
a82400b5 | 721 | |
a82400b5 | 722 | if (!hive) |
0b9d3760 | 723 | return -EINVAL; |
a82400b5 | 724 | |
d95e8e97 | 725 | mutex_lock(&hive->hive_lock); |
a89b5dae JZ |
726 | task_barrier_rem_task(&hive->tb); |
727 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); | |
d95e8e97 DL |
728 | if (hive->hi_req_gpu == adev) |
729 | hive->hi_req_gpu = NULL; | |
730 | list_del(&adev->gmc.xgmi.head); | |
a89b5dae JZ |
731 | mutex_unlock(&hive->hive_lock); |
732 | ||
d95e8e97 DL |
733 | amdgpu_put_xgmi_hive(hive); |
734 | adev->hive = NULL; | |
735 | ||
736 | if (atomic_dec_return(&hive->number_devices) == 0) { | |
737 | /* Remove the hive from global hive list */ | |
738 | mutex_lock(&xgmi_mutex); | |
739 | list_del(&hive->node); | |
740 | mutex_unlock(&xgmi_mutex); | |
741 | ||
742 | amdgpu_put_xgmi_hive(hive); | |
22d6575b | 743 | } |
0b9d3760 HZ |
744 | |
745 | return psp_xgmi_terminate(&adev->psp); | |
a82400b5 | 746 | } |
029fbd43 | 747 | |
4e9b1fa5 | 748 | static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
029fbd43 | 749 | { |
029fbd43 HZ |
750 | if (!adev->gmc.xgmi.supported || |
751 | adev->gmc.xgmi.num_physical_nodes == 0) | |
752 | return 0; | |
753 | ||
6c245386 | 754 | adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); |
66399248 | 755 | |
caae42f0 | 756 | return amdgpu_ras_block_late_init(adev, ras_block); |
029fbd43 | 757 | } |
be5b39d8 | 758 | |
19744f5f HZ |
759 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, |
760 | uint64_t addr) | |
761 | { | |
890900fe HZ |
762 | struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; |
763 | return (addr + xgmi->physical_node_id * xgmi->node_segment_size); | |
19744f5f | 764 | } |
18f36157 | 765 | |
66399248 JC |
766 | static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) |
767 | { | |
768 | WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); | |
769 | WREG32_PCIE(pcs_status_reg, 0); | |
770 | } | |
771 | ||
52137ca8 | 772 | static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) |
66399248 JC |
773 | { |
774 | uint32_t i; | |
775 | ||
776 | switch (adev->asic_type) { | |
777 | case CHIP_ARCTURUS: | |
778 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) | |
779 | pcs_clear_status(adev, | |
780 | xgmi_pcs_err_status_reg_arct[i]); | |
781 | break; | |
782 | case CHIP_VEGA20: | |
783 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) | |
784 | pcs_clear_status(adev, | |
785 | xgmi_pcs_err_status_reg_vg20[i]); | |
786 | break; | |
3c4ff2dc | 787 | case CHIP_ALDEBARAN: |
7513c9ff | 788 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) |
3c4ff2dc | 789 | pcs_clear_status(adev, |
7513c9ff | 790 | xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
3c4ff2dc JC |
791 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) |
792 | pcs_clear_status(adev, | |
793 | walf_pcs_err_status_reg_aldebaran[i]); | |
794 | break; | |
66399248 JC |
795 | default: |
796 | break; | |
797 | } | |
798 | } | |
799 | ||
18f36157 HZ |
800 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, |
801 | uint32_t value, | |
802 | uint32_t *ue_count, | |
803 | uint32_t *ce_count, | |
804 | bool is_xgmi_pcs) | |
805 | { | |
806 | int i; | |
807 | int ue_cnt; | |
808 | ||
809 | if (is_xgmi_pcs) { | |
810 | /* query xgmi pcs error status, | |
811 | * only ue is supported */ | |
812 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) { | |
813 | ue_cnt = (value & | |
814 | xgmi_pcs_ras_fields[i].pcs_err_mask) >> | |
815 | xgmi_pcs_ras_fields[i].pcs_err_shift; | |
816 | if (ue_cnt) { | |
817 | dev_info(adev->dev, "%s detected\n", | |
818 | xgmi_pcs_ras_fields[i].err_name); | |
819 | *ue_count += ue_cnt; | |
820 | } | |
821 | } | |
822 | } else { | |
823 | /* query wafl pcs error status, | |
824 | * only ue is supported */ | |
825 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) { | |
826 | ue_cnt = (value & | |
827 | wafl_pcs_ras_fields[i].pcs_err_mask) >> | |
828 | wafl_pcs_ras_fields[i].pcs_err_shift; | |
829 | if (ue_cnt) { | |
830 | dev_info(adev->dev, "%s detected\n", | |
831 | wafl_pcs_ras_fields[i].err_name); | |
832 | *ue_count += ue_cnt; | |
833 | } | |
834 | } | |
835 | } | |
836 | ||
837 | return 0; | |
838 | } | |
839 | ||
6c245386 | 840 | static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, |
52137ca8 | 841 | void *ras_error_status) |
18f36157 HZ |
842 | { |
843 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; | |
844 | int i; | |
845 | uint32_t data; | |
846 | uint32_t ue_cnt = 0, ce_cnt = 0; | |
847 | ||
848 | if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) | |
6c245386 | 849 | return ; |
18f36157 HZ |
850 | |
851 | err_data->ue_count = 0; | |
852 | err_data->ce_count = 0; | |
853 | ||
854 | switch (adev->asic_type) { | |
a61f41b1 HZ |
855 | case CHIP_ARCTURUS: |
856 | /* check xgmi pcs error */ | |
857 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { | |
858 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); | |
859 | if (data) | |
860 | amdgpu_xgmi_query_pcs_error_status(adev, | |
861 | data, &ue_cnt, &ce_cnt, true); | |
862 | } | |
863 | /* check wafl pcs error */ | |
864 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { | |
865 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); | |
866 | if (data) | |
867 | amdgpu_xgmi_query_pcs_error_status(adev, | |
868 | data, &ue_cnt, &ce_cnt, false); | |
869 | } | |
870 | break; | |
18f36157 | 871 | case CHIP_VEGA20: |
18f36157 HZ |
872 | /* check xgmi pcs error */ |
873 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { | |
874 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); | |
875 | if (data) | |
876 | amdgpu_xgmi_query_pcs_error_status(adev, | |
877 | data, &ue_cnt, &ce_cnt, true); | |
878 | } | |
879 | /* check wafl pcs error */ | |
880 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { | |
881 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); | |
882 | if (data) | |
883 | amdgpu_xgmi_query_pcs_error_status(adev, | |
884 | data, &ue_cnt, &ce_cnt, false); | |
885 | } | |
886 | break; | |
3c4ff2dc | 887 | case CHIP_ALDEBARAN: |
3c4ff2dc JC |
888 | /* check xgmi3x16 pcs error */ |
889 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { | |
890 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); | |
891 | if (data) | |
892 | amdgpu_xgmi_query_pcs_error_status(adev, | |
893 | data, &ue_cnt, &ce_cnt, true); | |
894 | } | |
895 | /* check wafl pcs error */ | |
896 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { | |
897 | data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); | |
898 | if (data) | |
899 | amdgpu_xgmi_query_pcs_error_status(adev, | |
900 | data, &ue_cnt, &ce_cnt, false); | |
901 | } | |
902 | break; | |
f24d991b JC |
903 | default: |
904 | dev_warn(adev->dev, "XGMI RAS error query not supported"); | |
905 | break; | |
18f36157 HZ |
906 | } |
907 | ||
6c245386 | 908 | adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); |
66399248 | 909 | |
18f36157 HZ |
910 | err_data->ue_count += ue_cnt; |
911 | err_data->ce_count += ce_cnt; | |
18f36157 | 912 | } |
52137ca8 | 913 | |
22d4ba53 | 914 | /* Trigger XGMI/WAFL error */ |
915 | static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) | |
916 | { | |
917 | int ret = 0; | |
71b6c4a2 | 918 | struct ta_ras_trigger_error_input *block_info = |
919 | (struct ta_ras_trigger_error_input *)inject_if; | |
22d4ba53 | 920 | |
921 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) | |
922 | dev_warn(adev->dev, "Failed to disallow df cstate"); | |
923 | ||
924 | if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) | |
925 | dev_warn(adev->dev, "Failed to disallow XGMI power down"); | |
926 | ||
927 | ret = psp_ras_trigger_error(&adev->psp, block_info); | |
928 | ||
929 | if (amdgpu_ras_intr_triggered()) | |
930 | return ret; | |
931 | ||
932 | if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) | |
933 | dev_warn(adev->dev, "Failed to allow XGMI power down"); | |
934 | ||
935 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) | |
936 | dev_warn(adev->dev, "Failed to allow df cstate"); | |
937 | ||
938 | return ret; | |
939 | } | |
940 | ||
6c245386 | 941 | struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { |
52137ca8 HZ |
942 | .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, |
943 | .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, | |
22d4ba53 | 944 | .ras_error_inject = amdgpu_ras_error_inject_xgmi, |
52137ca8 | 945 | }; |
6c245386 | 946 | |
947 | struct amdgpu_xgmi_ras xgmi_ras = { | |
948 | .ras_block = { | |
bdb3489c | 949 | .ras_comm = { |
892a57a9 | 950 | .name = "xgmi_wafl", |
bdb3489c | 951 | .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, |
892a57a9 | 952 | .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, |
bdb3489c | 953 | }, |
6c245386 | 954 | .hw_ops = &xgmi_ras_hw_ops, |
955 | .ras_late_init = amdgpu_xgmi_ras_late_init, | |
6c245386 | 956 | }, |
957 | }; |