Commit | Line | Data |
---|---|---|
fb30fc59 SL |
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/list.h> | |
25 | #include "amdgpu.h" | |
5183411b | 26 | #include "amdgpu_xgmi.h" |
93abb05f | 27 | #include "amdgpu_smu.h" |
029fbd43 | 28 | #include "amdgpu_ras.h" |
18f36157 | 29 | #include "soc15.h" |
24f9aacf | 30 | #include "df/df_3_6_offset.h" |
18f36157 HZ |
31 | #include "xgmi/xgmi_4_0_0_smn.h" |
32 | #include "xgmi/xgmi_4_0_0_sh_mask.h" | |
33 | #include "wafl/wafl2_4_0_0_smn.h" | |
34 | #include "wafl/wafl2_4_0_0_sh_mask.h" | |
fb30fc59 SL |
35 | |
36 | static DEFINE_MUTEX(xgmi_mutex); | |
37 | ||
38 | #define AMDGPU_MAX_XGMI_HIVE 8 | |
39 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 | |
40 | ||
fb30fc59 SL |
41 | static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; |
42 | static unsigned hive_count = 0; | |
43 | ||
18f36157 HZ |
44 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
45 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
46 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
47 | }; | |
48 | ||
49 | static const int wafl_pcs_err_status_reg_vg20[] = { | |
50 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
51 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
52 | }; | |
53 | ||
a61f41b1 HZ |
54 | static const int xgmi_pcs_err_status_reg_arct[] = { |
55 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
56 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
57 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, | |
58 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, | |
59 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, | |
60 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, | |
61 | }; | |
62 | ||
63 | /* same as vg20*/ | |
64 | static const int wafl_pcs_err_status_reg_arct[] = { | |
65 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
66 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
67 | }; | |
68 | ||
18f36157 HZ |
69 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
70 | {"XGMI PCS DataLossErr", | |
71 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, | |
72 | {"XGMI PCS TrainingErr", | |
73 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, | |
74 | {"XGMI PCS CRCErr", | |
75 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, | |
76 | {"XGMI PCS BERExceededErr", | |
77 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, | |
78 | {"XGMI PCS TxMetaDataErr", | |
79 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
80 | {"XGMI PCS ReplayBufParityErr", | |
81 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
82 | {"XGMI PCS DataParityErr", | |
83 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, | |
84 | {"XGMI PCS ReplayFifoOverflowErr", | |
85 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
86 | {"XGMI PCS ReplayFifoUnderflowErr", | |
87 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
88 | {"XGMI PCS ElasticFifoOverflowErr", | |
89 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
90 | {"XGMI PCS DeskewErr", | |
91 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, | |
92 | {"XGMI PCS DataStartupLimitErr", | |
93 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
94 | {"XGMI PCS FCInitTimeoutErr", | |
95 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
96 | {"XGMI PCS RecoveryTimeoutErr", | |
97 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
98 | {"XGMI PCS ReadySerialTimeoutErr", | |
99 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
100 | {"XGMI PCS ReadySerialAttemptErr", | |
101 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
102 | {"XGMI PCS RecoveryAttemptErr", | |
103 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
104 | {"XGMI PCS RecoveryRelockAttemptErr", | |
105 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
106 | }; | |
107 | ||
108 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { | |
109 | {"WAFL PCS DataLossErr", | |
110 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, | |
111 | {"WAFL PCS TrainingErr", | |
112 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, | |
113 | {"WAFL PCS CRCErr", | |
114 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, | |
115 | {"WAFL PCS BERExceededErr", | |
116 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, | |
117 | {"WAFL PCS TxMetaDataErr", | |
118 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
119 | {"WAFL PCS ReplayBufParityErr", | |
120 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
121 | {"WAFL PCS DataParityErr", | |
122 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, | |
123 | {"WAFL PCS ReplayFifoOverflowErr", | |
124 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
125 | {"WAFL PCS ReplayFifoUnderflowErr", | |
126 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
127 | {"WAFL PCS ElasticFifoOverflowErr", | |
128 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
129 | {"WAFL PCS DeskewErr", | |
130 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, | |
131 | {"WAFL PCS DataStartupLimitErr", | |
132 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
133 | {"WAFL PCS FCInitTimeoutErr", | |
134 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
135 | {"WAFL PCS RecoveryTimeoutErr", | |
136 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
137 | {"WAFL PCS ReadySerialTimeoutErr", | |
138 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
139 | {"WAFL PCS ReadySerialAttemptErr", | |
140 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
141 | {"WAFL PCS RecoveryAttemptErr", | |
142 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
143 | {"WAFL PCS RecoveryRelockAttemptErr", | |
144 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
145 | }; | |
146 | ||
ed2bf522 AG |
147 | void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) |
148 | { | |
149 | return &hive->device_list; | |
150 | } | |
151 | ||
1c1e53f7 TSD |
152 | /** |
153 | * DOC: AMDGPU XGMI Support | |
154 | * | |
155 | * XGMI is a high speed interconnect that joins multiple GPU cards | |
156 | * into a homogeneous memory space that is organized by a collective | |
157 | * hive ID and individual node IDs, both of which are 64-bit numbers. | |
158 | * | |
159 | * The file xgmi_device_id contains the unique per GPU device ID and | |
160 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. | |
161 | * | |
162 | * Inside the device directory a sub-directory 'xgmi_hive_info' is | |
163 | * created which contains the hive ID and the list of nodes. | |
164 | * | |
165 | * The hive ID is stored in: | |
166 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id | |
167 | * | |
168 | * The node information is stored in numbered directories: | |
169 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id | |
170 | * | |
171 | * Each device has their own xgmi_hive_info direction with a mirror | |
172 | * set of node sub-directories. | |
173 | * | |
174 | * The XGMI memory space is built by contiguously adding the power of | |
175 | * two padded VRAM space from each node to each other. | |
176 | * | |
177 | */ | |
178 | ||
179 | ||
b1fa8c89 AG |
180 | static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, |
181 | struct device_attribute *attr, char *buf) | |
182 | { | |
183 | struct amdgpu_hive_info *hive = | |
184 | container_of(attr, struct amdgpu_hive_info, dev_attr); | |
185 | ||
186 | return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); | |
187 | } | |
188 | ||
189 | static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, | |
190 | struct amdgpu_hive_info *hive) | |
191 | { | |
192 | int ret = 0; | |
193 | ||
194 | if (WARN_ON(hive->kobj)) | |
195 | return -EINVAL; | |
196 | ||
197 | hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj); | |
198 | if (!hive->kobj) { | |
199 | dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n"); | |
200 | return -EINVAL; | |
201 | } | |
202 | ||
203 | hive->dev_attr = (struct device_attribute) { | |
204 | .attr = { | |
205 | .name = "xgmi_hive_id", | |
206 | .mode = S_IRUGO, | |
207 | ||
208 | }, | |
209 | .show = amdgpu_xgmi_show_hive_id, | |
210 | }; | |
211 | ||
212 | ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr); | |
213 | if (ret) { | |
214 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n"); | |
215 | kobject_del(hive->kobj); | |
216 | kobject_put(hive->kobj); | |
217 | hive->kobj = NULL; | |
218 | } | |
219 | ||
220 | return ret; | |
221 | } | |
222 | ||
223 | static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev, | |
224 | struct amdgpu_hive_info *hive) | |
225 | { | |
226 | sysfs_remove_file(hive->kobj, &hive->dev_attr.attr); | |
227 | kobject_del(hive->kobj); | |
228 | kobject_put(hive->kobj); | |
229 | hive->kobj = NULL; | |
230 | } | |
231 | ||
232 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, | |
233 | struct device_attribute *attr, | |
234 | char *buf) | |
235 | { | |
236 | struct drm_device *ddev = dev_get_drvdata(dev); | |
237 | struct amdgpu_device *adev = ddev->dev_private; | |
238 | ||
239 | return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id); | |
240 | ||
241 | } | |
242 | ||
24f9aacf JK |
243 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) |
244 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, | |
245 | struct device_attribute *attr, | |
246 | char *buf) | |
247 | { | |
248 | struct drm_device *ddev = dev_get_drvdata(dev); | |
249 | struct amdgpu_device *adev = ddev->dev_private; | |
250 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; | |
251 | uint64_t fica_out; | |
252 | unsigned int error_count = 0; | |
253 | ||
254 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); | |
255 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); | |
b1fa8c89 | 256 | |
bdf84a80 | 257 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
24f9aacf JK |
258 | if (fica_out != 0x1f) |
259 | pr_err("xGMI error counters not enabled!\n"); | |
260 | ||
bdf84a80 | 261 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
24f9aacf JK |
262 | |
263 | if ((fica_out & 0xffff) == 2) | |
264 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); | |
b1fa8c89 | 265 | |
bdf84a80 | 266 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
24f9aacf JK |
267 | |
268 | return snprintf(buf, PAGE_SIZE, "%d\n", error_count); | |
269 | } | |
270 | ||
271 | ||
272 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | |
273 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); | |
b1fa8c89 AG |
274 | |
275 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | |
276 | struct amdgpu_hive_info *hive) | |
277 | { | |
278 | int ret = 0; | |
279 | char node[10] = { 0 }; | |
280 | ||
281 | /* Create xgmi device id file */ | |
282 | ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); | |
283 | if (ret) { | |
284 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); | |
285 | return ret; | |
286 | } | |
287 | ||
24f9aacf JK |
288 | /* Create xgmi error file */ |
289 | ret = device_create_file(adev->dev, &dev_attr_xgmi_error); | |
290 | if (ret) | |
291 | pr_err("failed to create xgmi_error\n"); | |
292 | ||
293 | ||
b1fa8c89 AG |
294 | /* Create sysfs link to hive info folder on the first device */ |
295 | if (adev != hive->adev) { | |
296 | ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, | |
297 | "xgmi_hive_info"); | |
298 | if (ret) { | |
299 | dev_err(adev->dev, "XGMI: Failed to create link to hive info"); | |
300 | goto remove_file; | |
301 | } | |
302 | } | |
303 | ||
304 | sprintf(node, "node%d", hive->number_devices); | |
305 | /* Create sysfs link form the hive folder to yourself */ | |
306 | ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node); | |
307 | if (ret) { | |
308 | dev_err(adev->dev, "XGMI: Failed to create link from hive info"); | |
309 | goto remove_link; | |
310 | } | |
311 | ||
312 | goto success; | |
313 | ||
314 | ||
315 | remove_link: | |
316 | sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); | |
317 | ||
318 | remove_file: | |
319 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
320 | ||
321 | success: | |
322 | return ret; | |
323 | } | |
324 | ||
325 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, | |
326 | struct amdgpu_hive_info *hive) | |
327 | { | |
328 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
329 | sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); | |
330 | sysfs_remove_link(hive->kobj, adev->ddev->unique); | |
331 | } | |
332 | ||
333 | ||
334 | ||
22d6575b | 335 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) |
fb30fc59 SL |
336 | { |
337 | int i; | |
338 | struct amdgpu_hive_info *tmp; | |
339 | ||
340 | if (!adev->gmc.xgmi.hive_id) | |
341 | return NULL; | |
22d6575b TSD |
342 | |
343 | mutex_lock(&xgmi_mutex); | |
344 | ||
fb30fc59 SL |
345 | for (i = 0 ; i < hive_count; ++i) { |
346 | tmp = &xgmi_hives[i]; | |
22d6575b TSD |
347 | if (tmp->hive_id == adev->gmc.xgmi.hive_id) { |
348 | if (lock) | |
349 | mutex_lock(&tmp->hive_lock); | |
350 | mutex_unlock(&xgmi_mutex); | |
fb30fc59 | 351 | return tmp; |
22d6575b | 352 | } |
fb30fc59 | 353 | } |
22d6575b TSD |
354 | if (i >= AMDGPU_MAX_XGMI_HIVE) { |
355 | mutex_unlock(&xgmi_mutex); | |
fb30fc59 | 356 | return NULL; |
22d6575b | 357 | } |
fb30fc59 SL |
358 | |
359 | /* initialize new hive if not exist */ | |
360 | tmp = &xgmi_hives[hive_count++]; | |
b1fa8c89 AG |
361 | |
362 | if (amdgpu_xgmi_sysfs_create(adev, tmp)) { | |
363 | mutex_unlock(&xgmi_mutex); | |
364 | return NULL; | |
365 | } | |
366 | ||
367 | tmp->adev = adev; | |
fb30fc59 SL |
368 | tmp->hive_id = adev->gmc.xgmi.hive_id; |
369 | INIT_LIST_HEAD(&tmp->device_list); | |
ed2bf522 | 370 | mutex_init(&tmp->hive_lock); |
22d6575b | 371 | mutex_init(&tmp->reset_lock); |
f33a8770 | 372 | task_barrier_init(&tmp->tb); |
b1fa8c89 | 373 | |
22d6575b TSD |
374 | if (lock) |
375 | mutex_lock(&tmp->hive_lock); | |
df399b06 | 376 | tmp->pstate = -1; |
22d6575b | 377 | mutex_unlock(&xgmi_mutex); |
ed2bf522 | 378 | |
fb30fc59 SL |
379 | return tmp; |
380 | } | |
381 | ||
df399b06 | 382 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
383 | { | |
384 | int ret = 0; | |
385 | struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); | |
5c5b2ba0 EQ |
386 | struct amdgpu_device *tmp_adev; |
387 | bool update_hive_pstate = true; | |
cb5932f8 | 388 | bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; |
df399b06 | 389 | |
390 | if (!hive) | |
391 | return 0; | |
392 | ||
5c5b2ba0 EQ |
393 | mutex_lock(&hive->hive_lock); |
394 | ||
395 | if (hive->pstate == pstate) { | |
cb5932f8 JK |
396 | adev->pstate = is_high_pstate ? pstate : adev->pstate; |
397 | goto out; | |
5c5b2ba0 | 398 | } |
93abb05f | 399 | |
400 | dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); | |
401 | ||
9530273e | 402 | ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate); |
5c5b2ba0 | 403 | if (ret) { |
93abb05f | 404 | dev_err(adev->dev, |
405 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", | |
406 | adev->gmc.xgmi.node_id, | |
407 | adev->gmc.xgmi.hive_id, ret); | |
5c5b2ba0 EQ |
408 | goto out; |
409 | } | |
410 | ||
411 | /* Update device pstate */ | |
412 | adev->pstate = pstate; | |
413 | ||
414 | /* | |
415 | * Update the hive pstate only all devices of the hive | |
416 | * are in the same pstate | |
417 | */ | |
418 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
419 | if (tmp_adev->pstate != adev->pstate) { | |
420 | update_hive_pstate = false; | |
421 | break; | |
422 | } | |
423 | } | |
cb5932f8 | 424 | if (update_hive_pstate || is_high_pstate) |
5c5b2ba0 EQ |
425 | hive->pstate = pstate; |
426 | ||
427 | out: | |
428 | mutex_unlock(&hive->hive_lock); | |
93abb05f | 429 | |
df399b06 | 430 | return ret; |
431 | } | |
432 | ||
5183411b AG |
433 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
434 | { | |
435 | int ret = -EINVAL; | |
436 | ||
437 | /* Each psp need to set the latest topology */ | |
438 | ret = psp_xgmi_set_topology_info(&adev->psp, | |
439 | hive->number_devices, | |
da361dd1 | 440 | &adev->psp.xgmi_context.top_info); |
5183411b AG |
441 | if (ret) |
442 | dev_err(adev->dev, | |
443 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d", | |
444 | adev->gmc.xgmi.node_id, | |
445 | adev->gmc.xgmi.hive_id, ret); | |
5183411b AG |
446 | |
447 | return ret; | |
448 | } | |
449 | ||
da361dd1 | 450 | |
451 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, | |
452 | struct amdgpu_device *peer_adev) | |
453 | { | |
454 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
455 | int i; | |
456 | ||
457 | for (i = 0 ; i < top->num_nodes; ++i) | |
458 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
459 | return top->nodes[i].num_hops; | |
460 | return -EINVAL; | |
461 | } | |
462 | ||
fb30fc59 SL |
463 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
464 | { | |
da361dd1 | 465 | struct psp_xgmi_topology_info *top_info; |
fb30fc59 SL |
466 | struct amdgpu_hive_info *hive; |
467 | struct amdgpu_xgmi *entry; | |
5183411b | 468 | struct amdgpu_device *tmp_adev = NULL; |
fb30fc59 | 469 | |
75b2fce2 | 470 | int count = 0, ret = 0; |
fb30fc59 | 471 | |
47622ba0 | 472 | if (!adev->gmc.xgmi.supported) |
fb30fc59 | 473 | return 0; |
47622ba0 | 474 | |
2f2eab3a | 475 | if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { |
0b9d3760 HZ |
476 | ret = psp_xgmi_initialize(&adev->psp); |
477 | if (ret) { | |
478 | dev_err(adev->dev, | |
479 | "XGMI: Failed to initialize xgmi session\n"); | |
480 | return ret; | |
481 | } | |
482 | ||
2f2eab3a OZ |
483 | ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); |
484 | if (ret) { | |
485 | dev_err(adev->dev, | |
486 | "XGMI: Failed to get hive id\n"); | |
487 | return ret; | |
488 | } | |
379c237e | 489 | |
2f2eab3a OZ |
490 | ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); |
491 | if (ret) { | |
492 | dev_err(adev->dev, | |
493 | "XGMI: Failed to get node id\n"); | |
494 | return ret; | |
495 | } | |
496 | } else { | |
497 | adev->gmc.xgmi.hive_id = 16; | |
498 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; | |
379c237e | 499 | } |
fb30fc59 | 500 | |
22d6575b | 501 | hive = amdgpu_get_xgmi_hive(adev, 1); |
36ca09a0 | 502 | if (!hive) { |
503 | ret = -EINVAL; | |
504 | dev_err(adev->dev, | |
c1219b94 | 505 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", |
36ca09a0 | 506 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
fb30fc59 | 507 | goto exit; |
36ca09a0 | 508 | } |
fb30fc59 | 509 | |
5c5b2ba0 EQ |
510 | /* Set default device pstate */ |
511 | adev->pstate = -1; | |
512 | ||
da361dd1 | 513 | top_info = &adev->psp.xgmi_context.top_info; |
5183411b | 514 | |
fb30fc59 SL |
515 | list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); |
516 | list_for_each_entry(entry, &hive->device_list, head) | |
da361dd1 | 517 | top_info->nodes[count++].node_id = entry->node_id; |
e008299e | 518 | top_info->num_nodes = count; |
5183411b | 519 | hive->number_devices = count; |
fb30fc59 | 520 | |
f33a8770 AG |
521 | task_barrier_add_task(&hive->tb); |
522 | ||
75b2fce2 LM |
523 | if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { |
524 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
525 | /* update node list for other device in the hive */ | |
526 | if (tmp_adev != adev) { | |
527 | top_info = &tmp_adev->psp.xgmi_context.top_info; | |
528 | top_info->nodes[count - 1].node_id = | |
529 | adev->gmc.xgmi.node_id; | |
530 | top_info->num_nodes = count; | |
531 | } | |
532 | ret = amdgpu_xgmi_update_topology(hive, tmp_adev); | |
533 | if (ret) | |
534 | goto exit; | |
e008299e | 535 | } |
e008299e | 536 | |
75b2fce2 LM |
537 | /* get latest topology info for each device from psp */ |
538 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
539 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
540 | &tmp_adev->psp.xgmi_context.top_info); | |
541 | if (ret) { | |
542 | dev_err(tmp_adev->dev, | |
543 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d", | |
544 | tmp_adev->gmc.xgmi.node_id, | |
545 | tmp_adev->gmc.xgmi.hive_id, ret); | |
546 | /* To do : continue with some node failed or disable the whole hive */ | |
547 | goto exit; | |
548 | } | |
a82c1566 | 549 | } |
fb30fc59 | 550 | } |
a82c1566 | 551 | |
b1fa8c89 AG |
552 | if (!ret) |
553 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); | |
554 | ||
e008299e | 555 | |
556 | mutex_unlock(&hive->hive_lock); | |
557 | exit: | |
b1fa8c89 AG |
558 | if (!ret) |
559 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", | |
560 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); | |
561 | else | |
562 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", | |
563 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, | |
564 | ret); | |
565 | ||
fb30fc59 SL |
566 | return ret; |
567 | } | |
a82400b5 | 568 | |
0b9d3760 | 569 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
a82400b5 AG |
570 | { |
571 | struct amdgpu_hive_info *hive; | |
572 | ||
573 | if (!adev->gmc.xgmi.supported) | |
0b9d3760 | 574 | return -EINVAL; |
a82400b5 | 575 | |
22d6575b | 576 | hive = amdgpu_get_xgmi_hive(adev, 1); |
a82400b5 | 577 | if (!hive) |
0b9d3760 | 578 | return -EINVAL; |
a82400b5 | 579 | |
22d6575b | 580 | if (!(hive->number_devices--)) { |
b1fa8c89 | 581 | amdgpu_xgmi_sysfs_destroy(adev, hive); |
a82400b5 | 582 | mutex_destroy(&hive->hive_lock); |
22d6575b TSD |
583 | mutex_destroy(&hive->reset_lock); |
584 | } else { | |
f33a8770 | 585 | task_barrier_rem_task(&hive->tb); |
b1fa8c89 | 586 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); |
22d6575b TSD |
587 | mutex_unlock(&hive->hive_lock); |
588 | } | |
0b9d3760 HZ |
589 | |
590 | return psp_xgmi_terminate(&adev->psp); | |
a82400b5 | 591 | } |
029fbd43 HZ |
592 | |
593 | int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) | |
594 | { | |
595 | int r; | |
596 | struct ras_ih_if ih_info = { | |
597 | .cb = NULL, | |
598 | }; | |
599 | struct ras_fs_if fs_info = { | |
600 | .sysfs_name = "xgmi_wafl_err_count", | |
029fbd43 HZ |
601 | }; |
602 | ||
603 | if (!adev->gmc.xgmi.supported || | |
604 | adev->gmc.xgmi.num_physical_nodes == 0) | |
605 | return 0; | |
606 | ||
607 | if (!adev->gmc.xgmi.ras_if) { | |
608 | adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); | |
609 | if (!adev->gmc.xgmi.ras_if) | |
610 | return -ENOMEM; | |
611 | adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL; | |
612 | adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; | |
613 | adev->gmc.xgmi.ras_if->sub_block_index = 0; | |
614 | strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl"); | |
615 | } | |
616 | ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if; | |
617 | r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if, | |
618 | &fs_info, &ih_info); | |
619 | if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) { | |
620 | kfree(adev->gmc.xgmi.ras_if); | |
621 | adev->gmc.xgmi.ras_if = NULL; | |
622 | } | |
623 | ||
624 | return r; | |
625 | } | |
be5b39d8 TZ |
626 | |
627 | void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) | |
628 | { | |
629 | if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && | |
630 | adev->gmc.xgmi.ras_if) { | |
631 | struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if; | |
632 | struct ras_ih_if ih_info = { | |
633 | .cb = NULL, | |
634 | }; | |
635 | ||
636 | amdgpu_ras_late_fini(adev, ras_if, &ih_info); | |
637 | kfree(ras_if); | |
638 | } | |
639 | } | |
19744f5f HZ |
640 | |
641 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, | |
642 | uint64_t addr) | |
643 | { | |
644 | uint32_t df_inst_id; | |
938065d4 HZ |
645 | uint64_t dram_base_addr = 0; |
646 | const struct amdgpu_df_funcs *df_funcs = adev->df.funcs; | |
647 | ||
648 | if ((!df_funcs) || | |
649 | (!df_funcs->get_df_inst_id) || | |
650 | (!df_funcs->get_dram_base_addr)) { | |
651 | dev_warn(adev->dev, | |
652 | "XGMI: relative phy_addr algorithm is not supported\n"); | |
653 | return addr; | |
654 | } | |
19744f5f | 655 | |
938065d4 HZ |
656 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) { |
657 | dev_warn(adev->dev, | |
658 | "failed to disable DF-Cstate, DF register may not be accessible\n"); | |
19744f5f | 659 | return addr; |
938065d4 HZ |
660 | } |
661 | ||
662 | df_inst_id = df_funcs->get_df_inst_id(adev); | |
663 | dram_base_addr = df_funcs->get_dram_base_addr(adev, df_inst_id); | |
19744f5f | 664 | |
938065d4 HZ |
665 | if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) |
666 | dev_warn(adev->dev, "failed to enable DF-Cstate\n"); | |
19744f5f | 667 | |
938065d4 | 668 | return addr + dram_base_addr; |
19744f5f | 669 | } |
18f36157 HZ |
670 | |
671 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, | |
672 | uint32_t value, | |
673 | uint32_t *ue_count, | |
674 | uint32_t *ce_count, | |
675 | bool is_xgmi_pcs) | |
676 | { | |
677 | int i; | |
678 | int ue_cnt; | |
679 | ||
680 | if (is_xgmi_pcs) { | |
681 | /* query xgmi pcs error status, | |
682 | * only ue is supported */ | |
683 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) { | |
684 | ue_cnt = (value & | |
685 | xgmi_pcs_ras_fields[i].pcs_err_mask) >> | |
686 | xgmi_pcs_ras_fields[i].pcs_err_shift; | |
687 | if (ue_cnt) { | |
688 | dev_info(adev->dev, "%s detected\n", | |
689 | xgmi_pcs_ras_fields[i].err_name); | |
690 | *ue_count += ue_cnt; | |
691 | } | |
692 | } | |
693 | } else { | |
694 | /* query wafl pcs error status, | |
695 | * only ue is supported */ | |
696 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) { | |
697 | ue_cnt = (value & | |
698 | wafl_pcs_ras_fields[i].pcs_err_mask) >> | |
699 | wafl_pcs_ras_fields[i].pcs_err_shift; | |
700 | if (ue_cnt) { | |
701 | dev_info(adev->dev, "%s detected\n", | |
702 | wafl_pcs_ras_fields[i].err_name); | |
703 | *ue_count += ue_cnt; | |
704 | } | |
705 | } | |
706 | } | |
707 | ||
708 | return 0; | |
709 | } | |
710 | ||
711 | int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, | |
712 | void *ras_error_status) | |
713 | { | |
714 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; | |
715 | int i; | |
716 | uint32_t data; | |
717 | uint32_t ue_cnt = 0, ce_cnt = 0; | |
718 | ||
719 | if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) | |
720 | return -EINVAL; | |
721 | ||
722 | err_data->ue_count = 0; | |
723 | err_data->ce_count = 0; | |
724 | ||
725 | switch (adev->asic_type) { | |
a61f41b1 HZ |
726 | case CHIP_ARCTURUS: |
727 | /* check xgmi pcs error */ | |
728 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { | |
729 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); | |
730 | if (data) | |
731 | amdgpu_xgmi_query_pcs_error_status(adev, | |
732 | data, &ue_cnt, &ce_cnt, true); | |
733 | } | |
734 | /* check wafl pcs error */ | |
735 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { | |
736 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); | |
737 | if (data) | |
738 | amdgpu_xgmi_query_pcs_error_status(adev, | |
739 | data, &ue_cnt, &ce_cnt, false); | |
740 | } | |
741 | break; | |
18f36157 HZ |
742 | case CHIP_VEGA20: |
743 | default: | |
744 | /* check xgmi pcs error */ | |
745 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { | |
746 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); | |
747 | if (data) | |
748 | amdgpu_xgmi_query_pcs_error_status(adev, | |
749 | data, &ue_cnt, &ce_cnt, true); | |
750 | } | |
751 | /* check wafl pcs error */ | |
752 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { | |
753 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); | |
754 | if (data) | |
755 | amdgpu_xgmi_query_pcs_error_status(adev, | |
756 | data, &ue_cnt, &ce_cnt, false); | |
757 | } | |
758 | break; | |
759 | } | |
760 | ||
761 | err_data->ue_count += ue_cnt; | |
762 | err_data->ce_count += ce_cnt; | |
763 | ||
764 | return 0; | |
765 | } |