Commit | Line | Data |
---|---|---|
fb30fc59 SL |
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/list.h> | |
25 | #include "amdgpu.h" | |
5183411b | 26 | #include "amdgpu_xgmi.h" |
93abb05f | 27 | #include "amdgpu_smu.h" |
029fbd43 | 28 | #include "amdgpu_ras.h" |
18f36157 | 29 | #include "soc15.h" |
24f9aacf | 30 | #include "df/df_3_6_offset.h" |
18f36157 HZ |
31 | #include "xgmi/xgmi_4_0_0_smn.h" |
32 | #include "xgmi/xgmi_4_0_0_sh_mask.h" | |
33 | #include "wafl/wafl2_4_0_0_smn.h" | |
34 | #include "wafl/wafl2_4_0_0_sh_mask.h" | |
fb30fc59 SL |
35 | |
36 | static DEFINE_MUTEX(xgmi_mutex); | |
37 | ||
fb30fc59 SL |
38 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 |
39 | ||
d95e8e97 | 40 | static LIST_HEAD(xgmi_hive_list); |
fb30fc59 | 41 | |
18f36157 HZ |
42 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
43 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
44 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
45 | }; | |
46 | ||
47 | static const int wafl_pcs_err_status_reg_vg20[] = { | |
48 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
49 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
50 | }; | |
51 | ||
a61f41b1 HZ |
52 | static const int xgmi_pcs_err_status_reg_arct[] = { |
53 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, | |
54 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, | |
55 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, | |
56 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, | |
57 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, | |
58 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, | |
59 | }; | |
60 | ||
61 | /* same as vg20*/ | |
62 | static const int wafl_pcs_err_status_reg_arct[] = { | |
63 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, | |
64 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, | |
65 | }; | |
66 | ||
18f36157 HZ |
67 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
68 | {"XGMI PCS DataLossErr", | |
69 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, | |
70 | {"XGMI PCS TrainingErr", | |
71 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, | |
72 | {"XGMI PCS CRCErr", | |
73 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, | |
74 | {"XGMI PCS BERExceededErr", | |
75 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, | |
76 | {"XGMI PCS TxMetaDataErr", | |
77 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
78 | {"XGMI PCS ReplayBufParityErr", | |
79 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
80 | {"XGMI PCS DataParityErr", | |
81 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, | |
82 | {"XGMI PCS ReplayFifoOverflowErr", | |
83 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
84 | {"XGMI PCS ReplayFifoUnderflowErr", | |
85 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
86 | {"XGMI PCS ElasticFifoOverflowErr", | |
87 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
88 | {"XGMI PCS DeskewErr", | |
89 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, | |
90 | {"XGMI PCS DataStartupLimitErr", | |
91 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
92 | {"XGMI PCS FCInitTimeoutErr", | |
93 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
94 | {"XGMI PCS RecoveryTimeoutErr", | |
95 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
96 | {"XGMI PCS ReadySerialTimeoutErr", | |
97 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
98 | {"XGMI PCS ReadySerialAttemptErr", | |
99 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
100 | {"XGMI PCS RecoveryAttemptErr", | |
101 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
102 | {"XGMI PCS RecoveryRelockAttemptErr", | |
103 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
104 | }; | |
105 | ||
106 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { | |
107 | {"WAFL PCS DataLossErr", | |
108 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, | |
109 | {"WAFL PCS TrainingErr", | |
110 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, | |
111 | {"WAFL PCS CRCErr", | |
112 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, | |
113 | {"WAFL PCS BERExceededErr", | |
114 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, | |
115 | {"WAFL PCS TxMetaDataErr", | |
116 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, | |
117 | {"WAFL PCS ReplayBufParityErr", | |
118 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, | |
119 | {"WAFL PCS DataParityErr", | |
120 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, | |
121 | {"WAFL PCS ReplayFifoOverflowErr", | |
122 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, | |
123 | {"WAFL PCS ReplayFifoUnderflowErr", | |
124 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, | |
125 | {"WAFL PCS ElasticFifoOverflowErr", | |
126 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, | |
127 | {"WAFL PCS DeskewErr", | |
128 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, | |
129 | {"WAFL PCS DataStartupLimitErr", | |
130 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, | |
131 | {"WAFL PCS FCInitTimeoutErr", | |
132 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, | |
133 | {"WAFL PCS RecoveryTimeoutErr", | |
134 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, | |
135 | {"WAFL PCS ReadySerialTimeoutErr", | |
136 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, | |
137 | {"WAFL PCS ReadySerialAttemptErr", | |
138 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, | |
139 | {"WAFL PCS RecoveryAttemptErr", | |
140 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, | |
141 | {"WAFL PCS RecoveryRelockAttemptErr", | |
142 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, | |
143 | }; | |
144 | ||
1c1e53f7 TSD |
145 | /** |
146 | * DOC: AMDGPU XGMI Support | |
147 | * | |
148 | * XGMI is a high speed interconnect that joins multiple GPU cards | |
149 | * into a homogeneous memory space that is organized by a collective | |
150 | * hive ID and individual node IDs, both of which are 64-bit numbers. | |
151 | * | |
152 | * The file xgmi_device_id contains the unique per GPU device ID and | |
153 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. | |
154 | * | |
155 | * Inside the device directory a sub-directory 'xgmi_hive_info' is | |
156 | * created which contains the hive ID and the list of nodes. | |
157 | * | |
158 | * The hive ID is stored in: | |
159 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id | |
160 | * | |
161 | * The node information is stored in numbered directories: | |
162 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id | |
163 | * | |
164 | * Each device has their own xgmi_hive_info direction with a mirror | |
165 | * set of node sub-directories. | |
166 | * | |
167 | * The XGMI memory space is built by contiguously adding the power of | |
168 | * two padded VRAM space from each node to each other. | |
169 | * | |
170 | */ | |
171 | ||
d95e8e97 DL |
172 | static struct attribute amdgpu_xgmi_hive_id = { |
173 | .name = "xgmi_hive_id", | |
174 | .mode = S_IRUGO | |
175 | }; | |
1c1e53f7 | 176 | |
d95e8e97 DL |
177 | static struct attribute *amdgpu_xgmi_hive_attrs[] = { |
178 | &amdgpu_xgmi_hive_id, | |
179 | NULL | |
180 | }; | |
b1fa8c89 | 181 | |
d95e8e97 DL |
182 | static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, |
183 | struct attribute *attr, char *buf) | |
b1fa8c89 | 184 | { |
d95e8e97 DL |
185 | struct amdgpu_hive_info *hive = container_of( |
186 | kobj, struct amdgpu_hive_info, kobj); | |
b1fa8c89 | 187 | |
d95e8e97 DL |
188 | if (attr == &amdgpu_xgmi_hive_id) |
189 | return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); | |
b1fa8c89 | 190 | |
d95e8e97 | 191 | return 0; |
b1fa8c89 AG |
192 | } |
193 | ||
d95e8e97 | 194 | static void amdgpu_xgmi_hive_release(struct kobject *kobj) |
b1fa8c89 | 195 | { |
d95e8e97 DL |
196 | struct amdgpu_hive_info *hive = container_of( |
197 | kobj, struct amdgpu_hive_info, kobj); | |
198 | ||
199 | mutex_destroy(&hive->hive_lock); | |
200 | kfree(hive); | |
b1fa8c89 AG |
201 | } |
202 | ||
d95e8e97 DL |
203 | static const struct sysfs_ops amdgpu_xgmi_hive_ops = { |
204 | .show = amdgpu_xgmi_show_attrs, | |
205 | }; | |
206 | ||
207 | struct kobj_type amdgpu_xgmi_hive_type = { | |
208 | .release = amdgpu_xgmi_hive_release, | |
209 | .sysfs_ops = &amdgpu_xgmi_hive_ops, | |
210 | .default_attrs = amdgpu_xgmi_hive_attrs, | |
211 | }; | |
212 | ||
b1fa8c89 AG |
213 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, |
214 | struct device_attribute *attr, | |
215 | char *buf) | |
216 | { | |
217 | struct drm_device *ddev = dev_get_drvdata(dev); | |
1348969a | 218 | struct amdgpu_device *adev = drm_to_adev(ddev); |
b1fa8c89 AG |
219 | |
220 | return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id); | |
221 | ||
222 | } | |
223 | ||
24f9aacf JK |
224 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) |
225 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, | |
226 | struct device_attribute *attr, | |
227 | char *buf) | |
228 | { | |
229 | struct drm_device *ddev = dev_get_drvdata(dev); | |
1348969a | 230 | struct amdgpu_device *adev = drm_to_adev(ddev); |
24f9aacf JK |
231 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; |
232 | uint64_t fica_out; | |
233 | unsigned int error_count = 0; | |
234 | ||
235 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); | |
236 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); | |
b1fa8c89 | 237 | |
bdf84a80 | 238 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
24f9aacf JK |
239 | if (fica_out != 0x1f) |
240 | pr_err("xGMI error counters not enabled!\n"); | |
241 | ||
bdf84a80 | 242 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
24f9aacf JK |
243 | |
244 | if ((fica_out & 0xffff) == 2) | |
245 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); | |
b1fa8c89 | 246 | |
bdf84a80 | 247 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
24f9aacf | 248 | |
73e34336 | 249 | return snprintf(buf, PAGE_SIZE, "%u\n", error_count); |
24f9aacf JK |
250 | } |
251 | ||
252 | ||
253 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | |
254 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); | |
b1fa8c89 AG |
255 | |
256 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | |
257 | struct amdgpu_hive_info *hive) | |
258 | { | |
259 | int ret = 0; | |
260 | char node[10] = { 0 }; | |
261 | ||
262 | /* Create xgmi device id file */ | |
263 | ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); | |
264 | if (ret) { | |
265 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); | |
266 | return ret; | |
267 | } | |
268 | ||
24f9aacf JK |
269 | /* Create xgmi error file */ |
270 | ret = device_create_file(adev->dev, &dev_attr_xgmi_error); | |
271 | if (ret) | |
272 | pr_err("failed to create xgmi_error\n"); | |
273 | ||
274 | ||
b1fa8c89 | 275 | /* Create sysfs link to hive info folder on the first device */ |
d95e8e97 DL |
276 | if (hive->kobj.parent != (&adev->dev->kobj)) { |
277 | ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, | |
b1fa8c89 AG |
278 | "xgmi_hive_info"); |
279 | if (ret) { | |
280 | dev_err(adev->dev, "XGMI: Failed to create link to hive info"); | |
281 | goto remove_file; | |
282 | } | |
283 | } | |
284 | ||
d95e8e97 | 285 | sprintf(node, "node%d", atomic_read(&hive->number_devices)); |
b1fa8c89 | 286 | /* Create sysfs link form the hive folder to yourself */ |
d95e8e97 | 287 | ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); |
b1fa8c89 AG |
288 | if (ret) { |
289 | dev_err(adev->dev, "XGMI: Failed to create link from hive info"); | |
290 | goto remove_link; | |
291 | } | |
292 | ||
293 | goto success; | |
294 | ||
295 | ||
296 | remove_link: | |
4a580877 | 297 | sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); |
b1fa8c89 AG |
298 | |
299 | remove_file: | |
300 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
301 | ||
302 | success: | |
303 | return ret; | |
304 | } | |
305 | ||
306 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, | |
307 | struct amdgpu_hive_info *hive) | |
308 | { | |
a89b5dae JZ |
309 | char node[10]; |
310 | memset(node, 0, sizeof(node)); | |
311 | ||
b1fa8c89 | 312 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); |
a89b5dae JZ |
313 | device_remove_file(adev->dev, &dev_attr_xgmi_error); |
314 | ||
d95e8e97 | 315 | if (hive->kobj.parent != (&adev->dev->kobj)) |
a89b5dae JZ |
316 | sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); |
317 | ||
d95e8e97 DL |
318 | sprintf(node, "node%d", atomic_read(&hive->number_devices)); |
319 | sysfs_remove_link(&hive->kobj, node); | |
a89b5dae | 320 | |
b1fa8c89 AG |
321 | } |
322 | ||
323 | ||
324 | ||
d95e8e97 | 325 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) |
fb30fc59 | 326 | { |
d95e8e97 DL |
327 | struct amdgpu_hive_info *hive = NULL, *tmp = NULL; |
328 | int ret; | |
fb30fc59 SL |
329 | |
330 | if (!adev->gmc.xgmi.hive_id) | |
331 | return NULL; | |
22d6575b | 332 | |
d95e8e97 DL |
333 | if (adev->hive) { |
334 | kobject_get(&adev->hive->kobj); | |
335 | return adev->hive; | |
336 | } | |
337 | ||
22d6575b TSD |
338 | mutex_lock(&xgmi_mutex); |
339 | ||
d95e8e97 DL |
340 | if (!list_empty(&xgmi_hive_list)) { |
341 | list_for_each_entry_safe(hive, tmp, &xgmi_hive_list, node) { | |
342 | if (hive->hive_id == adev->gmc.xgmi.hive_id) | |
343 | goto pro_end; | |
22d6575b | 344 | } |
fb30fc59 | 345 | } |
d95e8e97 DL |
346 | |
347 | hive = kzalloc(sizeof(*hive), GFP_KERNEL); | |
348 | if (!hive) { | |
349 | dev_err(adev->dev, "XGMI: allocation failed\n"); | |
350 | hive = NULL; | |
351 | goto pro_end; | |
22d6575b | 352 | } |
fb30fc59 SL |
353 | |
354 | /* initialize new hive if not exist */ | |
d95e8e97 DL |
355 | ret = kobject_init_and_add(&hive->kobj, |
356 | &amdgpu_xgmi_hive_type, | |
357 | &adev->dev->kobj, | |
358 | "%s", "xgmi_hive_info"); | |
359 | if (ret) { | |
360 | dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); | |
361 | kfree(hive); | |
362 | hive = NULL; | |
363 | goto pro_end; | |
b1fa8c89 AG |
364 | } |
365 | ||
d95e8e97 DL |
366 | hive->hive_id = adev->gmc.xgmi.hive_id; |
367 | INIT_LIST_HEAD(&hive->device_list); | |
368 | INIT_LIST_HEAD(&hive->node); | |
369 | mutex_init(&hive->hive_lock); | |
370 | atomic_set(&hive->in_reset, 0); | |
371 | atomic_set(&hive->number_devices, 0); | |
372 | task_barrier_init(&hive->tb); | |
373 | hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; | |
374 | hive->hi_req_gpu = NULL; | |
d84a430d JK |
375 | /* |
376 | * hive pstate on boot is high in vega20 so we have to go to low | |
377 | * pstate on after boot. | |
378 | */ | |
d95e8e97 DL |
379 | hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; |
380 | list_add_tail(&hive->node, &xgmi_hive_list); | |
381 | ||
382 | pro_end: | |
383 | if (hive) | |
384 | kobject_get(&hive->kobj); | |
22d6575b | 385 | mutex_unlock(&xgmi_mutex); |
d95e8e97 DL |
386 | return hive; |
387 | } | |
ed2bf522 | 388 | |
d95e8e97 DL |
389 | void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) |
390 | { | |
391 | if (hive) | |
392 | kobject_put(&hive->kobj); | |
fb30fc59 SL |
393 | } |
394 | ||
df399b06 | 395 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
396 | { | |
397 | int ret = 0; | |
a9f5f98f HZ |
398 | struct amdgpu_hive_info *hive; |
399 | struct amdgpu_device *request_adev; | |
d84a430d | 400 | bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; |
a9f5f98f | 401 | bool init_low; |
df399b06 | 402 | |
a9f5f98f HZ |
403 | hive = amdgpu_get_xgmi_hive(adev); |
404 | if (!hive) | |
405 | return 0; | |
406 | ||
407 | request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; | |
408 | init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; | |
d95e8e97 | 409 | amdgpu_put_xgmi_hive(hive); |
d84a430d | 410 | /* fw bug so temporarily disable pstate switching */ |
dfe31f25 JK |
411 | return 0; |
412 | ||
413 | if (!hive || adev->asic_type != CHIP_VEGA20) | |
df399b06 | 414 | return 0; |
415 | ||
f1403342 | 416 | mutex_lock(&hive->hive_lock); |
5c5b2ba0 | 417 | |
d84a430d JK |
418 | if (is_hi_req) |
419 | hive->hi_req_count++; | |
420 | else | |
421 | hive->hi_req_count--; | |
422 | ||
423 | /* | |
424 | * Vega20 only needs single peer to request pstate high for the hive to | |
425 | * go high but all peers must request pstate low for the hive to go low | |
426 | */ | |
427 | if (hive->pstate == pstate || | |
428 | (!is_hi_req && hive->hi_req_count && !init_low)) | |
cb5932f8 | 429 | goto out; |
93abb05f | 430 | |
d84a430d | 431 | dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); |
93abb05f | 432 | |
d84a430d | 433 | ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); |
5c5b2ba0 | 434 | if (ret) { |
d84a430d | 435 | dev_err(request_adev->dev, |
93abb05f | 436 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", |
d84a430d JK |
437 | request_adev->gmc.xgmi.node_id, |
438 | request_adev->gmc.xgmi.hive_id, ret); | |
5c5b2ba0 EQ |
439 | goto out; |
440 | } | |
441 | ||
d84a430d JK |
442 | if (init_low) |
443 | hive->pstate = hive->hi_req_count ? | |
444 | hive->pstate : AMDGPU_XGMI_PSTATE_MIN; | |
445 | else { | |
5c5b2ba0 | 446 | hive->pstate = pstate; |
d84a430d JK |
447 | hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? |
448 | adev : NULL; | |
449 | } | |
5c5b2ba0 | 450 | out: |
f1403342 | 451 | mutex_unlock(&hive->hive_lock); |
df399b06 | 452 | return ret; |
453 | } | |
454 | ||
5183411b AG |
455 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
456 | { | |
29c1ec24 | 457 | int ret; |
5183411b AG |
458 | |
459 | /* Each psp need to set the latest topology */ | |
460 | ret = psp_xgmi_set_topology_info(&adev->psp, | |
d95e8e97 | 461 | atomic_read(&hive->number_devices), |
da361dd1 | 462 | &adev->psp.xgmi_context.top_info); |
5183411b AG |
463 | if (ret) |
464 | dev_err(adev->dev, | |
465 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d", | |
466 | adev->gmc.xgmi.node_id, | |
467 | adev->gmc.xgmi.hive_id, ret); | |
5183411b AG |
468 | |
469 | return ret; | |
470 | } | |
471 | ||
da361dd1 | 472 | |
473 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, | |
474 | struct amdgpu_device *peer_adev) | |
475 | { | |
476 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
477 | int i; | |
478 | ||
479 | for (i = 0 ; i < top->num_nodes; ++i) | |
480 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
481 | return top->nodes[i].num_hops; | |
482 | return -EINVAL; | |
483 | } | |
484 | ||
fb30fc59 SL |
485 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
486 | { | |
da361dd1 | 487 | struct psp_xgmi_topology_info *top_info; |
fb30fc59 SL |
488 | struct amdgpu_hive_info *hive; |
489 | struct amdgpu_xgmi *entry; | |
5183411b | 490 | struct amdgpu_device *tmp_adev = NULL; |
fb30fc59 | 491 | |
75b2fce2 | 492 | int count = 0, ret = 0; |
fb30fc59 | 493 | |
47622ba0 | 494 | if (!adev->gmc.xgmi.supported) |
fb30fc59 | 495 | return 0; |
47622ba0 | 496 | |
2f2eab3a | 497 | if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { |
0b9d3760 HZ |
498 | ret = psp_xgmi_initialize(&adev->psp); |
499 | if (ret) { | |
500 | dev_err(adev->dev, | |
501 | "XGMI: Failed to initialize xgmi session\n"); | |
502 | return ret; | |
503 | } | |
504 | ||
2f2eab3a OZ |
505 | ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); |
506 | if (ret) { | |
507 | dev_err(adev->dev, | |
508 | "XGMI: Failed to get hive id\n"); | |
509 | return ret; | |
510 | } | |
379c237e | 511 | |
2f2eab3a OZ |
512 | ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); |
513 | if (ret) { | |
514 | dev_err(adev->dev, | |
515 | "XGMI: Failed to get node id\n"); | |
516 | return ret; | |
517 | } | |
518 | } else { | |
519 | adev->gmc.xgmi.hive_id = 16; | |
520 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; | |
379c237e | 521 | } |
fb30fc59 | 522 | |
d95e8e97 | 523 | hive = amdgpu_get_xgmi_hive(adev); |
36ca09a0 | 524 | if (!hive) { |
525 | ret = -EINVAL; | |
526 | dev_err(adev->dev, | |
c1219b94 | 527 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", |
36ca09a0 | 528 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
fb30fc59 | 529 | goto exit; |
36ca09a0 | 530 | } |
d95e8e97 | 531 | mutex_lock(&hive->hive_lock); |
fb30fc59 | 532 | |
da361dd1 | 533 | top_info = &adev->psp.xgmi_context.top_info; |
5183411b | 534 | |
fb30fc59 SL |
535 | list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); |
536 | list_for_each_entry(entry, &hive->device_list, head) | |
da361dd1 | 537 | top_info->nodes[count++].node_id = entry->node_id; |
e008299e | 538 | top_info->num_nodes = count; |
d95e8e97 | 539 | atomic_set(&hive->number_devices, count); |
fb30fc59 | 540 | |
f33a8770 AG |
541 | task_barrier_add_task(&hive->tb); |
542 | ||
75b2fce2 LM |
543 | if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { |
544 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
545 | /* update node list for other device in the hive */ | |
546 | if (tmp_adev != adev) { | |
547 | top_info = &tmp_adev->psp.xgmi_context.top_info; | |
548 | top_info->nodes[count - 1].node_id = | |
549 | adev->gmc.xgmi.node_id; | |
550 | top_info->num_nodes = count; | |
551 | } | |
552 | ret = amdgpu_xgmi_update_topology(hive, tmp_adev); | |
553 | if (ret) | |
94561899 | 554 | goto exit_unlock; |
e008299e | 555 | } |
e008299e | 556 | |
75b2fce2 LM |
557 | /* get latest topology info for each device from psp */ |
558 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
559 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
560 | &tmp_adev->psp.xgmi_context.top_info); | |
561 | if (ret) { | |
562 | dev_err(tmp_adev->dev, | |
563 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d", | |
564 | tmp_adev->gmc.xgmi.node_id, | |
565 | tmp_adev->gmc.xgmi.hive_id, ret); | |
566 | /* To do : continue with some node failed or disable the whole hive */ | |
94561899 | 567 | goto exit_unlock; |
75b2fce2 | 568 | } |
a82c1566 | 569 | } |
fb30fc59 | 570 | } |
a82c1566 | 571 | |
b1fa8c89 AG |
572 | if (!ret) |
573 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); | |
574 | ||
94561899 | 575 | exit_unlock: |
e008299e | 576 | mutex_unlock(&hive->hive_lock); |
577 | exit: | |
d95e8e97 DL |
578 | if (!ret) { |
579 | adev->hive = hive; | |
b1fa8c89 AG |
580 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", |
581 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); | |
d95e8e97 DL |
582 | } else { |
583 | amdgpu_put_xgmi_hive(hive); | |
b1fa8c89 AG |
584 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", |
585 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, | |
586 | ret); | |
d95e8e97 | 587 | } |
b1fa8c89 | 588 | |
fb30fc59 SL |
589 | return ret; |
590 | } | |
a82400b5 | 591 | |
0b9d3760 | 592 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
a82400b5 | 593 | { |
d95e8e97 | 594 | struct amdgpu_hive_info *hive = adev->hive; |
a82400b5 AG |
595 | |
596 | if (!adev->gmc.xgmi.supported) | |
0b9d3760 | 597 | return -EINVAL; |
a82400b5 | 598 | |
a82400b5 | 599 | if (!hive) |
0b9d3760 | 600 | return -EINVAL; |
a82400b5 | 601 | |
d95e8e97 | 602 | mutex_lock(&hive->hive_lock); |
a89b5dae JZ |
603 | task_barrier_rem_task(&hive->tb); |
604 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); | |
d95e8e97 DL |
605 | if (hive->hi_req_gpu == adev) |
606 | hive->hi_req_gpu = NULL; | |
607 | list_del(&adev->gmc.xgmi.head); | |
a89b5dae JZ |
608 | mutex_unlock(&hive->hive_lock); |
609 | ||
d95e8e97 DL |
610 | amdgpu_put_xgmi_hive(hive); |
611 | adev->hive = NULL; | |
612 | ||
613 | if (atomic_dec_return(&hive->number_devices) == 0) { | |
614 | /* Remove the hive from global hive list */ | |
615 | mutex_lock(&xgmi_mutex); | |
616 | list_del(&hive->node); | |
617 | mutex_unlock(&xgmi_mutex); | |
618 | ||
619 | amdgpu_put_xgmi_hive(hive); | |
22d6575b | 620 | } |
0b9d3760 HZ |
621 | |
622 | return psp_xgmi_terminate(&adev->psp); | |
a82400b5 | 623 | } |
029fbd43 HZ |
624 | |
625 | int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) | |
626 | { | |
627 | int r; | |
628 | struct ras_ih_if ih_info = { | |
629 | .cb = NULL, | |
630 | }; | |
631 | struct ras_fs_if fs_info = { | |
632 | .sysfs_name = "xgmi_wafl_err_count", | |
029fbd43 HZ |
633 | }; |
634 | ||
635 | if (!adev->gmc.xgmi.supported || | |
636 | adev->gmc.xgmi.num_physical_nodes == 0) | |
637 | return 0; | |
638 | ||
66399248 JC |
639 | amdgpu_xgmi_reset_ras_error_count(adev); |
640 | ||
029fbd43 HZ |
641 | if (!adev->gmc.xgmi.ras_if) { |
642 | adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); | |
643 | if (!adev->gmc.xgmi.ras_if) | |
644 | return -ENOMEM; | |
645 | adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL; | |
646 | adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; | |
647 | adev->gmc.xgmi.ras_if->sub_block_index = 0; | |
648 | strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl"); | |
649 | } | |
650 | ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if; | |
651 | r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if, | |
652 | &fs_info, &ih_info); | |
653 | if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) { | |
654 | kfree(adev->gmc.xgmi.ras_if); | |
655 | adev->gmc.xgmi.ras_if = NULL; | |
656 | } | |
657 | ||
658 | return r; | |
659 | } | |
be5b39d8 TZ |
660 | |
661 | void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) | |
662 | { | |
663 | if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && | |
664 | adev->gmc.xgmi.ras_if) { | |
665 | struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if; | |
666 | struct ras_ih_if ih_info = { | |
667 | .cb = NULL, | |
668 | }; | |
669 | ||
670 | amdgpu_ras_late_fini(adev, ras_if, &ih_info); | |
671 | kfree(ras_if); | |
672 | } | |
673 | } | |
19744f5f HZ |
674 | |
675 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, | |
676 | uint64_t addr) | |
677 | { | |
890900fe HZ |
678 | struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; |
679 | return (addr + xgmi->physical_node_id * xgmi->node_segment_size); | |
19744f5f | 680 | } |
18f36157 | 681 | |
66399248 JC |
682 | static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) |
683 | { | |
684 | WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); | |
685 | WREG32_PCIE(pcs_status_reg, 0); | |
686 | } | |
687 | ||
688 | void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) | |
689 | { | |
690 | uint32_t i; | |
691 | ||
692 | switch (adev->asic_type) { | |
693 | case CHIP_ARCTURUS: | |
694 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) | |
695 | pcs_clear_status(adev, | |
696 | xgmi_pcs_err_status_reg_arct[i]); | |
697 | break; | |
698 | case CHIP_VEGA20: | |
699 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) | |
700 | pcs_clear_status(adev, | |
701 | xgmi_pcs_err_status_reg_vg20[i]); | |
702 | break; | |
703 | default: | |
704 | break; | |
705 | } | |
706 | } | |
707 | ||
18f36157 HZ |
708 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, |
709 | uint32_t value, | |
710 | uint32_t *ue_count, | |
711 | uint32_t *ce_count, | |
712 | bool is_xgmi_pcs) | |
713 | { | |
714 | int i; | |
715 | int ue_cnt; | |
716 | ||
717 | if (is_xgmi_pcs) { | |
718 | /* query xgmi pcs error status, | |
719 | * only ue is supported */ | |
720 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) { | |
721 | ue_cnt = (value & | |
722 | xgmi_pcs_ras_fields[i].pcs_err_mask) >> | |
723 | xgmi_pcs_ras_fields[i].pcs_err_shift; | |
724 | if (ue_cnt) { | |
725 | dev_info(adev->dev, "%s detected\n", | |
726 | xgmi_pcs_ras_fields[i].err_name); | |
727 | *ue_count += ue_cnt; | |
728 | } | |
729 | } | |
730 | } else { | |
731 | /* query wafl pcs error status, | |
732 | * only ue is supported */ | |
733 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) { | |
734 | ue_cnt = (value & | |
735 | wafl_pcs_ras_fields[i].pcs_err_mask) >> | |
736 | wafl_pcs_ras_fields[i].pcs_err_shift; | |
737 | if (ue_cnt) { | |
738 | dev_info(adev->dev, "%s detected\n", | |
739 | wafl_pcs_ras_fields[i].err_name); | |
740 | *ue_count += ue_cnt; | |
741 | } | |
742 | } | |
743 | } | |
744 | ||
745 | return 0; | |
746 | } | |
747 | ||
748 | int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, | |
749 | void *ras_error_status) | |
750 | { | |
751 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; | |
752 | int i; | |
753 | uint32_t data; | |
754 | uint32_t ue_cnt = 0, ce_cnt = 0; | |
755 | ||
756 | if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) | |
757 | return -EINVAL; | |
758 | ||
759 | err_data->ue_count = 0; | |
760 | err_data->ce_count = 0; | |
761 | ||
762 | switch (adev->asic_type) { | |
a61f41b1 HZ |
763 | case CHIP_ARCTURUS: |
764 | /* check xgmi pcs error */ | |
765 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { | |
766 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); | |
767 | if (data) | |
768 | amdgpu_xgmi_query_pcs_error_status(adev, | |
769 | data, &ue_cnt, &ce_cnt, true); | |
770 | } | |
771 | /* check wafl pcs error */ | |
772 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { | |
773 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); | |
774 | if (data) | |
775 | amdgpu_xgmi_query_pcs_error_status(adev, | |
776 | data, &ue_cnt, &ce_cnt, false); | |
777 | } | |
778 | break; | |
18f36157 HZ |
779 | case CHIP_VEGA20: |
780 | default: | |
781 | /* check xgmi pcs error */ | |
782 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { | |
783 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); | |
784 | if (data) | |
785 | amdgpu_xgmi_query_pcs_error_status(adev, | |
786 | data, &ue_cnt, &ce_cnt, true); | |
787 | } | |
788 | /* check wafl pcs error */ | |
789 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { | |
790 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); | |
791 | if (data) | |
792 | amdgpu_xgmi_query_pcs_error_status(adev, | |
793 | data, &ue_cnt, &ce_cnt, false); | |
794 | } | |
795 | break; | |
796 | } | |
797 | ||
66399248 JC |
798 | amdgpu_xgmi_reset_ras_error_count(adev); |
799 | ||
18f36157 HZ |
800 | err_data->ue_count += ue_cnt; |
801 | err_data->ce_count += ce_cnt; | |
802 | ||
803 | return 0; | |
804 | } |