Commit | Line | Data |
---|---|---|
fb30fc59 SL |
1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/list.h> | |
25 | #include "amdgpu.h" | |
5183411b | 26 | #include "amdgpu_xgmi.h" |
93abb05f | 27 | #include "amdgpu_smu.h" |
fb30fc59 SL |
28 | |
29 | ||
30 | static DEFINE_MUTEX(xgmi_mutex); | |
31 | ||
32 | #define AMDGPU_MAX_XGMI_HIVE 8 | |
33 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 | |
34 | ||
fb30fc59 SL |
35 | static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; |
36 | static unsigned hive_count = 0; | |
37 | ||
ed2bf522 AG |
38 | void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) |
39 | { | |
40 | return &hive->device_list; | |
41 | } | |
42 | ||
1c1e53f7 TSD |
43 | /** |
44 | * DOC: AMDGPU XGMI Support | |
45 | * | |
46 | * XGMI is a high speed interconnect that joins multiple GPU cards | |
47 | * into a homogeneous memory space that is organized by a collective | |
48 | * hive ID and individual node IDs, both of which are 64-bit numbers. | |
49 | * | |
50 | * The file xgmi_device_id contains the unique per GPU device ID and | |
51 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. | |
52 | * | |
53 | * Inside the device directory a sub-directory 'xgmi_hive_info' is | |
54 | * created which contains the hive ID and the list of nodes. | |
55 | * | |
56 | * The hive ID is stored in: | |
57 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id | |
58 | * | |
59 | * The node information is stored in numbered directories: | |
60 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id | |
61 | * | |
62 | * Each device has their own xgmi_hive_info direction with a mirror | |
63 | * set of node sub-directories. | |
64 | * | |
65 | * The XGMI memory space is built by contiguously adding the power of | |
66 | * two padded VRAM space from each node to each other. | |
67 | * | |
68 | */ | |
69 | ||
70 | ||
b1fa8c89 AG |
71 | static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, |
72 | struct device_attribute *attr, char *buf) | |
73 | { | |
74 | struct amdgpu_hive_info *hive = | |
75 | container_of(attr, struct amdgpu_hive_info, dev_attr); | |
76 | ||
77 | return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); | |
78 | } | |
79 | ||
80 | static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, | |
81 | struct amdgpu_hive_info *hive) | |
82 | { | |
83 | int ret = 0; | |
84 | ||
85 | if (WARN_ON(hive->kobj)) | |
86 | return -EINVAL; | |
87 | ||
88 | hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj); | |
89 | if (!hive->kobj) { | |
90 | dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n"); | |
91 | return -EINVAL; | |
92 | } | |
93 | ||
94 | hive->dev_attr = (struct device_attribute) { | |
95 | .attr = { | |
96 | .name = "xgmi_hive_id", | |
97 | .mode = S_IRUGO, | |
98 | ||
99 | }, | |
100 | .show = amdgpu_xgmi_show_hive_id, | |
101 | }; | |
102 | ||
103 | ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr); | |
104 | if (ret) { | |
105 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n"); | |
106 | kobject_del(hive->kobj); | |
107 | kobject_put(hive->kobj); | |
108 | hive->kobj = NULL; | |
109 | } | |
110 | ||
111 | return ret; | |
112 | } | |
113 | ||
114 | static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev, | |
115 | struct amdgpu_hive_info *hive) | |
116 | { | |
117 | sysfs_remove_file(hive->kobj, &hive->dev_attr.attr); | |
118 | kobject_del(hive->kobj); | |
119 | kobject_put(hive->kobj); | |
120 | hive->kobj = NULL; | |
121 | } | |
122 | ||
123 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, | |
124 | struct device_attribute *attr, | |
125 | char *buf) | |
126 | { | |
127 | struct drm_device *ddev = dev_get_drvdata(dev); | |
128 | struct amdgpu_device *adev = ddev->dev_private; | |
129 | ||
130 | return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id); | |
131 | ||
132 | } | |
133 | ||
134 | ||
135 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | |
136 | ||
137 | ||
138 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | |
139 | struct amdgpu_hive_info *hive) | |
140 | { | |
141 | int ret = 0; | |
142 | char node[10] = { 0 }; | |
143 | ||
144 | /* Create xgmi device id file */ | |
145 | ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); | |
146 | if (ret) { | |
147 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); | |
148 | return ret; | |
149 | } | |
150 | ||
151 | /* Create sysfs link to hive info folder on the first device */ | |
152 | if (adev != hive->adev) { | |
153 | ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, | |
154 | "xgmi_hive_info"); | |
155 | if (ret) { | |
156 | dev_err(adev->dev, "XGMI: Failed to create link to hive info"); | |
157 | goto remove_file; | |
158 | } | |
159 | } | |
160 | ||
161 | sprintf(node, "node%d", hive->number_devices); | |
162 | /* Create sysfs link form the hive folder to yourself */ | |
163 | ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node); | |
164 | if (ret) { | |
165 | dev_err(adev->dev, "XGMI: Failed to create link from hive info"); | |
166 | goto remove_link; | |
167 | } | |
168 | ||
169 | goto success; | |
170 | ||
171 | ||
172 | remove_link: | |
173 | sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); | |
174 | ||
175 | remove_file: | |
176 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
177 | ||
178 | success: | |
179 | return ret; | |
180 | } | |
181 | ||
182 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, | |
183 | struct amdgpu_hive_info *hive) | |
184 | { | |
185 | device_remove_file(adev->dev, &dev_attr_xgmi_device_id); | |
186 | sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); | |
187 | sysfs_remove_link(hive->kobj, adev->ddev->unique); | |
188 | } | |
189 | ||
190 | ||
191 | ||
22d6575b | 192 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) |
fb30fc59 SL |
193 | { |
194 | int i; | |
195 | struct amdgpu_hive_info *tmp; | |
196 | ||
197 | if (!adev->gmc.xgmi.hive_id) | |
198 | return NULL; | |
22d6575b TSD |
199 | |
200 | mutex_lock(&xgmi_mutex); | |
201 | ||
fb30fc59 SL |
202 | for (i = 0 ; i < hive_count; ++i) { |
203 | tmp = &xgmi_hives[i]; | |
22d6575b TSD |
204 | if (tmp->hive_id == adev->gmc.xgmi.hive_id) { |
205 | if (lock) | |
206 | mutex_lock(&tmp->hive_lock); | |
207 | mutex_unlock(&xgmi_mutex); | |
fb30fc59 | 208 | return tmp; |
22d6575b | 209 | } |
fb30fc59 | 210 | } |
22d6575b TSD |
211 | if (i >= AMDGPU_MAX_XGMI_HIVE) { |
212 | mutex_unlock(&xgmi_mutex); | |
fb30fc59 | 213 | return NULL; |
22d6575b | 214 | } |
fb30fc59 SL |
215 | |
216 | /* initialize new hive if not exist */ | |
217 | tmp = &xgmi_hives[hive_count++]; | |
b1fa8c89 AG |
218 | |
219 | if (amdgpu_xgmi_sysfs_create(adev, tmp)) { | |
220 | mutex_unlock(&xgmi_mutex); | |
221 | return NULL; | |
222 | } | |
223 | ||
224 | tmp->adev = adev; | |
fb30fc59 SL |
225 | tmp->hive_id = adev->gmc.xgmi.hive_id; |
226 | INIT_LIST_HEAD(&tmp->device_list); | |
ed2bf522 | 227 | mutex_init(&tmp->hive_lock); |
22d6575b | 228 | mutex_init(&tmp->reset_lock); |
b1fa8c89 | 229 | |
22d6575b TSD |
230 | if (lock) |
231 | mutex_lock(&tmp->hive_lock); | |
df399b06 | 232 | tmp->pstate = -1; |
22d6575b | 233 | mutex_unlock(&xgmi_mutex); |
ed2bf522 | 234 | |
fb30fc59 SL |
235 | return tmp; |
236 | } | |
237 | ||
df399b06 | 238 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
239 | { | |
240 | int ret = 0; | |
241 | struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); | |
242 | ||
243 | if (!hive) | |
244 | return 0; | |
245 | ||
246 | if (hive->pstate == pstate) | |
247 | return 0; | |
93abb05f | 248 | |
249 | dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); | |
250 | ||
251 | if (is_support_sw_smu(adev)) | |
252 | ret = smu_set_xgmi_pstate(&adev->smu, pstate); | |
253 | if (ret) | |
254 | dev_err(adev->dev, | |
255 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", | |
256 | adev->gmc.xgmi.node_id, | |
257 | adev->gmc.xgmi.hive_id, ret); | |
258 | ||
df399b06 | 259 | return ret; |
260 | } | |
261 | ||
5183411b AG |
262 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
263 | { | |
264 | int ret = -EINVAL; | |
265 | ||
266 | /* Each psp need to set the latest topology */ | |
267 | ret = psp_xgmi_set_topology_info(&adev->psp, | |
268 | hive->number_devices, | |
da361dd1 | 269 | &adev->psp.xgmi_context.top_info); |
5183411b AG |
270 | if (ret) |
271 | dev_err(adev->dev, | |
272 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d", | |
273 | adev->gmc.xgmi.node_id, | |
274 | adev->gmc.xgmi.hive_id, ret); | |
5183411b AG |
275 | |
276 | return ret; | |
277 | } | |
278 | ||
da361dd1 | 279 | |
280 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, | |
281 | struct amdgpu_device *peer_adev) | |
282 | { | |
283 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; | |
284 | int i; | |
285 | ||
286 | for (i = 0 ; i < top->num_nodes; ++i) | |
287 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) | |
288 | return top->nodes[i].num_hops; | |
289 | return -EINVAL; | |
290 | } | |
291 | ||
fb30fc59 SL |
292 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
293 | { | |
da361dd1 | 294 | struct psp_xgmi_topology_info *top_info; |
fb30fc59 SL |
295 | struct amdgpu_hive_info *hive; |
296 | struct amdgpu_xgmi *entry; | |
5183411b | 297 | struct amdgpu_device *tmp_adev = NULL; |
fb30fc59 SL |
298 | |
299 | int count = 0, ret = -EINVAL; | |
300 | ||
47622ba0 | 301 | if (!adev->gmc.xgmi.supported) |
fb30fc59 | 302 | return 0; |
47622ba0 | 303 | |
379c237e EQ |
304 | ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); |
305 | if (ret) { | |
306 | dev_err(adev->dev, | |
307 | "XGMI: Failed to get node id\n"); | |
308 | return ret; | |
309 | } | |
310 | ||
311 | ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); | |
312 | if (ret) { | |
313 | dev_err(adev->dev, | |
314 | "XGMI: Failed to get hive id\n"); | |
315 | return ret; | |
316 | } | |
fb30fc59 | 317 | |
22d6575b | 318 | hive = amdgpu_get_xgmi_hive(adev, 1); |
36ca09a0 | 319 | if (!hive) { |
320 | ret = -EINVAL; | |
321 | dev_err(adev->dev, | |
c1219b94 | 322 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", |
36ca09a0 | 323 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
fb30fc59 | 324 | goto exit; |
36ca09a0 | 325 | } |
fb30fc59 | 326 | |
da361dd1 | 327 | top_info = &adev->psp.xgmi_context.top_info; |
5183411b | 328 | |
fb30fc59 SL |
329 | list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); |
330 | list_for_each_entry(entry, &hive->device_list, head) | |
da361dd1 | 331 | top_info->nodes[count++].node_id = entry->node_id; |
e008299e | 332 | top_info->num_nodes = count; |
5183411b | 333 | hive->number_devices = count; |
fb30fc59 | 334 | |
a82c1566 | 335 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
e008299e | 336 | /* update node list for other device in the hive */ |
337 | if (tmp_adev != adev) { | |
338 | top_info = &tmp_adev->psp.xgmi_context.top_info; | |
339 | top_info->nodes[count - 1].node_id = adev->gmc.xgmi.node_id; | |
340 | top_info->num_nodes = count; | |
341 | } | |
342 | ret = amdgpu_xgmi_update_topology(hive, tmp_adev); | |
343 | if (ret) | |
344 | goto exit; | |
345 | } | |
346 | ||
347 | /* get latest topology info for each device from psp */ | |
348 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { | |
349 | ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, | |
350 | &tmp_adev->psp.xgmi_context.top_info); | |
a82c1566 | 351 | if (ret) { |
352 | dev_err(tmp_adev->dev, | |
353 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d", | |
354 | tmp_adev->gmc.xgmi.node_id, | |
355 | tmp_adev->gmc.xgmi.hive_id, ret); | |
356 | /* To do : continue with some node failed or disable the whole hive */ | |
e008299e | 357 | goto exit; |
a82c1566 | 358 | } |
fb30fc59 | 359 | } |
a82c1566 | 360 | |
b1fa8c89 AG |
361 | if (!ret) |
362 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); | |
363 | ||
e008299e | 364 | |
365 | mutex_unlock(&hive->hive_lock); | |
366 | exit: | |
b1fa8c89 AG |
367 | if (!ret) |
368 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", | |
369 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); | |
370 | else | |
371 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", | |
372 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, | |
373 | ret); | |
374 | ||
fb30fc59 SL |
375 | return ret; |
376 | } | |
a82400b5 AG |
377 | |
378 | void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) | |
379 | { | |
380 | struct amdgpu_hive_info *hive; | |
381 | ||
382 | if (!adev->gmc.xgmi.supported) | |
383 | return; | |
384 | ||
22d6575b | 385 | hive = amdgpu_get_xgmi_hive(adev, 1); |
a82400b5 | 386 | if (!hive) |
22d6575b | 387 | return; |
a82400b5 | 388 | |
22d6575b | 389 | if (!(hive->number_devices--)) { |
b1fa8c89 | 390 | amdgpu_xgmi_sysfs_destroy(adev, hive); |
a82400b5 | 391 | mutex_destroy(&hive->hive_lock); |
22d6575b TSD |
392 | mutex_destroy(&hive->reset_lock); |
393 | } else { | |
b1fa8c89 | 394 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); |
22d6575b TSD |
395 | mutex_unlock(&hive->hive_lock); |
396 | } | |
a82400b5 | 397 | } |