Commit | Line | Data |
---|---|---|
174de876 FK |
1 | /* |
2 | * Copyright 2015-2017 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | */ | |
3a87177e HK |
22 | |
23 | #include <linux/pci.h> | |
174de876 FK |
24 | #include <linux/acpi.h> |
25 | #include "kfd_crat.h" | |
520b8fb7 | 26 | #include "kfd_priv.h" |
174de876 | 27 | #include "kfd_topology.h" |
64d1c3a4 | 28 | #include "kfd_iommu.h" |
5b87245f | 29 | #include "amdgpu_amdkfd.h" |
174de876 | 30 | |
3a87177e HK |
31 | /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. |
32 | * GPU processor ID are expressed with Bit[31]=1. | |
33 | * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs | |
34 | * used in the CRAT. | |
35 | */ | |
36 | static uint32_t gpu_processor_id_low = 0x80001000; | |
37 | ||
38 | /* Return the next available gpu_processor_id and increment it for next GPU | |
39 | * @total_cu_count - Total CUs present in the GPU including ones | |
40 | * masked off | |
41 | */ | |
42 | static inline unsigned int get_and_inc_gpu_processor_id( | |
43 | unsigned int total_cu_count) | |
44 | { | |
45 | int current_id = gpu_processor_id_low; | |
46 | ||
47 | gpu_processor_id_low += total_cu_count; | |
48 | return current_id; | |
49 | } | |
50 | ||
51 | /* Static table to describe GPU Cache information */ | |
52 | struct kfd_gpu_cache_info { | |
53 | uint32_t cache_size; | |
54 | uint32_t cache_level; | |
55 | uint32_t flags; | |
56 | /* Indicates how many Compute Units share this cache | |
57 | * Value = 1 indicates the cache is not shared | |
58 | */ | |
59 | uint32_t num_cu_shared; | |
60 | }; | |
61 | ||
62 | static struct kfd_gpu_cache_info kaveri_cache_info[] = { | |
63 | { | |
64 | /* TCP L1 Cache per CU */ | |
65 | .cache_size = 16, | |
66 | .cache_level = 1, | |
67 | .flags = (CRAT_CACHE_FLAGS_ENABLED | | |
68 | CRAT_CACHE_FLAGS_DATA_CACHE | | |
69 | CRAT_CACHE_FLAGS_SIMD_CACHE), | |
70 | .num_cu_shared = 1, | |
71 | ||
72 | }, | |
73 | { | |
74 | /* Scalar L1 Instruction Cache (in SQC module) per bank */ | |
75 | .cache_size = 16, | |
76 | .cache_level = 1, | |
77 | .flags = (CRAT_CACHE_FLAGS_ENABLED | | |
78 | CRAT_CACHE_FLAGS_INST_CACHE | | |
79 | CRAT_CACHE_FLAGS_SIMD_CACHE), | |
80 | .num_cu_shared = 2, | |
81 | }, | |
82 | { | |
83 | /* Scalar L1 Data Cache (in SQC module) per bank */ | |
84 | .cache_size = 8, | |
85 | .cache_level = 1, | |
86 | .flags = (CRAT_CACHE_FLAGS_ENABLED | | |
87 | CRAT_CACHE_FLAGS_DATA_CACHE | | |
88 | CRAT_CACHE_FLAGS_SIMD_CACHE), | |
89 | .num_cu_shared = 2, | |
90 | }, | |
91 | ||
92 | /* TODO: Add L2 Cache information */ | |
93 | }; | |
94 | ||
95 | ||
96 | static struct kfd_gpu_cache_info carrizo_cache_info[] = { | |
97 | { | |
98 | /* TCP L1 Cache per CU */ | |
99 | .cache_size = 16, | |
100 | .cache_level = 1, | |
101 | .flags = (CRAT_CACHE_FLAGS_ENABLED | | |
102 | CRAT_CACHE_FLAGS_DATA_CACHE | | |
103 | CRAT_CACHE_FLAGS_SIMD_CACHE), | |
104 | .num_cu_shared = 1, | |
105 | }, | |
106 | { | |
107 | /* Scalar L1 Instruction Cache (in SQC module) per bank */ | |
108 | .cache_size = 8, | |
109 | .cache_level = 1, | |
110 | .flags = (CRAT_CACHE_FLAGS_ENABLED | | |
111 | CRAT_CACHE_FLAGS_INST_CACHE | | |
112 | CRAT_CACHE_FLAGS_SIMD_CACHE), | |
113 | .num_cu_shared = 4, | |
114 | }, | |
115 | { | |
116 | /* Scalar L1 Data Cache (in SQC module) per bank. */ | |
117 | .cache_size = 4, | |
118 | .cache_level = 1, | |
119 | .flags = (CRAT_CACHE_FLAGS_ENABLED | | |
120 | CRAT_CACHE_FLAGS_DATA_CACHE | | |
121 | CRAT_CACHE_FLAGS_SIMD_CACHE), | |
122 | .num_cu_shared = 4, | |
123 | }, | |
124 | ||
125 | /* TODO: Add L2 Cache information */ | |
126 | }; | |
127 | ||
128 | /* NOTE: In future if more information is added to struct kfd_gpu_cache_info | |
129 | * the following ASICs may need a separate table. | |
130 | */ | |
131 | #define hawaii_cache_info kaveri_cache_info | |
132 | #define tonga_cache_info carrizo_cache_info | |
133 | #define fiji_cache_info carrizo_cache_info | |
134 | #define polaris10_cache_info carrizo_cache_info | |
135 | #define polaris11_cache_info carrizo_cache_info | |
846a44d7 | 136 | #define polaris12_cache_info carrizo_cache_info |
ed81cd6e | 137 | #define vegam_cache_info carrizo_cache_info |
389056e5 FK |
138 | /* TODO - check & update Vega10 cache details */ |
139 | #define vega10_cache_info carrizo_cache_info | |
140 | #define raven_cache_info carrizo_cache_info | |
a8d42f17 | 141 | #define renoir_cache_info carrizo_cache_info |
14328aa5 PC |
142 | /* TODO - check & update Navi10 cache details */ |
143 | #define navi10_cache_info carrizo_cache_info | |
3a87177e | 144 | |
174de876 FK |
145 | static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, |
146 | struct crat_subtype_computeunit *cu) | |
147 | { | |
148 | dev->node_props.cpu_cores_count = cu->num_cpu_cores; | |
149 | dev->node_props.cpu_core_id_base = cu->processor_id_low; | |
150 | if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) | |
151 | dev->node_props.capability |= HSA_CAP_ATS_PRESENT; | |
152 | ||
42aa8793 | 153 | pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, |
174de876 FK |
154 | cu->processor_id_low); |
155 | } | |
156 | ||
157 | static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, | |
158 | struct crat_subtype_computeunit *cu) | |
159 | { | |
160 | dev->node_props.simd_id_base = cu->processor_id_low; | |
161 | dev->node_props.simd_count = cu->num_simd_cores; | |
162 | dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; | |
163 | dev->node_props.max_waves_per_simd = cu->max_waves_simd; | |
164 | dev->node_props.wave_front_size = cu->wave_front_size; | |
3a87177e | 165 | dev->node_props.array_count = cu->array_count; |
174de876 FK |
166 | dev->node_props.cu_per_simd_array = cu->num_cu_per_array; |
167 | dev->node_props.simd_per_cu = cu->num_simd_per_cu; | |
168 | dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; | |
169 | if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) | |
170 | dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; | |
42aa8793 | 171 | pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); |
174de876 FK |
172 | } |
173 | ||
4f449311 HK |
174 | /* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct |
175 | * topology device present in the device_list | |
176 | */ | |
177 | static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, | |
178 | struct list_head *device_list) | |
174de876 FK |
179 | { |
180 | struct kfd_topology_device *dev; | |
174de876 | 181 | |
42aa8793 | 182 | pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", |
174de876 | 183 | cu->proximity_domain, cu->hsa_capability); |
4f449311 HK |
184 | list_for_each_entry(dev, device_list, list) { |
185 | if (cu->proximity_domain == dev->proximity_domain) { | |
174de876 FK |
186 | if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) |
187 | kfd_populated_cu_info_cpu(dev, cu); | |
188 | ||
189 | if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) | |
190 | kfd_populated_cu_info_gpu(dev, cu); | |
191 | break; | |
192 | } | |
174de876 FK |
193 | } |
194 | ||
195 | return 0; | |
196 | } | |
197 | ||
f3ed5df8 YZ |
198 | static struct kfd_mem_properties * |
199 | find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, | |
200 | struct kfd_topology_device *dev) | |
201 | { | |
202 | struct kfd_mem_properties *props; | |
203 | ||
204 | list_for_each_entry(props, &dev->mem_props, list) { | |
205 | if (props->heap_type == heap_type | |
206 | && props->flags == flags | |
207 | && props->width == width) | |
208 | return props; | |
209 | } | |
210 | ||
211 | return NULL; | |
212 | } | |
4f449311 HK |
213 | /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct |
214 | * topology device present in the device_list | |
174de876 | 215 | */ |
4f449311 HK |
216 | static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, |
217 | struct list_head *device_list) | |
174de876 FK |
218 | { |
219 | struct kfd_mem_properties *props; | |
220 | struct kfd_topology_device *dev; | |
f3ed5df8 YZ |
221 | uint32_t heap_type; |
222 | uint64_t size_in_bytes; | |
223 | uint32_t flags = 0; | |
224 | uint32_t width; | |
174de876 | 225 | |
42aa8793 | 226 | pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", |
174de876 | 227 | mem->proximity_domain); |
4f449311 HK |
228 | list_for_each_entry(dev, device_list, list) { |
229 | if (mem->proximity_domain == dev->proximity_domain) { | |
3a87177e HK |
230 | /* We're on GPU node */ |
231 | if (dev->node_props.cpu_cores_count == 0) { | |
232 | /* APU */ | |
233 | if (mem->visibility_type == 0) | |
f3ed5df8 | 234 | heap_type = |
3a87177e HK |
235 | HSA_MEM_HEAP_TYPE_FB_PRIVATE; |
236 | /* dGPU */ | |
237 | else | |
f3ed5df8 | 238 | heap_type = mem->visibility_type; |
3a87177e | 239 | } else |
f3ed5df8 | 240 | heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; |
174de876 FK |
241 | |
242 | if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) | |
f3ed5df8 | 243 | flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; |
174de876 | 244 | if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) |
f3ed5df8 | 245 | flags |= HSA_MEM_FLAGS_NON_VOLATILE; |
174de876 | 246 | |
f3ed5df8 | 247 | size_in_bytes = |
174de876 FK |
248 | ((uint64_t)mem->length_high << 32) + |
249 | mem->length_low; | |
f3ed5df8 YZ |
250 | width = mem->width; |
251 | ||
252 | /* Multiple banks of the same type are aggregated into | |
253 | * one. User mode doesn't care about multiple physical | |
254 | * memory segments. It's managed as a single virtual | |
255 | * heap for user mode. | |
256 | */ | |
257 | props = find_subtype_mem(heap_type, flags, width, dev); | |
258 | if (props) { | |
259 | props->size_in_bytes += size_in_bytes; | |
260 | break; | |
261 | } | |
262 | ||
263 | props = kfd_alloc_struct(props); | |
264 | if (!props) | |
265 | return -ENOMEM; | |
266 | ||
267 | props->heap_type = heap_type; | |
268 | props->flags = flags; | |
269 | props->size_in_bytes = size_in_bytes; | |
270 | props->width = width; | |
174de876 | 271 | |
175b9263 | 272 | dev->node_props.mem_banks_count++; |
174de876 FK |
273 | list_add_tail(&props->list, &dev->mem_props); |
274 | ||
275 | break; | |
276 | } | |
174de876 FK |
277 | } |
278 | ||
279 | return 0; | |
280 | } | |
281 | ||
4f449311 HK |
282 | /* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct |
283 | * topology device present in the device_list | |
174de876 | 284 | */ |
4f449311 HK |
285 | static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, |
286 | struct list_head *device_list) | |
174de876 FK |
287 | { |
288 | struct kfd_cache_properties *props; | |
289 | struct kfd_topology_device *dev; | |
290 | uint32_t id; | |
3a87177e | 291 | uint32_t total_num_of_cu; |
174de876 FK |
292 | |
293 | id = cache->processor_id_low; | |
294 | ||
42aa8793 | 295 | pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); |
3a87177e HK |
296 | list_for_each_entry(dev, device_list, list) { |
297 | total_num_of_cu = (dev->node_props.array_count * | |
298 | dev->node_props.cu_per_simd_array); | |
299 | ||
300 | /* Cache infomration in CRAT doesn't have proximity_domain | |
301 | * information as it is associated with a CPU core or GPU | |
302 | * Compute Unit. So map the cache using CPU core Id or SIMD | |
303 | * (GPU) ID. | |
304 | * TODO: This works because currently we can safely assume that | |
305 | * Compute Units are parsed before caches are parsed. In | |
306 | * future, remove this dependency | |
307 | */ | |
308 | if ((id >= dev->node_props.cpu_core_id_base && | |
309 | id <= dev->node_props.cpu_core_id_base + | |
310 | dev->node_props.cpu_cores_count) || | |
311 | (id >= dev->node_props.simd_id_base && | |
312 | id < dev->node_props.simd_id_base + | |
313 | total_num_of_cu)) { | |
174de876 FK |
314 | props = kfd_alloc_struct(props); |
315 | if (!props) | |
316 | return -ENOMEM; | |
317 | ||
318 | props->processor_id_low = id; | |
319 | props->cache_level = cache->cache_level; | |
320 | props->cache_size = cache->cache_size; | |
321 | props->cacheline_size = cache->cache_line_size; | |
322 | props->cachelines_per_tag = cache->lines_per_tag; | |
323 | props->cache_assoc = cache->associativity; | |
324 | props->cache_latency = cache->cache_latency; | |
3a87177e HK |
325 | memcpy(props->sibling_map, cache->sibling_map, |
326 | sizeof(props->sibling_map)); | |
174de876 FK |
327 | |
328 | if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) | |
329 | props->cache_type |= HSA_CACHE_TYPE_DATA; | |
330 | if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) | |
331 | props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; | |
332 | if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) | |
333 | props->cache_type |= HSA_CACHE_TYPE_CPU; | |
334 | if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) | |
335 | props->cache_type |= HSA_CACHE_TYPE_HSACU; | |
336 | ||
337 | dev->cache_count++; | |
338 | dev->node_props.caches_count++; | |
339 | list_add_tail(&props->list, &dev->cache_props); | |
340 | ||
341 | break; | |
342 | } | |
3a87177e | 343 | } |
174de876 FK |
344 | |
345 | return 0; | |
346 | } | |
347 | ||
4f449311 HK |
348 | /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct |
349 | * topology device present in the device_list | |
174de876 | 350 | */ |
4f449311 HK |
351 | static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, |
352 | struct list_head *device_list) | |
174de876 | 353 | { |
3a87177e | 354 | struct kfd_iolink_properties *props = NULL, *props2; |
ae9a25ae | 355 | struct kfd_topology_device *dev, *to_dev; |
174de876 FK |
356 | uint32_t id_from; |
357 | uint32_t id_to; | |
358 | ||
359 | id_from = iolink->proximity_domain_from; | |
360 | id_to = iolink->proximity_domain_to; | |
361 | ||
67f7cf9f | 362 | pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n", |
363 | id_from, id_to); | |
4f449311 HK |
364 | list_for_each_entry(dev, device_list, list) { |
365 | if (id_from == dev->proximity_domain) { | |
174de876 FK |
366 | props = kfd_alloc_struct(props); |
367 | if (!props) | |
368 | return -ENOMEM; | |
369 | ||
370 | props->node_from = id_from; | |
371 | props->node_to = id_to; | |
372 | props->ver_maj = iolink->version_major; | |
373 | props->ver_min = iolink->version_minor; | |
3a87177e | 374 | props->iolink_type = iolink->io_interface_type; |
174de876 | 375 | |
3a87177e HK |
376 | if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) |
377 | props->weight = 20; | |
ae9a25ae | 378 | else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) |
0fb0df03 | 379 | props->weight = 15 * iolink->num_hops_xgmi; |
3a87177e HK |
380 | else |
381 | props->weight = node_distance(id_from, id_to); | |
174de876 FK |
382 | |
383 | props->min_latency = iolink->minimum_latency; | |
384 | props->max_latency = iolink->maximum_latency; | |
385 | props->min_bandwidth = iolink->minimum_bandwidth_mbs; | |
386 | props->max_bandwidth = iolink->maximum_bandwidth_mbs; | |
387 | props->rec_transfer_size = | |
388 | iolink->recommended_transfer_size; | |
389 | ||
390 | dev->io_link_count++; | |
391 | dev->node_props.io_links_count++; | |
392 | list_add_tail(&props->list, &dev->io_link_props); | |
174de876 FK |
393 | break; |
394 | } | |
174de876 FK |
395 | } |
396 | ||
3a87177e HK |
397 | /* CPU topology is created before GPUs are detected, so CPU->GPU |
398 | * links are not built at that time. If a PCIe type is discovered, it | |
399 | * means a GPU is detected and we are adding GPU->CPU to the topology. | |
67f7cf9f | 400 | * At this time, also add the corresponded CPU->GPU link if GPU |
401 | * is large bar. | |
ae9a25ae SL |
402 | * For xGMI, we only added the link with one direction in the crat |
403 | * table, add corresponded reversed direction link now. | |
3a87177e | 404 | */ |
67f7cf9f | 405 | if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) { |
ae9a25ae SL |
406 | to_dev = kfd_topology_device_by_proximity_domain(id_to); |
407 | if (!to_dev) | |
3a87177e HK |
408 | return -ENODEV; |
409 | /* same everything but the other direction */ | |
410 | props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); | |
411 | props2->node_from = id_to; | |
412 | props2->node_to = id_from; | |
413 | props2->kobj = NULL; | |
ae9a25ae SL |
414 | to_dev->io_link_count++; |
415 | to_dev->node_props.io_links_count++; | |
416 | list_add_tail(&props2->list, &to_dev->io_link_props); | |
3a87177e HK |
417 | } |
418 | ||
174de876 FK |
419 | return 0; |
420 | } | |
421 | ||
4f449311 HK |
422 | /* kfd_parse_subtype - parse subtypes and attach it to correct topology device |
423 | * present in the device_list | |
424 | * @sub_type_hdr - subtype section of crat_image | |
425 | * @device_list - list of topology devices present in this crat_image | |
426 | */ | |
427 | static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, | |
428 | struct list_head *device_list) | |
174de876 FK |
429 | { |
430 | struct crat_subtype_computeunit *cu; | |
431 | struct crat_subtype_memory *mem; | |
432 | struct crat_subtype_cache *cache; | |
433 | struct crat_subtype_iolink *iolink; | |
434 | int ret = 0; | |
435 | ||
436 | switch (sub_type_hdr->type) { | |
437 | case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: | |
438 | cu = (struct crat_subtype_computeunit *)sub_type_hdr; | |
4f449311 | 439 | ret = kfd_parse_subtype_cu(cu, device_list); |
174de876 FK |
440 | break; |
441 | case CRAT_SUBTYPE_MEMORY_AFFINITY: | |
442 | mem = (struct crat_subtype_memory *)sub_type_hdr; | |
4f449311 | 443 | ret = kfd_parse_subtype_mem(mem, device_list); |
174de876 FK |
444 | break; |
445 | case CRAT_SUBTYPE_CACHE_AFFINITY: | |
446 | cache = (struct crat_subtype_cache *)sub_type_hdr; | |
4f449311 | 447 | ret = kfd_parse_subtype_cache(cache, device_list); |
174de876 FK |
448 | break; |
449 | case CRAT_SUBTYPE_TLB_AFFINITY: | |
450 | /* | |
451 | * For now, nothing to do here | |
452 | */ | |
42aa8793 | 453 | pr_debug("Found TLB entry in CRAT table (not processing)\n"); |
174de876 FK |
454 | break; |
455 | case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: | |
456 | /* | |
457 | * For now, nothing to do here | |
458 | */ | |
42aa8793 | 459 | pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); |
174de876 FK |
460 | break; |
461 | case CRAT_SUBTYPE_IOLINK_AFFINITY: | |
462 | iolink = (struct crat_subtype_iolink *)sub_type_hdr; | |
4f449311 | 463 | ret = kfd_parse_subtype_iolink(iolink, device_list); |
174de876 FK |
464 | break; |
465 | default: | |
466 | pr_warn("Unknown subtype %d in CRAT\n", | |
467 | sub_type_hdr->type); | |
468 | } | |
469 | ||
470 | return ret; | |
471 | } | |
472 | ||
4f449311 HK |
473 | /* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT |
474 | * create a kfd_topology_device and add in to device_list. Also parse | |
475 | * CRAT subtypes and attach it to appropriate kfd_topology_device | |
476 | * @crat_image - input image containing CRAT | |
477 | * @device_list - [OUT] list of kfd_topology_device generated after | |
478 | * parsing crat_image | |
479 | * @proximity_domain - Proximity domain of the first device in the table | |
480 | * | |
481 | * Return - 0 if successful else -ve value | |
482 | */ | |
483 | int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, | |
484 | uint32_t proximity_domain) | |
174de876 | 485 | { |
520b8fb7 | 486 | struct kfd_topology_device *top_dev = NULL; |
174de876 FK |
487 | struct crat_subtype_generic *sub_type_hdr; |
488 | uint16_t node_id; | |
4f449311 | 489 | int ret = 0; |
174de876 FK |
490 | struct crat_header *crat_table = (struct crat_header *)crat_image; |
491 | uint16_t num_nodes; | |
492 | uint32_t image_len; | |
493 | ||
494 | if (!crat_image) | |
495 | return -EINVAL; | |
496 | ||
4f449311 HK |
497 | if (!list_empty(device_list)) { |
498 | pr_warn("Error device list should be empty\n"); | |
499 | return -EINVAL; | |
500 | } | |
501 | ||
174de876 FK |
502 | num_nodes = crat_table->num_domains; |
503 | image_len = crat_table->length; | |
504 | ||
505 | pr_info("Parsing CRAT table with %d nodes\n", num_nodes); | |
506 | ||
507 | for (node_id = 0; node_id < num_nodes; node_id++) { | |
4f449311 HK |
508 | top_dev = kfd_create_topology_device(device_list); |
509 | if (!top_dev) | |
510 | break; | |
511 | top_dev->proximity_domain = proximity_domain++; | |
512 | } | |
513 | ||
514 | if (!top_dev) { | |
515 | ret = -ENOMEM; | |
516 | goto err; | |
174de876 FK |
517 | } |
518 | ||
520b8fb7 FK |
519 | memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); |
520 | memcpy(top_dev->oem_table_id, crat_table->oem_table_id, | |
521 | CRAT_OEMTABLEID_LENGTH); | |
522 | top_dev->oem_revision = crat_table->oem_revision; | |
174de876 FK |
523 | |
524 | sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); | |
525 | while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < | |
526 | ((char *)crat_image) + image_len) { | |
527 | if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { | |
4f449311 HK |
528 | ret = kfd_parse_subtype(sub_type_hdr, device_list); |
529 | if (ret) | |
530 | break; | |
174de876 FK |
531 | } |
532 | ||
533 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + | |
534 | sub_type_hdr->length); | |
535 | } | |
536 | ||
4f449311 HK |
537 | err: |
538 | if (ret) | |
539 | kfd_release_topology_device_list(device_list); | |
174de876 | 540 | |
4f449311 | 541 | return ret; |
174de876 FK |
542 | } |
543 | ||
3a87177e HK |
544 | /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ |
545 | static int fill_in_pcache(struct crat_subtype_cache *pcache, | |
546 | struct kfd_gpu_cache_info *pcache_info, | |
547 | struct kfd_cu_info *cu_info, | |
548 | int mem_available, | |
549 | int cu_bitmask, | |
550 | int cache_type, unsigned int cu_processor_id, | |
551 | int cu_block) | |
552 | { | |
553 | unsigned int cu_sibling_map_mask; | |
554 | int first_active_cu; | |
555 | ||
556 | /* First check if enough memory is available */ | |
557 | if (sizeof(struct crat_subtype_cache) > mem_available) | |
558 | return -ENOMEM; | |
559 | ||
560 | cu_sibling_map_mask = cu_bitmask; | |
561 | cu_sibling_map_mask >>= cu_block; | |
562 | cu_sibling_map_mask &= | |
563 | ((1 << pcache_info[cache_type].num_cu_shared) - 1); | |
564 | first_active_cu = ffs(cu_sibling_map_mask); | |
565 | ||
566 | /* CU could be inactive. In case of shared cache find the first active | |
567 | * CU. and incase of non-shared cache check if the CU is inactive. If | |
568 | * inactive active skip it | |
569 | */ | |
570 | if (first_active_cu) { | |
571 | memset(pcache, 0, sizeof(struct crat_subtype_cache)); | |
572 | pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; | |
573 | pcache->length = sizeof(struct crat_subtype_cache); | |
574 | pcache->flags = pcache_info[cache_type].flags; | |
575 | pcache->processor_id_low = cu_processor_id | |
576 | + (first_active_cu - 1); | |
577 | pcache->cache_level = pcache_info[cache_type].cache_level; | |
578 | pcache->cache_size = pcache_info[cache_type].cache_size; | |
579 | ||
580 | /* Sibling map is w.r.t processor_id_low, so shift out | |
581 | * inactive CU | |
582 | */ | |
583 | cu_sibling_map_mask = | |
584 | cu_sibling_map_mask >> (first_active_cu - 1); | |
585 | ||
586 | pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); | |
587 | pcache->sibling_map[1] = | |
588 | (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); | |
589 | pcache->sibling_map[2] = | |
590 | (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); | |
591 | pcache->sibling_map[3] = | |
592 | (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); | |
593 | return 0; | |
594 | } | |
595 | return 1; | |
596 | } | |
597 | ||
598 | /* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info | |
599 | * tables | |
600 | * | |
601 | * @kdev - [IN] GPU device | |
602 | * @gpu_processor_id - [IN] GPU processor ID to which these caches | |
603 | * associate | |
604 | * @available_size - [IN] Amount of memory available in pcache | |
605 | * @cu_info - [IN] Compute Unit info obtained from KGD | |
606 | * @pcache - [OUT] memory into which cache data is to be filled in. | |
607 | * @size_filled - [OUT] amount of data used up in pcache. | |
608 | * @num_of_entries - [OUT] number of caches added | |
609 | */ | |
610 | static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, | |
611 | int gpu_processor_id, | |
612 | int available_size, | |
613 | struct kfd_cu_info *cu_info, | |
614 | struct crat_subtype_cache *pcache, | |
615 | int *size_filled, | |
616 | int *num_of_entries) | |
617 | { | |
618 | struct kfd_gpu_cache_info *pcache_info; | |
619 | int num_of_cache_types = 0; | |
620 | int i, j, k; | |
621 | int ct = 0; | |
622 | int mem_available = available_size; | |
623 | unsigned int cu_processor_id; | |
624 | int ret; | |
625 | ||
626 | switch (kdev->device_info->asic_family) { | |
627 | case CHIP_KAVERI: | |
628 | pcache_info = kaveri_cache_info; | |
629 | num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); | |
630 | break; | |
631 | case CHIP_HAWAII: | |
632 | pcache_info = hawaii_cache_info; | |
633 | num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); | |
634 | break; | |
635 | case CHIP_CARRIZO: | |
636 | pcache_info = carrizo_cache_info; | |
637 | num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); | |
638 | break; | |
639 | case CHIP_TONGA: | |
640 | pcache_info = tonga_cache_info; | |
641 | num_of_cache_types = ARRAY_SIZE(tonga_cache_info); | |
642 | break; | |
643 | case CHIP_FIJI: | |
644 | pcache_info = fiji_cache_info; | |
645 | num_of_cache_types = ARRAY_SIZE(fiji_cache_info); | |
646 | break; | |
647 | case CHIP_POLARIS10: | |
648 | pcache_info = polaris10_cache_info; | |
649 | num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); | |
650 | break; | |
651 | case CHIP_POLARIS11: | |
652 | pcache_info = polaris11_cache_info; | |
653 | num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); | |
654 | break; | |
846a44d7 GB |
655 | case CHIP_POLARIS12: |
656 | pcache_info = polaris12_cache_info; | |
657 | num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); | |
658 | break; | |
ed81cd6e KR |
659 | case CHIP_VEGAM: |
660 | pcache_info = vegam_cache_info; | |
661 | num_of_cache_types = ARRAY_SIZE(vegam_cache_info); | |
662 | break; | |
389056e5 | 663 | case CHIP_VEGA10: |
846a44d7 | 664 | case CHIP_VEGA12: |
22a3a294 | 665 | case CHIP_VEGA20: |
49adcf8a | 666 | case CHIP_ARCTURUS: |
389056e5 FK |
667 | pcache_info = vega10_cache_info; |
668 | num_of_cache_types = ARRAY_SIZE(vega10_cache_info); | |
669 | break; | |
670 | case CHIP_RAVEN: | |
671 | pcache_info = raven_cache_info; | |
672 | num_of_cache_types = ARRAY_SIZE(raven_cache_info); | |
737298d1 | 673 | break; |
a8d42f17 HR |
674 | case CHIP_RENOIR: |
675 | pcache_info = renoir_cache_info; | |
676 | num_of_cache_types = ARRAY_SIZE(renoir_cache_info); | |
677 | break; | |
14328aa5 | 678 | case CHIP_NAVI10: |
0e94b564 | 679 | case CHIP_NAVI12: |
8099ae40 | 680 | case CHIP_NAVI14: |
14328aa5 PC |
681 | pcache_info = navi10_cache_info; |
682 | num_of_cache_types = ARRAY_SIZE(navi10_cache_info); | |
389056e5 | 683 | break; |
3a87177e HK |
684 | default: |
685 | return -EINVAL; | |
686 | } | |
687 | ||
688 | *size_filled = 0; | |
689 | *num_of_entries = 0; | |
690 | ||
691 | /* For each type of cache listed in the kfd_gpu_cache_info table, | |
692 | * go through all available Compute Units. | |
693 | * The [i,j,k] loop will | |
694 | * if kfd_gpu_cache_info.num_cu_shared = 1 | |
695 | * will parse through all available CU | |
696 | * If (kfd_gpu_cache_info.num_cu_shared != 1) | |
697 | * then it will consider only one CU from | |
698 | * the shared unit | |
699 | */ | |
700 | ||
701 | for (ct = 0; ct < num_of_cache_types; ct++) { | |
702 | cu_processor_id = gpu_processor_id; | |
703 | for (i = 0; i < cu_info->num_shader_engines; i++) { | |
704 | for (j = 0; j < cu_info->num_shader_arrays_per_engine; | |
705 | j++) { | |
706 | for (k = 0; k < cu_info->num_cu_per_sh; | |
707 | k += pcache_info[ct].num_cu_shared) { | |
708 | ||
709 | ret = fill_in_pcache(pcache, | |
710 | pcache_info, | |
711 | cu_info, | |
712 | mem_available, | |
713 | cu_info->cu_bitmap[i][j], | |
714 | ct, | |
715 | cu_processor_id, | |
716 | k); | |
717 | ||
718 | if (ret < 0) | |
719 | break; | |
720 | ||
721 | if (!ret) { | |
722 | pcache++; | |
723 | (*num_of_entries)++; | |
724 | mem_available -= | |
725 | sizeof(*pcache); | |
726 | (*size_filled) += | |
727 | sizeof(*pcache); | |
728 | } | |
729 | ||
730 | /* Move to next CU block */ | |
731 | cu_processor_id += | |
732 | pcache_info[ct].num_cu_shared; | |
733 | } | |
734 | } | |
735 | } | |
736 | } | |
737 | ||
738 | pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); | |
739 | ||
740 | return 0; | |
741 | } | |
742 | ||
8e05247d HK |
743 | /* |
744 | * kfd_create_crat_image_acpi - Allocates memory for CRAT image and | |
745 | * copies CRAT from ACPI (if available). | |
746 | * NOTE: Call kfd_destroy_crat_image to free CRAT image memory | |
747 | * | |
748 | * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then | |
749 | * crat_image will be NULL | |
750 | * @size: [OUT] size of crat_image | |
751 | * | |
752 | * Return 0 if successful else return error code | |
753 | */ | |
754 | int kfd_create_crat_image_acpi(void **crat_image, size_t *size) | |
174de876 FK |
755 | { |
756 | struct acpi_table_header *crat_table; | |
757 | acpi_status status; | |
8e05247d | 758 | void *pcrat_image; |
174de876 | 759 | |
8e05247d | 760 | if (!crat_image) |
174de876 FK |
761 | return -EINVAL; |
762 | ||
8e05247d HK |
763 | *crat_image = NULL; |
764 | ||
765 | /* Fetch the CRAT table from ACPI */ | |
174de876 FK |
766 | status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); |
767 | if (status == AE_NOT_FOUND) { | |
768 | pr_warn("CRAT table not found\n"); | |
769 | return -ENODATA; | |
770 | } else if (ACPI_FAILURE(status)) { | |
771 | const char *err = acpi_format_exception(status); | |
772 | ||
773 | pr_err("CRAT table error: %s\n", err); | |
774 | return -EINVAL; | |
775 | } | |
776 | ||
ebcfd1e2 FK |
777 | if (ignore_crat) { |
778 | pr_info("CRAT table disabled by module option\n"); | |
779 | return -ENODATA; | |
780 | } | |
781 | ||
6dfeb11a | 782 | pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL); |
8e05247d HK |
783 | if (!pcrat_image) |
784 | return -ENOMEM; | |
785 | ||
8e05247d | 786 | *crat_image = pcrat_image; |
174de876 FK |
787 | *size = crat_table->length; |
788 | ||
789 | return 0; | |
790 | } | |
8e05247d | 791 | |
520b8fb7 FK |
792 | /* Memory required to create Virtual CRAT. |
793 | * Since there is no easy way to predict the amount of memory required, the | |
794 | * following amount are allocated for CPU and GPU Virtual CRAT. This is | |
795 | * expected to cover all known conditions. But to be safe additional check | |
796 | * is put in the code to ensure we don't overwrite. | |
797 | */ | |
798 | #define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) | |
47a7fe53 | 799 | #define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE) |
520b8fb7 FK |
800 | |
801 | /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node | |
802 | * | |
803 | * @numa_node_id: CPU NUMA node id | |
804 | * @avail_size: Available size in the memory | |
805 | * @sub_type_hdr: Memory into which compute info will be filled in | |
806 | * | |
807 | * Return 0 if successful else return -ve value | |
808 | */ | |
809 | static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, | |
810 | int proximity_domain, | |
811 | struct crat_subtype_computeunit *sub_type_hdr) | |
812 | { | |
813 | const struct cpumask *cpumask; | |
814 | ||
815 | *avail_size -= sizeof(struct crat_subtype_computeunit); | |
816 | if (*avail_size < 0) | |
817 | return -ENOMEM; | |
818 | ||
819 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); | |
820 | ||
821 | /* Fill in subtype header data */ | |
822 | sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; | |
823 | sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); | |
824 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; | |
825 | ||
826 | cpumask = cpumask_of_node(numa_node_id); | |
827 | ||
828 | /* Fill in CU data */ | |
829 | sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; | |
830 | sub_type_hdr->proximity_domain = proximity_domain; | |
831 | sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); | |
832 | if (sub_type_hdr->processor_id_low == -1) | |
833 | return -EINVAL; | |
834 | ||
835 | sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); | |
836 | ||
837 | return 0; | |
838 | } | |
839 | ||
840 | /* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node | |
841 | * | |
842 | * @numa_node_id: CPU NUMA node id | |
843 | * @avail_size: Available size in the memory | |
844 | * @sub_type_hdr: Memory into which compute info will be filled in | |
845 | * | |
846 | * Return 0 if successful else return -ve value | |
847 | */ | |
848 | static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, | |
849 | int proximity_domain, | |
850 | struct crat_subtype_memory *sub_type_hdr) | |
851 | { | |
852 | uint64_t mem_in_bytes = 0; | |
853 | pg_data_t *pgdat; | |
854 | int zone_type; | |
855 | ||
856 | *avail_size -= sizeof(struct crat_subtype_memory); | |
857 | if (*avail_size < 0) | |
858 | return -ENOMEM; | |
859 | ||
860 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); | |
861 | ||
862 | /* Fill in subtype header data */ | |
863 | sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; | |
864 | sub_type_hdr->length = sizeof(struct crat_subtype_memory); | |
865 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; | |
866 | ||
867 | /* Fill in Memory Subunit data */ | |
868 | ||
869 | /* Unlike si_meminfo, si_meminfo_node is not exported. So | |
870 | * the following lines are duplicated from si_meminfo_node | |
871 | * function | |
872 | */ | |
873 | pgdat = NODE_DATA(numa_node_id); | |
874 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) | |
9705bea5 | 875 | mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]); |
520b8fb7 FK |
876 | mem_in_bytes <<= PAGE_SHIFT; |
877 | ||
878 | sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); | |
879 | sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); | |
880 | sub_type_hdr->proximity_domain = proximity_domain; | |
881 | ||
882 | return 0; | |
883 | } | |
884 | ||
6d3d8065 | 885 | #ifdef CONFIG_X86_64 |
520b8fb7 FK |
886 | static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, |
887 | uint32_t *num_entries, | |
888 | struct crat_subtype_iolink *sub_type_hdr) | |
889 | { | |
890 | int nid; | |
891 | struct cpuinfo_x86 *c = &cpu_data(0); | |
892 | uint8_t link_type; | |
893 | ||
894 | if (c->x86_vendor == X86_VENDOR_AMD) | |
895 | link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; | |
896 | else | |
897 | link_type = CRAT_IOLINK_TYPE_QPI_1_1; | |
898 | ||
899 | *num_entries = 0; | |
900 | ||
901 | /* Create IO links from this node to other CPU nodes */ | |
902 | for_each_online_node(nid) { | |
903 | if (nid == numa_node_id) /* node itself */ | |
904 | continue; | |
905 | ||
906 | *avail_size -= sizeof(struct crat_subtype_iolink); | |
907 | if (*avail_size < 0) | |
908 | return -ENOMEM; | |
909 | ||
910 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); | |
911 | ||
912 | /* Fill in subtype header data */ | |
913 | sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; | |
914 | sub_type_hdr->length = sizeof(struct crat_subtype_iolink); | |
915 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; | |
916 | ||
917 | /* Fill in IO link data */ | |
918 | sub_type_hdr->proximity_domain_from = numa_node_id; | |
919 | sub_type_hdr->proximity_domain_to = nid; | |
920 | sub_type_hdr->io_interface_type = link_type; | |
921 | ||
922 | (*num_entries)++; | |
923 | sub_type_hdr++; | |
924 | } | |
925 | ||
926 | return 0; | |
927 | } | |
d1c234e2 | 928 | #endif |
520b8fb7 FK |
929 | |
930 | /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU | |
931 | * | |
932 | * @pcrat_image: Fill in VCRAT for CPU | |
933 | * @size: [IN] allocated size of crat_image. | |
934 | * [OUT] actual size of data filled in crat_image | |
935 | */ | |
936 | static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) | |
937 | { | |
938 | struct crat_header *crat_table = (struct crat_header *)pcrat_image; | |
939 | struct acpi_table_header *acpi_table; | |
940 | acpi_status status; | |
941 | struct crat_subtype_generic *sub_type_hdr; | |
942 | int avail_size = *size; | |
943 | int numa_node_id; | |
d1c234e2 | 944 | #ifdef CONFIG_X86_64 |
520b8fb7 | 945 | uint32_t entries = 0; |
d1c234e2 | 946 | #endif |
520b8fb7 FK |
947 | int ret = 0; |
948 | ||
949 | if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) | |
950 | return -EINVAL; | |
951 | ||
952 | /* Fill in CRAT Header. | |
953 | * Modify length and total_entries as subunits are added. | |
954 | */ | |
955 | avail_size -= sizeof(struct crat_header); | |
956 | if (avail_size < 0) | |
957 | return -ENOMEM; | |
958 | ||
959 | memset(crat_table, 0, sizeof(struct crat_header)); | |
960 | memcpy(&crat_table->signature, CRAT_SIGNATURE, | |
961 | sizeof(crat_table->signature)); | |
962 | crat_table->length = sizeof(struct crat_header); | |
963 | ||
964 | status = acpi_get_table("DSDT", 0, &acpi_table); | |
48a44387 | 965 | if (status != AE_OK) |
520b8fb7 FK |
966 | pr_warn("DSDT table not found for OEM information\n"); |
967 | else { | |
968 | crat_table->oem_revision = acpi_table->revision; | |
969 | memcpy(crat_table->oem_id, acpi_table->oem_id, | |
970 | CRAT_OEMID_LENGTH); | |
971 | memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, | |
972 | CRAT_OEMTABLEID_LENGTH); | |
973 | } | |
974 | crat_table->total_entries = 0; | |
975 | crat_table->num_domains = 0; | |
976 | ||
977 | sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); | |
978 | ||
979 | for_each_online_node(numa_node_id) { | |
980 | if (kfd_numa_node_to_apic_id(numa_node_id) == -1) | |
981 | continue; | |
982 | ||
983 | /* Fill in Subtype: Compute Unit */ | |
984 | ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, | |
985 | crat_table->num_domains, | |
986 | (struct crat_subtype_computeunit *)sub_type_hdr); | |
987 | if (ret < 0) | |
988 | return ret; | |
989 | crat_table->length += sub_type_hdr->length; | |
990 | crat_table->total_entries++; | |
991 | ||
992 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + | |
993 | sub_type_hdr->length); | |
994 | ||
995 | /* Fill in Subtype: Memory */ | |
996 | ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, | |
997 | crat_table->num_domains, | |
998 | (struct crat_subtype_memory *)sub_type_hdr); | |
999 | if (ret < 0) | |
1000 | return ret; | |
1001 | crat_table->length += sub_type_hdr->length; | |
1002 | crat_table->total_entries++; | |
1003 | ||
1004 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + | |
1005 | sub_type_hdr->length); | |
1006 | ||
1007 | /* Fill in Subtype: IO Link */ | |
d1c234e2 | 1008 | #ifdef CONFIG_X86_64 |
520b8fb7 FK |
1009 | ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, |
1010 | &entries, | |
1011 | (struct crat_subtype_iolink *)sub_type_hdr); | |
1012 | if (ret < 0) | |
1013 | return ret; | |
1014 | crat_table->length += (sub_type_hdr->length * entries); | |
1015 | crat_table->total_entries += entries; | |
1016 | ||
1017 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + | |
1018 | sub_type_hdr->length * entries); | |
d1c234e2 FK |
1019 | #else |
1020 | pr_info("IO link not available for non x86 platforms\n"); | |
1021 | #endif | |
520b8fb7 FK |
1022 | |
1023 | crat_table->num_domains++; | |
1024 | } | |
1025 | ||
1026 | /* TODO: Add cache Subtype for CPU. | |
1027 | * Currently, CPU cache information is available in function | |
1028 | * detect_cache_attributes(cpu) defined in the file | |
1029 | * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not | |
1030 | * exported and to get the same information the code needs to be | |
1031 | * duplicated. | |
1032 | */ | |
1033 | ||
1034 | *size = crat_table->length; | |
1035 | pr_info("Virtual CRAT table created for CPU\n"); | |
1036 | ||
1037 | return 0; | |
1038 | } | |
1039 | ||
3a87177e HK |
1040 | static int kfd_fill_gpu_memory_affinity(int *avail_size, |
1041 | struct kfd_dev *kdev, uint8_t type, uint64_t size, | |
1042 | struct crat_subtype_memory *sub_type_hdr, | |
1043 | uint32_t proximity_domain, | |
1044 | const struct kfd_local_mem_info *local_mem_info) | |
1045 | { | |
1046 | *avail_size -= sizeof(struct crat_subtype_memory); | |
1047 | if (*avail_size < 0) | |
1048 | return -ENOMEM; | |
1049 | ||
1050 | memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); | |
1051 | sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; | |
1052 | sub_type_hdr->length = sizeof(struct crat_subtype_memory); | |
1053 | sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; | |
1054 | ||
1055 | sub_type_hdr->proximity_domain = proximity_domain; | |
1056 | ||
1057 | pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", | |
1058 | type, size); | |
1059 | ||
1060 | sub_type_hdr->length_low = lower_32_bits(size); | |
1061 | sub_type_hdr->length_high = upper_32_bits(size); | |
1062 | ||
1063 | sub_type_hdr->width = local_mem_info->vram_width; | |
1064 | sub_type_hdr->visibility_type = type; | |
1065 | ||
1066 | return 0; | |
1067 | } | |
1068 | ||
1069 | /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU | |
1070 | * to its NUMA node | |
1071 | * @avail_size: Available size in the memory | |
1072 | * @kdev - [IN] GPU device | |
1073 | * @sub_type_hdr: Memory into which io link info will be filled in | |
1074 | * @proximity_domain - proximity domain of the GPU node | |
1075 | * | |
1076 | * Return 0 if successful else return -ve value | |
1077 | */ | |
ae9a25ae | 1078 | static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, |
3a87177e HK |
1079 | struct kfd_dev *kdev, |
1080 | struct crat_subtype_iolink *sub_type_hdr, | |
1081 | uint32_t proximity_domain) | |
1082 | { | |
1083 | *avail_size -= sizeof(struct crat_subtype_iolink); | |
1084 | if (*avail_size < 0) | |
1085 | return -ENOMEM; | |
1086 | ||
1087 | memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); | |
1088 | ||
1089 | /* Fill in subtype header data */ | |
1090 | sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; | |
1091 | sub_type_hdr->length = sizeof(struct crat_subtype_iolink); | |
1092 | sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; | |
67f7cf9f | 1093 | if (kfd_dev_is_large_bar(kdev)) |
1094 | sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; | |
3a87177e HK |
1095 | |
1096 | /* Fill in IOLINK subtype. | |
1097 | * TODO: Fill-in other fields of iolink subtype | |
1098 | */ | |
1099 | sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; | |
1100 | sub_type_hdr->proximity_domain_from = proximity_domain; | |
1101 | #ifdef CONFIG_NUMA | |
1102 | if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) | |
1103 | sub_type_hdr->proximity_domain_to = 0; | |
1104 | else | |
1105 | sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; | |
1106 | #else | |
1107 | sub_type_hdr->proximity_domain_to = 0; | |
1108 | #endif | |
1109 | return 0; | |
1110 | } | |
1111 | ||
ae9a25ae SL |
1112 | static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, |
1113 | struct kfd_dev *kdev, | |
0fb0df03 | 1114 | struct kfd_dev *peer_kdev, |
ae9a25ae SL |
1115 | struct crat_subtype_iolink *sub_type_hdr, |
1116 | uint32_t proximity_domain_from, | |
1117 | uint32_t proximity_domain_to) | |
1118 | { | |
1119 | *avail_size -= sizeof(struct crat_subtype_iolink); | |
1120 | if (*avail_size < 0) | |
1121 | return -ENOMEM; | |
1122 | ||
1123 | memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); | |
1124 | ||
1125 | sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; | |
1126 | sub_type_hdr->length = sizeof(struct crat_subtype_iolink); | |
67f7cf9f | 1127 | sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED | |
1128 | CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; | |
ae9a25ae SL |
1129 | |
1130 | sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; | |
1131 | sub_type_hdr->proximity_domain_from = proximity_domain_from; | |
1132 | sub_type_hdr->proximity_domain_to = proximity_domain_to; | |
0fb0df03 | 1133 | sub_type_hdr->num_hops_xgmi = |
1134 | amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd); | |
ae9a25ae SL |
1135 | return 0; |
1136 | } | |
1137 | ||
3a87177e HK |
1138 | /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU |
1139 | * | |
1140 | * @pcrat_image: Fill in VCRAT for GPU | |
1141 | * @size: [IN] allocated size of crat_image. | |
1142 | * [OUT] actual size of data filled in crat_image | |
1143 | */ | |
1144 | static int kfd_create_vcrat_image_gpu(void *pcrat_image, | |
1145 | size_t *size, struct kfd_dev *kdev, | |
1146 | uint32_t proximity_domain) | |
1147 | { | |
1148 | struct crat_header *crat_table = (struct crat_header *)pcrat_image; | |
1149 | struct crat_subtype_generic *sub_type_hdr; | |
ae9a25ae SL |
1150 | struct kfd_local_mem_info local_mem_info; |
1151 | struct kfd_topology_device *peer_dev; | |
3a87177e HK |
1152 | struct crat_subtype_computeunit *cu; |
1153 | struct kfd_cu_info cu_info; | |
3a87177e HK |
1154 | int avail_size = *size; |
1155 | uint32_t total_num_of_cu; | |
1156 | int num_of_cache_entries = 0; | |
1157 | int cache_mem_filled = 0; | |
ae9a25ae | 1158 | uint32_t nid = 0; |
3a87177e | 1159 | int ret = 0; |
3a87177e HK |
1160 | |
1161 | if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) | |
1162 | return -EINVAL; | |
1163 | ||
1164 | /* Fill the CRAT Header. | |
1165 | * Modify length and total_entries as subunits are added. | |
1166 | */ | |
1167 | avail_size -= sizeof(struct crat_header); | |
1168 | if (avail_size < 0) | |
1169 | return -ENOMEM; | |
1170 | ||
1171 | memset(crat_table, 0, sizeof(struct crat_header)); | |
1172 | ||
1173 | memcpy(&crat_table->signature, CRAT_SIGNATURE, | |
1174 | sizeof(crat_table->signature)); | |
1175 | /* Change length as we add more subtypes*/ | |
1176 | crat_table->length = sizeof(struct crat_header); | |
1177 | crat_table->num_domains = 1; | |
1178 | crat_table->total_entries = 0; | |
1179 | ||
1180 | /* Fill in Subtype: Compute Unit | |
1181 | * First fill in the sub type header and then sub type data | |
1182 | */ | |
1183 | avail_size -= sizeof(struct crat_subtype_computeunit); | |
1184 | if (avail_size < 0) | |
1185 | return -ENOMEM; | |
1186 | ||
1187 | sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); | |
1188 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); | |
1189 | ||
1190 | sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; | |
1191 | sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); | |
1192 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; | |
1193 | ||
1194 | /* Fill CU subtype data */ | |
1195 | cu = (struct crat_subtype_computeunit *)sub_type_hdr; | |
1196 | cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; | |
1197 | cu->proximity_domain = proximity_domain; | |
1198 | ||
7cd52c91 | 1199 | amdgpu_amdkfd_get_cu_info(kdev->kgd, &cu_info); |
3a87177e HK |
1200 | cu->num_simd_per_cu = cu_info.simd_per_cu; |
1201 | cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; | |
1202 | cu->max_waves_simd = cu_info.max_waves_per_simd; | |
1203 | ||
1204 | cu->wave_front_size = cu_info.wave_front_size; | |
1205 | cu->array_count = cu_info.num_shader_arrays_per_engine * | |
1206 | cu_info.num_shader_engines; | |
1207 | total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); | |
1208 | cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); | |
1209 | cu->num_cu_per_array = cu_info.num_cu_per_sh; | |
1210 | cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; | |
1211 | cu->num_banks = cu_info.num_shader_engines; | |
1212 | cu->lds_size_in_kb = cu_info.lds_size; | |
1213 | ||
1214 | cu->hsa_capability = 0; | |
1215 | ||
1216 | /* Check if this node supports IOMMU. During parsing this flag will | |
1217 | * translate to HSA_CAP_ATS_PRESENT | |
1218 | */ | |
64d1c3a4 FK |
1219 | if (!kfd_iommu_check_device(kdev)) |
1220 | cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; | |
3a87177e HK |
1221 | |
1222 | crat_table->length += sub_type_hdr->length; | |
1223 | crat_table->total_entries++; | |
1224 | ||
1225 | /* Fill in Subtype: Memory. Only on systems with large BAR (no | |
1226 | * private FB), report memory as public. On other systems | |
1227 | * report the total FB size (public+private) as a single | |
1228 | * private heap. | |
1229 | */ | |
7cd52c91 | 1230 | amdgpu_amdkfd_get_local_mem_info(kdev->kgd, &local_mem_info); |
3a87177e HK |
1231 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
1232 | sub_type_hdr->length); | |
1233 | ||
374200b1 FK |
1234 | if (debug_largebar) |
1235 | local_mem_info.local_mem_size_private = 0; | |
1236 | ||
3a87177e HK |
1237 | if (local_mem_info.local_mem_size_private == 0) |
1238 | ret = kfd_fill_gpu_memory_affinity(&avail_size, | |
1239 | kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, | |
1240 | local_mem_info.local_mem_size_public, | |
1241 | (struct crat_subtype_memory *)sub_type_hdr, | |
1242 | proximity_domain, | |
1243 | &local_mem_info); | |
1244 | else | |
1245 | ret = kfd_fill_gpu_memory_affinity(&avail_size, | |
1246 | kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, | |
1247 | local_mem_info.local_mem_size_public + | |
1248 | local_mem_info.local_mem_size_private, | |
1249 | (struct crat_subtype_memory *)sub_type_hdr, | |
1250 | proximity_domain, | |
1251 | &local_mem_info); | |
1252 | if (ret < 0) | |
1253 | return ret; | |
1254 | ||
1255 | crat_table->length += sizeof(struct crat_subtype_memory); | |
1256 | crat_table->total_entries++; | |
1257 | ||
1258 | /* TODO: Fill in cache information. This information is NOT readily | |
1259 | * available in KGD | |
1260 | */ | |
1261 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + | |
1262 | sub_type_hdr->length); | |
1263 | ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, | |
1264 | avail_size, | |
1265 | &cu_info, | |
1266 | (struct crat_subtype_cache *)sub_type_hdr, | |
1267 | &cache_mem_filled, | |
1268 | &num_of_cache_entries); | |
1269 | ||
1270 | if (ret < 0) | |
1271 | return ret; | |
1272 | ||
1273 | crat_table->length += cache_mem_filled; | |
1274 | crat_table->total_entries += num_of_cache_entries; | |
1275 | avail_size -= cache_mem_filled; | |
1276 | ||
1277 | /* Fill in Subtype: IO_LINKS | |
1278 | * Only direct links are added here which is Link from GPU to | |
1279 | * to its NUMA node. Indirect links are added by userspace. | |
1280 | */ | |
1281 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + | |
1282 | cache_mem_filled); | |
ae9a25ae | 1283 | ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev, |
3a87177e HK |
1284 | (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); |
1285 | ||
1286 | if (ret < 0) | |
1287 | return ret; | |
1288 | ||
1289 | crat_table->length += sub_type_hdr->length; | |
1290 | crat_table->total_entries++; | |
1291 | ||
ae9a25ae SL |
1292 | |
1293 | /* Fill in Subtype: IO_LINKS | |
1294 | * Direct links from GPU to other GPUs through xGMI. | |
1295 | * We will loop GPUs that already be processed (with lower value | |
1296 | * of proximity_domain), add the link for the GPUs with same | |
1297 | * hive id (from this GPU to other GPU) . The reversed iolink | |
1298 | * (from other GPU to this GPU) will be added | |
1299 | * in kfd_parse_subtype_iolink. | |
1300 | */ | |
1301 | if (kdev->hive_id) { | |
1302 | for (nid = 0; nid < proximity_domain; ++nid) { | |
1303 | peer_dev = kfd_topology_device_by_proximity_domain(nid); | |
1304 | if (!peer_dev->gpu) | |
1305 | continue; | |
1306 | if (peer_dev->gpu->hive_id != kdev->hive_id) | |
1307 | continue; | |
1308 | sub_type_hdr = (typeof(sub_type_hdr))( | |
1309 | (char *)sub_type_hdr + | |
1310 | sizeof(struct crat_subtype_iolink)); | |
1311 | ret = kfd_fill_gpu_xgmi_link_to_gpu( | |
0fb0df03 | 1312 | &avail_size, kdev, peer_dev->gpu, |
ae9a25ae SL |
1313 | (struct crat_subtype_iolink *)sub_type_hdr, |
1314 | proximity_domain, nid); | |
1315 | if (ret < 0) | |
1316 | return ret; | |
1317 | crat_table->length += sub_type_hdr->length; | |
1318 | crat_table->total_entries++; | |
1319 | } | |
1320 | } | |
3a87177e HK |
1321 | *size = crat_table->length; |
1322 | pr_info("Virtual CRAT table created for GPU\n"); | |
1323 | ||
1324 | return ret; | |
1325 | } | |
1326 | ||
520b8fb7 FK |
1327 | /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and |
1328 | * creates a Virtual CRAT (VCRAT) image | |
1329 | * | |
1330 | * NOTE: Call kfd_destroy_crat_image to free CRAT image memory | |
1331 | * | |
1332 | * @crat_image: VCRAT image created because ACPI does not have a | |
1333 | * CRAT for this device | |
1334 | * @size: [OUT] size of virtual crat_image | |
1335 | * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device | |
1336 | * COMPUTE_UNIT_GPU - Create VCRAT for GPU | |
1337 | * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU | |
1338 | * -- this option is not currently implemented. | |
1339 | * The assumption is that all AMD APUs will have CRAT | |
1340 | * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU | |
1341 | * | |
1342 | * Return 0 if successful else return -ve value | |
1343 | */ | |
1344 | int kfd_create_crat_image_virtual(void **crat_image, size_t *size, | |
1345 | int flags, struct kfd_dev *kdev, | |
1346 | uint32_t proximity_domain) | |
1347 | { | |
1348 | void *pcrat_image = NULL; | |
1349 | int ret = 0; | |
1350 | ||
1351 | if (!crat_image) | |
1352 | return -EINVAL; | |
1353 | ||
1354 | *crat_image = NULL; | |
1355 | ||
1356 | /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and | |
1357 | * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover | |
1358 | * all the current conditions. A check is put not to overwrite beyond | |
1359 | * allocated size | |
1360 | */ | |
1361 | switch (flags) { | |
1362 | case COMPUTE_UNIT_CPU: | |
1363 | pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); | |
1364 | if (!pcrat_image) | |
1365 | return -ENOMEM; | |
1366 | *size = VCRAT_SIZE_FOR_CPU; | |
1367 | ret = kfd_create_vcrat_image_cpu(pcrat_image, size); | |
1368 | break; | |
1369 | case COMPUTE_UNIT_GPU: | |
3a87177e HK |
1370 | if (!kdev) |
1371 | return -EINVAL; | |
1372 | pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); | |
1373 | if (!pcrat_image) | |
1374 | return -ENOMEM; | |
1375 | *size = VCRAT_SIZE_FOR_GPU; | |
1376 | ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, | |
1377 | proximity_domain); | |
520b8fb7 FK |
1378 | break; |
1379 | case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): | |
1380 | /* TODO: */ | |
1381 | ret = -EINVAL; | |
1382 | pr_err("VCRAT not implemented for APU\n"); | |
1383 | break; | |
1384 | default: | |
1385 | ret = -EINVAL; | |
1386 | } | |
1387 | ||
1388 | if (!ret) | |
1389 | *crat_image = pcrat_image; | |
1390 | else | |
1391 | kfree(pcrat_image); | |
1392 | ||
1393 | return ret; | |
1394 | } | |
1395 | ||
1396 | ||
1397 | /* kfd_destroy_crat_image | |
8e05247d HK |
1398 | * |
1399 | * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) | |
1400 | * | |
1401 | */ | |
1402 | void kfd_destroy_crat_image(void *crat_image) | |
1403 | { | |
1404 | kfree(crat_image); | |
1405 | } |