Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
b8ef9172 TH |
2 | /* |
3 | * NUMA emulation | |
4 | */ | |
5 | #include <linux/kernel.h> | |
6 | #include <linux/errno.h> | |
7 | #include <linux/topology.h> | |
8 | #include <linux/memblock.h> | |
1b7e03ef | 9 | #include <linux/bootmem.h> |
b8ef9172 TH |
10 | #include <asm/dma.h> |
11 | ||
12 | #include "numa_internal.h" | |
13 | ||
148f9bb8 | 14 | static int emu_nid_to_phys[MAX_NUMNODES]; |
b8ef9172 TH |
15 | static char *emu_cmdline __initdata; |
16 | ||
17 | void __init numa_emu_cmdline(char *str) | |
18 | { | |
19 | emu_cmdline = str; | |
20 | } | |
21 | ||
22 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | |
23 | { | |
24 | int i; | |
25 | ||
26 | for (i = 0; i < mi->nr_blks; i++) | |
27 | if (mi->blk[i].nid == nid) | |
28 | return i; | |
29 | return -ENOENT; | |
30 | } | |
31 | ||
e37aade3 | 32 | static u64 __init mem_hole_size(u64 start, u64 end) |
474b881b TH |
33 | { |
34 | unsigned long start_pfn = PFN_UP(start); | |
35 | unsigned long end_pfn = PFN_DOWN(end); | |
36 | ||
37 | if (start_pfn < end_pfn) | |
38 | return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); | |
39 | return 0; | |
40 | } | |
41 | ||
b8ef9172 TH |
42 | /* |
43 | * Sets up nid to range from @start to @end. The return value is -errno if | |
44 | * something went wrong, 0 otherwise. | |
45 | */ | |
46 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | |
47 | struct numa_meminfo *pi, | |
48 | int nid, int phys_blk, u64 size) | |
49 | { | |
50 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | |
51 | struct numa_memblk *pb = &pi->blk[phys_blk]; | |
52 | ||
53 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | |
54 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | |
55 | return -EINVAL; | |
56 | } | |
57 | ||
58 | ei->nr_blks++; | |
59 | eb->start = pb->start; | |
60 | eb->end = pb->start + size; | |
61 | eb->nid = nid; | |
62 | ||
63 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | |
d71b5a73 | 64 | emu_nid_to_phys[nid] = nid; |
b8ef9172 TH |
65 | |
66 | pb->start += size; | |
67 | if (pb->start >= pb->end) { | |
68 | WARN_ON_ONCE(pb->start > pb->end); | |
69 | numa_remove_memblk_from(phys_blk, pi); | |
70 | } | |
71 | ||
365811d6 BH |
72 | printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", |
73 | nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); | |
b8ef9172 TH |
74 | return 0; |
75 | } | |
76 | ||
77 | /* | |
78 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | |
d80a9eb3 WY |
79 | * to max_addr. |
80 | * | |
81 | * Returns zero on success or negative on error. | |
b8ef9172 TH |
82 | */ |
83 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | |
84 | struct numa_meminfo *pi, | |
85 | u64 addr, u64 max_addr, int nr_nodes) | |
86 | { | |
d80a9eb3 | 87 | nodemask_t physnode_mask = numa_nodes_parsed; |
b8ef9172 TH |
88 | u64 size; |
89 | int big; | |
90 | int nid = 0; | |
91 | int i, ret; | |
92 | ||
93 | if (nr_nodes <= 0) | |
94 | return -1; | |
95 | if (nr_nodes > MAX_NUMNODES) { | |
96 | pr_info("numa=fake=%d too large, reducing to %d\n", | |
97 | nr_nodes, MAX_NUMNODES); | |
98 | nr_nodes = MAX_NUMNODES; | |
99 | } | |
100 | ||
1b7e03ef TH |
101 | /* |
102 | * Calculate target node size. x86_32 freaks on __udivdi3() so do | |
103 | * the division in ulong number of pages and convert back. | |
104 | */ | |
474b881b | 105 | size = max_addr - addr - mem_hole_size(addr, max_addr); |
1b7e03ef TH |
106 | size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); |
107 | ||
b8ef9172 TH |
108 | /* |
109 | * Calculate the number of big nodes that can be allocated as a result | |
110 | * of consolidating the remainder. | |
111 | */ | |
112 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | |
113 | FAKE_NODE_MIN_SIZE; | |
114 | ||
115 | size &= FAKE_NODE_MIN_HASH_MASK; | |
116 | if (!size) { | |
117 | pr_err("Not enough memory for each node. " | |
118 | "NUMA emulation disabled.\n"); | |
119 | return -1; | |
120 | } | |
121 | ||
b8ef9172 TH |
122 | /* |
123 | * Continue to fill physical nodes with fake nodes until there is no | |
124 | * memory left on any of them. | |
125 | */ | |
126 | while (nodes_weight(physnode_mask)) { | |
127 | for_each_node_mask(i, physnode_mask) { | |
128 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | |
129 | u64 start, limit, end; | |
130 | int phys_blk; | |
131 | ||
132 | phys_blk = emu_find_memblk_by_nid(i, pi); | |
133 | if (phys_blk < 0) { | |
134 | node_clear(i, physnode_mask); | |
135 | continue; | |
136 | } | |
137 | start = pi->blk[phys_blk].start; | |
138 | limit = pi->blk[phys_blk].end; | |
139 | end = start + size; | |
140 | ||
141 | if (nid < big) | |
142 | end += FAKE_NODE_MIN_SIZE; | |
143 | ||
144 | /* | |
145 | * Continue to add memory to this fake node if its | |
146 | * non-reserved memory is less than the per-node size. | |
147 | */ | |
474b881b | 148 | while (end - start - mem_hole_size(start, end) < size) { |
b8ef9172 TH |
149 | end += FAKE_NODE_MIN_SIZE; |
150 | if (end > limit) { | |
151 | end = limit; | |
152 | break; | |
153 | } | |
154 | } | |
155 | ||
156 | /* | |
157 | * If there won't be at least FAKE_NODE_MIN_SIZE of | |
158 | * non-reserved memory in ZONE_DMA32 for the next node, | |
159 | * this one must extend to the boundary. | |
160 | */ | |
161 | if (end < dma32_end && dma32_end - end - | |
474b881b | 162 | mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
b8ef9172 TH |
163 | end = dma32_end; |
164 | ||
165 | /* | |
166 | * If there won't be enough non-reserved memory for the | |
167 | * next node, this one must extend to the end of the | |
168 | * physical node. | |
169 | */ | |
474b881b | 170 | if (limit - end - mem_hole_size(end, limit) < size) |
b8ef9172 TH |
171 | end = limit; |
172 | ||
173 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | |
174 | phys_blk, | |
175 | min(end, limit) - start); | |
176 | if (ret < 0) | |
177 | return ret; | |
178 | } | |
179 | } | |
180 | return 0; | |
181 | } | |
182 | ||
183 | /* | |
184 | * Returns the end address of a node so that there is at least `size' amount of | |
185 | * non-reserved memory or `max_addr' is reached. | |
186 | */ | |
187 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | |
188 | { | |
189 | u64 end = start + size; | |
190 | ||
474b881b | 191 | while (end - start - mem_hole_size(start, end) < size) { |
b8ef9172 TH |
192 | end += FAKE_NODE_MIN_SIZE; |
193 | if (end > max_addr) { | |
194 | end = max_addr; | |
195 | break; | |
196 | } | |
197 | } | |
198 | return end; | |
199 | } | |
200 | ||
201 | /* | |
202 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | |
d80a9eb3 WY |
203 | * `addr' to `max_addr'. |
204 | * | |
205 | * Returns zero on success or negative on error. | |
b8ef9172 TH |
206 | */ |
207 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | |
208 | struct numa_meminfo *pi, | |
209 | u64 addr, u64 max_addr, u64 size) | |
210 | { | |
d80a9eb3 | 211 | nodemask_t physnode_mask = numa_nodes_parsed; |
b8ef9172 TH |
212 | u64 min_size; |
213 | int nid = 0; | |
214 | int i, ret; | |
215 | ||
216 | if (!size) | |
217 | return -1; | |
218 | /* | |
219 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | |
220 | * increased accordingly if the requested size is too small. This | |
221 | * creates a uniform distribution of node sizes across the entire | |
222 | * machine (but not necessarily over physical nodes). | |
223 | */ | |
474b881b | 224 | min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; |
b8ef9172 TH |
225 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); |
226 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | |
227 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | |
228 | FAKE_NODE_MIN_HASH_MASK; | |
229 | if (size < min_size) { | |
230 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | |
231 | size >> 20, min_size >> 20); | |
232 | size = min_size; | |
233 | } | |
234 | size &= FAKE_NODE_MIN_HASH_MASK; | |
235 | ||
b8ef9172 TH |
236 | /* |
237 | * Fill physical nodes with fake nodes of size until there is no memory | |
238 | * left on any of them. | |
239 | */ | |
240 | while (nodes_weight(physnode_mask)) { | |
241 | for_each_node_mask(i, physnode_mask) { | |
1b7e03ef | 242 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); |
b8ef9172 TH |
243 | u64 start, limit, end; |
244 | int phys_blk; | |
245 | ||
246 | phys_blk = emu_find_memblk_by_nid(i, pi); | |
247 | if (phys_blk < 0) { | |
248 | node_clear(i, physnode_mask); | |
249 | continue; | |
250 | } | |
251 | start = pi->blk[phys_blk].start; | |
252 | limit = pi->blk[phys_blk].end; | |
253 | ||
254 | end = find_end_of_node(start, limit, size); | |
255 | /* | |
256 | * If there won't be at least FAKE_NODE_MIN_SIZE of | |
257 | * non-reserved memory in ZONE_DMA32 for the next node, | |
258 | * this one must extend to the boundary. | |
259 | */ | |
260 | if (end < dma32_end && dma32_end - end - | |
474b881b | 261 | mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
b8ef9172 TH |
262 | end = dma32_end; |
263 | ||
264 | /* | |
265 | * If there won't be enough non-reserved memory for the | |
266 | * next node, this one must extend to the end of the | |
267 | * physical node. | |
268 | */ | |
474b881b | 269 | if (limit - end - mem_hole_size(end, limit) < size) |
b8ef9172 TH |
270 | end = limit; |
271 | ||
272 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | |
273 | phys_blk, | |
274 | min(end, limit) - start); | |
275 | if (ret < 0) | |
276 | return ret; | |
277 | } | |
278 | } | |
279 | return 0; | |
280 | } | |
281 | ||
158f424f WY |
282 | int __init setup_emu2phys_nid(int *dfl_phys_nid) |
283 | { | |
284 | int i, max_emu_nid = 0; | |
285 | ||
286 | *dfl_phys_nid = NUMA_NO_NODE; | |
287 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | |
288 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | |
289 | max_emu_nid = i; | |
290 | if (*dfl_phys_nid == NUMA_NO_NODE) | |
291 | *dfl_phys_nid = emu_nid_to_phys[i]; | |
292 | } | |
293 | } | |
294 | ||
295 | return max_emu_nid; | |
296 | } | |
297 | ||
90e6b677 TH |
298 | /** |
299 | * numa_emulation - Emulate NUMA nodes | |
300 | * @numa_meminfo: NUMA configuration to massage | |
301 | * @numa_dist_cnt: The size of the physical NUMA distance table | |
302 | * | |
303 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | |
304 | * @numa_meminfo contains the physical memory configuration and is modified | |
305 | * to reflect the emulated configuration on success. @numa_dist_cnt is | |
306 | * used to determine the size of the physical distance table. | |
307 | * | |
308 | * On success, the following modifications are made. | |
309 | * | |
310 | * - @numa_meminfo is updated to reflect the emulated nodes. | |
311 | * | |
312 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | |
313 | * emulated nodes. | |
314 | * | |
315 | * - NUMA distance table is rebuilt to represent distances between emulated | |
316 | * nodes. The distances are determined considering how emulated nodes | |
317 | * are mapped to physical nodes and match the actual distances. | |
318 | * | |
319 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | |
320 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | |
321 | * | |
322 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | |
323 | * identity mapping and no other modification is made. | |
b8ef9172 TH |
324 | */ |
325 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | |
326 | { | |
327 | static struct numa_meminfo ei __initdata; | |
328 | static struct numa_meminfo pi __initdata; | |
1b7e03ef | 329 | const u64 max_addr = PFN_PHYS(max_pfn); |
b8ef9172 | 330 | u8 *phys_dist = NULL; |
ce003330 | 331 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); |
56396e68 | 332 | int max_emu_nid, dfl_phys_nid; |
b8ef9172 TH |
333 | int i, j, ret; |
334 | ||
335 | if (!emu_cmdline) | |
336 | goto no_emu; | |
337 | ||
338 | memset(&ei, 0, sizeof(ei)); | |
339 | pi = *numa_meminfo; | |
340 | ||
341 | for (i = 0; i < MAX_NUMNODES; i++) | |
342 | emu_nid_to_phys[i] = NUMA_NO_NODE; | |
343 | ||
344 | /* | |
345 | * If the numa=fake command-line contains a 'M' or 'G', it represents | |
346 | * the fixed node size. Otherwise, if it is just a single number N, | |
347 | * split the system RAM into N fake nodes. | |
348 | */ | |
349 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | |
350 | u64 size; | |
351 | ||
352 | size = memparse(emu_cmdline, &emu_cmdline); | |
353 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | |
354 | } else { | |
355 | unsigned long n; | |
356 | ||
94c0dd32 | 357 | n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
b8ef9172 TH |
358 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); |
359 | } | |
94c0dd32 PZ |
360 | if (*emu_cmdline == ':') |
361 | emu_cmdline++; | |
b8ef9172 TH |
362 | |
363 | if (ret < 0) | |
364 | goto no_emu; | |
365 | ||
366 | if (numa_cleanup_meminfo(&ei) < 0) { | |
367 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | |
368 | goto no_emu; | |
369 | } | |
370 | ||
ce003330 | 371 | /* copy the physical distance table */ |
b8ef9172 | 372 | if (numa_dist_cnt) { |
b8ef9172 TH |
373 | u64 phys; |
374 | ||
1b7e03ef | 375 | phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), |
ce003330 | 376 | phys_size, PAGE_SIZE); |
1f5026a7 | 377 | if (!phys) { |
b8ef9172 TH |
378 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); |
379 | goto no_emu; | |
380 | } | |
24aa0788 | 381 | memblock_reserve(phys, phys_size); |
b8ef9172 TH |
382 | phys_dist = __va(phys); |
383 | ||
384 | for (i = 0; i < numa_dist_cnt; i++) | |
385 | for (j = 0; j < numa_dist_cnt; j++) | |
386 | phys_dist[i * numa_dist_cnt + j] = | |
387 | node_distance(i, j); | |
388 | } | |
389 | ||
56396e68 TH |
390 | /* |
391 | * Determine the max emulated nid and the default phys nid to use | |
392 | * for unmapped nodes. | |
393 | */ | |
158f424f | 394 | max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); |
078a1989 | 395 | |
b8ef9172 TH |
396 | /* commit */ |
397 | *numa_meminfo = ei; | |
398 | ||
4f167201 WY |
399 | /* Make sure numa_nodes_parsed only contains emulated nodes */ |
400 | nodes_clear(numa_nodes_parsed); | |
401 | for (i = 0; i < ARRAY_SIZE(ei.blk); i++) | |
402 | if (ei.blk[i].start != ei.blk[i].end && | |
403 | ei.blk[i].nid != NUMA_NO_NODE) | |
404 | node_set(ei.blk[i].nid, numa_nodes_parsed); | |
405 | ||
b8ef9172 TH |
406 | /* |
407 | * Transform __apicid_to_node table to use emulated nids by | |
408 | * reverse-mapping phys_nid. The maps should always exist but fall | |
409 | * back to zero just in case. | |
410 | */ | |
411 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | |
412 | if (__apicid_to_node[i] == NUMA_NO_NODE) | |
413 | continue; | |
414 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | |
415 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | |
416 | break; | |
417 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | |
418 | } | |
419 | ||
420 | /* make sure all emulated nodes are mapped to a physical node */ | |
421 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | |
422 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | |
078a1989 | 423 | emu_nid_to_phys[i] = dfl_phys_nid; |
b8ef9172 | 424 | |
56396e68 | 425 | /* transform distance table */ |
b8ef9172 | 426 | numa_reset_distance(); |
56396e68 TH |
427 | for (i = 0; i < max_emu_nid + 1; i++) { |
428 | for (j = 0; j < max_emu_nid + 1; j++) { | |
b8ef9172 TH |
429 | int physi = emu_nid_to_phys[i]; |
430 | int physj = emu_nid_to_phys[j]; | |
431 | int dist; | |
432 | ||
94c0dd32 PZ |
433 | if (get_option(&emu_cmdline, &dist) == 2) |
434 | ; | |
435 | else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | |
b8ef9172 TH |
436 | dist = physi == physj ? |
437 | LOCAL_DISTANCE : REMOTE_DISTANCE; | |
438 | else | |
439 | dist = phys_dist[physi * numa_dist_cnt + physj]; | |
440 | ||
441 | numa_set_distance(i, j, dist); | |
442 | } | |
443 | } | |
ce003330 YL |
444 | |
445 | /* free the copied physical distance table */ | |
446 | if (phys_dist) | |
24aa0788 | 447 | memblock_free(__pa(phys_dist), phys_size); |
b8ef9172 TH |
448 | return; |
449 | ||
450 | no_emu: | |
451 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | |
452 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | |
453 | emu_nid_to_phys[i] = i; | |
454 | } | |
455 | ||
456 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | |
148f9bb8 | 457 | void numa_add_cpu(int cpu) |
b8ef9172 TH |
458 | { |
459 | int physnid, nid; | |
460 | ||
51b361b4 | 461 | nid = early_cpu_to_node(cpu); |
b8ef9172 TH |
462 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); |
463 | ||
464 | physnid = emu_nid_to_phys[nid]; | |
465 | ||
466 | /* | |
467 | * Map the cpu to each emulated node that is allocated on the physical | |
468 | * node of the cpu's apic id. | |
469 | */ | |
470 | for_each_online_node(nid) | |
471 | if (emu_nid_to_phys[nid] == physnid) | |
472 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | |
473 | } | |
474 | ||
148f9bb8 | 475 | void numa_remove_cpu(int cpu) |
b8ef9172 TH |
476 | { |
477 | int i; | |
478 | ||
479 | for_each_online_node(i) | |
480 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | |
481 | } | |
482 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | |
148f9bb8 | 483 | static void numa_set_cpumask(int cpu, bool enable) |
b8ef9172 | 484 | { |
7a6c6547 | 485 | int nid, physnid; |
b8ef9172 TH |
486 | |
487 | nid = early_cpu_to_node(cpu); | |
488 | if (nid == NUMA_NO_NODE) { | |
489 | /* early_cpu_to_node() already emits a warning and trace */ | |
490 | return; | |
491 | } | |
492 | ||
493 | physnid = emu_nid_to_phys[nid]; | |
494 | ||
7a6c6547 | 495 | for_each_online_node(nid) { |
b8ef9172 TH |
496 | if (emu_nid_to_phys[nid] != physnid) |
497 | continue; | |
498 | ||
7a6c6547 | 499 | debug_cpumask_set_cpu(cpu, nid, enable); |
b8ef9172 TH |
500 | } |
501 | } | |
502 | ||
148f9bb8 | 503 | void numa_add_cpu(int cpu) |
b8ef9172 | 504 | { |
7a6c6547 | 505 | numa_set_cpumask(cpu, true); |
b8ef9172 TH |
506 | } |
507 | ||
148f9bb8 | 508 | void numa_remove_cpu(int cpu) |
b8ef9172 | 509 | { |
7a6c6547 | 510 | numa_set_cpumask(cpu, false); |
b8ef9172 TH |
511 | } |
512 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |