Commit | Line | Data |
---|---|---|
b8ef9172 TH |
1 | /* |
2 | * NUMA emulation | |
3 | */ | |
4 | #include <linux/kernel.h> | |
5 | #include <linux/errno.h> | |
6 | #include <linux/topology.h> | |
7 | #include <linux/memblock.h> | |
1b7e03ef | 8 | #include <linux/bootmem.h> |
b8ef9172 TH |
9 | #include <asm/dma.h> |
10 | ||
11 | #include "numa_internal.h" | |
12 | ||
148f9bb8 | 13 | static int emu_nid_to_phys[MAX_NUMNODES]; |
b8ef9172 TH |
14 | static char *emu_cmdline __initdata; |
15 | ||
16 | void __init numa_emu_cmdline(char *str) | |
17 | { | |
18 | emu_cmdline = str; | |
19 | } | |
20 | ||
21 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | |
22 | { | |
23 | int i; | |
24 | ||
25 | for (i = 0; i < mi->nr_blks; i++) | |
26 | if (mi->blk[i].nid == nid) | |
27 | return i; | |
28 | return -ENOENT; | |
29 | } | |
30 | ||
e37aade3 | 31 | static u64 __init mem_hole_size(u64 start, u64 end) |
474b881b TH |
32 | { |
33 | unsigned long start_pfn = PFN_UP(start); | |
34 | unsigned long end_pfn = PFN_DOWN(end); | |
35 | ||
36 | if (start_pfn < end_pfn) | |
37 | return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); | |
38 | return 0; | |
39 | } | |
40 | ||
b8ef9172 TH |
41 | /* |
42 | * Sets up nid to range from @start to @end. The return value is -errno if | |
43 | * something went wrong, 0 otherwise. | |
44 | */ | |
45 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | |
46 | struct numa_meminfo *pi, | |
47 | int nid, int phys_blk, u64 size) | |
48 | { | |
49 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | |
50 | struct numa_memblk *pb = &pi->blk[phys_blk]; | |
51 | ||
52 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | |
53 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | |
54 | return -EINVAL; | |
55 | } | |
56 | ||
57 | ei->nr_blks++; | |
58 | eb->start = pb->start; | |
59 | eb->end = pb->start + size; | |
60 | eb->nid = nid; | |
61 | ||
62 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | |
d71b5a73 | 63 | emu_nid_to_phys[nid] = nid; |
b8ef9172 TH |
64 | |
65 | pb->start += size; | |
66 | if (pb->start >= pb->end) { | |
67 | WARN_ON_ONCE(pb->start > pb->end); | |
68 | numa_remove_memblk_from(phys_blk, pi); | |
69 | } | |
70 | ||
365811d6 BH |
71 | printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", |
72 | nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); | |
b8ef9172 TH |
73 | return 0; |
74 | } | |
75 | ||
76 | /* | |
77 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | |
78 | * to max_addr. The return value is the number of nodes allocated. | |
79 | */ | |
80 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | |
81 | struct numa_meminfo *pi, | |
82 | u64 addr, u64 max_addr, int nr_nodes) | |
83 | { | |
84 | nodemask_t physnode_mask = NODE_MASK_NONE; | |
85 | u64 size; | |
86 | int big; | |
87 | int nid = 0; | |
88 | int i, ret; | |
89 | ||
90 | if (nr_nodes <= 0) | |
91 | return -1; | |
92 | if (nr_nodes > MAX_NUMNODES) { | |
93 | pr_info("numa=fake=%d too large, reducing to %d\n", | |
94 | nr_nodes, MAX_NUMNODES); | |
95 | nr_nodes = MAX_NUMNODES; | |
96 | } | |
97 | ||
1b7e03ef TH |
98 | /* |
99 | * Calculate target node size. x86_32 freaks on __udivdi3() so do | |
100 | * the division in ulong number of pages and convert back. | |
101 | */ | |
474b881b | 102 | size = max_addr - addr - mem_hole_size(addr, max_addr); |
1b7e03ef TH |
103 | size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); |
104 | ||
b8ef9172 TH |
105 | /* |
106 | * Calculate the number of big nodes that can be allocated as a result | |
107 | * of consolidating the remainder. | |
108 | */ | |
109 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | |
110 | FAKE_NODE_MIN_SIZE; | |
111 | ||
112 | size &= FAKE_NODE_MIN_HASH_MASK; | |
113 | if (!size) { | |
114 | pr_err("Not enough memory for each node. " | |
115 | "NUMA emulation disabled.\n"); | |
116 | return -1; | |
117 | } | |
118 | ||
119 | for (i = 0; i < pi->nr_blks; i++) | |
120 | node_set(pi->blk[i].nid, physnode_mask); | |
121 | ||
122 | /* | |
123 | * Continue to fill physical nodes with fake nodes until there is no | |
124 | * memory left on any of them. | |
125 | */ | |
126 | while (nodes_weight(physnode_mask)) { | |
127 | for_each_node_mask(i, physnode_mask) { | |
128 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | |
129 | u64 start, limit, end; | |
130 | int phys_blk; | |
131 | ||
132 | phys_blk = emu_find_memblk_by_nid(i, pi); | |
133 | if (phys_blk < 0) { | |
134 | node_clear(i, physnode_mask); | |
135 | continue; | |
136 | } | |
137 | start = pi->blk[phys_blk].start; | |
138 | limit = pi->blk[phys_blk].end; | |
139 | end = start + size; | |
140 | ||
141 | if (nid < big) | |
142 | end += FAKE_NODE_MIN_SIZE; | |
143 | ||
144 | /* | |
145 | * Continue to add memory to this fake node if its | |
146 | * non-reserved memory is less than the per-node size. | |
147 | */ | |
474b881b | 148 | while (end - start - mem_hole_size(start, end) < size) { |
b8ef9172 TH |
149 | end += FAKE_NODE_MIN_SIZE; |
150 | if (end > limit) { | |
151 | end = limit; | |
152 | break; | |
153 | } | |
154 | } | |
155 | ||
156 | /* | |
157 | * If there won't be at least FAKE_NODE_MIN_SIZE of | |
158 | * non-reserved memory in ZONE_DMA32 for the next node, | |
159 | * this one must extend to the boundary. | |
160 | */ | |
161 | if (end < dma32_end && dma32_end - end - | |
474b881b | 162 | mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
b8ef9172 TH |
163 | end = dma32_end; |
164 | ||
165 | /* | |
166 | * If there won't be enough non-reserved memory for the | |
167 | * next node, this one must extend to the end of the | |
168 | * physical node. | |
169 | */ | |
474b881b | 170 | if (limit - end - mem_hole_size(end, limit) < size) |
b8ef9172 TH |
171 | end = limit; |
172 | ||
173 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | |
174 | phys_blk, | |
175 | min(end, limit) - start); | |
176 | if (ret < 0) | |
177 | return ret; | |
178 | } | |
179 | } | |
180 | return 0; | |
181 | } | |
182 | ||
183 | /* | |
184 | * Returns the end address of a node so that there is at least `size' amount of | |
185 | * non-reserved memory or `max_addr' is reached. | |
186 | */ | |
187 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | |
188 | { | |
189 | u64 end = start + size; | |
190 | ||
474b881b | 191 | while (end - start - mem_hole_size(start, end) < size) { |
b8ef9172 TH |
192 | end += FAKE_NODE_MIN_SIZE; |
193 | if (end > max_addr) { | |
194 | end = max_addr; | |
195 | break; | |
196 | } | |
197 | } | |
198 | return end; | |
199 | } | |
200 | ||
201 | /* | |
202 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | |
203 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | |
204 | */ | |
205 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | |
206 | struct numa_meminfo *pi, | |
207 | u64 addr, u64 max_addr, u64 size) | |
208 | { | |
209 | nodemask_t physnode_mask = NODE_MASK_NONE; | |
210 | u64 min_size; | |
211 | int nid = 0; | |
212 | int i, ret; | |
213 | ||
214 | if (!size) | |
215 | return -1; | |
216 | /* | |
217 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | |
218 | * increased accordingly if the requested size is too small. This | |
219 | * creates a uniform distribution of node sizes across the entire | |
220 | * machine (but not necessarily over physical nodes). | |
221 | */ | |
474b881b | 222 | min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; |
b8ef9172 TH |
223 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); |
224 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | |
225 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | |
226 | FAKE_NODE_MIN_HASH_MASK; | |
227 | if (size < min_size) { | |
228 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | |
229 | size >> 20, min_size >> 20); | |
230 | size = min_size; | |
231 | } | |
232 | size &= FAKE_NODE_MIN_HASH_MASK; | |
233 | ||
234 | for (i = 0; i < pi->nr_blks; i++) | |
235 | node_set(pi->blk[i].nid, physnode_mask); | |
236 | ||
237 | /* | |
238 | * Fill physical nodes with fake nodes of size until there is no memory | |
239 | * left on any of them. | |
240 | */ | |
241 | while (nodes_weight(physnode_mask)) { | |
242 | for_each_node_mask(i, physnode_mask) { | |
1b7e03ef | 243 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); |
b8ef9172 TH |
244 | u64 start, limit, end; |
245 | int phys_blk; | |
246 | ||
247 | phys_blk = emu_find_memblk_by_nid(i, pi); | |
248 | if (phys_blk < 0) { | |
249 | node_clear(i, physnode_mask); | |
250 | continue; | |
251 | } | |
252 | start = pi->blk[phys_blk].start; | |
253 | limit = pi->blk[phys_blk].end; | |
254 | ||
255 | end = find_end_of_node(start, limit, size); | |
256 | /* | |
257 | * If there won't be at least FAKE_NODE_MIN_SIZE of | |
258 | * non-reserved memory in ZONE_DMA32 for the next node, | |
259 | * this one must extend to the boundary. | |
260 | */ | |
261 | if (end < dma32_end && dma32_end - end - | |
474b881b | 262 | mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
b8ef9172 TH |
263 | end = dma32_end; |
264 | ||
265 | /* | |
266 | * If there won't be enough non-reserved memory for the | |
267 | * next node, this one must extend to the end of the | |
268 | * physical node. | |
269 | */ | |
474b881b | 270 | if (limit - end - mem_hole_size(end, limit) < size) |
b8ef9172 TH |
271 | end = limit; |
272 | ||
273 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | |
274 | phys_blk, | |
275 | min(end, limit) - start); | |
276 | if (ret < 0) | |
277 | return ret; | |
278 | } | |
279 | } | |
280 | return 0; | |
281 | } | |
282 | ||
90e6b677 TH |
283 | /** |
284 | * numa_emulation - Emulate NUMA nodes | |
285 | * @numa_meminfo: NUMA configuration to massage | |
286 | * @numa_dist_cnt: The size of the physical NUMA distance table | |
287 | * | |
288 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | |
289 | * @numa_meminfo contains the physical memory configuration and is modified | |
290 | * to reflect the emulated configuration on success. @numa_dist_cnt is | |
291 | * used to determine the size of the physical distance table. | |
292 | * | |
293 | * On success, the following modifications are made. | |
294 | * | |
295 | * - @numa_meminfo is updated to reflect the emulated nodes. | |
296 | * | |
297 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | |
298 | * emulated nodes. | |
299 | * | |
300 | * - NUMA distance table is rebuilt to represent distances between emulated | |
301 | * nodes. The distances are determined considering how emulated nodes | |
302 | * are mapped to physical nodes and match the actual distances. | |
303 | * | |
304 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | |
305 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | |
306 | * | |
307 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | |
308 | * identity mapping and no other modification is made. | |
b8ef9172 TH |
309 | */ |
310 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | |
311 | { | |
312 | static struct numa_meminfo ei __initdata; | |
313 | static struct numa_meminfo pi __initdata; | |
1b7e03ef | 314 | const u64 max_addr = PFN_PHYS(max_pfn); |
b8ef9172 | 315 | u8 *phys_dist = NULL; |
ce003330 | 316 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); |
56396e68 | 317 | int max_emu_nid, dfl_phys_nid; |
b8ef9172 TH |
318 | int i, j, ret; |
319 | ||
320 | if (!emu_cmdline) | |
321 | goto no_emu; | |
322 | ||
323 | memset(&ei, 0, sizeof(ei)); | |
324 | pi = *numa_meminfo; | |
325 | ||
326 | for (i = 0; i < MAX_NUMNODES; i++) | |
327 | emu_nid_to_phys[i] = NUMA_NO_NODE; | |
328 | ||
329 | /* | |
330 | * If the numa=fake command-line contains a 'M' or 'G', it represents | |
331 | * the fixed node size. Otherwise, if it is just a single number N, | |
332 | * split the system RAM into N fake nodes. | |
333 | */ | |
334 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | |
335 | u64 size; | |
336 | ||
337 | size = memparse(emu_cmdline, &emu_cmdline); | |
338 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | |
339 | } else { | |
340 | unsigned long n; | |
341 | ||
94c0dd32 | 342 | n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); |
b8ef9172 TH |
343 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); |
344 | } | |
94c0dd32 PZ |
345 | if (*emu_cmdline == ':') |
346 | emu_cmdline++; | |
b8ef9172 TH |
347 | |
348 | if (ret < 0) | |
349 | goto no_emu; | |
350 | ||
351 | if (numa_cleanup_meminfo(&ei) < 0) { | |
352 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | |
353 | goto no_emu; | |
354 | } | |
355 | ||
ce003330 | 356 | /* copy the physical distance table */ |
b8ef9172 | 357 | if (numa_dist_cnt) { |
b8ef9172 TH |
358 | u64 phys; |
359 | ||
1b7e03ef | 360 | phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), |
ce003330 | 361 | phys_size, PAGE_SIZE); |
1f5026a7 | 362 | if (!phys) { |
b8ef9172 TH |
363 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); |
364 | goto no_emu; | |
365 | } | |
24aa0788 | 366 | memblock_reserve(phys, phys_size); |
b8ef9172 TH |
367 | phys_dist = __va(phys); |
368 | ||
369 | for (i = 0; i < numa_dist_cnt; i++) | |
370 | for (j = 0; j < numa_dist_cnt; j++) | |
371 | phys_dist[i * numa_dist_cnt + j] = | |
372 | node_distance(i, j); | |
373 | } | |
374 | ||
56396e68 TH |
375 | /* |
376 | * Determine the max emulated nid and the default phys nid to use | |
377 | * for unmapped nodes. | |
378 | */ | |
379 | max_emu_nid = 0; | |
078a1989 TH |
380 | dfl_phys_nid = NUMA_NO_NODE; |
381 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | |
382 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | |
56396e68 TH |
383 | max_emu_nid = i; |
384 | if (dfl_phys_nid == NUMA_NO_NODE) | |
385 | dfl_phys_nid = emu_nid_to_phys[i]; | |
078a1989 TH |
386 | } |
387 | } | |
388 | if (dfl_phys_nid == NUMA_NO_NODE) { | |
389 | pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); | |
390 | goto no_emu; | |
391 | } | |
392 | ||
b8ef9172 TH |
393 | /* commit */ |
394 | *numa_meminfo = ei; | |
395 | ||
396 | /* | |
397 | * Transform __apicid_to_node table to use emulated nids by | |
398 | * reverse-mapping phys_nid. The maps should always exist but fall | |
399 | * back to zero just in case. | |
400 | */ | |
401 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | |
402 | if (__apicid_to_node[i] == NUMA_NO_NODE) | |
403 | continue; | |
404 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | |
405 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | |
406 | break; | |
407 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | |
408 | } | |
409 | ||
410 | /* make sure all emulated nodes are mapped to a physical node */ | |
411 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | |
412 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | |
078a1989 | 413 | emu_nid_to_phys[i] = dfl_phys_nid; |
b8ef9172 | 414 | |
56396e68 | 415 | /* transform distance table */ |
b8ef9172 | 416 | numa_reset_distance(); |
56396e68 TH |
417 | for (i = 0; i < max_emu_nid + 1; i++) { |
418 | for (j = 0; j < max_emu_nid + 1; j++) { | |
b8ef9172 TH |
419 | int physi = emu_nid_to_phys[i]; |
420 | int physj = emu_nid_to_phys[j]; | |
421 | int dist; | |
422 | ||
94c0dd32 PZ |
423 | if (get_option(&emu_cmdline, &dist) == 2) |
424 | ; | |
425 | else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | |
b8ef9172 TH |
426 | dist = physi == physj ? |
427 | LOCAL_DISTANCE : REMOTE_DISTANCE; | |
428 | else | |
429 | dist = phys_dist[physi * numa_dist_cnt + physj]; | |
430 | ||
431 | numa_set_distance(i, j, dist); | |
432 | } | |
433 | } | |
ce003330 YL |
434 | |
435 | /* free the copied physical distance table */ | |
436 | if (phys_dist) | |
24aa0788 | 437 | memblock_free(__pa(phys_dist), phys_size); |
b8ef9172 TH |
438 | return; |
439 | ||
440 | no_emu: | |
441 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | |
442 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | |
443 | emu_nid_to_phys[i] = i; | |
444 | } | |
445 | ||
446 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | |
148f9bb8 | 447 | void numa_add_cpu(int cpu) |
b8ef9172 TH |
448 | { |
449 | int physnid, nid; | |
450 | ||
51b361b4 | 451 | nid = early_cpu_to_node(cpu); |
b8ef9172 TH |
452 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); |
453 | ||
454 | physnid = emu_nid_to_phys[nid]; | |
455 | ||
456 | /* | |
457 | * Map the cpu to each emulated node that is allocated on the physical | |
458 | * node of the cpu's apic id. | |
459 | */ | |
460 | for_each_online_node(nid) | |
461 | if (emu_nid_to_phys[nid] == physnid) | |
462 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | |
463 | } | |
464 | ||
148f9bb8 | 465 | void numa_remove_cpu(int cpu) |
b8ef9172 TH |
466 | { |
467 | int i; | |
468 | ||
469 | for_each_online_node(i) | |
470 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | |
471 | } | |
472 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | |
148f9bb8 | 473 | static void numa_set_cpumask(int cpu, bool enable) |
b8ef9172 | 474 | { |
7a6c6547 | 475 | int nid, physnid; |
b8ef9172 TH |
476 | |
477 | nid = early_cpu_to_node(cpu); | |
478 | if (nid == NUMA_NO_NODE) { | |
479 | /* early_cpu_to_node() already emits a warning and trace */ | |
480 | return; | |
481 | } | |
482 | ||
483 | physnid = emu_nid_to_phys[nid]; | |
484 | ||
7a6c6547 | 485 | for_each_online_node(nid) { |
b8ef9172 TH |
486 | if (emu_nid_to_phys[nid] != physnid) |
487 | continue; | |
488 | ||
7a6c6547 | 489 | debug_cpumask_set_cpu(cpu, nid, enable); |
b8ef9172 TH |
490 | } |
491 | } | |
492 | ||
148f9bb8 | 493 | void numa_add_cpu(int cpu) |
b8ef9172 | 494 | { |
7a6c6547 | 495 | numa_set_cpumask(cpu, true); |
b8ef9172 TH |
496 | } |
497 | ||
148f9bb8 | 498 | void numa_remove_cpu(int cpu) |
b8ef9172 | 499 | { |
7a6c6547 | 500 | numa_set_cpumask(cpu, false); |
b8ef9172 TH |
501 | } |
502 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |