Commit | Line | Data |
---|---|---|
457c8996 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
71ee73e7 | 2 | /* Common code for 32 and 64-bit NUMA */ |
e84025e2 | 3 | #include <linux/acpi.h> |
a4106eae TH |
4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> | |
0c436a58 | 6 | #include <linux/of.h> |
a4106eae TH |
7 | #include <linux/string.h> |
8 | #include <linux/init.h> | |
a4106eae TH |
9 | #include <linux/memblock.h> |
10 | #include <linux/mmzone.h> | |
11 | #include <linux/ctype.h> | |
a4106eae TH |
12 | #include <linux/nodemask.h> |
13 | #include <linux/sched.h> | |
14 | #include <linux/topology.h> | |
8f012db2 | 15 | #include <linux/sort.h> |
a4106eae | 16 | |
66441bd3 | 17 | #include <asm/e820/api.h> |
a4106eae TH |
18 | #include <asm/proto.h> |
19 | #include <asm/dma.h> | |
a4106eae TH |
20 | #include <asm/amd_nb.h> |
21 | ||
22 | #include "numa_internal.h" | |
90321602 | 23 | |
aec03f89 | 24 | int numa_off; |
e6df595b | 25 | nodemask_t numa_nodes_parsed __initdata; |
90321602 | 26 | |
a4106eae TH |
27 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
28 | EXPORT_SYMBOL(node_data); | |
29 | ||
1e5d8e1e | 30 | static struct numa_meminfo numa_meminfo __initdata_or_meminfo; |
5d30f92e | 31 | static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; |
a4106eae TH |
32 | |
33 | static int numa_distance_cnt; | |
34 | static u8 *numa_distance; | |
a4106eae | 35 | |
90321602 JB |
36 | static __init int numa_setup(char *opt) |
37 | { | |
38 | if (!opt) | |
39 | return -EINVAL; | |
40 | if (!strncmp(opt, "off", 3)) | |
41 | numa_off = 1; | |
90321602 | 42 | if (!strncmp(opt, "fake=", 5)) |
2dd57d34 | 43 | return numa_emu_cmdline(opt + 5); |
90321602 | 44 | if (!strncmp(opt, "noacpi", 6)) |
2dd57d34 | 45 | disable_srat(); |
3b0d3101 DW |
46 | if (!strncmp(opt, "nohmat", 6)) |
47 | disable_hmat(); | |
90321602 JB |
48 | return 0; |
49 | } | |
50 | early_param("numa", numa_setup); | |
71ee73e7 | 51 | |
71ee73e7 | 52 | /* |
bbc9e2f4 | 53 | * apicid, cpu, node mappings |
71ee73e7 | 54 | */ |
c4c60524 | 55 | s16 __apicid_to_node[MAX_LOCAL_APIC] = { |
bbc9e2f4 TH |
56 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
57 | }; | |
58 | ||
148f9bb8 | 59 | int numa_cpu_node(int cpu) |
6bd26273 | 60 | { |
4705243d | 61 | u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
6bd26273 TH |
62 | |
63 | if (apicid != BAD_APICID) | |
64 | return __apicid_to_node[apicid]; | |
65 | return NUMA_NO_NODE; | |
66 | } | |
67 | ||
c032ef60 | 68 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
71ee73e7 RR |
69 | EXPORT_SYMBOL(node_to_cpumask_map); |
70 | ||
645a7919 TH |
71 | /* |
72 | * Map cpu index to node index | |
73 | */ | |
645a7919 | 74 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); |
645a7919 TH |
75 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); |
76 | ||
e13fe869 | 77 | void numa_set_node(int cpu, int node) |
645a7919 TH |
78 | { |
79 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | |
80 | ||
81 | /* early setting, no percpu area yet */ | |
82 | if (cpu_to_node_map) { | |
83 | cpu_to_node_map[cpu] = node; | |
84 | return; | |
85 | } | |
86 | ||
87 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | |
88 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | |
89 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | |
90 | dump_stack(); | |
91 | return; | |
92 | } | |
93 | #endif | |
94 | per_cpu(x86_cpu_to_node_map, cpu) = node; | |
95 | ||
942670d0 | 96 | set_cpu_numa_node(cpu, node); |
645a7919 TH |
97 | } |
98 | ||
e13fe869 | 99 | void numa_clear_node(int cpu) |
645a7919 TH |
100 | { |
101 | numa_set_node(cpu, NUMA_NO_NODE); | |
102 | } | |
103 | ||
71ee73e7 RR |
104 | /* |
105 | * Allocate node_to_cpumask_map based on number of available nodes | |
106 | * Requires node_possible_map to be valid. | |
107 | * | |
9512938b | 108 | * Note: cpumask_of_node() is not valid until after this is done. |
71ee73e7 RR |
109 | * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) |
110 | */ | |
111 | void __init setup_node_to_cpumask_map(void) | |
112 | { | |
d2ad351e | 113 | unsigned int node; |
71ee73e7 RR |
114 | |
115 | /* setup nr_node_ids if not done yet */ | |
d2ad351e CS |
116 | if (nr_node_ids == MAX_NUMNODES) |
117 | setup_nr_node_ids(); | |
71ee73e7 RR |
118 | |
119 | /* allocate the map */ | |
c032ef60 RR |
120 | for (node = 0; node < nr_node_ids; node++) |
121 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); | |
71ee73e7 | 122 | |
c032ef60 | 123 | /* cpumask_of_node() will now work */ |
b9726c26 | 124 | pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); |
71ee73e7 RR |
125 | } |
126 | ||
a4106eae TH |
127 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
128 | struct numa_meminfo *mi) | |
129 | { | |
130 | /* ignore zero length blks */ | |
131 | if (start == end) | |
132 | return 0; | |
133 | ||
134 | /* whine about and ignore invalid blks */ | |
135 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { | |
1de392f5 JP |
136 | pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", |
137 | nid, start, end - 1); | |
a4106eae TH |
138 | return 0; |
139 | } | |
140 | ||
141 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { | |
1de392f5 | 142 | pr_err("too many memblk ranges\n"); |
a4106eae TH |
143 | return -EINVAL; |
144 | } | |
145 | ||
146 | mi->blk[mi->nr_blks].start = start; | |
147 | mi->blk[mi->nr_blks].end = end; | |
148 | mi->blk[mi->nr_blks].nid = nid; | |
149 | mi->nr_blks++; | |
150 | return 0; | |
151 | } | |
152 | ||
153 | /** | |
154 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo | |
155 | * @idx: Index of memblk to remove | |
156 | * @mi: numa_meminfo to remove memblk from | |
157 | * | |
158 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and | |
159 | * decrementing @mi->nr_blks. | |
160 | */ | |
161 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) | |
162 | { | |
163 | mi->nr_blks--; | |
164 | memmove(&mi->blk[idx], &mi->blk[idx + 1], | |
165 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); | |
166 | } | |
167 | ||
5d30f92e DW |
168 | /** |
169 | * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another | |
170 | * @dst: numa_meminfo to append block to | |
171 | * @idx: Index of memblk to remove | |
172 | * @src: numa_meminfo to remove memblk from | |
173 | */ | |
174 | static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, | |
175 | struct numa_meminfo *src) | |
176 | { | |
177 | dst->blk[dst->nr_blks++] = src->blk[idx]; | |
178 | numa_remove_memblk_from(idx, src); | |
179 | } | |
180 | ||
a4106eae TH |
181 | /** |
182 | * numa_add_memblk - Add one numa_memblk to numa_meminfo | |
183 | * @nid: NUMA node ID of the new memblk | |
184 | * @start: Start address of the new memblk | |
185 | * @end: End address of the new memblk | |
186 | * | |
187 | * Add a new memblk to the default numa_meminfo. | |
188 | * | |
189 | * RETURNS: | |
190 | * 0 on success, -errno on failure. | |
191 | */ | |
192 | int __init numa_add_memblk(int nid, u64 start, u64 end) | |
193 | { | |
194 | return numa_add_memblk_to(nid, start, end, &numa_meminfo); | |
195 | } | |
196 | ||
8b375f64 LC |
197 | /* Allocate NODE_DATA for a node on the local memory */ |
198 | static void __init alloc_node_data(int nid) | |
a4106eae | 199 | { |
a4106eae | 200 | const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
38f3e1ca | 201 | u64 nd_pa; |
7888e96b | 202 | void *nd; |
a4106eae TH |
203 | int tnid; |
204 | ||
a4106eae | 205 | /* |
07f4207a PA |
206 | * Allocate node data. Try node-local memory and then any node. |
207 | * Never allocate in DMA zone. | |
a4106eae | 208 | */ |
42b46aef | 209 | nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); |
07f4207a | 210 | if (!nd_pa) { |
42b46aef MR |
211 | pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", |
212 | nd_size, nid); | |
213 | return; | |
a4106eae | 214 | } |
07f4207a | 215 | nd = __va(nd_pa); |
a4106eae TH |
216 | |
217 | /* report and initialize */ | |
8b375f64 | 218 | printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, |
07f4207a | 219 | nd_pa, nd_pa + nd_size - 1); |
a4106eae | 220 | tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); |
07f4207a | 221 | if (tnid != nid) |
a4106eae TH |
222 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); |
223 | ||
7888e96b | 224 | node_data[nid] = nd; |
a4106eae | 225 | memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); |
a4106eae TH |
226 | |
227 | node_set_online(nid); | |
228 | } | |
229 | ||
230 | /** | |
231 | * numa_cleanup_meminfo - Cleanup a numa_meminfo | |
232 | * @mi: numa_meminfo to clean up | |
233 | * | |
43dac8f6 | 234 | * Sanitize @mi by merging and removing unnecessary memblks. Also check for |
a4106eae TH |
235 | * conflicts and clear unused memblks. |
236 | * | |
237 | * RETURNS: | |
238 | * 0 on success, -errno on failure. | |
239 | */ | |
240 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) | |
241 | { | |
242 | const u64 low = 0; | |
38f3e1ca | 243 | const u64 high = PFN_PHYS(max_pfn); |
a4106eae TH |
244 | int i, j, k; |
245 | ||
e5a10c1b | 246 | /* first, trim all entries */ |
a4106eae TH |
247 | for (i = 0; i < mi->nr_blks; i++) { |
248 | struct numa_memblk *bi = &mi->blk[i]; | |
249 | ||
5d30f92e DW |
250 | /* move / save reserved memory ranges */ |
251 | if (!memblock_overlaps_region(&memblock.memory, | |
252 | bi->start, bi->end - bi->start)) { | |
253 | numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); | |
254 | continue; | |
255 | } | |
256 | ||
257 | /* make sure all non-reserved blocks are inside the limits */ | |
a4106eae | 258 | bi->start = max(bi->start, low); |
28e5e44a FD |
259 | |
260 | /* preserve info for non-RAM areas above 'max_pfn': */ | |
261 | if (bi->end > high) { | |
262 | numa_add_memblk_to(bi->nid, high, bi->end, | |
263 | &numa_reserved_meminfo); | |
264 | bi->end = high; | |
265 | } | |
a4106eae | 266 | |
5d30f92e DW |
267 | /* and there's no empty block */ |
268 | if (bi->start >= bi->end) | |
a4106eae | 269 | numa_remove_memblk_from(i--, mi); |
e5a10c1b YL |
270 | } |
271 | ||
272 | /* merge neighboring / overlapping entries */ | |
273 | for (i = 0; i < mi->nr_blks; i++) { | |
274 | struct numa_memblk *bi = &mi->blk[i]; | |
a4106eae TH |
275 | |
276 | for (j = i + 1; j < mi->nr_blks; j++) { | |
277 | struct numa_memblk *bj = &mi->blk[j]; | |
38f3e1ca | 278 | u64 start, end; |
a4106eae TH |
279 | |
280 | /* | |
281 | * See whether there are overlapping blocks. Whine | |
282 | * about but allow overlaps of the same nid. They | |
283 | * will be merged below. | |
284 | */ | |
285 | if (bi->end > bj->start && bi->start < bj->end) { | |
286 | if (bi->nid != bj->nid) { | |
1de392f5 | 287 | pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", |
365811d6 BH |
288 | bi->nid, bi->start, bi->end - 1, |
289 | bj->nid, bj->start, bj->end - 1); | |
a4106eae TH |
290 | return -EINVAL; |
291 | } | |
1de392f5 JP |
292 | pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", |
293 | bi->nid, bi->start, bi->end - 1, | |
294 | bj->start, bj->end - 1); | |
a4106eae TH |
295 | } |
296 | ||
297 | /* | |
298 | * Join together blocks on the same node, holes | |
299 | * between which don't overlap with memory on other | |
300 | * nodes. | |
301 | */ | |
302 | if (bi->nid != bj->nid) | |
303 | continue; | |
e5a10c1b YL |
304 | start = min(bi->start, bj->start); |
305 | end = max(bi->end, bj->end); | |
a4106eae TH |
306 | for (k = 0; k < mi->nr_blks; k++) { |
307 | struct numa_memblk *bk = &mi->blk[k]; | |
308 | ||
309 | if (bi->nid == bk->nid) | |
310 | continue; | |
311 | if (start < bk->end && end > bk->start) | |
312 | break; | |
313 | } | |
314 | if (k < mi->nr_blks) | |
315 | continue; | |
365811d6 BH |
316 | printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", |
317 | bi->nid, bi->start, bi->end - 1, bj->start, | |
318 | bj->end - 1, start, end - 1); | |
a4106eae TH |
319 | bi->start = start; |
320 | bi->end = end; | |
321 | numa_remove_memblk_from(j--, mi); | |
322 | } | |
323 | } | |
324 | ||
e5a10c1b | 325 | /* clear unused ones */ |
a4106eae TH |
326 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { |
327 | mi->blk[i].start = mi->blk[i].end = 0; | |
328 | mi->blk[i].nid = NUMA_NO_NODE; | |
329 | } | |
330 | ||
331 | return 0; | |
332 | } | |
333 | ||
b678c91a TG |
334 | /* |
335 | * Set nodes, which have memory in @mi, in *@nodemask. | |
336 | */ | |
337 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, | |
338 | const struct numa_meminfo *mi) | |
339 | { | |
340 | int i; | |
341 | ||
342 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) | |
343 | if (mi->blk[i].start != mi->blk[i].end && | |
344 | mi->blk[i].nid != NUMA_NO_NODE) | |
345 | node_set(mi->blk[i].nid, *nodemask); | |
346 | } | |
347 | ||
a4106eae TH |
348 | /** |
349 | * numa_reset_distance - Reset NUMA distance table | |
350 | * | |
351 | * The current table is freed. The next numa_set_distance() call will | |
352 | * create a new one. | |
353 | */ | |
354 | void __init numa_reset_distance(void) | |
355 | { | |
356 | size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); | |
357 | ||
358 | /* numa_distance could be 1LU marking allocation failure, test cnt */ | |
359 | if (numa_distance_cnt) | |
4421cca0 | 360 | memblock_free(numa_distance, size); |
a4106eae TH |
361 | numa_distance_cnt = 0; |
362 | numa_distance = NULL; /* enable table creation */ | |
363 | } | |
364 | ||
365 | static int __init numa_alloc_distance(void) | |
366 | { | |
b678c91a | 367 | nodemask_t nodes_parsed; |
a4106eae TH |
368 | size_t size; |
369 | int i, j, cnt = 0; | |
370 | u64 phys; | |
371 | ||
372 | /* size the new table and allocate it */ | |
b678c91a TG |
373 | nodes_parsed = numa_nodes_parsed; |
374 | numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); | |
375 | ||
376 | for_each_node_mask(i, nodes_parsed) | |
a4106eae TH |
377 | cnt = i; |
378 | cnt++; | |
379 | size = cnt * cnt * sizeof(numa_distance[0]); | |
380 | ||
a7259df7 MR |
381 | phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, |
382 | PFN_PHYS(max_pfn_mapped)); | |
1f5026a7 | 383 | if (!phys) { |
1de392f5 | 384 | pr_warn("Warning: can't allocate distance table!\n"); |
a4106eae TH |
385 | /* don't retry until explicitly reset */ |
386 | numa_distance = (void *)1LU; | |
387 | return -ENOMEM; | |
388 | } | |
a4106eae TH |
389 | |
390 | numa_distance = __va(phys); | |
391 | numa_distance_cnt = cnt; | |
392 | ||
393 | /* fill with the default distances */ | |
394 | for (i = 0; i < cnt; i++) | |
395 | for (j = 0; j < cnt; j++) | |
396 | numa_distance[i * cnt + j] = i == j ? | |
397 | LOCAL_DISTANCE : REMOTE_DISTANCE; | |
398 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); | |
399 | ||
400 | return 0; | |
401 | } | |
402 | ||
403 | /** | |
404 | * numa_set_distance - Set NUMA distance from one NUMA to another | |
405 | * @from: the 'from' node to set distance | |
406 | * @to: the 'to' node to set distance | |
407 | * @distance: NUMA distance | |
408 | * | |
409 | * Set the distance from node @from to @to to @distance. If distance table | |
410 | * doesn't exist, one which is large enough to accommodate all the currently | |
411 | * known nodes will be created. | |
412 | * | |
413 | * If such table cannot be allocated, a warning is printed and further | |
414 | * calls are ignored until the distance table is reset with | |
415 | * numa_reset_distance(). | |
416 | * | |
54eed6cb PH |
417 | * If @from or @to is higher than the highest known node or lower than zero |
418 | * at the time of table creation or @distance doesn't make sense, the call | |
419 | * is ignored. | |
a4106eae TH |
420 | * This is to allow simplification of specific NUMA config implementations. |
421 | */ | |
422 | void __init numa_set_distance(int from, int to, int distance) | |
423 | { | |
424 | if (!numa_distance && numa_alloc_distance() < 0) | |
425 | return; | |
426 | ||
54eed6cb PH |
427 | if (from >= numa_distance_cnt || to >= numa_distance_cnt || |
428 | from < 0 || to < 0) { | |
1de392f5 JP |
429 | pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", |
430 | from, to, distance); | |
a4106eae TH |
431 | return; |
432 | } | |
433 | ||
434 | if ((u8)distance != distance || | |
435 | (from == to && distance != LOCAL_DISTANCE)) { | |
1de392f5 | 436 | pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", |
a4106eae TH |
437 | from, to, distance); |
438 | return; | |
439 | } | |
440 | ||
441 | numa_distance[from * numa_distance_cnt + to] = distance; | |
442 | } | |
443 | ||
444 | int __node_distance(int from, int to) | |
445 | { | |
446 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) | |
447 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; | |
448 | return numa_distance[from * numa_distance_cnt + to]; | |
449 | } | |
450 | EXPORT_SYMBOL(__node_distance); | |
451 | ||
c1a0bf34 IM |
452 | /* |
453 | * Mark all currently memblock-reserved physical memory (which covers the | |
454 | * kernel's own memory ranges) as hot-unswappable. | |
455 | */ | |
bd5cfb89 XQ |
456 | static void __init numa_clear_kernel_node_hotplug(void) |
457 | { | |
c1a0bf34 IM |
458 | nodemask_t reserved_nodemask = NODE_MASK_NONE; |
459 | struct memblock_region *mb_region; | |
460 | int i; | |
bd5cfb89 XQ |
461 | |
462 | /* | |
c1a0bf34 IM |
463 | * We have to do some preprocessing of memblock regions, to |
464 | * make them suitable for reservation. | |
465 | * | |
bd5cfb89 | 466 | * At this time, all memory regions reserved by memblock are |
c1a0bf34 IM |
467 | * used by the kernel, but those regions are not split up |
468 | * along node boundaries yet, and don't necessarily have their | |
469 | * node ID set yet either. | |
470 | * | |
471 | * So iterate over all memory known to the x86 architecture, | |
472 | * and use those ranges to set the nid in memblock.reserved. | |
473 | * This will split up the memblock regions along node | |
474 | * boundaries and will set the node IDs as well. | |
bd5cfb89 XQ |
475 | */ |
476 | for (i = 0; i < numa_meminfo.nr_blks; i++) { | |
c1a0bf34 | 477 | struct numa_memblk *mb = numa_meminfo.blk + i; |
5f7ee246 | 478 | int ret; |
bd5cfb89 | 479 | |
5f7ee246 IM |
480 | ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); |
481 | WARN_ON_ONCE(ret); | |
bd5cfb89 XQ |
482 | } |
483 | ||
22ef882e | 484 | /* |
c1a0bf34 IM |
485 | * Now go over all reserved memblock regions, to construct a |
486 | * node mask of all kernel reserved memory areas. | |
22ef882e | 487 | * |
c1a0bf34 IM |
488 | * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, |
489 | * numa_meminfo might not include all memblock.reserved | |
490 | * memory ranges, because quirks such as trim_snb_memory() | |
491 | * reserve specific pages for Sandy Bridge graphics. ] | |
22ef882e | 492 | */ |
cc6de168 | 493 | for_each_reserved_mem_region(mb_region) { |
d622abf7 MR |
494 | int nid = memblock_get_region_node(mb_region); |
495 | ||
496 | if (nid != MAX_NUMNODES) | |
497 | node_set(nid, reserved_nodemask); | |
c1a0bf34 | 498 | } |
bd5cfb89 | 499 | |
c1a0bf34 IM |
500 | /* |
501 | * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory | |
502 | * belonging to the reserved node mask. | |
503 | * | |
504 | * Note that this will include memory regions that reside | |
505 | * on nodes that contain kernel memory - entire nodes | |
506 | * become hot-unpluggable: | |
507 | */ | |
bd5cfb89 | 508 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
c1a0bf34 | 509 | struct numa_memblk *mb = numa_meminfo.blk + i; |
bd5cfb89 | 510 | |
c1a0bf34 IM |
511 | if (!node_isset(mb->nid, reserved_nodemask)) |
512 | continue; | |
bd5cfb89 | 513 | |
c1a0bf34 | 514 | memblock_clear_hotplug(mb->start, mb->end - mb->start); |
bd5cfb89 XQ |
515 | } |
516 | } | |
517 | ||
a4106eae TH |
518 | static int __init numa_register_memblks(struct numa_meminfo *mi) |
519 | { | |
520 | int i, nid; | |
521 | ||
522 | /* Account for nodes with cpus and no memory */ | |
523 | node_possible_map = numa_nodes_parsed; | |
b678c91a | 524 | numa_nodemask_from_meminfo(&node_possible_map, mi); |
a4106eae TH |
525 | if (WARN_ON(nodes_empty(node_possible_map))) |
526 | return -EINVAL; | |
527 | ||
0608f70c TH |
528 | for (i = 0; i < mi->nr_blks; i++) { |
529 | struct numa_memblk *mb = &mi->blk[i]; | |
e7e8de59 TC |
530 | memblock_set_node(mb->start, mb->end - mb->start, |
531 | &memblock.memory, mb->nid); | |
0608f70c | 532 | } |
1e01979c | 533 | |
bd5cfb89 XQ |
534 | /* |
535 | * At very early time, the kernel have to use some memory such as | |
536 | * loading the kernel image. We cannot prevent this anyway. So any | |
537 | * node the kernel resides in should be un-hotpluggable. | |
538 | * | |
539 | * And when we come here, alloc node data won't fail. | |
540 | */ | |
541 | numa_clear_kernel_node_hotplug(); | |
542 | ||
1e01979c TH |
543 | /* |
544 | * If sections array is gonna be used for pfn -> nid mapping, check | |
545 | * whether its granularity is fine enough. | |
546 | */ | |
aecfd220 KC |
547 | if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { |
548 | unsigned long pfn_align = node_map_pfn_alignment(); | |
549 | ||
550 | if (pfn_align && pfn_align < PAGES_PER_SECTION) { | |
551 | pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", | |
552 | PFN_PHYS(pfn_align) >> 20, | |
553 | PFN_PHYS(PAGES_PER_SECTION) >> 20); | |
554 | return -EINVAL; | |
555 | } | |
1e01979c | 556 | } |
ff6c3d81 LN |
557 | |
558 | if (!memblock_validate_numa_coverage(SZ_1M)) | |
a4106eae TH |
559 | return -EINVAL; |
560 | ||
561 | /* Finally register nodes. */ | |
562 | for_each_node_mask(nid, node_possible_map) { | |
38f3e1ca | 563 | u64 start = PFN_PHYS(max_pfn); |
a4106eae TH |
564 | u64 end = 0; |
565 | ||
566 | for (i = 0; i < mi->nr_blks; i++) { | |
567 | if (nid != mi->blk[i].nid) | |
568 | continue; | |
569 | start = min(mi->blk[i].start, start); | |
570 | end = max(mi->blk[i].end, end); | |
571 | } | |
572 | ||
8b375f64 LC |
573 | if (start >= end) |
574 | continue; | |
575 | ||
8b375f64 | 576 | alloc_node_data(nid); |
a4106eae TH |
577 | } |
578 | ||
0608f70c TH |
579 | /* Dump memblock with node info and return. */ |
580 | memblock_dump_all(); | |
a4106eae TH |
581 | return 0; |
582 | } | |
a4106eae | 583 | |
8db78cc4 TH |
584 | /* |
585 | * There are unfortunately some poorly designed mainboards around that | |
586 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | |
587 | * mapping. To avoid this fill in the mapping for all possible CPUs, | |
588 | * as the number of CPUs is not known yet. We round robin the existing | |
589 | * nodes. | |
590 | */ | |
752d4f37 | 591 | static void __init numa_init_array(void) |
8db78cc4 TH |
592 | { |
593 | int rr, i; | |
594 | ||
595 | rr = first_node(node_online_map); | |
596 | for (i = 0; i < nr_cpu_ids; i++) { | |
597 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | |
598 | continue; | |
599 | numa_set_node(i, rr); | |
0edaf86c | 600 | rr = next_node_in(rr, node_online_map); |
8db78cc4 TH |
601 | } |
602 | } | |
603 | ||
a4106eae TH |
604 | static int __init numa_init(int (*init_func)(void)) |
605 | { | |
606 | int i; | |
607 | int ret; | |
608 | ||
609 | for (i = 0; i < MAX_LOCAL_APIC; i++) | |
610 | set_apicid_to_node(i, NUMA_NO_NODE); | |
611 | ||
20e6926d | 612 | nodes_clear(numa_nodes_parsed); |
a4106eae TH |
613 | nodes_clear(node_possible_map); |
614 | nodes_clear(node_online_map); | |
20e6926d | 615 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
e7e8de59 TC |
616 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, |
617 | MAX_NUMNODES)); | |
a0acda91 TC |
618 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, |
619 | MAX_NUMNODES)); | |
05d1d8cb TC |
620 | /* In case that parsing SRAT failed. */ |
621 | WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); | |
a4106eae TH |
622 | numa_reset_distance(); |
623 | ||
624 | ret = init_func(); | |
625 | if (ret < 0) | |
626 | return ret; | |
c5320926 TC |
627 | |
628 | /* | |
629 | * We reset memblock back to the top-down direction | |
630 | * here because if we configured ACPI_NUMA, we have | |
631 | * parsed SRAT in init_func(). It is ok to have the | |
632 | * reset here even if we did't configure ACPI_NUMA | |
633 | * or acpi numa init fails and fallbacks to dummy | |
634 | * numa init. | |
635 | */ | |
636 | memblock_set_bottom_up(false); | |
637 | ||
a4106eae TH |
638 | ret = numa_cleanup_meminfo(&numa_meminfo); |
639 | if (ret < 0) | |
640 | return ret; | |
641 | ||
642 | numa_emulation(&numa_meminfo, numa_distance_cnt); | |
643 | ||
644 | ret = numa_register_memblks(&numa_meminfo); | |
645 | if (ret < 0) | |
646 | return ret; | |
647 | ||
648 | for (i = 0; i < nr_cpu_ids; i++) { | |
649 | int nid = early_cpu_to_node(i); | |
650 | ||
651 | if (nid == NUMA_NO_NODE) | |
652 | continue; | |
653 | if (!node_online(nid)) | |
654 | numa_clear_node(i); | |
655 | } | |
656 | numa_init_array(); | |
a0acda91 | 657 | |
a4106eae TH |
658 | return 0; |
659 | } | |
660 | ||
661 | /** | |
662 | * dummy_numa_init - Fallback dummy NUMA init | |
663 | * | |
664 | * Used if there's no underlying NUMA architecture, NUMA initialization | |
665 | * fails, or NUMA is disabled on the command line. | |
666 | * | |
667 | * Must online at least one node and add memory blocks that cover all | |
668 | * allowed memory. This function must not fail. | |
669 | */ | |
670 | static int __init dummy_numa_init(void) | |
671 | { | |
672 | printk(KERN_INFO "%s\n", | |
673 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | |
365811d6 BH |
674 | printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", |
675 | 0LLU, PFN_PHYS(max_pfn) - 1); | |
a4106eae TH |
676 | |
677 | node_set(0, numa_nodes_parsed); | |
38f3e1ca | 678 | numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); |
a4106eae TH |
679 | |
680 | return 0; | |
681 | } | |
682 | ||
683 | /** | |
684 | * x86_numa_init - Initialize NUMA | |
685 | * | |
686 | * Try each configured NUMA initialization method until one succeeds. The | |
11a98f37 | 687 | * last fallback is dummy single node config encompassing whole memory and |
a4106eae TH |
688 | * never fails. |
689 | */ | |
690 | void __init x86_numa_init(void) | |
691 | { | |
692 | if (!numa_off) { | |
693 | #ifdef CONFIG_ACPI_NUMA | |
694 | if (!numa_init(x86_acpi_numa_init)) | |
695 | return; | |
696 | #endif | |
697 | #ifdef CONFIG_AMD_NUMA | |
698 | if (!numa_init(amd_numa_init)) | |
699 | return; | |
700 | #endif | |
0c436a58 SS |
701 | if (acpi_disabled && !numa_init(of_numa_init)) |
702 | return; | |
a4106eae TH |
703 | } |
704 | ||
705 | numa_init(dummy_numa_init); | |
706 | } | |
a4106eae | 707 | |
8db78cc4 | 708 | |
73bf7382 JC |
709 | /* |
710 | * A node may exist which has one or more Generic Initiators but no CPUs and no | |
711 | * memory. | |
712 | * | |
713 | * This function must be called after init_cpu_to_node(), to ensure that any | |
714 | * memoryless CPU nodes have already been brought online, and before the | |
715 | * node_data[nid] is needed for zone list setup in build_all_zonelists(). | |
716 | * | |
717 | * When this function is called, any nodes containing either memory and/or CPUs | |
718 | * will already be online and there is no need to do anything extra, even if | |
719 | * they also contain one or more Generic Initiators. | |
720 | */ | |
721 | void __init init_gi_nodes(void) | |
722 | { | |
723 | int nid; | |
724 | ||
1ca75fa7 OS |
725 | /* |
726 | * Exclude this node from | |
727 | * bringup_nonboot_cpus | |
728 | * cpu_up | |
729 | * __try_online_node | |
730 | * register_one_node | |
731 | * because node_subsys is not initialized yet. | |
732 | * TODO remove dependency on node_online | |
733 | */ | |
73bf7382 JC |
734 | for_each_node_state(nid, N_GENERIC_INITIATOR) |
735 | if (!node_online(nid)) | |
1ca75fa7 | 736 | node_set_online(nid); |
73bf7382 JC |
737 | } |
738 | ||
8db78cc4 TH |
739 | /* |
740 | * Setup early cpu_to_node. | |
741 | * | |
742 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | |
743 | * and apicid_to_node[] tables have valid entries for a CPU. | |
744 | * This means we skip cpu_to_node[] initialisation for NUMA | |
745 | * emulation and faking node case (when running a kernel compiled | |
746 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | |
747 | * is already initialized in a round robin manner at numa_init_array, | |
748 | * prior to this call, and this initialization is good enough | |
749 | * for the fake NUMA cases. | |
750 | * | |
751 | * Called before the per_cpu areas are setup. | |
752 | */ | |
753 | void __init init_cpu_to_node(void) | |
754 | { | |
755 | int cpu; | |
4705243d | 756 | u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
8db78cc4 TH |
757 | |
758 | BUG_ON(cpu_to_apicid == NULL); | |
759 | ||
760 | for_each_possible_cpu(cpu) { | |
761 | int node = numa_cpu_node(cpu); | |
762 | ||
763 | if (node == NUMA_NO_NODE) | |
764 | continue; | |
2532fc31 | 765 | |
1ca75fa7 OS |
766 | /* |
767 | * Exclude this node from | |
768 | * bringup_nonboot_cpus | |
769 | * cpu_up | |
770 | * __try_online_node | |
771 | * register_one_node | |
772 | * because node_subsys is not initialized yet. | |
773 | * TODO remove dependency on node_online | |
774 | */ | |
8db78cc4 | 775 | if (!node_online(node)) |
1ca75fa7 | 776 | node_set_online(node); |
2532fc31 | 777 | |
8db78cc4 TH |
778 | numa_set_node(cpu, node); |
779 | } | |
780 | } | |
781 | ||
de2d9445 TH |
782 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS |
783 | ||
784 | # ifndef CONFIG_NUMA_EMU | |
148f9bb8 | 785 | void numa_add_cpu(int cpu) |
de2d9445 TH |
786 | { |
787 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | |
788 | } | |
789 | ||
148f9bb8 | 790 | void numa_remove_cpu(int cpu) |
de2d9445 TH |
791 | { |
792 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | |
793 | } | |
794 | # endif /* !CONFIG_NUMA_EMU */ | |
795 | ||
796 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | |
645a7919 TH |
797 | |
798 | int __cpu_to_node(int cpu) | |
799 | { | |
800 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | |
801 | printk(KERN_WARNING | |
802 | "cpu_to_node(%d): usage too early!\n", cpu); | |
803 | dump_stack(); | |
804 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | |
805 | } | |
806 | return per_cpu(x86_cpu_to_node_map, cpu); | |
807 | } | |
808 | EXPORT_SYMBOL(__cpu_to_node); | |
809 | ||
810 | /* | |
811 | * Same function as cpu_to_node() but used if called before the | |
812 | * per_cpu areas are setup. | |
813 | */ | |
814 | int early_cpu_to_node(int cpu) | |
815 | { | |
816 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | |
817 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | |
818 | ||
819 | if (!cpu_possible(cpu)) { | |
820 | printk(KERN_WARNING | |
821 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | |
822 | dump_stack(); | |
823 | return NUMA_NO_NODE; | |
824 | } | |
825 | return per_cpu(x86_cpu_to_node_map, cpu); | |
826 | } | |
827 | ||
7a6c6547 | 828 | void debug_cpumask_set_cpu(int cpu, int node, bool enable) |
de2d9445 | 829 | { |
de2d9445 | 830 | struct cpumask *mask; |
de2d9445 | 831 | |
14392fd3 DR |
832 | if (node == NUMA_NO_NODE) { |
833 | /* early_cpu_to_node() already emits a warning and trace */ | |
7a6c6547 | 834 | return; |
14392fd3 | 835 | } |
de2d9445 | 836 | mask = node_to_cpumask_map[node]; |
625395c4 | 837 | if (!cpumask_available(mask)) { |
de2d9445 TH |
838 | pr_err("node_to_cpumask_map[%i] NULL\n", node); |
839 | dump_stack(); | |
7a6c6547 | 840 | return; |
de2d9445 TH |
841 | } |
842 | ||
7a6c6547 DR |
843 | if (enable) |
844 | cpumask_set_cpu(cpu, mask); | |
845 | else | |
846 | cpumask_clear_cpu(cpu, mask); | |
847 | ||
bf58b487 | 848 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", |
de2d9445 | 849 | enable ? "numa_add_cpu" : "numa_remove_cpu", |
bf58b487 | 850 | cpu, node, cpumask_pr_args(mask)); |
7a6c6547 | 851 | return; |
de2d9445 TH |
852 | } |
853 | ||
854 | # ifndef CONFIG_NUMA_EMU | |
148f9bb8 | 855 | static void numa_set_cpumask(int cpu, bool enable) |
de2d9445 | 856 | { |
7a6c6547 | 857 | debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); |
de2d9445 TH |
858 | } |
859 | ||
148f9bb8 | 860 | void numa_add_cpu(int cpu) |
de2d9445 | 861 | { |
7a6c6547 | 862 | numa_set_cpumask(cpu, true); |
de2d9445 TH |
863 | } |
864 | ||
148f9bb8 | 865 | void numa_remove_cpu(int cpu) |
de2d9445 | 866 | { |
7a6c6547 | 867 | numa_set_cpumask(cpu, false); |
de2d9445 TH |
868 | } |
869 | # endif /* !CONFIG_NUMA_EMU */ | |
870 | ||
71ee73e7 RR |
871 | /* |
872 | * Returns a pointer to the bitmask of CPUs on Node 'node'. | |
873 | */ | |
73e907de | 874 | const struct cpumask *cpumask_of_node(int node) |
71ee73e7 | 875 | { |
bc04a049 | 876 | if ((unsigned)node >= nr_node_ids) { |
71ee73e7 | 877 | printk(KERN_WARNING |
bc04a049 | 878 | "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", |
71ee73e7 RR |
879 | node, nr_node_ids); |
880 | dump_stack(); | |
881 | return cpu_none_mask; | |
882 | } | |
625395c4 | 883 | if (!cpumask_available(node_to_cpumask_map[node])) { |
c032ef60 RR |
884 | printk(KERN_WARNING |
885 | "cpumask_of_node(%d): no node_to_cpumask_map!\n", | |
886 | node); | |
887 | dump_stack(); | |
888 | return cpu_online_mask; | |
889 | } | |
0b966252 | 890 | return node_to_cpumask_map[node]; |
71ee73e7 RR |
891 | } |
892 | EXPORT_SYMBOL(cpumask_of_node); | |
645a7919 | 893 | |
de2d9445 | 894 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
a4106eae | 895 | |
5d30f92e DW |
896 | #ifdef CONFIG_NUMA_KEEP_MEMINFO |
897 | static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) | |
a4106eae | 898 | { |
a4106eae TH |
899 | int i; |
900 | ||
901 | for (i = 0; i < mi->nr_blks; i++) | |
902 | if (mi->blk[i].start <= start && mi->blk[i].end > start) | |
5d30f92e DW |
903 | return mi->blk[i].nid; |
904 | return NUMA_NO_NODE; | |
905 | } | |
906 | ||
907 | int phys_to_target_node(phys_addr_t start) | |
908 | { | |
909 | int nid = meminfo_to_nid(&numa_meminfo, start); | |
910 | ||
911 | /* | |
912 | * Prefer online nodes, but if reserved memory might be | |
913 | * hot-added continue the search with reserved ranges. | |
914 | */ | |
915 | if (nid != NUMA_NO_NODE) | |
916 | return nid; | |
917 | ||
918 | return meminfo_to_nid(&numa_reserved_meminfo, start); | |
919 | } | |
a927bd6b | 920 | EXPORT_SYMBOL_GPL(phys_to_target_node); |
5d30f92e DW |
921 | |
922 | int memory_add_physaddr_to_nid(u64 start) | |
923 | { | |
924 | int nid = meminfo_to_nid(&numa_meminfo, start); | |
925 | ||
926 | if (nid == NUMA_NO_NODE) | |
927 | nid = numa_meminfo.blk[0].nid; | |
a4106eae TH |
928 | return nid; |
929 | } | |
a927bd6b | 930 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
8f012db2 AS |
931 | |
932 | static int __init cmp_memblk(const void *a, const void *b) | |
933 | { | |
934 | const struct numa_memblk *ma = *(const struct numa_memblk **)a; | |
935 | const struct numa_memblk *mb = *(const struct numa_memblk **)b; | |
936 | ||
b626070f | 937 | return (ma->start > mb->start) - (ma->start < mb->start); |
8f012db2 AS |
938 | } |
939 | ||
940 | static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; | |
941 | ||
942 | /** | |
943 | * numa_fill_memblks - Fill gaps in numa_meminfo memblks | |
944 | * @start: address to begin fill | |
945 | * @end: address to end fill | |
946 | * | |
9b99c17f AS |
947 | * Find and extend numa_meminfo memblks to cover the physical |
948 | * address range @start-@end | |
8f012db2 AS |
949 | * |
950 | * RETURNS: | |
951 | * 0 : Success | |
9b99c17f | 952 | * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end |
8f012db2 AS |
953 | */ |
954 | ||
955 | int __init numa_fill_memblks(u64 start, u64 end) | |
956 | { | |
957 | struct numa_memblk **blk = &numa_memblk_list[0]; | |
958 | struct numa_meminfo *mi = &numa_meminfo; | |
959 | int count = 0; | |
960 | u64 prev_end; | |
961 | ||
962 | /* | |
963 | * Create a list of pointers to numa_meminfo memblks that | |
9b99c17f AS |
964 | * overlap start, end. The list is used to make in-place |
965 | * changes that fill out the numa_meminfo memblks. | |
8f012db2 AS |
966 | */ |
967 | for (int i = 0; i < mi->nr_blks; i++) { | |
968 | struct numa_memblk *bi = &mi->blk[i]; | |
969 | ||
9b99c17f AS |
970 | if (memblock_addrs_overlap(start, end - start, bi->start, |
971 | bi->end - bi->start)) { | |
8f012db2 AS |
972 | blk[count] = &mi->blk[i]; |
973 | count++; | |
974 | } | |
975 | } | |
976 | if (!count) | |
977 | return NUMA_NO_MEMBLK; | |
978 | ||
979 | /* Sort the list of pointers in memblk->start order */ | |
980 | sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); | |
981 | ||
982 | /* Make sure the first/last memblks include start/end */ | |
983 | blk[0]->start = min(blk[0]->start, start); | |
984 | blk[count - 1]->end = max(blk[count - 1]->end, end); | |
985 | ||
986 | /* | |
987 | * Fill any gaps by tracking the previous memblks | |
988 | * end address and backfilling to it if needed. | |
989 | */ | |
990 | prev_end = blk[0]->end; | |
991 | for (int i = 1; i < count; i++) { | |
992 | struct numa_memblk *curr = blk[i]; | |
993 | ||
994 | if (prev_end >= curr->start) { | |
995 | if (prev_end < curr->end) | |
996 | prev_end = curr->end; | |
997 | } else { | |
998 | curr->start = prev_end; | |
999 | prev_end = curr->end; | |
1000 | } | |
1001 | } | |
1002 | return 0; | |
1003 | } | |
1004 | ||
a4106eae | 1005 | #endif |