x86_64: fix mm.txt documentation
[linux-2.6-block.git] / arch / x86 / mm / srat_64.c
CommitLineData
1da177e4
LT
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
68a3a7fe
AK
18#include <linux/bootmem.h>
19#include <linux/mm.h>
1da177e4
LT
20#include <asm/proto.h>
21#include <asm/numa.h>
8a6fdd3e 22#include <asm/e820.h>
a65d1d64 23#include <asm/genapic.h>
1da177e4 24
c31fbb1a
AK
25int acpi_numa __initdata;
26
1da177e4
LT
27static struct acpi_table_slit *acpi_slit;
28
29static nodemask_t nodes_parsed __initdata;
abe059e7 30static struct bootnode nodes[MAX_NUMNODES] __initdata;
4942e998 31static struct bootnode nodes_add[MAX_NUMNODES];
68a3a7fe 32static int found_add_area __initdata;
fad7906d 33int hotadd_percent __initdata = 0;
1da177e4 34
6ec6e0d9
SS
35static int num_node_memblks __initdata;
36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
38
9391a3f9
AK
39/* Too small nodes confuse the VM badly. Usually they result
40 from BIOS bugs. */
41#define NODE_MIN_SIZE (4*1024*1024)
42
1da177e4
LT
43static __init int setup_node(int pxm)
44{
762834e8 45 return acpi_map_pxm_to_node(pxm);
1da177e4
LT
46}
47
6ec6e0d9 48static __init int conflicting_memblks(unsigned long start, unsigned long end)
1da177e4
LT
49{
50 int i;
6ec6e0d9
SS
51 for (i = 0; i < num_node_memblks; i++) {
52 struct bootnode *nd = &node_memblk_range[i];
1da177e4
LT
53 if (nd->start == nd->end)
54 continue;
55 if (nd->end > start && nd->start < end)
6ec6e0d9 56 return memblk_nodeid[i];
1da177e4 57 if (nd->end == end && nd->start == start)
6ec6e0d9 58 return memblk_nodeid[i];
1da177e4
LT
59 }
60 return -1;
61}
62
63static __init void cutoff_node(int i, unsigned long start, unsigned long end)
64{
abe059e7 65 struct bootnode *nd = &nodes[i];
68a3a7fe
AK
66
67 if (found_add_area)
68 return;
69
1da177e4
LT
70 if (nd->start < start) {
71 nd->start = start;
72 if (nd->end < nd->start)
73 nd->start = nd->end;
74 }
75 if (nd->end > end) {
1da177e4
LT
76 nd->end = end;
77 if (nd->start > nd->end)
78 nd->start = nd->end;
79 }
80}
81
82static __init void bad_srat(void)
83{
2bce2b54 84 int i;
1da177e4
LT
85 printk(KERN_ERR "SRAT: SRAT not used.\n");
86 acpi_numa = -1;
fad7906d 87 found_add_area = 0;
2bce2b54
AK
88 for (i = 0; i < MAX_LOCAL_APIC; i++)
89 apicid_to_node[i] = NUMA_NO_NODE;
68a3a7fe
AK
90 for (i = 0; i < MAX_NUMNODES; i++)
91 nodes_add[i].start = nodes[i].end = 0;
5cb248ab 92 remove_all_active_ranges();
1da177e4
LT
93}
94
95static __init inline int srat_disabled(void)
96{
97 return numa_off || acpi_numa < 0;
98}
99
1584b89c
AK
100/*
101 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
102 * up the NUMA heuristics which wants the local node to have a smaller
103 * distance than the others.
104 * Do some quick checks here and only use the SLIT if it passes.
105 */
106static __init int slit_valid(struct acpi_table_slit *slit)
107{
108 int i, j;
15a58ed1 109 int d = slit->locality_count;
1584b89c
AK
110 for (i = 0; i < d; i++) {
111 for (j = 0; j < d; j++) {
112 u8 val = slit->entry[d*i + j];
113 if (i == j) {
a2e212da 114 if (val != LOCAL_DISTANCE)
1584b89c 115 return 0;
a2e212da 116 } else if (val <= LOCAL_DISTANCE)
1584b89c
AK
117 return 0;
118 }
119 }
120 return 1;
121}
122
1da177e4
LT
123/* Callback for SLIT parsing */
124void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
125{
1584b89c
AK
126 if (!slit_valid(slit)) {
127 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
128 return;
129 }
1da177e4
LT
130 acpi_slit = slit;
131}
132
133/* Callback for Proximity Domain -> LAPIC mapping */
134void __init
15a58ed1 135acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
1da177e4
LT
136{
137 int pxm, node;
ef97001f 138 int apic_id;
139
d22fe808
AK
140 if (srat_disabled())
141 return;
15a58ed1 142 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
fad7906d 143 bad_srat();
d22fe808
AK
144 return;
145 }
15a58ed1 146 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
1da177e4 147 return;
15a58ed1 148 pxm = pa->proximity_domain_lo;
1da177e4
LT
149 node = setup_node(pxm);
150 if (node < 0) {
151 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
152 bad_srat();
153 return;
154 }
beafe91f 155
a65d1d64
JS
156 if (is_uv_system())
157 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
158 else
159 apic_id = pa->apic_id;
ef97001f 160 apicid_to_node[apic_id] = node;
1da177e4 161 acpi_numa = 1;
0b07e984 162 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
ef97001f 163 pxm, apic_id, node);
1da177e4
LT
164}
165
a4928cff 166static int update_end_of_memory(unsigned long end) {return -1;}
71efa8fd
KM
167static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
168#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
169static inline int save_add_info(void) {return 1;}
170#else
171static inline int save_add_info(void) {return 0;}
172#endif
68a3a7fe 173/*
71efa8fd 174 * Update nodes_add and decide if to include add are in the zone.
ab4a574e 175 * Both SPARSE and RESERVE need nodes_add information.
676b1855 176 * This code supports one contiguous hot add area per node.
68a3a7fe 177 */
d01b9ad5
SR
178static int __init
179reserve_hotadd(int node, unsigned long start, unsigned long end)
68a3a7fe
AK
180{
181 unsigned long s_pfn = start >> PAGE_SHIFT;
182 unsigned long e_pfn = end >> PAGE_SHIFT;
71efa8fd 183 int ret = 0, changed = 0;
68a3a7fe
AK
184 struct bootnode *nd = &nodes_add[node];
185
186 /* I had some trouble with strange memory hotadd regions breaking
187 the boot. Be very strict here and reject anything unexpected.
188 If you want working memory hotadd write correct SRATs.
189
190 The node size check is a basic sanity check to guard against
191 mistakes */
192 if ((signed long)(end - start) < NODE_MIN_SIZE) {
193 printk(KERN_ERR "SRAT: Hotplug area too small\n");
194 return -1;
195 }
196
197 /* This check might be a bit too strict, but I'm keeping it for now. */
5cb248ab 198 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
9c7cd687
MG
199 printk(KERN_ERR
200 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
201 s_pfn, e_pfn);
68a3a7fe
AK
202 return -1;
203 }
204
205 if (!hotadd_enough_memory(&nodes_add[node])) {
206 printk(KERN_ERR "SRAT: Hotplug area too large\n");
207 return -1;
208 }
209
210 /* Looks good */
211
68a3a7fe 212 if (nd->start == nd->end) {
15a58ed1
AS
213 nd->start = start;
214 nd->end = end;
68a3a7fe 215 changed = 1;
15a58ed1
AS
216 } else {
217 if (nd->start == end) {
218 nd->start = start;
68a3a7fe
AK
219 changed = 1;
220 }
15a58ed1
AS
221 if (nd->end == start) {
222 nd->end = end;
68a3a7fe
AK
223 changed = 1;
224 }
225 if (!changed)
226 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
15a58ed1 227 }
68a3a7fe 228
71efa8fd 229 ret = update_end_of_memory(nd->end);
68a3a7fe
AK
230
231 if (changed)
232 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
71efa8fd 233 return ret;
68a3a7fe 234}
68a3a7fe 235
1da177e4
LT
236/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
237void __init
15a58ed1 238acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
1da177e4 239{
68a3a7fe 240 struct bootnode *nd, oldnode;
1da177e4
LT
241 unsigned long start, end;
242 int node, pxm;
243 int i;
244
d22fe808 245 if (srat_disabled())
1da177e4 246 return;
15a58ed1 247 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
d22fe808
AK
248 bad_srat();
249 return;
250 }
15a58ed1 251 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
d22fe808 252 return;
15a58ed1
AS
253
254 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
68a3a7fe 255 return;
15a58ed1
AS
256 start = ma->base_address;
257 end = start + ma->length;
1da177e4
LT
258 pxm = ma->proximity_domain;
259 node = setup_node(pxm);
260 if (node < 0) {
261 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
262 bad_srat();
263 return;
264 }
6ec6e0d9 265 i = conflicting_memblks(start, end);
05d1fa4b
AK
266 if (i == node) {
267 printk(KERN_WARNING
268 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
269 pxm, start, end, nodes[i].start, nodes[i].end);
270 } else if (i >= 0) {
1da177e4 271 printk(KERN_ERR
05d1fa4b
AK
272 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
273 pxm, start, end, node_to_pxm(i),
274 nodes[i].start, nodes[i].end);
1da177e4
LT
275 bad_srat();
276 return;
277 }
278 nd = &nodes[node];
68a3a7fe 279 oldnode = *nd;
1da177e4
LT
280 if (!node_test_and_set(node, nodes_parsed)) {
281 nd->start = start;
282 nd->end = end;
283 } else {
284 if (start < nd->start)
285 nd->start = start;
286 if (nd->end < end)
287 nd->end = end;
288 }
68a3a7fe 289
6ec6e0d9
SS
290 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
291 start, end);
292 e820_register_active_regions(node, start >> PAGE_SHIFT,
293 end >> PAGE_SHIFT);
fb01439c
MG
294 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
295 nd->end >> PAGE_SHIFT);
68a3a7fe 296
15a58ed1
AS
297 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
298 (reserve_hotadd(node, start, end) < 0)) {
68a3a7fe
AK
299 /* Ignore hotadd region. Undo damage */
300 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
301 *nd = oldnode;
302 if ((nd->start | nd->end) == 0)
303 node_clear(node, nodes_parsed);
304 }
6ec6e0d9
SS
305
306 node_memblk_range[num_node_memblks].start = start;
307 node_memblk_range[num_node_memblks].end = end;
308 memblk_nodeid[num_node_memblks] = node;
309 num_node_memblks++;
1da177e4
LT
310}
311
8a6fdd3e
AK
312/* Sanity check to catch more bad SRATs (they are amazingly common).
313 Make sure the PXMs cover all memory. */
3484d798 314static int __init nodes_cover_memory(const struct bootnode *nodes)
8a6fdd3e
AK
315{
316 int i;
317 unsigned long pxmram, e820ram;
318
319 pxmram = 0;
320 for_each_node_mask(i, nodes_parsed) {
321 unsigned long s = nodes[i].start >> PAGE_SHIFT;
322 unsigned long e = nodes[i].end >> PAGE_SHIFT;
323 pxmram += e - s;
5cb248ab 324 pxmram -= absent_pages_in_range(s, e);
68a3a7fe
AK
325 if ((long)pxmram < 0)
326 pxmram = 0;
8a6fdd3e
AK
327 }
328
5cb248ab 329 e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
fdb9df94
AK
330 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
331 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
8a6fdd3e
AK
332 printk(KERN_ERR
333 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
334 (pxmram << PAGE_SHIFT) >> 20,
335 (e820ram << PAGE_SHIFT) >> 20);
336 return 0;
337 }
338 return 1;
339}
340
1e296f57 341static void __init unparse_node(int node)
9391a3f9
AK
342{
343 int i;
344 node_clear(node, nodes_parsed);
345 for (i = 0; i < MAX_LOCAL_APIC; i++) {
346 if (apicid_to_node[i] == node)
347 apicid_to_node[i] = NUMA_NO_NODE;
348 }
349}
350
1da177e4
LT
351void __init acpi_numa_arch_fixup(void) {}
352
353/* Use the information discovered above to actually set up the nodes. */
354int __init acpi_scan_nodes(unsigned long start, unsigned long end)
355{
356 int i;
8a6fdd3e 357
ae2c6dcf
DR
358 if (acpi_numa <= 0)
359 return -1;
360
e58e0d03 361 /* First clean up the node list */
9391a3f9 362 for (i = 0; i < MAX_NUMNODES; i++) {
15a58ed1 363 cutoff_node(i, start, end);
693e3c56
MT
364 /*
365 * don't confuse VM with a node that doesn't have the
366 * minimum memory.
367 */
368 if (nodes[i].end &&
369 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
9391a3f9 370 unparse_node(i);
0d015324
DY
371 node_set_offline(i);
372 }
e58e0d03
AK
373 }
374
3484d798 375 if (!nodes_cover_memory(nodes)) {
8a6fdd3e
AK
376 bad_srat();
377 return -1;
378 }
379
6ec6e0d9
SS
380 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
381 memblk_nodeid);
1da177e4
LT
382 if (memnode_shift < 0) {
383 printk(KERN_ERR
384 "SRAT: No NUMA node hash function found. Contact maintainer\n");
385 bad_srat();
386 return -1;
387 }
e58e0d03 388
e3f1caee
SS
389 node_possible_map = nodes_parsed;
390
e58e0d03 391 /* Finally register nodes */
e3f1caee 392 for_each_node_mask(i, node_possible_map)
1da177e4 393 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
a8062231
AK
394 /* Try again in case setup_node_bootmem missed one due
395 to missing bootmem */
e3f1caee 396 for_each_node_mask(i, node_possible_map)
a8062231
AK
397 if (!node_online(i))
398 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
399
15a58ed1 400 for (i = 0; i < NR_CPUS; i++) {
0164fe16
MT
401 int node = early_cpu_to_node(i);
402
834beda1 403 if (node == NUMA_NO_NODE)
1da177e4 404 continue;
834beda1 405 if (!node_isset(node, node_possible_map))
69d81fcd 406 numa_set_node(i, NUMA_NO_NODE);
1da177e4
LT
407 }
408 numa_init_array();
409 return 0;
410}
411
3484d798 412#ifdef CONFIG_NUMA_EMU
ef97001f 413static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
414 [0 ... MAX_NUMNODES-1] = PXM_INVAL
415};
602a54a8 416static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
ef97001f 417 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
418};
3484d798
DR
419static int __init find_node_by_addr(unsigned long addr)
420{
421 int ret = NUMA_NO_NODE;
422 int i;
423
424 for_each_node_mask(i, nodes_parsed) {
425 /*
426 * Find the real node that this emulated node appears on. For
427 * the sake of simplicity, we only use a real node's starting
428 * address to determine which emulated node it appears on.
429 */
430 if (addr >= nodes[i].start && addr < nodes[i].end) {
431 ret = i;
432 break;
433 }
434 }
9a1b62fe 435 return ret;
3484d798
DR
436}
437
438/*
439 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
440 * mappings that respect the real ACPI topology but reflect our emulated
441 * environment. For each emulated node, we find which real node it appears on
442 * and create PXM to NID mappings for those fake nodes which mirror that
443 * locality. SLIT will now represent the correct distances between emulated
444 * nodes as a result of the real topology.
445 */
446void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
447{
08705b89 448 int i, j;
3484d798
DR
449
450 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
451 "topology.\n");
452 for (i = 0; i < num_nodes; i++) {
453 int nid, pxm;
454
455 nid = find_node_by_addr(fake_nodes[i].start);
456 if (nid == NUMA_NO_NODE)
457 continue;
458 pxm = node_to_pxm(nid);
459 if (pxm == PXM_INVAL)
460 continue;
461 fake_node_to_pxm_map[i] = pxm;
08705b89
DR
462 /*
463 * For each apicid_to_node mapping that exists for this real
464 * node, it must now point to the fake node ID.
465 */
466 for (j = 0; j < MAX_LOCAL_APIC; j++)
467 if (apicid_to_node[j] == nid)
468 fake_apicid_to_node[j] = i;
3484d798
DR
469 }
470 for (i = 0; i < num_nodes; i++)
471 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
08705b89 472 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
3484d798
DR
473
474 nodes_clear(nodes_parsed);
475 for (i = 0; i < num_nodes; i++)
476 if (fake_nodes[i].start != fake_nodes[i].end)
477 node_set(i, nodes_parsed);
478 WARN_ON(!nodes_cover_memory(fake_nodes));
479}
480
481static int null_slit_node_compare(int a, int b)
482{
483 return node_to_pxm(a) == node_to_pxm(b);
484}
485#else
486static int null_slit_node_compare(int a, int b)
487{
488 return a == b;
489}
490#endif /* CONFIG_NUMA_EMU */
491
68a3a7fe
AK
492void __init srat_reserve_add_area(int nodeid)
493{
494 if (found_add_area && nodes_add[nodeid].end) {
495 u64 total_mb;
496
497 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
498 "for node %d at %Lx-%Lx\n",
499 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
500 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
501 >> PAGE_SHIFT;
502 total_mb *= sizeof(struct page);
503 total_mb >>= 20;
504 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
505 "pre-allocated memory.\n", (unsigned long long)total_mb);
506 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
72a7fe39
BW
507 nodes_add[nodeid].end - nodes_add[nodeid].start,
508 BOOTMEM_DEFAULT);
68a3a7fe
AK
509 }
510}
511
1da177e4
LT
512int __node_distance(int a, int b)
513{
514 int index;
515
516 if (!acpi_slit)
3484d798
DR
517 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
518 REMOTE_DISTANCE;
15a58ed1 519 index = acpi_slit->locality_count * node_to_pxm(a);
1da177e4
LT
520 return acpi_slit->entry[index + node_to_pxm(b)];
521}
522
523EXPORT_SYMBOL(__node_distance);
4942e998
KM
524
525int memory_add_physaddr_to_nid(u64 start)
526{
527 int i, ret = 0;
528
529 for_each_node(i)
530 if (nodes_add[i].start <= start && nodes_add[i].end > start)
531 ret = i;
532
533 return ret;
534}
8c2676a5
KM
535EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
536