x86: e820: user-defined memory maps: remove the range instead of update it to reserved
[linux-2.6-block.git] / arch / x86 / mm / srat_64.c
CommitLineData
1da177e4
LT
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
68a3a7fe
AK
18#include <linux/bootmem.h>
19#include <linux/mm.h>
1da177e4
LT
20#include <asm/proto.h>
21#include <asm/numa.h>
8a6fdd3e 22#include <asm/e820.h>
a65d1d64 23#include <asm/genapic.h>
1da177e4 24
c31fbb1a
AK
25int acpi_numa __initdata;
26
1da177e4
LT
27static struct acpi_table_slit *acpi_slit;
28
29static nodemask_t nodes_parsed __initdata;
abe059e7 30static struct bootnode nodes[MAX_NUMNODES] __initdata;
4942e998 31static struct bootnode nodes_add[MAX_NUMNODES];
68a3a7fe 32static int found_add_area __initdata;
fad7906d 33int hotadd_percent __initdata = 0;
1da177e4 34
6ec6e0d9
SS
35static int num_node_memblks __initdata;
36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
38
9391a3f9
AK
39/* Too small nodes confuse the VM badly. Usually they result
40 from BIOS bugs. */
41#define NODE_MIN_SIZE (4*1024*1024)
42
1da177e4
LT
43static __init int setup_node(int pxm)
44{
762834e8 45 return acpi_map_pxm_to_node(pxm);
1da177e4
LT
46}
47
6ec6e0d9 48static __init int conflicting_memblks(unsigned long start, unsigned long end)
1da177e4
LT
49{
50 int i;
6ec6e0d9
SS
51 for (i = 0; i < num_node_memblks; i++) {
52 struct bootnode *nd = &node_memblk_range[i];
1da177e4
LT
53 if (nd->start == nd->end)
54 continue;
55 if (nd->end > start && nd->start < end)
6ec6e0d9 56 return memblk_nodeid[i];
1da177e4 57 if (nd->end == end && nd->start == start)
6ec6e0d9 58 return memblk_nodeid[i];
1da177e4
LT
59 }
60 return -1;
61}
62
63static __init void cutoff_node(int i, unsigned long start, unsigned long end)
64{
abe059e7 65 struct bootnode *nd = &nodes[i];
68a3a7fe
AK
66
67 if (found_add_area)
68 return;
69
1da177e4
LT
70 if (nd->start < start) {
71 nd->start = start;
72 if (nd->end < nd->start)
73 nd->start = nd->end;
74 }
75 if (nd->end > end) {
1da177e4
LT
76 nd->end = end;
77 if (nd->start > nd->end)
78 nd->start = nd->end;
79 }
80}
81
82static __init void bad_srat(void)
83{
2bce2b54 84 int i;
1da177e4
LT
85 printk(KERN_ERR "SRAT: SRAT not used.\n");
86 acpi_numa = -1;
fad7906d 87 found_add_area = 0;
2bce2b54
AK
88 for (i = 0; i < MAX_LOCAL_APIC; i++)
89 apicid_to_node[i] = NUMA_NO_NODE;
68a3a7fe
AK
90 for (i = 0; i < MAX_NUMNODES; i++)
91 nodes_add[i].start = nodes[i].end = 0;
5cb248ab 92 remove_all_active_ranges();
1da177e4
LT
93}
94
95static __init inline int srat_disabled(void)
96{
97 return numa_off || acpi_numa < 0;
98}
99
100/* Callback for SLIT parsing */
101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102{
103 acpi_slit = slit;
104}
105
106/* Callback for Proximity Domain -> LAPIC mapping */
107void __init
15a58ed1 108acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
1da177e4
LT
109{
110 int pxm, node;
ef97001f 111 int apic_id;
112
d22fe808
AK
113 if (srat_disabled())
114 return;
15a58ed1 115 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
fad7906d 116 bad_srat();
d22fe808
AK
117 return;
118 }
15a58ed1 119 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
1da177e4 120 return;
15a58ed1 121 pxm = pa->proximity_domain_lo;
1da177e4
LT
122 node = setup_node(pxm);
123 if (node < 0) {
124 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
125 bad_srat();
126 return;
127 }
beafe91f 128
a65d1d64
JS
129 if (is_uv_system())
130 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
131 else
132 apic_id = pa->apic_id;
ef97001f 133 apicid_to_node[apic_id] = node;
1da177e4 134 acpi_numa = 1;
0b07e984 135 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
ef97001f 136 pxm, apic_id, node);
1da177e4
LT
137}
138
a4928cff 139static int update_end_of_memory(unsigned long end) {return -1;}
71efa8fd
KM
140static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
141#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
142static inline int save_add_info(void) {return 1;}
143#else
144static inline int save_add_info(void) {return 0;}
145#endif
68a3a7fe 146/*
71efa8fd 147 * Update nodes_add and decide if to include add are in the zone.
ab4a574e 148 * Both SPARSE and RESERVE need nodes_add information.
676b1855 149 * This code supports one contiguous hot add area per node.
68a3a7fe 150 */
d01b9ad5
SR
151static int __init
152reserve_hotadd(int node, unsigned long start, unsigned long end)
68a3a7fe
AK
153{
154 unsigned long s_pfn = start >> PAGE_SHIFT;
155 unsigned long e_pfn = end >> PAGE_SHIFT;
71efa8fd 156 int ret = 0, changed = 0;
68a3a7fe
AK
157 struct bootnode *nd = &nodes_add[node];
158
159 /* I had some trouble with strange memory hotadd regions breaking
160 the boot. Be very strict here and reject anything unexpected.
161 If you want working memory hotadd write correct SRATs.
162
163 The node size check is a basic sanity check to guard against
164 mistakes */
165 if ((signed long)(end - start) < NODE_MIN_SIZE) {
166 printk(KERN_ERR "SRAT: Hotplug area too small\n");
167 return -1;
168 }
169
170 /* This check might be a bit too strict, but I'm keeping it for now. */
5cb248ab 171 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
9c7cd687
MG
172 printk(KERN_ERR
173 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
174 s_pfn, e_pfn);
68a3a7fe
AK
175 return -1;
176 }
177
178 if (!hotadd_enough_memory(&nodes_add[node])) {
179 printk(KERN_ERR "SRAT: Hotplug area too large\n");
180 return -1;
181 }
182
183 /* Looks good */
184
68a3a7fe 185 if (nd->start == nd->end) {
15a58ed1
AS
186 nd->start = start;
187 nd->end = end;
68a3a7fe 188 changed = 1;
15a58ed1
AS
189 } else {
190 if (nd->start == end) {
191 nd->start = start;
68a3a7fe
AK
192 changed = 1;
193 }
15a58ed1
AS
194 if (nd->end == start) {
195 nd->end = end;
68a3a7fe
AK
196 changed = 1;
197 }
198 if (!changed)
199 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
15a58ed1 200 }
68a3a7fe 201
71efa8fd 202 ret = update_end_of_memory(nd->end);
68a3a7fe
AK
203
204 if (changed)
205 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
71efa8fd 206 return ret;
68a3a7fe 207}
68a3a7fe 208
1da177e4
LT
209/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
210void __init
15a58ed1 211acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
1da177e4 212{
68a3a7fe 213 struct bootnode *nd, oldnode;
1da177e4
LT
214 unsigned long start, end;
215 int node, pxm;
216 int i;
217
d22fe808 218 if (srat_disabled())
1da177e4 219 return;
15a58ed1 220 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
d22fe808
AK
221 bad_srat();
222 return;
223 }
15a58ed1 224 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
d22fe808 225 return;
15a58ed1
AS
226
227 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
68a3a7fe 228 return;
15a58ed1
AS
229 start = ma->base_address;
230 end = start + ma->length;
1da177e4
LT
231 pxm = ma->proximity_domain;
232 node = setup_node(pxm);
233 if (node < 0) {
234 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
235 bad_srat();
236 return;
237 }
6ec6e0d9 238 i = conflicting_memblks(start, end);
05d1fa4b
AK
239 if (i == node) {
240 printk(KERN_WARNING
241 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
242 pxm, start, end, nodes[i].start, nodes[i].end);
243 } else if (i >= 0) {
1da177e4 244 printk(KERN_ERR
05d1fa4b
AK
245 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
246 pxm, start, end, node_to_pxm(i),
247 nodes[i].start, nodes[i].end);
1da177e4
LT
248 bad_srat();
249 return;
250 }
251 nd = &nodes[node];
68a3a7fe 252 oldnode = *nd;
1da177e4
LT
253 if (!node_test_and_set(node, nodes_parsed)) {
254 nd->start = start;
255 nd->end = end;
256 } else {
257 if (start < nd->start)
258 nd->start = start;
259 if (nd->end < end)
260 nd->end = end;
261 }
68a3a7fe 262
6ec6e0d9
SS
263 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
264 start, end);
265 e820_register_active_regions(node, start >> PAGE_SHIFT,
266 end >> PAGE_SHIFT);
fb01439c
MG
267 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
268 nd->end >> PAGE_SHIFT);
68a3a7fe 269
15a58ed1
AS
270 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
271 (reserve_hotadd(node, start, end) < 0)) {
68a3a7fe
AK
272 /* Ignore hotadd region. Undo damage */
273 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
274 *nd = oldnode;
275 if ((nd->start | nd->end) == 0)
276 node_clear(node, nodes_parsed);
277 }
6ec6e0d9
SS
278
279 node_memblk_range[num_node_memblks].start = start;
280 node_memblk_range[num_node_memblks].end = end;
281 memblk_nodeid[num_node_memblks] = node;
282 num_node_memblks++;
1da177e4
LT
283}
284
8a6fdd3e
AK
285/* Sanity check to catch more bad SRATs (they are amazingly common).
286 Make sure the PXMs cover all memory. */
3484d798 287static int __init nodes_cover_memory(const struct bootnode *nodes)
8a6fdd3e
AK
288{
289 int i;
290 unsigned long pxmram, e820ram;
291
292 pxmram = 0;
293 for_each_node_mask(i, nodes_parsed) {
294 unsigned long s = nodes[i].start >> PAGE_SHIFT;
295 unsigned long e = nodes[i].end >> PAGE_SHIFT;
296 pxmram += e - s;
5cb248ab 297 pxmram -= absent_pages_in_range(s, e);
68a3a7fe
AK
298 if ((long)pxmram < 0)
299 pxmram = 0;
8a6fdd3e
AK
300 }
301
c987d12f 302 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
fdb9df94
AK
303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
304 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
8a6fdd3e
AK
305 printk(KERN_ERR
306 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
307 (pxmram << PAGE_SHIFT) >> 20,
308 (e820ram << PAGE_SHIFT) >> 20);
309 return 0;
310 }
311 return 1;
312}
313
1e296f57 314static void __init unparse_node(int node)
9391a3f9
AK
315{
316 int i;
317 node_clear(node, nodes_parsed);
318 for (i = 0; i < MAX_LOCAL_APIC; i++) {
319 if (apicid_to_node[i] == node)
320 apicid_to_node[i] = NUMA_NO_NODE;
321 }
322}
323
1da177e4
LT
324void __init acpi_numa_arch_fixup(void) {}
325
326/* Use the information discovered above to actually set up the nodes. */
327int __init acpi_scan_nodes(unsigned long start, unsigned long end)
328{
329 int i;
8a6fdd3e 330
ae2c6dcf
DR
331 if (acpi_numa <= 0)
332 return -1;
333
e58e0d03 334 /* First clean up the node list */
9391a3f9 335 for (i = 0; i < MAX_NUMNODES; i++) {
15a58ed1 336 cutoff_node(i, start, end);
693e3c56
MT
337 /*
338 * don't confuse VM with a node that doesn't have the
339 * minimum memory.
340 */
341 if (nodes[i].end &&
342 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
9391a3f9 343 unparse_node(i);
0d015324
DY
344 node_set_offline(i);
345 }
e58e0d03
AK
346 }
347
3484d798 348 if (!nodes_cover_memory(nodes)) {
8a6fdd3e
AK
349 bad_srat();
350 return -1;
351 }
352
6ec6e0d9
SS
353 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
354 memblk_nodeid);
1da177e4
LT
355 if (memnode_shift < 0) {
356 printk(KERN_ERR
357 "SRAT: No NUMA node hash function found. Contact maintainer\n");
358 bad_srat();
359 return -1;
360 }
e58e0d03 361
e3f1caee
SS
362 node_possible_map = nodes_parsed;
363
e58e0d03 364 /* Finally register nodes */
e3f1caee 365 for_each_node_mask(i, node_possible_map)
1da177e4 366 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
a8062231
AK
367 /* Try again in case setup_node_bootmem missed one due
368 to missing bootmem */
e3f1caee 369 for_each_node_mask(i, node_possible_map)
a8062231
AK
370 if (!node_online(i))
371 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
372
15a58ed1 373 for (i = 0; i < NR_CPUS; i++) {
0164fe16
MT
374 int node = early_cpu_to_node(i);
375
834beda1 376 if (node == NUMA_NO_NODE)
1da177e4 377 continue;
834beda1 378 if (!node_isset(node, node_possible_map))
23ca4bba 379 numa_clear_node(i);
1da177e4
LT
380 }
381 numa_init_array();
382 return 0;
383}
384
3484d798 385#ifdef CONFIG_NUMA_EMU
ef97001f 386static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
387 [0 ... MAX_NUMNODES-1] = PXM_INVAL
388};
602a54a8 389static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
ef97001f 390 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
391};
3484d798
DR
392static int __init find_node_by_addr(unsigned long addr)
393{
394 int ret = NUMA_NO_NODE;
395 int i;
396
397 for_each_node_mask(i, nodes_parsed) {
398 /*
399 * Find the real node that this emulated node appears on. For
400 * the sake of simplicity, we only use a real node's starting
401 * address to determine which emulated node it appears on.
402 */
403 if (addr >= nodes[i].start && addr < nodes[i].end) {
404 ret = i;
405 break;
406 }
407 }
9a1b62fe 408 return ret;
3484d798
DR
409}
410
411/*
412 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
413 * mappings that respect the real ACPI topology but reflect our emulated
414 * environment. For each emulated node, we find which real node it appears on
415 * and create PXM to NID mappings for those fake nodes which mirror that
416 * locality. SLIT will now represent the correct distances between emulated
417 * nodes as a result of the real topology.
418 */
419void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
420{
08705b89 421 int i, j;
3484d798
DR
422
423 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
424 "topology.\n");
425 for (i = 0; i < num_nodes; i++) {
426 int nid, pxm;
427
428 nid = find_node_by_addr(fake_nodes[i].start);
429 if (nid == NUMA_NO_NODE)
430 continue;
431 pxm = node_to_pxm(nid);
432 if (pxm == PXM_INVAL)
433 continue;
434 fake_node_to_pxm_map[i] = pxm;
08705b89
DR
435 /*
436 * For each apicid_to_node mapping that exists for this real
437 * node, it must now point to the fake node ID.
438 */
439 for (j = 0; j < MAX_LOCAL_APIC; j++)
440 if (apicid_to_node[j] == nid)
441 fake_apicid_to_node[j] = i;
3484d798
DR
442 }
443 for (i = 0; i < num_nodes; i++)
444 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
08705b89 445 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
3484d798
DR
446
447 nodes_clear(nodes_parsed);
448 for (i = 0; i < num_nodes; i++)
449 if (fake_nodes[i].start != fake_nodes[i].end)
450 node_set(i, nodes_parsed);
451 WARN_ON(!nodes_cover_memory(fake_nodes));
452}
453
454static int null_slit_node_compare(int a, int b)
455{
456 return node_to_pxm(a) == node_to_pxm(b);
457}
458#else
459static int null_slit_node_compare(int a, int b)
460{
461 return a == b;
462}
463#endif /* CONFIG_NUMA_EMU */
464
68a3a7fe
AK
465void __init srat_reserve_add_area(int nodeid)
466{
467 if (found_add_area && nodes_add[nodeid].end) {
468 u64 total_mb;
469
470 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
471 "for node %d at %Lx-%Lx\n",
472 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
473 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
474 >> PAGE_SHIFT;
475 total_mb *= sizeof(struct page);
476 total_mb >>= 20;
477 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
478 "pre-allocated memory.\n", (unsigned long long)total_mb);
479 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
72a7fe39
BW
480 nodes_add[nodeid].end - nodes_add[nodeid].start,
481 BOOTMEM_DEFAULT);
68a3a7fe
AK
482 }
483}
484
1da177e4
LT
485int __node_distance(int a, int b)
486{
487 int index;
488
489 if (!acpi_slit)
3484d798
DR
490 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
491 REMOTE_DISTANCE;
15a58ed1 492 index = acpi_slit->locality_count * node_to_pxm(a);
1da177e4
LT
493 return acpi_slit->entry[index + node_to_pxm(b)];
494}
495
496EXPORT_SYMBOL(__node_distance);
4942e998 497
6a1673ae 498#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
4942e998
KM
499int memory_add_physaddr_to_nid(u64 start)
500{
501 int i, ret = 0;
502
503 for_each_node(i)
504 if (nodes_add[i].start <= start && nodes_add[i].end > start)
505 ret = i;
506
507 return ret;
508}
8c2676a5 509EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
6a1673ae 510#endif