powerpc/mm: Consolidate numa_enable check and min_common_depth check
[linux-2.6-block.git] / arch / powerpc / mm / numa.c
CommitLineData
1da177e4
LT
1/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
2d73bae1
NA
11#define pr_fmt(fmt) "numa: " fmt
12
1da177e4 13#include <linux/threads.h>
57c8a661 14#include <linux/memblock.h>
1da177e4
LT
15#include <linux/init.h>
16#include <linux/mm.h>
17#include <linux/mmzone.h>
4b16f8e2 18#include <linux/export.h>
1da177e4
LT
19#include <linux/nodemask.h>
20#include <linux/cpu.h>
21#include <linux/notifier.h>
6df1646e 22#include <linux/of.h>
06eccea6 23#include <linux/pfn.h>
9eff1a38
JL
24#include <linux/cpuset.h>
25#include <linux/node.h>
30c05350 26#include <linux/stop_machine.h>
e04fa612
NF
27#include <linux/proc_fs.h>
28#include <linux/seq_file.h>
29#include <linux/uaccess.h>
191a7120 30#include <linux/slab.h>
3be7db6a 31#include <asm/cputhreads.h>
45fb6cea 32#include <asm/sparsemem.h>
d9b2b2a2 33#include <asm/prom.h>
2249ca9d 34#include <asm/smp.h>
d4edc5b6 35#include <asm/topology.h>
9eff1a38
JL
36#include <asm/firmware.h>
37#include <asm/paca.h>
39bf990e 38#include <asm/hvcall.h>
ae3a197e 39#include <asm/setup.h>
176bbf14 40#include <asm/vdso.h>
514a9cb3 41#include <asm/drmem.h>
1da177e4
LT
42
43static int numa_enabled = 1;
44
1daa6d08
BS
45static char *cmdline __initdata;
46
1da177e4
LT
47static int numa_debug;
48#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
49
45fb6cea 50int numa_cpu_lookup_table[NR_CPUS];
25863de0 51cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
1da177e4 52struct pglist_data *node_data[MAX_NUMNODES];
45fb6cea
AB
53
54EXPORT_SYMBOL(numa_cpu_lookup_table);
25863de0 55EXPORT_SYMBOL(node_to_cpumask_map);
45fb6cea
AB
56EXPORT_SYMBOL(node_data);
57
1da177e4 58static int min_common_depth;
237a0989 59static int n_mem_addr_cells, n_mem_size_cells;
41eab6f8
AB
60static int form1_affinity;
61
62#define MAX_DISTANCE_REF_POINTS 4
63static int distance_ref_points_depth;
b08a2a12 64static const __be32 *distance_ref_points;
41eab6f8 65static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
1da177e4 66
25863de0
AB
67/*
68 * Allocate node_to_cpumask_map based on number of available nodes
69 * Requires node_possible_map to be valid.
70 *
9512938b 71 * Note: cpumask_of_node() is not valid until after this is done.
25863de0
AB
72 */
73static void __init setup_node_to_cpumask_map(void)
74{
f9d531b8 75 unsigned int node;
25863de0
AB
76
77 /* setup nr_node_ids if not done yet */
f9d531b8
CS
78 if (nr_node_ids == MAX_NUMNODES)
79 setup_nr_node_ids();
25863de0
AB
80
81 /* allocate the map */
c118baf8 82 for_each_node(node)
25863de0
AB
83 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
84
85 /* cpumask_of_node() will now work */
b9726c26 86 dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
25863de0
AB
87}
88
55671f3c 89static int __init fake_numa_create_new_node(unsigned long end_pfn,
1daa6d08
BS
90 unsigned int *nid)
91{
92 unsigned long long mem;
93 char *p = cmdline;
94 static unsigned int fake_nid;
95 static unsigned long long curr_boundary;
96
97 /*
98 * Modify node id, iff we started creating NUMA nodes
99 * We want to continue from where we left of the last time
100 */
101 if (fake_nid)
102 *nid = fake_nid;
103 /*
104 * In case there are no more arguments to parse, the
105 * node_id should be the same as the last fake node id
106 * (we've handled this above).
107 */
108 if (!p)
109 return 0;
110
111 mem = memparse(p, &p);
112 if (!mem)
113 return 0;
114
115 if (mem < curr_boundary)
116 return 0;
117
118 curr_boundary = mem;
119
120 if ((end_pfn << PAGE_SHIFT) > mem) {
121 /*
122 * Skip commas and spaces
123 */
124 while (*p == ',' || *p == ' ' || *p == '\t')
125 p++;
126
127 cmdline = p;
128 fake_nid++;
129 *nid = fake_nid;
130 dbg("created new fake_node with id %d\n", fake_nid);
131 return 1;
132 }
133 return 0;
134}
135
d4edc5b6
SB
136static void reset_numa_cpu_lookup_table(void)
137{
138 unsigned int cpu;
139
140 for_each_possible_cpu(cpu)
141 numa_cpu_lookup_table[cpu] = -1;
142}
143
d4edc5b6
SB
144static void map_cpu_to_node(int cpu, int node)
145{
146 update_numa_cpu_lookup_table(cpu, node);
45fb6cea 147
bf4b85b0
NL
148 dbg("adding cpu %d to node %d\n", cpu, node);
149
25863de0
AB
150 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
151 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
1da177e4
LT
152}
153
39bf990e 154#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
1da177e4
LT
155static void unmap_cpu_from_node(unsigned long cpu)
156{
157 int node = numa_cpu_lookup_table[cpu];
158
159 dbg("removing cpu %lu from node %d\n", cpu, node);
160
25863de0 161 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
429f4d8d 162 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
1da177e4
LT
163 } else {
164 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
165 cpu, node);
166 }
167}
39bf990e 168#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
1da177e4 169
d62c8dee
NR
170int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
171{
172 int dist = 0;
173
174 int i, index;
175
176 for (i = 0; i < distance_ref_points_depth; i++) {
177 index = be32_to_cpu(distance_ref_points[i]);
178 if (cpu1_assoc[index] == cpu2_assoc[index])
179 break;
180 dist++;
181 }
182
183 return dist;
184}
185
1da177e4 186/* must hold reference to node during call */
b08a2a12 187static const __be32 *of_get_associativity(struct device_node *dev)
1da177e4 188{
e2eb6392 189 return of_get_property(dev, "ibm,associativity", NULL);
1da177e4
LT
190}
191
41eab6f8
AB
192int __node_distance(int a, int b)
193{
194 int i;
195 int distance = LOCAL_DISTANCE;
196
197 if (!form1_affinity)
7122beee 198 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
41eab6f8
AB
199
200 for (i = 0; i < distance_ref_points_depth; i++) {
201 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
202 break;
203
204 /* Double the distance for each NUMA level */
205 distance *= 2;
206 }
207
208 return distance;
209}
12c743eb 210EXPORT_SYMBOL(__node_distance);
41eab6f8
AB
211
212static void initialize_distance_lookup_table(int nid,
b08a2a12 213 const __be32 *associativity)
41eab6f8
AB
214{
215 int i;
216
217 if (!form1_affinity)
218 return;
219
220 for (i = 0; i < distance_ref_points_depth; i++) {
b08a2a12
AP
221 const __be32 *entry;
222
1d805440 223 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
b08a2a12 224 distance_lookup_table[nid][i] = of_read_number(entry, 1);
41eab6f8
AB
225 }
226}
227
482ec7c4
NL
228/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
229 * info is found.
230 */
b08a2a12 231static int associativity_to_nid(const __be32 *associativity)
1da177e4 232{
98fa15f3 233 int nid = NUMA_NO_NODE;
1da177e4 234
495c2ff4 235 if (!numa_enabled)
482ec7c4 236 goto out;
1da177e4 237
b08a2a12
AP
238 if (of_read_number(associativity, 1) >= min_common_depth)
239 nid = of_read_number(&associativity[min_common_depth], 1);
bc16a759
NL
240
241 /* POWER4 LPAR uses 0xffff as invalid node */
482ec7c4 242 if (nid == 0xffff || nid >= MAX_NUMNODES)
98fa15f3 243 nid = NUMA_NO_NODE;
41eab6f8 244
b08a2a12 245 if (nid > 0 &&
1d805440
ND
246 of_read_number(associativity, 1) >= distance_ref_points_depth) {
247 /*
248 * Skip the length field and send start of associativity array
249 */
250 initialize_distance_lookup_table(nid, associativity + 1);
251 }
41eab6f8 252
482ec7c4 253out:
cf950b7a 254 return nid;
1da177e4
LT
255}
256
9eff1a38
JL
257/* Returns the nid associated with the given device tree node,
258 * or -1 if not found.
259 */
260static int of_node_to_nid_single(struct device_node *device)
261{
98fa15f3 262 int nid = NUMA_NO_NODE;
b08a2a12 263 const __be32 *tmp;
9eff1a38
JL
264
265 tmp = of_get_associativity(device);
266 if (tmp)
267 nid = associativity_to_nid(tmp);
268 return nid;
269}
270
953039c8
JK
271/* Walk the device tree upwards, looking for an associativity id */
272int of_node_to_nid(struct device_node *device)
273{
98fa15f3 274 int nid = NUMA_NO_NODE;
953039c8
JK
275
276 of_node_get(device);
277 while (device) {
278 nid = of_node_to_nid_single(device);
279 if (nid != -1)
280 break;
281
1def3758 282 device = of_get_next_parent(device);
953039c8
JK
283 }
284 of_node_put(device);
285
286 return nid;
287}
be9ba9ff 288EXPORT_SYMBOL(of_node_to_nid);
953039c8 289
1da177e4
LT
290static int __init find_min_common_depth(void)
291{
41eab6f8 292 int depth;
e70606eb 293 struct device_node *root;
1da177e4 294
1c8ee733
DS
295 if (firmware_has_feature(FW_FEATURE_OPAL))
296 root = of_find_node_by_path("/ibm,opal");
297 else
298 root = of_find_node_by_path("/rtas");
e70606eb
ME
299 if (!root)
300 root = of_find_node_by_path("/");
1da177e4
LT
301
302 /*
41eab6f8
AB
303 * This property is a set of 32-bit integers, each representing
304 * an index into the ibm,associativity nodes.
305 *
306 * With form 0 affinity the first integer is for an SMP configuration
307 * (should be all 0's) and the second is for a normal NUMA
308 * configuration. We have only one level of NUMA.
309 *
310 * With form 1 affinity the first integer is the most significant
311 * NUMA boundary and the following are progressively less significant
312 * boundaries. There can be more than one level of NUMA.
1da177e4 313 */
e70606eb 314 distance_ref_points = of_get_property(root,
41eab6f8
AB
315 "ibm,associativity-reference-points",
316 &distance_ref_points_depth);
317
318 if (!distance_ref_points) {
319 dbg("NUMA: ibm,associativity-reference-points not found.\n");
320 goto err;
321 }
322
323 distance_ref_points_depth /= sizeof(int);
1da177e4 324
8002b0c5
NF
325 if (firmware_has_feature(FW_FEATURE_OPAL) ||
326 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
327 dbg("Using form 1 affinity\n");
1c8ee733 328 form1_affinity = 1;
4b83c330
AB
329 }
330
41eab6f8 331 if (form1_affinity) {
b08a2a12 332 depth = of_read_number(distance_ref_points, 1);
1da177e4 333 } else {
41eab6f8
AB
334 if (distance_ref_points_depth < 2) {
335 printk(KERN_WARNING "NUMA: "
336 "short ibm,associativity-reference-points\n");
337 goto err;
338 }
339
b08a2a12 340 depth = of_read_number(&distance_ref_points[1], 1);
1da177e4 341 }
1da177e4 342
41eab6f8
AB
343 /*
344 * Warn and cap if the hardware supports more than
345 * MAX_DISTANCE_REF_POINTS domains.
346 */
347 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
348 printk(KERN_WARNING "NUMA: distance array capped at "
349 "%d entries\n", MAX_DISTANCE_REF_POINTS);
350 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
351 }
352
e70606eb 353 of_node_put(root);
1da177e4 354 return depth;
41eab6f8
AB
355
356err:
e70606eb 357 of_node_put(root);
41eab6f8 358 return -1;
1da177e4
LT
359}
360
84c9fdd1 361static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
1da177e4
LT
362{
363 struct device_node *memory = NULL;
1da177e4
LT
364
365 memory = of_find_node_by_type(memory, "memory");
54c23310 366 if (!memory)
84c9fdd1 367 panic("numa.c: No memory nodes found!");
54c23310 368
a8bda5dd 369 *n_addr_cells = of_n_addr_cells(memory);
9213feea 370 *n_size_cells = of_n_size_cells(memory);
84c9fdd1 371 of_node_put(memory);
1da177e4
LT
372}
373
b08a2a12 374static unsigned long read_n_cells(int n, const __be32 **buf)
1da177e4
LT
375{
376 unsigned long result = 0;
377
378 while (n--) {
b08a2a12 379 result = (result << 32) | of_read_number(*buf, 1);
1da177e4
LT
380 (*buf)++;
381 }
382 return result;
383}
384
8342681d
NF
385struct assoc_arrays {
386 u32 n_arrays;
387 u32 array_sz;
b08a2a12 388 const __be32 *arrays;
8342681d
NF
389};
390
391/*
25985edc 392 * Retrieve and validate the list of associativity arrays for drconf
8342681d
NF
393 * memory from the ibm,associativity-lookup-arrays property of the
394 * device tree..
395 *
396 * The layout of the ibm,associativity-lookup-arrays property is a number N
397 * indicating the number of associativity arrays, followed by a number M
398 * indicating the size of each associativity array, followed by a list
399 * of N associativity arrays.
400 */
35f80deb 401static int of_get_assoc_arrays(struct assoc_arrays *aa)
8342681d 402{
35f80deb 403 struct device_node *memory;
b08a2a12 404 const __be32 *prop;
8342681d
NF
405 u32 len;
406
35f80deb
NF
407 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
408 if (!memory)
409 return -1;
410
8342681d 411 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
35f80deb
NF
412 if (!prop || len < 2 * sizeof(unsigned int)) {
413 of_node_put(memory);
8342681d 414 return -1;
35f80deb 415 }
8342681d 416
b08a2a12
AP
417 aa->n_arrays = of_read_number(prop++, 1);
418 aa->array_sz = of_read_number(prop++, 1);
8342681d 419
35f80deb
NF
420 of_node_put(memory);
421
42b2aa86 422 /* Now that we know the number of arrays and size of each array,
8342681d
NF
423 * revalidate the size of the property read in.
424 */
425 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
426 return -1;
427
428 aa->arrays = prop;
429 return 0;
430}
431
432/*
433 * This is like of_node_to_nid_single() for memory represented in the
434 * ibm,dynamic-reconfiguration-memory node.
435 */
514a9cb3 436static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
8342681d 437{
b88fc309 438 struct assoc_arrays aa = { .arrays = NULL };
ea9f5b70 439 int default_nid = NUMA_NO_NODE;
8342681d 440 int nid = default_nid;
b88fc309
NF
441 int rc, index;
442
f52741c4 443 if ((min_common_depth < 0) || !numa_enabled)
ea9f5b70
AK
444 return default_nid;
445
b88fc309
NF
446 rc = of_get_assoc_arrays(&aa);
447 if (rc)
448 return default_nid;
8342681d 449
ea9f5b70
AK
450 if (min_common_depth <= aa.array_sz &&
451 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
514a9cb3 452 index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
b88fc309 453 nid = of_read_number(&aa.arrays[index], 1);
8342681d
NF
454
455 if (nid == 0xffff || nid >= MAX_NUMNODES)
456 nid = default_nid;
1d805440
ND
457
458 if (nid > 0) {
514a9cb3 459 index = lmb->aa_index * aa.array_sz;
1d805440 460 initialize_distance_lookup_table(nid,
b88fc309 461 &aa.arrays[index]);
1d805440 462 }
8342681d
NF
463 }
464
465 return nid;
466}
467
1da177e4
LT
468/*
469 * Figure out to which domain a cpu belongs and stick it there.
470 * Return the id of the domain used.
471 */
061d19f2 472static int numa_setup_cpu(unsigned long lcpu)
1da177e4 473{
98fa15f3 474 int nid = NUMA_NO_NODE;
d4edc5b6
SB
475 struct device_node *cpu;
476
477 /*
478 * If a valid cpu-to-node mapping is already available, use it
479 * directly instead of querying the firmware, since it represents
480 * the most recent mapping notified to us by the platform (eg: VPHN).
481 */
482 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
483 map_cpu_to_node(lcpu, nid);
484 return nid;
485 }
486
487 cpu = of_get_cpu_node(lcpu, NULL);
1da177e4
LT
488
489 if (!cpu) {
490 WARN_ON(1);
297cf502
LZ
491 if (cpu_present(lcpu))
492 goto out_present;
493 else
494 goto out;
1da177e4
LT
495 }
496
953039c8 497 nid = of_node_to_nid_single(cpu);
1da177e4 498
297cf502 499out_present:
ea05ba7c 500 if (nid < 0 || !node_possible(nid))
72c33688 501 nid = first_online_node;
1da177e4 502
297cf502 503 map_cpu_to_node(lcpu, nid);
1da177e4 504 of_node_put(cpu);
297cf502 505out:
cf950b7a 506 return nid;
1da177e4
LT
507}
508
68fb18aa
SB
509static void verify_cpu_node_mapping(int cpu, int node)
510{
511 int base, sibling, i;
512
513 /* Verify that all the threads in the core belong to the same node */
514 base = cpu_first_thread_sibling(cpu);
515
516 for (i = 0; i < threads_per_core; i++) {
517 sibling = base + i;
518
519 if (sibling == cpu || cpu_is_offline(sibling))
520 continue;
521
522 if (cpu_to_node(sibling) != node) {
523 WARN(1, "CPU thread siblings %d and %d don't belong"
524 " to the same node!\n", cpu, sibling);
525 break;
526 }
527 }
528}
529
bdab88e0
SAS
530/* Must run before sched domains notifier. */
531static int ppc_numa_cpu_prepare(unsigned int cpu)
532{
533 int nid;
534
535 nid = numa_setup_cpu(cpu);
536 verify_cpu_node_mapping(cpu, nid);
537 return 0;
538}
539
540static int ppc_numa_cpu_dead(unsigned int cpu)
541{
1da177e4 542#ifdef CONFIG_HOTPLUG_CPU
bdab88e0 543 unmap_cpu_from_node(cpu);
1da177e4 544#endif
bdab88e0 545 return 0;
1da177e4
LT
546}
547
548/*
549 * Check and possibly modify a memory region to enforce the memory limit.
550 *
551 * Returns the size the region should have to enforce the memory limit.
552 * This will either be the original value of size, a truncated value,
553 * or zero. If the returned value of size is 0 the region should be
25985edc 554 * discarded as it lies wholly above the memory limit.
1da177e4 555 */
45fb6cea
AB
556static unsigned long __init numa_enforce_memory_limit(unsigned long start,
557 unsigned long size)
1da177e4
LT
558{
559 /*
95f72d1e 560 * We use memblock_end_of_DRAM() in here instead of memory_limit because
1da177e4 561 * we've already adjusted it for the limit and it takes care of
fe55249d
MM
562 * having memory holes below the limit. Also, in the case of
563 * iommu_is_off, memory_limit is not set but is implicitly enforced.
1da177e4 564 */
1da177e4 565
95f72d1e 566 if (start + size <= memblock_end_of_DRAM())
1da177e4
LT
567 return size;
568
95f72d1e 569 if (start >= memblock_end_of_DRAM())
1da177e4
LT
570 return 0;
571
95f72d1e 572 return memblock_end_of_DRAM() - start;
1da177e4
LT
573}
574
cf00085d
C
575/*
576 * Reads the counter for a given entry in
577 * linux,drconf-usable-memory property
578 */
b08a2a12 579static inline int __init read_usm_ranges(const __be32 **usm)
cf00085d
C
580{
581 /*
3fdfd990 582 * For each lmb in ibm,dynamic-memory a corresponding
cf00085d
C
583 * entry in linux,drconf-usable-memory property contains
584 * a counter followed by that many (base, size) duple.
585 * read the counter from linux,drconf-usable-memory
586 */
587 return read_n_cells(n_mem_size_cells, usm);
588}
589
0204568a
PM
590/*
591 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
592 * node. This assumes n_mem_{addr,size}_cells have been set.
593 */
514a9cb3
NF
594static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
595 const __be32 **usm)
0204568a 596{
514a9cb3
NF
597 unsigned int ranges, is_kexec_kdump = 0;
598 unsigned long base, size, sz;
8342681d 599 int nid;
8342681d 600
514a9cb3
NF
601 /*
602 * Skip this block if the reserved bit is set in flags (0x80)
603 * or if the block is not assigned to this partition (0x8)
604 */
605 if ((lmb->flags & DRCONF_MEM_RESERVED)
606 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
8342681d
NF
607 return;
608
514a9cb3 609 if (*usm)
cf00085d
C
610 is_kexec_kdump = 1;
611
514a9cb3
NF
612 base = lmb->base_addr;
613 size = drmem_lmb_size();
614 ranges = 1;
8342681d 615
514a9cb3
NF
616 if (is_kexec_kdump) {
617 ranges = read_usm_ranges(usm);
618 if (!ranges) /* there are no (base, size) duple */
619 return;
620 }
8342681d 621
514a9cb3 622 do {
cf00085d 623 if (is_kexec_kdump) {
514a9cb3
NF
624 base = read_n_cells(n_mem_addr_cells, usm);
625 size = read_n_cells(n_mem_size_cells, usm);
cf00085d 626 }
514a9cb3
NF
627
628 nid = of_drconf_to_nid_single(lmb);
629 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
630 &nid);
631 node_set_online(nid);
632 sz = numa_enforce_memory_limit(base, size);
633 if (sz)
634 memblock_set_node(base, sz, &memblock.memory, nid);
635 } while (--ranges);
0204568a
PM
636}
637
1da177e4
LT
638static int __init parse_numa_properties(void)
639{
94db7c5e 640 struct device_node *memory;
482ec7c4 641 int default_nid = 0;
1da177e4
LT
642 unsigned long i;
643
644 if (numa_enabled == 0) {
645 printk(KERN_WARNING "NUMA disabled by user\n");
646 return -1;
647 }
648
1da177e4
LT
649 min_common_depth = find_min_common_depth();
650
495c2ff4
AK
651 if (min_common_depth < 0) {
652 /*
653 * if we fail to parse min_common_depth from device tree
654 * mark the numa disabled, boot with numa disabled.
655 */
656 numa_enabled = false;
1da177e4 657 return min_common_depth;
495c2ff4 658 }
1da177e4 659
bf4b85b0
NL
660 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
661
1da177e4 662 /*
482ec7c4
NL
663 * Even though we connect cpus to numa domains later in SMP
664 * init, we need to know the node ids now. This is because
665 * each node to be onlined must have NODE_DATA etc backing it.
1da177e4 666 */
482ec7c4 667 for_each_present_cpu(i) {
dfbe93a2 668 struct device_node *cpu;
cf950b7a 669 int nid;
1da177e4 670
8b16cd23 671 cpu = of_get_cpu_node(i, NULL);
482ec7c4 672 BUG_ON(!cpu);
953039c8 673 nid = of_node_to_nid_single(cpu);
482ec7c4 674 of_node_put(cpu);
1da177e4 675
482ec7c4
NL
676 /*
677 * Don't fall back to default_nid yet -- we will plug
678 * cpus into nodes once the memory scan has discovered
679 * the topology.
680 */
681 if (nid < 0)
682 continue;
683 node_set_online(nid);
1da177e4
LT
684 }
685
237a0989 686 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
94db7c5e
AB
687
688 for_each_node_by_type(memory, "memory") {
1da177e4
LT
689 unsigned long start;
690 unsigned long size;
cf950b7a 691 int nid;
1da177e4 692 int ranges;
b08a2a12 693 const __be32 *memcell_buf;
1da177e4
LT
694 unsigned int len;
695
e2eb6392 696 memcell_buf = of_get_property(memory,
ba759485
ME
697 "linux,usable-memory", &len);
698 if (!memcell_buf || len <= 0)
e2eb6392 699 memcell_buf = of_get_property(memory, "reg", &len);
1da177e4
LT
700 if (!memcell_buf || len <= 0)
701 continue;
702
cc5d0189
BH
703 /* ranges in cell */
704 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1da177e4
LT
705new_range:
706 /* these are order-sensitive, and modify the buffer pointer */
237a0989
MK
707 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
708 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1da177e4 709
482ec7c4
NL
710 /*
711 * Assumption: either all memory nodes or none will
712 * have associativity properties. If none, then
713 * everything goes to default_nid.
714 */
953039c8 715 nid = of_node_to_nid_single(memory);
482ec7c4
NL
716 if (nid < 0)
717 nid = default_nid;
1daa6d08
BS
718
719 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
482ec7c4 720 node_set_online(nid);
1da177e4 721
7656cd8e
RA
722 size = numa_enforce_memory_limit(start, size);
723 if (size)
724 memblock_set_node(start, size, &memblock.memory, nid);
1da177e4
LT
725
726 if (--ranges)
727 goto new_range;
728 }
729
0204568a 730 /*
dfbe93a2
AB
731 * Now do the same thing for each MEMBLOCK listed in the
732 * ibm,dynamic-memory property in the
733 * ibm,dynamic-reconfiguration-memory node.
0204568a
PM
734 */
735 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
514a9cb3
NF
736 if (memory) {
737 walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
738 of_node_put(memory);
739 }
0204568a 740
1da177e4
LT
741 return 0;
742}
743
744static void __init setup_nonnuma(void)
745{
95f72d1e
YL
746 unsigned long top_of_ram = memblock_end_of_DRAM();
747 unsigned long total_ram = memblock_phys_mem_size();
c67c3cb4 748 unsigned long start_pfn, end_pfn;
28be7072
BH
749 unsigned int nid = 0;
750 struct memblock_region *reg;
1da177e4 751
e110b281 752 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1da177e4 753 top_of_ram, total_ram);
e110b281 754 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
1da177e4
LT
755 (top_of_ram - total_ram) >> 20);
756
28be7072 757 for_each_memblock(memory, reg) {
c7fc2de0
YL
758 start_pfn = memblock_region_memory_base_pfn(reg);
759 end_pfn = memblock_region_memory_end_pfn(reg);
1daa6d08
BS
760
761 fake_numa_create_new_node(end_pfn, &nid);
1d7cfe18 762 memblock_set_node(PFN_PHYS(start_pfn),
e7e8de59
TC
763 PFN_PHYS(end_pfn - start_pfn),
764 &memblock.memory, nid);
1daa6d08 765 node_set_online(nid);
c67c3cb4 766 }
1da177e4
LT
767}
768
4b703a23
AB
769void __init dump_numa_cpu_topology(void)
770{
771 unsigned int node;
772 unsigned int cpu, count;
773
495c2ff4 774 if (!numa_enabled)
4b703a23
AB
775 return;
776
777 for_each_online_node(node) {
8467801c 778 pr_info("Node %d CPUs:", node);
4b703a23
AB
779
780 count = 0;
781 /*
782 * If we used a CPU iterator here we would miss printing
783 * the holes in the cpumap.
784 */
25863de0
AB
785 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
786 if (cpumask_test_cpu(cpu,
787 node_to_cpumask_map[node])) {
4b703a23 788 if (count == 0)
8467801c 789 pr_cont(" %u", cpu);
4b703a23
AB
790 ++count;
791 } else {
792 if (count > 1)
8467801c 793 pr_cont("-%u", cpu - 1);
4b703a23
AB
794 count = 0;
795 }
796 }
797
798 if (count > 1)
8467801c
AK
799 pr_cont("-%u", nr_cpu_ids - 1);
800 pr_cont("\n");
4b703a23
AB
801 }
802}
803
10239733
AB
804/* Initialize NODE_DATA for a node on the local memory */
805static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
4a618669 806{
10239733
AB
807 u64 spanned_pages = end_pfn - start_pfn;
808 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
809 u64 nd_pa;
810 void *nd;
811 int tnid;
4a618669 812
9a8dd708 813 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
33755574
MR
814 if (!nd_pa)
815 panic("Cannot allocate %zu bytes for node %d data\n",
816 nd_size, nid);
817
10239733 818 nd = __va(nd_pa);
4a618669 819
10239733
AB
820 /* report and initialize */
821 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
822 nd_pa, nd_pa + nd_size - 1);
823 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
824 if (tnid != nid)
825 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
4a618669 826
10239733
AB
827 node_data[nid] = nd;
828 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
829 NODE_DATA(nid)->node_id = nid;
830 NODE_DATA(nid)->node_start_pfn = start_pfn;
831 NODE_DATA(nid)->node_spanned_pages = spanned_pages;
832}
4a618669 833
a346137e
MB
834static void __init find_possible_nodes(void)
835{
836 struct device_node *rtas;
837 u32 numnodes, i;
838
495c2ff4 839 if (!numa_enabled)
a346137e
MB
840 return;
841
842 rtas = of_find_node_by_path("/rtas");
843 if (!rtas)
844 return;
845
846 if (of_property_read_u32_index(rtas,
847 "ibm,max-associativity-domains",
848 min_common_depth, &numnodes))
849 goto out;
850
851 for (i = 0; i < numnodes; i++) {
ea05ba7c 852 if (!node_possible(i))
a346137e 853 node_set(i, node_possible_map);
a346137e
MB
854 }
855
856out:
857 of_node_put(rtas);
858}
859
9bd9be00 860void __init mem_topology_setup(void)
1da177e4 861{
9bd9be00 862 int cpu;
1da177e4
LT
863
864 if (parse_numa_properties())
865 setup_nonnuma();
1da177e4 866
3af229f2 867 /*
a346137e
MB
868 * Modify the set of possible NUMA nodes to reflect information
869 * available about the set of online nodes, and the set of nodes
870 * that we expect to make use of for this platform's affinity
871 * calculations.
3af229f2
NA
872 */
873 nodes_and(node_possible_map, node_possible_map, node_online_map);
874
a346137e
MB
875 find_possible_nodes();
876
9bd9be00
NP
877 setup_node_to_cpumask_map();
878
879 reset_numa_cpu_lookup_table();
880
881 for_each_present_cpu(cpu)
882 numa_setup_cpu(cpu);
883}
884
885void __init initmem_init(void)
886{
887 int nid;
888
889 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
890 max_pfn = max_low_pfn;
891
892 memblock_dump_all();
893
1da177e4 894 for_each_online_node(nid) {
c67c3cb4 895 unsigned long start_pfn, end_pfn;
1da177e4 896
c67c3cb4 897 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
10239733 898 setup_node_data(nid, start_pfn, end_pfn);
8f64e1f2 899 sparse_memory_present_with_active_regions(nid);
4a618669 900 }
d3f6204a 901
21098b9e 902 sparse_init();
25863de0 903
2fabf084
NA
904 /*
905 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
906 * even before we online them, so that we can use cpu_to_{node,mem}
907 * early in boot, cf. smp_prepare_cpus().
bdab88e0
SAS
908 * _nocalls() + manual invocation is used because cpuhp is not yet
909 * initialized for the boot CPU.
2fabf084 910 */
73c1b41e 911 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
bdab88e0 912 ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
1da177e4
LT
913}
914
1da177e4
LT
915static int __init early_numa(char *p)
916{
917 if (!p)
918 return 0;
919
920 if (strstr(p, "off"))
921 numa_enabled = 0;
922
923 if (strstr(p, "debug"))
924 numa_debug = 1;
925
1daa6d08
BS
926 p = strstr(p, "fake=");
927 if (p)
928 cmdline = p + strlen("fake=");
929
1da177e4
LT
930 return 0;
931}
932early_param("numa", early_numa);
237a0989 933
558f8649
NL
934/*
935 * The platform can inform us through one of several mechanisms
936 * (post-migration device tree updates, PRRN or VPHN) that the NUMA
937 * assignment of a resource has changed. This controls whether we act
938 * on that. Disabled by default.
939 */
940static bool topology_updates_enabled;
2d73bae1
NA
941
942static int __init early_topology_updates(char *p)
943{
944 if (!p)
945 return 0;
946
558f8649
NL
947 if (!strcmp(p, "on")) {
948 pr_warn("Caution: enabling topology updates\n");
949 topology_updates_enabled = true;
2d73bae1
NA
950 }
951
952 return 0;
953}
954early_param("topology_updates", early_topology_updates);
955
237a0989 956#ifdef CONFIG_MEMORY_HOTPLUG
0db9360a 957/*
0f16ef7f
NF
958 * Find the node associated with a hot added memory section for
959 * memory represented in the device tree by the property
960 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
0db9360a 961 */
514a9cb3 962static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
0db9360a 963{
514a9cb3 964 struct drmem_lmb *lmb;
3fdfd990 965 unsigned long lmb_size;
98fa15f3 966 int nid = NUMA_NO_NODE;
0db9360a 967
514a9cb3 968 lmb_size = drmem_lmb_size();
0db9360a 969
514a9cb3 970 for_each_drmem_lmb(lmb) {
0db9360a
NF
971 /* skip this block if it is reserved or not assigned to
972 * this partition */
514a9cb3
NF
973 if ((lmb->flags & DRCONF_MEM_RESERVED)
974 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
0db9360a
NF
975 continue;
976
514a9cb3
NF
977 if ((scn_addr < lmb->base_addr)
978 || (scn_addr >= (lmb->base_addr + lmb_size)))
0f16ef7f
NF
979 continue;
980
514a9cb3 981 nid = of_drconf_to_nid_single(lmb);
0f16ef7f
NF
982 break;
983 }
984
985 return nid;
986}
987
988/*
989 * Find the node associated with a hot added memory section for memory
990 * represented in the device tree as a node (i.e. memory@XXXX) for
95f72d1e 991 * each memblock.
0f16ef7f 992 */
ec32dd66 993static int hot_add_node_scn_to_nid(unsigned long scn_addr)
0f16ef7f 994{
94db7c5e 995 struct device_node *memory;
98fa15f3 996 int nid = NUMA_NO_NODE;
0f16ef7f 997
94db7c5e 998 for_each_node_by_type(memory, "memory") {
0f16ef7f
NF
999 unsigned long start, size;
1000 int ranges;
b08a2a12 1001 const __be32 *memcell_buf;
0f16ef7f
NF
1002 unsigned int len;
1003
1004 memcell_buf = of_get_property(memory, "reg", &len);
1005 if (!memcell_buf || len <= 0)
1006 continue;
1007
1008 /* ranges in cell */
1009 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1010
1011 while (ranges--) {
1012 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1013 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1014
1015 if ((scn_addr < start) || (scn_addr >= (start + size)))
1016 continue;
1017
1018 nid = of_node_to_nid_single(memory);
1019 break;
1020 }
0db9360a 1021
0f16ef7f
NF
1022 if (nid >= 0)
1023 break;
0db9360a
NF
1024 }
1025
60831842
AB
1026 of_node_put(memory);
1027
0f16ef7f 1028 return nid;
0db9360a
NF
1029}
1030
237a0989
MK
1031/*
1032 * Find the node associated with a hot added memory section. Section
95f72d1e
YL
1033 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
1034 * sections are fully contained within a single MEMBLOCK.
237a0989
MK
1035 */
1036int hot_add_scn_to_nid(unsigned long scn_addr)
1037{
1038 struct device_node *memory = NULL;
4a3bac4e 1039 int nid;
237a0989 1040
495c2ff4 1041 if (!numa_enabled)
72c33688 1042 return first_online_node;
0db9360a
NF
1043
1044 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1045 if (memory) {
514a9cb3 1046 nid = hot_add_drconf_scn_to_nid(scn_addr);
0db9360a 1047 of_node_put(memory);
0f16ef7f
NF
1048 } else {
1049 nid = hot_add_node_scn_to_nid(scn_addr);
0db9360a 1050 }
237a0989 1051
2a8628d4 1052 if (nid < 0 || !node_possible(nid))
72c33688 1053 nid = first_online_node;
237a0989 1054
0f16ef7f 1055 return nid;
237a0989 1056}
0f16ef7f 1057
cd34206e
NA
1058static u64 hot_add_drconf_memory_max(void)
1059{
e70bd3ae 1060 struct device_node *memory = NULL;
45b64ee6 1061 struct device_node *dn = NULL;
45b64ee6 1062 const __be64 *lrdr = NULL;
45b64ee6
BR
1063
1064 dn = of_find_node_by_path("/rtas");
1065 if (dn) {
1066 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
1067 of_node_put(dn);
1068 if (lrdr)
1069 return be64_to_cpup(lrdr);
1070 }
cd34206e 1071
e70bd3ae
BR
1072 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1073 if (memory) {
e70bd3ae 1074 of_node_put(memory);
514a9cb3 1075 return drmem_lmb_memory_max();
e70bd3ae 1076 }
45b64ee6 1077 return 0;
cd34206e
NA
1078}
1079
1080/*
1081 * memory_hotplug_max - return max address of memory that may be added
1082 *
1083 * This is currently only used on systems that support drconfig memory
1084 * hotplug.
1085 */
1086u64 memory_hotplug_max(void)
1087{
1088 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1089}
237a0989 1090#endif /* CONFIG_MEMORY_HOTPLUG */
9eff1a38 1091
bd03403a 1092/* Virtual Processor Home Node (VPHN) support */
39bf990e 1093#ifdef CONFIG_PPC_SPLPAR
30c05350
NF
1094struct topology_update_data {
1095 struct topology_update_data *next;
1096 unsigned int cpu;
1097 int old_nid;
1098 int new_nid;
1099};
1100
cee5405d
MB
1101#define TOPOLOGY_DEF_TIMER_SECS 60
1102
5de16699 1103static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
9eff1a38
JL
1104static cpumask_t cpu_associativity_changes_mask;
1105static int vphn_enabled;
5d88aa85
JL
1106static int prrn_enabled;
1107static void reset_topology_timer(void);
cee5405d 1108static int topology_timer_secs = 1;
17f444c0 1109static int topology_inited;
9eff1a38 1110
cee5405d
MB
1111/*
1112 * Change polling interval for associativity changes.
1113 */
1114int timed_topology_update(int nsecs)
1115{
1116 if (vphn_enabled) {
1117 if (nsecs > 0)
1118 topology_timer_secs = nsecs;
1119 else
1120 topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
1121
1122 reset_topology_timer();
1123 }
1124
1125 return 0;
1126}
9eff1a38
JL
1127
1128/*
1129 * Store the current values of the associativity change counters in the
1130 * hypervisor.
1131 */
1132static void setup_cpu_associativity_change_counters(void)
1133{
cd9d6cc7 1134 int cpu;
9eff1a38 1135
5de16699
AB
1136 /* The VPHN feature supports a maximum of 8 reference points */
1137 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1138
9eff1a38 1139 for_each_possible_cpu(cpu) {
cd9d6cc7 1140 int i;
9eff1a38 1141 u8 *counts = vphn_cpu_change_counts[cpu];
499dcd41 1142 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
9eff1a38 1143
5de16699 1144 for (i = 0; i < distance_ref_points_depth; i++)
9eff1a38 1145 counts[i] = hypervisor_counts[i];
9eff1a38
JL
1146 }
1147}
1148
1149/*
1150 * The hypervisor maintains a set of 8 associativity change counters in
1151 * the VPA of each cpu that correspond to the associativity levels in the
1152 * ibm,associativity-reference-points property. When an associativity
1153 * level changes, the corresponding counter is incremented.
1154 *
1155 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1156 * node associativity levels have changed.
1157 *
1158 * Returns the number of cpus with unhandled associativity changes.
1159 */
1160static int update_cpu_associativity_changes_mask(void)
1161{
5d88aa85 1162 int cpu;
9eff1a38
JL
1163 cpumask_t *changes = &cpu_associativity_changes_mask;
1164
9eff1a38
JL
1165 for_each_possible_cpu(cpu) {
1166 int i, changed = 0;
1167 u8 *counts = vphn_cpu_change_counts[cpu];
499dcd41 1168 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
9eff1a38 1169
5de16699 1170 for (i = 0; i < distance_ref_points_depth; i++) {
d69043e8 1171 if (hypervisor_counts[i] != counts[i]) {
9eff1a38
JL
1172 counts[i] = hypervisor_counts[i];
1173 changed = 1;
1174 }
1175 }
1176 if (changed) {
3be7db6a
RJ
1177 cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1178 cpu = cpu_last_thread_sibling(cpu);
9eff1a38
JL
1179 }
1180 }
1181
5d88aa85 1182 return cpumask_weight(changes);
9eff1a38
JL
1183}
1184
9eff1a38
JL
1185/*
1186 * Retrieve the new associativity information for a virtual processor's
1187 * home node.
1188 */
9eff1a38 1189static long vphn_get_associativity(unsigned long cpu,
b08a2a12 1190 __be32 *associativity)
9eff1a38 1191{
cd9d6cc7 1192 long rc;
9eff1a38 1193
ef34e0ef
NR
1194 rc = hcall_vphn(get_hard_smp_processor_id(cpu),
1195 VPHN_FLAG_VCPU, associativity);
9eff1a38
JL
1196
1197 switch (rc) {
1198 case H_FUNCTION:
437ccdc8 1199 printk_once(KERN_INFO
9eff1a38
JL
1200 "VPHN is not supported. Disabling polling...\n");
1201 stop_topology_update();
1202 break;
1203 case H_HARDWARE:
1204 printk(KERN_ERR
1205 "hcall_vphn() experienced a hardware fault "
1206 "preventing VPHN. Disabling polling...\n");
1207 stop_topology_update();
17f444c0
MB
1208 break;
1209 case H_SUCCESS:
1210 dbg("VPHN hcall succeeded. Reset polling...\n");
cee5405d 1211 timed_topology_update(0);
17f444c0 1212 break;
9eff1a38
JL
1213 }
1214
1215 return rc;
1216}
1217
e67e02a5 1218int find_and_online_cpu_nid(int cpu)
ea05ba7c
MB
1219{
1220 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1221 int new_nid;
1222
1223 /* Use associativity from first thread for all siblings */
2483ef05
SD
1224 if (vphn_get_associativity(cpu, associativity))
1225 return cpu_to_node(cpu);
1226
ea05ba7c
MB
1227 new_nid = associativity_to_nid(associativity);
1228 if (new_nid < 0 || !node_possible(new_nid))
1229 new_nid = first_online_node;
1230
1231 if (NODE_DATA(new_nid) == NULL) {
1232#ifdef CONFIG_MEMORY_HOTPLUG
1233 /*
1234 * Need to ensure that NODE_DATA is initialized for a node from
1235 * available memory (see memblock_alloc_try_nid). If unable to
1236 * init the node, then default to nearest node that has memory
ac1788cc
SD
1237 * installed. Skip onlining a node if the subsystems are not
1238 * yet initialized.
ea05ba7c 1239 */
ac1788cc 1240 if (!topology_inited || try_online_node(new_nid))
ea05ba7c
MB
1241 new_nid = first_online_node;
1242#else
1243 /*
1244 * Default to using the nearest node that has memory installed.
1245 * Otherwise, it would be necessary to patch the kernel MM code
1246 * to deal with more memoryless-node error conditions.
1247 */
1248 new_nid = first_online_node;
1249#endif
1250 }
1251
e67e02a5
MB
1252 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
1253 cpu, new_nid);
ea05ba7c
MB
1254 return new_nid;
1255}
1256
30c05350
NF
1257/*
1258 * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1259 * characteristics change. This function doesn't perform any locking and is
1260 * only safe to call from stop_machine().
1261 */
1262static int update_cpu_topology(void *data)
1263{
1264 struct topology_update_data *update;
1265 unsigned long cpu;
1266
1267 if (!data)
1268 return -EINVAL;
1269
3be7db6a 1270 cpu = smp_processor_id();
30c05350
NF
1271
1272 for (update = data; update; update = update->next) {
2c0a33f9 1273 int new_nid = update->new_nid;
30c05350
NF
1274 if (cpu != update->cpu)
1275 continue;
1276
49f8d8c0 1277 unmap_cpu_from_node(cpu);
2c0a33f9
NA
1278 map_cpu_to_node(cpu, new_nid);
1279 set_cpu_numa_node(cpu, new_nid);
1280 set_cpu_numa_mem(cpu, local_memory_node(new_nid));
176bbf14 1281 vdso_getcpu_init();
30c05350
NF
1282 }
1283
1284 return 0;
1285}
1286
d4edc5b6
SB
1287static int update_lookup_table(void *data)
1288{
1289 struct topology_update_data *update;
1290
1291 if (!data)
1292 return -EINVAL;
1293
1294 /*
1295 * Upon topology update, the numa-cpu lookup table needs to be updated
1296 * for all threads in the core, including offline CPUs, to ensure that
1297 * future hotplug operations respect the cpu-to-node associativity
1298 * properly.
1299 */
1300 for (update = data; update; update = update->next) {
1301 int nid, base, j;
1302
1303 nid = update->new_nid;
1304 base = cpu_first_thread_sibling(update->cpu);
1305
1306 for (j = 0; j < threads_per_core; j++) {
1307 update_numa_cpu_lookup_table(base + j, nid);
1308 }
1309 }
1310
1311 return 0;
1312}
1313
9eff1a38
JL
1314/*
1315 * Update the node maps and sysfs entries for each cpu whose home node
79c5fceb 1316 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
3e401f7a
TJB
1317 *
1318 * cpus_locked says whether we already hold cpu_hotplug_lock.
9eff1a38 1319 */
3e401f7a 1320int numa_update_cpu_topology(bool cpus_locked)
9eff1a38 1321{
3be7db6a 1322 unsigned int cpu, sibling, changed = 0;
30c05350 1323 struct topology_update_data *updates, *ud;
176bbf14 1324 cpumask_t updated_cpus;
8a25a2fd 1325 struct device *dev;
3be7db6a 1326 int weight, new_nid, i = 0;
9eff1a38 1327
2ea62630 1328 if (!prrn_enabled && !vphn_enabled && topology_inited)
2d73bae1
NA
1329 return 0;
1330
30c05350
NF
1331 weight = cpumask_weight(&cpu_associativity_changes_mask);
1332 if (!weight)
1333 return 0;
1334
6396bb22 1335 updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
30c05350
NF
1336 if (!updates)
1337 return 0;
9eff1a38 1338
176bbf14
JL
1339 cpumask_clear(&updated_cpus);
1340
5d88aa85 1341 for_each_cpu(cpu, &cpu_associativity_changes_mask) {
3be7db6a
RJ
1342 /*
1343 * If siblings aren't flagged for changes, updates list
1344 * will be too short. Skip on this update and set for next
1345 * update.
1346 */
1347 if (!cpumask_subset(cpu_sibling_mask(cpu),
1348 &cpu_associativity_changes_mask)) {
1349 pr_info("Sibling bits not set for associativity "
1350 "change, cpu%d\n", cpu);
1351 cpumask_or(&cpu_associativity_changes_mask,
1352 &cpu_associativity_changes_mask,
1353 cpu_sibling_mask(cpu));
1354 cpu = cpu_last_thread_sibling(cpu);
1355 continue;
1356 }
9eff1a38 1357
ea05ba7c 1358 new_nid = find_and_online_cpu_nid(cpu);
3be7db6a
RJ
1359
1360 if (new_nid == numa_cpu_lookup_table[cpu]) {
1361 cpumask_andnot(&cpu_associativity_changes_mask,
1362 &cpu_associativity_changes_mask,
1363 cpu_sibling_mask(cpu));
17f444c0
MB
1364 dbg("Assoc chg gives same node %d for cpu%d\n",
1365 new_nid, cpu);
3be7db6a
RJ
1366 cpu = cpu_last_thread_sibling(cpu);
1367 continue;
1368 }
9eff1a38 1369
3be7db6a
RJ
1370 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1371 ud = &updates[i++];
8bc93149 1372 ud->next = &updates[i];
3be7db6a
RJ
1373 ud->cpu = sibling;
1374 ud->new_nid = new_nid;
1375 ud->old_nid = numa_cpu_lookup_table[sibling];
1376 cpumask_set_cpu(sibling, &updated_cpus);
3be7db6a
RJ
1377 }
1378 cpu = cpu_last_thread_sibling(cpu);
30c05350
NF
1379 }
1380
8bc93149
MB
1381 /*
1382 * Prevent processing of 'updates' from overflowing array
1383 * where last entry filled in a 'next' pointer.
1384 */
1385 if (i)
1386 updates[i-1].next = NULL;
1387
2d73bae1
NA
1388 pr_debug("Topology update for the following CPUs:\n");
1389 if (cpumask_weight(&updated_cpus)) {
1390 for (ud = &updates[0]; ud; ud = ud->next) {
1391 pr_debug("cpu %d moving from node %d "
1392 "to %d\n", ud->cpu,
1393 ud->old_nid, ud->new_nid);
1394 }
1395 }
1396
9a013361
MW
1397 /*
1398 * In cases where we have nothing to update (because the updates list
1399 * is too short or because the new topology is same as the old one),
1400 * skip invoking update_cpu_topology() via stop-machine(). This is
1401 * necessary (and not just a fast-path optimization) since stop-machine
1402 * can end up electing a random CPU to run update_cpu_topology(), and
1403 * thus trick us into setting up incorrect cpu-node mappings (since
1404 * 'updates' is kzalloc()'ed).
1405 *
1406 * And for the similar reason, we will skip all the following updating.
1407 */
1408 if (!cpumask_weight(&updated_cpus))
1409 goto out;
1410
3e401f7a
TJB
1411 if (cpus_locked)
1412 stop_machine_cpuslocked(update_cpu_topology, &updates[0],
1413 &updated_cpus);
1414 else
1415 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
30c05350 1416
d4edc5b6
SB
1417 /*
1418 * Update the numa-cpu lookup table with the new mappings, even for
1419 * offline CPUs. It is best to perform this update from the stop-
1420 * machine context.
1421 */
3e401f7a
TJB
1422 if (cpus_locked)
1423 stop_machine_cpuslocked(update_lookup_table, &updates[0],
d4edc5b6 1424 cpumask_of(raw_smp_processor_id()));
3e401f7a
TJB
1425 else
1426 stop_machine(update_lookup_table, &updates[0],
1427 cpumask_of(raw_smp_processor_id()));
d4edc5b6 1428
30c05350 1429 for (ud = &updates[0]; ud; ud = ud->next) {
dd023217
NF
1430 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1431 register_cpu_under_node(ud->cpu, ud->new_nid);
1432
30c05350 1433 dev = get_cpu_device(ud->cpu);
8a25a2fd
KS
1434 if (dev)
1435 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
30c05350 1436 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
79c5fceb 1437 changed = 1;
9eff1a38
JL
1438 }
1439
9a013361 1440out:
30c05350 1441 kfree(updates);
79c5fceb 1442 return changed;
9eff1a38
JL
1443}
1444
3e401f7a
TJB
1445int arch_update_cpu_topology(void)
1446{
3e401f7a
TJB
1447 return numa_update_cpu_topology(true);
1448}
1449
9eff1a38
JL
1450static void topology_work_fn(struct work_struct *work)
1451{
1452 rebuild_sched_domains();
1453}
1454static DECLARE_WORK(topology_work, topology_work_fn);
1455
ec32dd66 1456static void topology_schedule_update(void)
9eff1a38
JL
1457{
1458 schedule_work(&topology_work);
1459}
1460
df7e828c 1461static void topology_timer_fn(struct timer_list *unused)
9eff1a38 1462{
5d88aa85 1463 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
9eff1a38 1464 topology_schedule_update();
5d88aa85
JL
1465 else if (vphn_enabled) {
1466 if (update_cpu_associativity_changes_mask() > 0)
1467 topology_schedule_update();
1468 reset_topology_timer();
1469 }
9eff1a38 1470}
df7e828c 1471static struct timer_list topology_timer;
9eff1a38 1472
5d88aa85 1473static void reset_topology_timer(void)
9eff1a38 1474{
8604895a
MB
1475 if (vphn_enabled)
1476 mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
9eff1a38
JL
1477}
1478
601abdc3
NF
1479#ifdef CONFIG_SMP
1480
5d88aa85
JL
1481static int dt_update_callback(struct notifier_block *nb,
1482 unsigned long action, void *data)
1483{
f5242e5a 1484 struct of_reconfig_data *update = data;
5d88aa85
JL
1485 int rc = NOTIFY_DONE;
1486
1487 switch (action) {
5d88aa85 1488 case OF_RECONFIG_UPDATE_PROPERTY:
e5480bdc 1489 if (of_node_is_type(update->dn, "cpu") &&
30c05350 1490 !of_prop_cmp(update->prop->name, "ibm,associativity")) {
5d88aa85
JL
1491 u32 core_id;
1492 of_property_read_u32(update->dn, "reg", &core_id);
81b61324 1493 rc = dlpar_cpu_readd(core_id);
5d88aa85
JL
1494 rc = NOTIFY_OK;
1495 }
1496 break;
1497 }
1498
1499 return rc;
9eff1a38
JL
1500}
1501
5d88aa85
JL
1502static struct notifier_block dt_update_nb = {
1503 .notifier_call = dt_update_callback,
1504};
1505
601abdc3
NF
1506#endif
1507
9eff1a38 1508/*
5d88aa85 1509 * Start polling for associativity changes.
9eff1a38
JL
1510 */
1511int start_topology_update(void)
1512{
1513 int rc = 0;
1514
2d4d9b30
NL
1515 if (!topology_updates_enabled)
1516 return 0;
1517
5d88aa85
JL
1518 if (firmware_has_feature(FW_FEATURE_PRRN)) {
1519 if (!prrn_enabled) {
1520 prrn_enabled = 1;
601abdc3 1521#ifdef CONFIG_SMP
5d88aa85 1522 rc = of_reconfig_notifier_register(&dt_update_nb);
601abdc3 1523#endif
5d88aa85 1524 }
a3496e91
MB
1525 }
1526 if (firmware_has_feature(FW_FEATURE_VPHN) &&
f13c13a0 1527 lppaca_shared_proc(get_lppaca())) {
5d88aa85 1528 if (!vphn_enabled) {
5d88aa85
JL
1529 vphn_enabled = 1;
1530 setup_cpu_associativity_change_counters();
df7e828c
KC
1531 timer_setup(&topology_timer, topology_timer_fn,
1532 TIMER_DEFERRABLE);
5d88aa85
JL
1533 reset_topology_timer();
1534 }
9eff1a38
JL
1535 }
1536
65b9fdad
MB
1537 pr_info("Starting topology update%s%s\n",
1538 (prrn_enabled ? " prrn_enabled" : ""),
1539 (vphn_enabled ? " vphn_enabled" : ""));
1540
9eff1a38
JL
1541 return rc;
1542}
9eff1a38
JL
1543
1544/*
1545 * Disable polling for VPHN associativity changes.
1546 */
1547int stop_topology_update(void)
1548{
5d88aa85
JL
1549 int rc = 0;
1550
2d4d9b30
NL
1551 if (!topology_updates_enabled)
1552 return 0;
1553
5d88aa85
JL
1554 if (prrn_enabled) {
1555 prrn_enabled = 0;
601abdc3 1556#ifdef CONFIG_SMP
5d88aa85 1557 rc = of_reconfig_notifier_unregister(&dt_update_nb);
601abdc3 1558#endif
a3496e91
MB
1559 }
1560 if (vphn_enabled) {
5d88aa85
JL
1561 vphn_enabled = 0;
1562 rc = del_timer_sync(&topology_timer);
1563 }
1564
65b9fdad
MB
1565 pr_info("Stopping topology update\n");
1566
5d88aa85 1567 return rc;
9eff1a38 1568}
e04fa612
NF
1569
1570int prrn_is_enabled(void)
1571{
1572 return prrn_enabled;
1573}
1574
2ea62630
SD
1575void __init shared_proc_topology_init(void)
1576{
1577 if (lppaca_shared_proc(get_lppaca())) {
1578 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1579 nr_cpumask_bits);
1580 numa_update_cpu_topology(false);
1581 }
1582}
1583
e04fa612
NF
1584static int topology_read(struct seq_file *file, void *v)
1585{
1586 if (vphn_enabled || prrn_enabled)
1587 seq_puts(file, "on\n");
1588 else
1589 seq_puts(file, "off\n");
1590
1591 return 0;
1592}
1593
1594static int topology_open(struct inode *inode, struct file *file)
1595{
1596 return single_open(file, topology_read, NULL);
1597}
1598
1599static ssize_t topology_write(struct file *file, const char __user *buf,
1600 size_t count, loff_t *off)
1601{
1602 char kbuf[4]; /* "on" or "off" plus null. */
1603 int read_len;
1604
1605 read_len = count < 3 ? count : 3;
1606 if (copy_from_user(kbuf, buf, read_len))
1607 return -EINVAL;
1608
1609 kbuf[read_len] = '\0';
1610
2d4d9b30
NL
1611 if (!strncmp(kbuf, "on", 2)) {
1612 topology_updates_enabled = true;
e04fa612 1613 start_topology_update();
2d4d9b30 1614 } else if (!strncmp(kbuf, "off", 3)) {
e04fa612 1615 stop_topology_update();
2d4d9b30
NL
1616 topology_updates_enabled = false;
1617 } else
e04fa612
NF
1618 return -EINVAL;
1619
1620 return count;
1621}
1622
1623static const struct file_operations topology_ops = {
1624 .read = seq_read,
1625 .write = topology_write,
1626 .open = topology_open,
1627 .release = single_release
1628};
1629
1630static int topology_update_init(void)
1631{
2d4d9b30 1632 start_topology_update();
2d73bae1 1633
17f444c0
MB
1634 if (vphn_enabled)
1635 topology_schedule_update();
1636
2d15b9b4
NA
1637 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1638 return -ENOMEM;
e04fa612 1639
17f444c0 1640 topology_inited = 1;
e04fa612 1641 return 0;
9eff1a38 1642}
e04fa612 1643device_initcall(topology_update_init);
39bf990e 1644#endif /* CONFIG_PPC_SPLPAR */