treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152
[linux-block.git] / arch / powerpc / mm / numa.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * pSeries NUMA support
4 *
5 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
1da177e4 6 */
2d73bae1
NA
7#define pr_fmt(fmt) "numa: " fmt
8
1da177e4 9#include <linux/threads.h>
57c8a661 10#include <linux/memblock.h>
1da177e4
LT
11#include <linux/init.h>
12#include <linux/mm.h>
13#include <linux/mmzone.h>
4b16f8e2 14#include <linux/export.h>
1da177e4
LT
15#include <linux/nodemask.h>
16#include <linux/cpu.h>
17#include <linux/notifier.h>
6df1646e 18#include <linux/of.h>
06eccea6 19#include <linux/pfn.h>
9eff1a38
JL
20#include <linux/cpuset.h>
21#include <linux/node.h>
30c05350 22#include <linux/stop_machine.h>
e04fa612
NF
23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
25#include <linux/uaccess.h>
191a7120 26#include <linux/slab.h>
3be7db6a 27#include <asm/cputhreads.h>
45fb6cea 28#include <asm/sparsemem.h>
d9b2b2a2 29#include <asm/prom.h>
2249ca9d 30#include <asm/smp.h>
d4edc5b6 31#include <asm/topology.h>
9eff1a38
JL
32#include <asm/firmware.h>
33#include <asm/paca.h>
39bf990e 34#include <asm/hvcall.h>
ae3a197e 35#include <asm/setup.h>
176bbf14 36#include <asm/vdso.h>
514a9cb3 37#include <asm/drmem.h>
1da177e4
LT
38
39static int numa_enabled = 1;
40
1daa6d08
BS
41static char *cmdline __initdata;
42
1da177e4
LT
43static int numa_debug;
44#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
45
45fb6cea 46int numa_cpu_lookup_table[NR_CPUS];
25863de0 47cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
1da177e4 48struct pglist_data *node_data[MAX_NUMNODES];
45fb6cea
AB
49
50EXPORT_SYMBOL(numa_cpu_lookup_table);
25863de0 51EXPORT_SYMBOL(node_to_cpumask_map);
45fb6cea
AB
52EXPORT_SYMBOL(node_data);
53
1da177e4 54static int min_common_depth;
237a0989 55static int n_mem_addr_cells, n_mem_size_cells;
41eab6f8
AB
56static int form1_affinity;
57
58#define MAX_DISTANCE_REF_POINTS 4
59static int distance_ref_points_depth;
b08a2a12 60static const __be32 *distance_ref_points;
41eab6f8 61static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
1da177e4 62
25863de0
AB
63/*
64 * Allocate node_to_cpumask_map based on number of available nodes
65 * Requires node_possible_map to be valid.
66 *
9512938b 67 * Note: cpumask_of_node() is not valid until after this is done.
25863de0
AB
68 */
69static void __init setup_node_to_cpumask_map(void)
70{
f9d531b8 71 unsigned int node;
25863de0
AB
72
73 /* setup nr_node_ids if not done yet */
f9d531b8
CS
74 if (nr_node_ids == MAX_NUMNODES)
75 setup_nr_node_ids();
25863de0
AB
76
77 /* allocate the map */
c118baf8 78 for_each_node(node)
25863de0
AB
79 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
80
81 /* cpumask_of_node() will now work */
b9726c26 82 dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
25863de0
AB
83}
84
55671f3c 85static int __init fake_numa_create_new_node(unsigned long end_pfn,
1daa6d08
BS
86 unsigned int *nid)
87{
88 unsigned long long mem;
89 char *p = cmdline;
90 static unsigned int fake_nid;
91 static unsigned long long curr_boundary;
92
93 /*
94 * Modify node id, iff we started creating NUMA nodes
95 * We want to continue from where we left of the last time
96 */
97 if (fake_nid)
98 *nid = fake_nid;
99 /*
100 * In case there are no more arguments to parse, the
101 * node_id should be the same as the last fake node id
102 * (we've handled this above).
103 */
104 if (!p)
105 return 0;
106
107 mem = memparse(p, &p);
108 if (!mem)
109 return 0;
110
111 if (mem < curr_boundary)
112 return 0;
113
114 curr_boundary = mem;
115
116 if ((end_pfn << PAGE_SHIFT) > mem) {
117 /*
118 * Skip commas and spaces
119 */
120 while (*p == ',' || *p == ' ' || *p == '\t')
121 p++;
122
123 cmdline = p;
124 fake_nid++;
125 *nid = fake_nid;
126 dbg("created new fake_node with id %d\n", fake_nid);
127 return 1;
128 }
129 return 0;
130}
131
d4edc5b6
SB
132static void reset_numa_cpu_lookup_table(void)
133{
134 unsigned int cpu;
135
136 for_each_possible_cpu(cpu)
137 numa_cpu_lookup_table[cpu] = -1;
138}
139
d4edc5b6
SB
140static void map_cpu_to_node(int cpu, int node)
141{
142 update_numa_cpu_lookup_table(cpu, node);
45fb6cea 143
bf4b85b0
NL
144 dbg("adding cpu %d to node %d\n", cpu, node);
145
25863de0
AB
146 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
147 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
1da177e4
LT
148}
149
39bf990e 150#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
1da177e4
LT
151static void unmap_cpu_from_node(unsigned long cpu)
152{
153 int node = numa_cpu_lookup_table[cpu];
154
155 dbg("removing cpu %lu from node %d\n", cpu, node);
156
25863de0 157 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
429f4d8d 158 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
1da177e4
LT
159 } else {
160 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
161 cpu, node);
162 }
163}
39bf990e 164#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
1da177e4 165
1da177e4 166/* must hold reference to node during call */
b08a2a12 167static const __be32 *of_get_associativity(struct device_node *dev)
1da177e4 168{
e2eb6392 169 return of_get_property(dev, "ibm,associativity", NULL);
1da177e4
LT
170}
171
41eab6f8
AB
172int __node_distance(int a, int b)
173{
174 int i;
175 int distance = LOCAL_DISTANCE;
176
177 if (!form1_affinity)
7122beee 178 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
41eab6f8
AB
179
180 for (i = 0; i < distance_ref_points_depth; i++) {
181 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
182 break;
183
184 /* Double the distance for each NUMA level */
185 distance *= 2;
186 }
187
188 return distance;
189}
12c743eb 190EXPORT_SYMBOL(__node_distance);
41eab6f8
AB
191
192static void initialize_distance_lookup_table(int nid,
b08a2a12 193 const __be32 *associativity)
41eab6f8
AB
194{
195 int i;
196
197 if (!form1_affinity)
198 return;
199
200 for (i = 0; i < distance_ref_points_depth; i++) {
b08a2a12
AP
201 const __be32 *entry;
202
1d805440 203 entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
b08a2a12 204 distance_lookup_table[nid][i] = of_read_number(entry, 1);
41eab6f8
AB
205 }
206}
207
482ec7c4
NL
208/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
209 * info is found.
210 */
b08a2a12 211static int associativity_to_nid(const __be32 *associativity)
1da177e4 212{
98fa15f3 213 int nid = NUMA_NO_NODE;
1da177e4
LT
214
215 if (min_common_depth == -1)
482ec7c4 216 goto out;
1da177e4 217
b08a2a12
AP
218 if (of_read_number(associativity, 1) >= min_common_depth)
219 nid = of_read_number(&associativity[min_common_depth], 1);
bc16a759
NL
220
221 /* POWER4 LPAR uses 0xffff as invalid node */
482ec7c4 222 if (nid == 0xffff || nid >= MAX_NUMNODES)
98fa15f3 223 nid = NUMA_NO_NODE;
41eab6f8 224
b08a2a12 225 if (nid > 0 &&
1d805440
ND
226 of_read_number(associativity, 1) >= distance_ref_points_depth) {
227 /*
228 * Skip the length field and send start of associativity array
229 */
230 initialize_distance_lookup_table(nid, associativity + 1);
231 }
41eab6f8 232
482ec7c4 233out:
cf950b7a 234 return nid;
1da177e4
LT
235}
236
9eff1a38
JL
237/* Returns the nid associated with the given device tree node,
238 * or -1 if not found.
239 */
240static int of_node_to_nid_single(struct device_node *device)
241{
98fa15f3 242 int nid = NUMA_NO_NODE;
b08a2a12 243 const __be32 *tmp;
9eff1a38
JL
244
245 tmp = of_get_associativity(device);
246 if (tmp)
247 nid = associativity_to_nid(tmp);
248 return nid;
249}
250
953039c8
JK
251/* Walk the device tree upwards, looking for an associativity id */
252int of_node_to_nid(struct device_node *device)
253{
98fa15f3 254 int nid = NUMA_NO_NODE;
953039c8
JK
255
256 of_node_get(device);
257 while (device) {
258 nid = of_node_to_nid_single(device);
259 if (nid != -1)
260 break;
261
1def3758 262 device = of_get_next_parent(device);
953039c8
JK
263 }
264 of_node_put(device);
265
266 return nid;
267}
be9ba9ff 268EXPORT_SYMBOL(of_node_to_nid);
953039c8 269
1da177e4
LT
270static int __init find_min_common_depth(void)
271{
41eab6f8 272 int depth;
e70606eb 273 struct device_node *root;
1da177e4 274
1c8ee733
DS
275 if (firmware_has_feature(FW_FEATURE_OPAL))
276 root = of_find_node_by_path("/ibm,opal");
277 else
278 root = of_find_node_by_path("/rtas");
e70606eb
ME
279 if (!root)
280 root = of_find_node_by_path("/");
1da177e4
LT
281
282 /*
41eab6f8
AB
283 * This property is a set of 32-bit integers, each representing
284 * an index into the ibm,associativity nodes.
285 *
286 * With form 0 affinity the first integer is for an SMP configuration
287 * (should be all 0's) and the second is for a normal NUMA
288 * configuration. We have only one level of NUMA.
289 *
290 * With form 1 affinity the first integer is the most significant
291 * NUMA boundary and the following are progressively less significant
292 * boundaries. There can be more than one level of NUMA.
1da177e4 293 */
e70606eb 294 distance_ref_points = of_get_property(root,
41eab6f8
AB
295 "ibm,associativity-reference-points",
296 &distance_ref_points_depth);
297
298 if (!distance_ref_points) {
299 dbg("NUMA: ibm,associativity-reference-points not found.\n");
300 goto err;
301 }
302
303 distance_ref_points_depth /= sizeof(int);
1da177e4 304
8002b0c5
NF
305 if (firmware_has_feature(FW_FEATURE_OPAL) ||
306 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
307 dbg("Using form 1 affinity\n");
1c8ee733 308 form1_affinity = 1;
4b83c330
AB
309 }
310
41eab6f8 311 if (form1_affinity) {
b08a2a12 312 depth = of_read_number(distance_ref_points, 1);
1da177e4 313 } else {
41eab6f8
AB
314 if (distance_ref_points_depth < 2) {
315 printk(KERN_WARNING "NUMA: "
316 "short ibm,associativity-reference-points\n");
317 goto err;
318 }
319
b08a2a12 320 depth = of_read_number(&distance_ref_points[1], 1);
1da177e4 321 }
1da177e4 322
41eab6f8
AB
323 /*
324 * Warn and cap if the hardware supports more than
325 * MAX_DISTANCE_REF_POINTS domains.
326 */
327 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
328 printk(KERN_WARNING "NUMA: distance array capped at "
329 "%d entries\n", MAX_DISTANCE_REF_POINTS);
330 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
331 }
332
e70606eb 333 of_node_put(root);
1da177e4 334 return depth;
41eab6f8
AB
335
336err:
e70606eb 337 of_node_put(root);
41eab6f8 338 return -1;
1da177e4
LT
339}
340
84c9fdd1 341static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
1da177e4
LT
342{
343 struct device_node *memory = NULL;
1da177e4
LT
344
345 memory = of_find_node_by_type(memory, "memory");
54c23310 346 if (!memory)
84c9fdd1 347 panic("numa.c: No memory nodes found!");
54c23310 348
a8bda5dd 349 *n_addr_cells = of_n_addr_cells(memory);
9213feea 350 *n_size_cells = of_n_size_cells(memory);
84c9fdd1 351 of_node_put(memory);
1da177e4
LT
352}
353
b08a2a12 354static unsigned long read_n_cells(int n, const __be32 **buf)
1da177e4
LT
355{
356 unsigned long result = 0;
357
358 while (n--) {
b08a2a12 359 result = (result << 32) | of_read_number(*buf, 1);
1da177e4
LT
360 (*buf)++;
361 }
362 return result;
363}
364
8342681d
NF
365struct assoc_arrays {
366 u32 n_arrays;
367 u32 array_sz;
b08a2a12 368 const __be32 *arrays;
8342681d
NF
369};
370
371/*
25985edc 372 * Retrieve and validate the list of associativity arrays for drconf
8342681d
NF
373 * memory from the ibm,associativity-lookup-arrays property of the
374 * device tree..
375 *
376 * The layout of the ibm,associativity-lookup-arrays property is a number N
377 * indicating the number of associativity arrays, followed by a number M
378 * indicating the size of each associativity array, followed by a list
379 * of N associativity arrays.
380 */
35f80deb 381static int of_get_assoc_arrays(struct assoc_arrays *aa)
8342681d 382{
35f80deb 383 struct device_node *memory;
b08a2a12 384 const __be32 *prop;
8342681d
NF
385 u32 len;
386
35f80deb
NF
387 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
388 if (!memory)
389 return -1;
390
8342681d 391 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
35f80deb
NF
392 if (!prop || len < 2 * sizeof(unsigned int)) {
393 of_node_put(memory);
8342681d 394 return -1;
35f80deb 395 }
8342681d 396
b08a2a12
AP
397 aa->n_arrays = of_read_number(prop++, 1);
398 aa->array_sz = of_read_number(prop++, 1);
8342681d 399
35f80deb
NF
400 of_node_put(memory);
401
42b2aa86 402 /* Now that we know the number of arrays and size of each array,
8342681d
NF
403 * revalidate the size of the property read in.
404 */
405 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
406 return -1;
407
408 aa->arrays = prop;
409 return 0;
410}
411
412/*
413 * This is like of_node_to_nid_single() for memory represented in the
414 * ibm,dynamic-reconfiguration-memory node.
415 */
514a9cb3 416static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
8342681d 417{
b88fc309 418 struct assoc_arrays aa = { .arrays = NULL };
8342681d
NF
419 int default_nid = 0;
420 int nid = default_nid;
b88fc309
NF
421 int rc, index;
422
423 rc = of_get_assoc_arrays(&aa);
424 if (rc)
425 return default_nid;
8342681d 426
b88fc309 427 if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
514a9cb3
NF
428 !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
429 lmb->aa_index < aa.n_arrays) {
430 index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
b88fc309 431 nid = of_read_number(&aa.arrays[index], 1);
8342681d
NF
432
433 if (nid == 0xffff || nid >= MAX_NUMNODES)
434 nid = default_nid;
1d805440
ND
435
436 if (nid > 0) {
514a9cb3 437 index = lmb->aa_index * aa.array_sz;
1d805440 438 initialize_distance_lookup_table(nid,
b88fc309 439 &aa.arrays[index]);
1d805440 440 }
8342681d
NF
441 }
442
443 return nid;
444}
445
1da177e4
LT
446/*
447 * Figure out to which domain a cpu belongs and stick it there.
448 * Return the id of the domain used.
449 */
061d19f2 450static int numa_setup_cpu(unsigned long lcpu)
1da177e4 451{
98fa15f3 452 int nid = NUMA_NO_NODE;
d4edc5b6
SB
453 struct device_node *cpu;
454
455 /*
456 * If a valid cpu-to-node mapping is already available, use it
457 * directly instead of querying the firmware, since it represents
458 * the most recent mapping notified to us by the platform (eg: VPHN).
459 */
460 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
461 map_cpu_to_node(lcpu, nid);
462 return nid;
463 }
464
465 cpu = of_get_cpu_node(lcpu, NULL);
1da177e4
LT
466
467 if (!cpu) {
468 WARN_ON(1);
297cf502
LZ
469 if (cpu_present(lcpu))
470 goto out_present;
471 else
472 goto out;
1da177e4
LT
473 }
474
953039c8 475 nid = of_node_to_nid_single(cpu);
1da177e4 476
297cf502 477out_present:
ea05ba7c 478 if (nid < 0 || !node_possible(nid))
72c33688 479 nid = first_online_node;
1da177e4 480
297cf502 481 map_cpu_to_node(lcpu, nid);
1da177e4 482 of_node_put(cpu);
297cf502 483out:
cf950b7a 484 return nid;
1da177e4
LT
485}
486
68fb18aa
SB
487static void verify_cpu_node_mapping(int cpu, int node)
488{
489 int base, sibling, i;
490
491 /* Verify that all the threads in the core belong to the same node */
492 base = cpu_first_thread_sibling(cpu);
493
494 for (i = 0; i < threads_per_core; i++) {
495 sibling = base + i;
496
497 if (sibling == cpu || cpu_is_offline(sibling))
498 continue;
499
500 if (cpu_to_node(sibling) != node) {
501 WARN(1, "CPU thread siblings %d and %d don't belong"
502 " to the same node!\n", cpu, sibling);
503 break;
504 }
505 }
506}
507
bdab88e0
SAS
508/* Must run before sched domains notifier. */
509static int ppc_numa_cpu_prepare(unsigned int cpu)
510{
511 int nid;
512
513 nid = numa_setup_cpu(cpu);
514 verify_cpu_node_mapping(cpu, nid);
515 return 0;
516}
517
518static int ppc_numa_cpu_dead(unsigned int cpu)
519{
1da177e4 520#ifdef CONFIG_HOTPLUG_CPU
bdab88e0 521 unmap_cpu_from_node(cpu);
1da177e4 522#endif
bdab88e0 523 return 0;
1da177e4
LT
524}
525
526/*
527 * Check and possibly modify a memory region to enforce the memory limit.
528 *
529 * Returns the size the region should have to enforce the memory limit.
530 * This will either be the original value of size, a truncated value,
531 * or zero. If the returned value of size is 0 the region should be
25985edc 532 * discarded as it lies wholly above the memory limit.
1da177e4 533 */
45fb6cea
AB
534static unsigned long __init numa_enforce_memory_limit(unsigned long start,
535 unsigned long size)
1da177e4
LT
536{
537 /*
95f72d1e 538 * We use memblock_end_of_DRAM() in here instead of memory_limit because
1da177e4 539 * we've already adjusted it for the limit and it takes care of
fe55249d
MM
540 * having memory holes below the limit. Also, in the case of
541 * iommu_is_off, memory_limit is not set but is implicitly enforced.
1da177e4 542 */
1da177e4 543
95f72d1e 544 if (start + size <= memblock_end_of_DRAM())
1da177e4
LT
545 return size;
546
95f72d1e 547 if (start >= memblock_end_of_DRAM())
1da177e4
LT
548 return 0;
549
95f72d1e 550 return memblock_end_of_DRAM() - start;
1da177e4
LT
551}
552
cf00085d
C
553/*
554 * Reads the counter for a given entry in
555 * linux,drconf-usable-memory property
556 */
b08a2a12 557static inline int __init read_usm_ranges(const __be32 **usm)
cf00085d
C
558{
559 /*
3fdfd990 560 * For each lmb in ibm,dynamic-memory a corresponding
cf00085d
C
561 * entry in linux,drconf-usable-memory property contains
562 * a counter followed by that many (base, size) duple.
563 * read the counter from linux,drconf-usable-memory
564 */
565 return read_n_cells(n_mem_size_cells, usm);
566}
567
0204568a
PM
568/*
569 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
570 * node. This assumes n_mem_{addr,size}_cells have been set.
571 */
514a9cb3
NF
572static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
573 const __be32 **usm)
0204568a 574{
514a9cb3
NF
575 unsigned int ranges, is_kexec_kdump = 0;
576 unsigned long base, size, sz;
8342681d 577 int nid;
8342681d 578
514a9cb3
NF
579 /*
580 * Skip this block if the reserved bit is set in flags (0x80)
581 * or if the block is not assigned to this partition (0x8)
582 */
583 if ((lmb->flags & DRCONF_MEM_RESERVED)
584 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
8342681d
NF
585 return;
586
514a9cb3 587 if (*usm)
cf00085d
C
588 is_kexec_kdump = 1;
589
514a9cb3
NF
590 base = lmb->base_addr;
591 size = drmem_lmb_size();
592 ranges = 1;
8342681d 593
514a9cb3
NF
594 if (is_kexec_kdump) {
595 ranges = read_usm_ranges(usm);
596 if (!ranges) /* there are no (base, size) duple */
597 return;
598 }
8342681d 599
514a9cb3 600 do {
cf00085d 601 if (is_kexec_kdump) {
514a9cb3
NF
602 base = read_n_cells(n_mem_addr_cells, usm);
603 size = read_n_cells(n_mem_size_cells, usm);
cf00085d 604 }
514a9cb3
NF
605
606 nid = of_drconf_to_nid_single(lmb);
607 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
608 &nid);
609 node_set_online(nid);
610 sz = numa_enforce_memory_limit(base, size);
611 if (sz)
612 memblock_set_node(base, sz, &memblock.memory, nid);
613 } while (--ranges);
0204568a
PM
614}
615
1da177e4
LT
616static int __init parse_numa_properties(void)
617{
94db7c5e 618 struct device_node *memory;
482ec7c4 619 int default_nid = 0;
1da177e4
LT
620 unsigned long i;
621
622 if (numa_enabled == 0) {
623 printk(KERN_WARNING "NUMA disabled by user\n");
624 return -1;
625 }
626
1da177e4
LT
627 min_common_depth = find_min_common_depth();
628
1da177e4
LT
629 if (min_common_depth < 0)
630 return min_common_depth;
631
bf4b85b0
NL
632 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
633
1da177e4 634 /*
482ec7c4
NL
635 * Even though we connect cpus to numa domains later in SMP
636 * init, we need to know the node ids now. This is because
637 * each node to be onlined must have NODE_DATA etc backing it.
1da177e4 638 */
482ec7c4 639 for_each_present_cpu(i) {
dfbe93a2 640 struct device_node *cpu;
cf950b7a 641 int nid;
1da177e4 642
8b16cd23 643 cpu = of_get_cpu_node(i, NULL);
482ec7c4 644 BUG_ON(!cpu);
953039c8 645 nid = of_node_to_nid_single(cpu);
482ec7c4 646 of_node_put(cpu);
1da177e4 647
482ec7c4
NL
648 /*
649 * Don't fall back to default_nid yet -- we will plug
650 * cpus into nodes once the memory scan has discovered
651 * the topology.
652 */
653 if (nid < 0)
654 continue;
655 node_set_online(nid);
1da177e4
LT
656 }
657
237a0989 658 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
94db7c5e
AB
659
660 for_each_node_by_type(memory, "memory") {
1da177e4
LT
661 unsigned long start;
662 unsigned long size;
cf950b7a 663 int nid;
1da177e4 664 int ranges;
b08a2a12 665 const __be32 *memcell_buf;
1da177e4
LT
666 unsigned int len;
667
e2eb6392 668 memcell_buf = of_get_property(memory,
ba759485
ME
669 "linux,usable-memory", &len);
670 if (!memcell_buf || len <= 0)
e2eb6392 671 memcell_buf = of_get_property(memory, "reg", &len);
1da177e4
LT
672 if (!memcell_buf || len <= 0)
673 continue;
674
cc5d0189
BH
675 /* ranges in cell */
676 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1da177e4
LT
677new_range:
678 /* these are order-sensitive, and modify the buffer pointer */
237a0989
MK
679 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
680 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1da177e4 681
482ec7c4
NL
682 /*
683 * Assumption: either all memory nodes or none will
684 * have associativity properties. If none, then
685 * everything goes to default_nid.
686 */
953039c8 687 nid = of_node_to_nid_single(memory);
482ec7c4
NL
688 if (nid < 0)
689 nid = default_nid;
1daa6d08
BS
690
691 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
482ec7c4 692 node_set_online(nid);
1da177e4 693
7656cd8e
RA
694 size = numa_enforce_memory_limit(start, size);
695 if (size)
696 memblock_set_node(start, size, &memblock.memory, nid);
1da177e4
LT
697
698 if (--ranges)
699 goto new_range;
700 }
701
0204568a 702 /*
dfbe93a2
AB
703 * Now do the same thing for each MEMBLOCK listed in the
704 * ibm,dynamic-memory property in the
705 * ibm,dynamic-reconfiguration-memory node.
0204568a
PM
706 */
707 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
514a9cb3
NF
708 if (memory) {
709 walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
710 of_node_put(memory);
711 }
0204568a 712
1da177e4
LT
713 return 0;
714}
715
716static void __init setup_nonnuma(void)
717{
95f72d1e
YL
718 unsigned long top_of_ram = memblock_end_of_DRAM();
719 unsigned long total_ram = memblock_phys_mem_size();
c67c3cb4 720 unsigned long start_pfn, end_pfn;
28be7072
BH
721 unsigned int nid = 0;
722 struct memblock_region *reg;
1da177e4 723
e110b281 724 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1da177e4 725 top_of_ram, total_ram);
e110b281 726 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
1da177e4
LT
727 (top_of_ram - total_ram) >> 20);
728
28be7072 729 for_each_memblock(memory, reg) {
c7fc2de0
YL
730 start_pfn = memblock_region_memory_base_pfn(reg);
731 end_pfn = memblock_region_memory_end_pfn(reg);
1daa6d08
BS
732
733 fake_numa_create_new_node(end_pfn, &nid);
1d7cfe18 734 memblock_set_node(PFN_PHYS(start_pfn),
e7e8de59
TC
735 PFN_PHYS(end_pfn - start_pfn),
736 &memblock.memory, nid);
1daa6d08 737 node_set_online(nid);
c67c3cb4 738 }
1da177e4
LT
739}
740
4b703a23
AB
741void __init dump_numa_cpu_topology(void)
742{
743 unsigned int node;
744 unsigned int cpu, count;
745
746 if (min_common_depth == -1 || !numa_enabled)
747 return;
748
749 for_each_online_node(node) {
8467801c 750 pr_info("Node %d CPUs:", node);
4b703a23
AB
751
752 count = 0;
753 /*
754 * If we used a CPU iterator here we would miss printing
755 * the holes in the cpumap.
756 */
25863de0
AB
757 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
758 if (cpumask_test_cpu(cpu,
759 node_to_cpumask_map[node])) {
4b703a23 760 if (count == 0)
8467801c 761 pr_cont(" %u", cpu);
4b703a23
AB
762 ++count;
763 } else {
764 if (count > 1)
8467801c 765 pr_cont("-%u", cpu - 1);
4b703a23
AB
766 count = 0;
767 }
768 }
769
770 if (count > 1)
8467801c
AK
771 pr_cont("-%u", nr_cpu_ids - 1);
772 pr_cont("\n");
4b703a23
AB
773 }
774}
775
10239733
AB
776/* Initialize NODE_DATA for a node on the local memory */
777static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
4a618669 778{
10239733
AB
779 u64 spanned_pages = end_pfn - start_pfn;
780 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
781 u64 nd_pa;
782 void *nd;
783 int tnid;
4a618669 784
9a8dd708 785 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
33755574
MR
786 if (!nd_pa)
787 panic("Cannot allocate %zu bytes for node %d data\n",
788 nd_size, nid);
789
10239733 790 nd = __va(nd_pa);
4a618669 791
10239733
AB
792 /* report and initialize */
793 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
794 nd_pa, nd_pa + nd_size - 1);
795 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
796 if (tnid != nid)
797 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
4a618669 798
10239733
AB
799 node_data[nid] = nd;
800 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
801 NODE_DATA(nid)->node_id = nid;
802 NODE_DATA(nid)->node_start_pfn = start_pfn;
803 NODE_DATA(nid)->node_spanned_pages = spanned_pages;
804}
4a618669 805
a346137e
MB
806static void __init find_possible_nodes(void)
807{
808 struct device_node *rtas;
809 u32 numnodes, i;
810
811 if (min_common_depth <= 0)
812 return;
813
814 rtas = of_find_node_by_path("/rtas");
815 if (!rtas)
816 return;
817
818 if (of_property_read_u32_index(rtas,
819 "ibm,max-associativity-domains",
820 min_common_depth, &numnodes))
821 goto out;
822
823 for (i = 0; i < numnodes; i++) {
ea05ba7c 824 if (!node_possible(i))
a346137e 825 node_set(i, node_possible_map);
a346137e
MB
826 }
827
828out:
829 of_node_put(rtas);
830}
831
9bd9be00 832void __init mem_topology_setup(void)
1da177e4 833{
9bd9be00 834 int cpu;
1da177e4
LT
835
836 if (parse_numa_properties())
837 setup_nonnuma();
1da177e4 838
3af229f2 839 /*
a346137e
MB
840 * Modify the set of possible NUMA nodes to reflect information
841 * available about the set of online nodes, and the set of nodes
842 * that we expect to make use of for this platform's affinity
843 * calculations.
3af229f2
NA
844 */
845 nodes_and(node_possible_map, node_possible_map, node_online_map);
846
a346137e
MB
847 find_possible_nodes();
848
9bd9be00
NP
849 setup_node_to_cpumask_map();
850
851 reset_numa_cpu_lookup_table();
852
853 for_each_present_cpu(cpu)
854 numa_setup_cpu(cpu);
855}
856
857void __init initmem_init(void)
858{
859 int nid;
860
861 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
862 max_pfn = max_low_pfn;
863
864 memblock_dump_all();
865
1da177e4 866 for_each_online_node(nid) {
c67c3cb4 867 unsigned long start_pfn, end_pfn;
1da177e4 868
c67c3cb4 869 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
10239733 870 setup_node_data(nid, start_pfn, end_pfn);
8f64e1f2 871 sparse_memory_present_with_active_regions(nid);
4a618669 872 }
d3f6204a 873
21098b9e 874 sparse_init();
25863de0 875
2fabf084
NA
876 /*
877 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
878 * even before we online them, so that we can use cpu_to_{node,mem}
879 * early in boot, cf. smp_prepare_cpus().
bdab88e0
SAS
880 * _nocalls() + manual invocation is used because cpuhp is not yet
881 * initialized for the boot CPU.
2fabf084 882 */
73c1b41e 883 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
bdab88e0 884 ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
1da177e4
LT
885}
886
1da177e4
LT
887static int __init early_numa(char *p)
888{
889 if (!p)
890 return 0;
891
892 if (strstr(p, "off"))
893 numa_enabled = 0;
894
895 if (strstr(p, "debug"))
896 numa_debug = 1;
897
1daa6d08
BS
898 p = strstr(p, "fake=");
899 if (p)
900 cmdline = p + strlen("fake=");
901
1da177e4
LT
902 return 0;
903}
904early_param("numa", early_numa);
237a0989 905
558f8649
NL
906/*
907 * The platform can inform us through one of several mechanisms
908 * (post-migration device tree updates, PRRN or VPHN) that the NUMA
909 * assignment of a resource has changed. This controls whether we act
910 * on that. Disabled by default.
911 */
912static bool topology_updates_enabled;
2d73bae1
NA
913
914static int __init early_topology_updates(char *p)
915{
916 if (!p)
917 return 0;
918
558f8649
NL
919 if (!strcmp(p, "on")) {
920 pr_warn("Caution: enabling topology updates\n");
921 topology_updates_enabled = true;
2d73bae1
NA
922 }
923
924 return 0;
925}
926early_param("topology_updates", early_topology_updates);
927
237a0989 928#ifdef CONFIG_MEMORY_HOTPLUG
0db9360a 929/*
0f16ef7f
NF
930 * Find the node associated with a hot added memory section for
931 * memory represented in the device tree by the property
932 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
0db9360a 933 */
514a9cb3 934static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
0db9360a 935{
514a9cb3 936 struct drmem_lmb *lmb;
3fdfd990 937 unsigned long lmb_size;
98fa15f3 938 int nid = NUMA_NO_NODE;
0db9360a 939
514a9cb3 940 lmb_size = drmem_lmb_size();
0db9360a 941
514a9cb3 942 for_each_drmem_lmb(lmb) {
0db9360a
NF
943 /* skip this block if it is reserved or not assigned to
944 * this partition */
514a9cb3
NF
945 if ((lmb->flags & DRCONF_MEM_RESERVED)
946 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
0db9360a
NF
947 continue;
948
514a9cb3
NF
949 if ((scn_addr < lmb->base_addr)
950 || (scn_addr >= (lmb->base_addr + lmb_size)))
0f16ef7f
NF
951 continue;
952
514a9cb3 953 nid = of_drconf_to_nid_single(lmb);
0f16ef7f
NF
954 break;
955 }
956
957 return nid;
958}
959
960/*
961 * Find the node associated with a hot added memory section for memory
962 * represented in the device tree as a node (i.e. memory@XXXX) for
95f72d1e 963 * each memblock.
0f16ef7f 964 */
ec32dd66 965static int hot_add_node_scn_to_nid(unsigned long scn_addr)
0f16ef7f 966{
94db7c5e 967 struct device_node *memory;
98fa15f3 968 int nid = NUMA_NO_NODE;
0f16ef7f 969
94db7c5e 970 for_each_node_by_type(memory, "memory") {
0f16ef7f
NF
971 unsigned long start, size;
972 int ranges;
b08a2a12 973 const __be32 *memcell_buf;
0f16ef7f
NF
974 unsigned int len;
975
976 memcell_buf = of_get_property(memory, "reg", &len);
977 if (!memcell_buf || len <= 0)
978 continue;
979
980 /* ranges in cell */
981 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
982
983 while (ranges--) {
984 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
985 size = read_n_cells(n_mem_size_cells, &memcell_buf);
986
987 if ((scn_addr < start) || (scn_addr >= (start + size)))
988 continue;
989
990 nid = of_node_to_nid_single(memory);
991 break;
992 }
0db9360a 993
0f16ef7f
NF
994 if (nid >= 0)
995 break;
0db9360a
NF
996 }
997
60831842
AB
998 of_node_put(memory);
999
0f16ef7f 1000 return nid;
0db9360a
NF
1001}
1002
237a0989
MK
1003/*
1004 * Find the node associated with a hot added memory section. Section
95f72d1e
YL
1005 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
1006 * sections are fully contained within a single MEMBLOCK.
237a0989
MK
1007 */
1008int hot_add_scn_to_nid(unsigned long scn_addr)
1009{
1010 struct device_node *memory = NULL;
4a3bac4e 1011 int nid;
237a0989
MK
1012
1013 if (!numa_enabled || (min_common_depth < 0))
72c33688 1014 return first_online_node;
0db9360a
NF
1015
1016 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1017 if (memory) {
514a9cb3 1018 nid = hot_add_drconf_scn_to_nid(scn_addr);
0db9360a 1019 of_node_put(memory);
0f16ef7f
NF
1020 } else {
1021 nid = hot_add_node_scn_to_nid(scn_addr);
0db9360a 1022 }
237a0989 1023
2a8628d4 1024 if (nid < 0 || !node_possible(nid))
72c33688 1025 nid = first_online_node;
237a0989 1026
0f16ef7f 1027 return nid;
237a0989 1028}
0f16ef7f 1029
cd34206e
NA
1030static u64 hot_add_drconf_memory_max(void)
1031{
e70bd3ae 1032 struct device_node *memory = NULL;
45b64ee6 1033 struct device_node *dn = NULL;
45b64ee6 1034 const __be64 *lrdr = NULL;
45b64ee6
BR
1035
1036 dn = of_find_node_by_path("/rtas");
1037 if (dn) {
1038 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
1039 of_node_put(dn);
1040 if (lrdr)
1041 return be64_to_cpup(lrdr);
1042 }
cd34206e 1043
e70bd3ae
BR
1044 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1045 if (memory) {
e70bd3ae 1046 of_node_put(memory);
514a9cb3 1047 return drmem_lmb_memory_max();
e70bd3ae 1048 }
45b64ee6 1049 return 0;
cd34206e
NA
1050}
1051
1052/*
1053 * memory_hotplug_max - return max address of memory that may be added
1054 *
1055 * This is currently only used on systems that support drconfig memory
1056 * hotplug.
1057 */
1058u64 memory_hotplug_max(void)
1059{
1060 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1061}
237a0989 1062#endif /* CONFIG_MEMORY_HOTPLUG */
9eff1a38 1063
bd03403a 1064/* Virtual Processor Home Node (VPHN) support */
39bf990e 1065#ifdef CONFIG_PPC_SPLPAR
4b6cfb2a 1066
47d99948 1067#include "book3s64/vphn.h"
4b6cfb2a 1068
30c05350
NF
1069struct topology_update_data {
1070 struct topology_update_data *next;
1071 unsigned int cpu;
1072 int old_nid;
1073 int new_nid;
1074};
1075
cee5405d
MB
1076#define TOPOLOGY_DEF_TIMER_SECS 60
1077
5de16699 1078static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
9eff1a38
JL
1079static cpumask_t cpu_associativity_changes_mask;
1080static int vphn_enabled;
5d88aa85
JL
1081static int prrn_enabled;
1082static void reset_topology_timer(void);
cee5405d 1083static int topology_timer_secs = 1;
17f444c0 1084static int topology_inited;
9eff1a38 1085
cee5405d
MB
1086/*
1087 * Change polling interval for associativity changes.
1088 */
1089int timed_topology_update(int nsecs)
1090{
1091 if (vphn_enabled) {
1092 if (nsecs > 0)
1093 topology_timer_secs = nsecs;
1094 else
1095 topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
1096
1097 reset_topology_timer();
1098 }
1099
1100 return 0;
1101}
9eff1a38
JL
1102
1103/*
1104 * Store the current values of the associativity change counters in the
1105 * hypervisor.
1106 */
1107static void setup_cpu_associativity_change_counters(void)
1108{
cd9d6cc7 1109 int cpu;
9eff1a38 1110
5de16699
AB
1111 /* The VPHN feature supports a maximum of 8 reference points */
1112 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1113
9eff1a38 1114 for_each_possible_cpu(cpu) {
cd9d6cc7 1115 int i;
9eff1a38 1116 u8 *counts = vphn_cpu_change_counts[cpu];
499dcd41 1117 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
9eff1a38 1118
5de16699 1119 for (i = 0; i < distance_ref_points_depth; i++)
9eff1a38 1120 counts[i] = hypervisor_counts[i];
9eff1a38
JL
1121 }
1122}
1123
1124/*
1125 * The hypervisor maintains a set of 8 associativity change counters in
1126 * the VPA of each cpu that correspond to the associativity levels in the
1127 * ibm,associativity-reference-points property. When an associativity
1128 * level changes, the corresponding counter is incremented.
1129 *
1130 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1131 * node associativity levels have changed.
1132 *
1133 * Returns the number of cpus with unhandled associativity changes.
1134 */
1135static int update_cpu_associativity_changes_mask(void)
1136{
5d88aa85 1137 int cpu;
9eff1a38
JL
1138 cpumask_t *changes = &cpu_associativity_changes_mask;
1139
9eff1a38
JL
1140 for_each_possible_cpu(cpu) {
1141 int i, changed = 0;
1142 u8 *counts = vphn_cpu_change_counts[cpu];
499dcd41 1143 volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
9eff1a38 1144
5de16699 1145 for (i = 0; i < distance_ref_points_depth; i++) {
d69043e8 1146 if (hypervisor_counts[i] != counts[i]) {
9eff1a38
JL
1147 counts[i] = hypervisor_counts[i];
1148 changed = 1;
1149 }
1150 }
1151 if (changed) {
3be7db6a
RJ
1152 cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1153 cpu = cpu_last_thread_sibling(cpu);
9eff1a38
JL
1154 }
1155 }
1156
5d88aa85 1157 return cpumask_weight(changes);
9eff1a38
JL
1158}
1159
9eff1a38
JL
1160/*
1161 * Retrieve the new associativity information for a virtual processor's
1162 * home node.
1163 */
b08a2a12 1164static long hcall_vphn(unsigned long cpu, __be32 *associativity)
9eff1a38 1165{
cd9d6cc7 1166 long rc;
9eff1a38
JL
1167 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1168 u64 flags = 1;
1169 int hwcpu = get_hard_smp_processor_id(cpu);
1170
1171 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1172 vphn_unpack_associativity(retbuf, associativity);
1173
1174 return rc;
1175}
1176
1177static long vphn_get_associativity(unsigned long cpu,
b08a2a12 1178 __be32 *associativity)
9eff1a38 1179{
cd9d6cc7 1180 long rc;
9eff1a38
JL
1181
1182 rc = hcall_vphn(cpu, associativity);
1183
1184 switch (rc) {
1185 case H_FUNCTION:
437ccdc8 1186 printk_once(KERN_INFO
9eff1a38
JL
1187 "VPHN is not supported. Disabling polling...\n");
1188 stop_topology_update();
1189 break;
1190 case H_HARDWARE:
1191 printk(KERN_ERR
1192 "hcall_vphn() experienced a hardware fault "
1193 "preventing VPHN. Disabling polling...\n");
1194 stop_topology_update();
17f444c0
MB
1195 break;
1196 case H_SUCCESS:
1197 dbg("VPHN hcall succeeded. Reset polling...\n");
cee5405d 1198 timed_topology_update(0);
17f444c0 1199 break;
9eff1a38
JL
1200 }
1201
1202 return rc;
1203}
1204
e67e02a5 1205int find_and_online_cpu_nid(int cpu)
ea05ba7c
MB
1206{
1207 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1208 int new_nid;
1209
1210 /* Use associativity from first thread for all siblings */
2483ef05
SD
1211 if (vphn_get_associativity(cpu, associativity))
1212 return cpu_to_node(cpu);
1213
ea05ba7c
MB
1214 new_nid = associativity_to_nid(associativity);
1215 if (new_nid < 0 || !node_possible(new_nid))
1216 new_nid = first_online_node;
1217
1218 if (NODE_DATA(new_nid) == NULL) {
1219#ifdef CONFIG_MEMORY_HOTPLUG
1220 /*
1221 * Need to ensure that NODE_DATA is initialized for a node from
1222 * available memory (see memblock_alloc_try_nid). If unable to
1223 * init the node, then default to nearest node that has memory
ac1788cc
SD
1224 * installed. Skip onlining a node if the subsystems are not
1225 * yet initialized.
ea05ba7c 1226 */
ac1788cc 1227 if (!topology_inited || try_online_node(new_nid))
ea05ba7c
MB
1228 new_nid = first_online_node;
1229#else
1230 /*
1231 * Default to using the nearest node that has memory installed.
1232 * Otherwise, it would be necessary to patch the kernel MM code
1233 * to deal with more memoryless-node error conditions.
1234 */
1235 new_nid = first_online_node;
1236#endif
1237 }
1238
e67e02a5
MB
1239 pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
1240 cpu, new_nid);
ea05ba7c
MB
1241 return new_nid;
1242}
1243
30c05350
NF
1244/*
1245 * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1246 * characteristics change. This function doesn't perform any locking and is
1247 * only safe to call from stop_machine().
1248 */
1249static int update_cpu_topology(void *data)
1250{
1251 struct topology_update_data *update;
1252 unsigned long cpu;
1253
1254 if (!data)
1255 return -EINVAL;
1256
3be7db6a 1257 cpu = smp_processor_id();
30c05350
NF
1258
1259 for (update = data; update; update = update->next) {
2c0a33f9 1260 int new_nid = update->new_nid;
30c05350
NF
1261 if (cpu != update->cpu)
1262 continue;
1263
49f8d8c0 1264 unmap_cpu_from_node(cpu);
2c0a33f9
NA
1265 map_cpu_to_node(cpu, new_nid);
1266 set_cpu_numa_node(cpu, new_nid);
1267 set_cpu_numa_mem(cpu, local_memory_node(new_nid));
176bbf14 1268 vdso_getcpu_init();
30c05350
NF
1269 }
1270
1271 return 0;
1272}
1273
d4edc5b6
SB
1274static int update_lookup_table(void *data)
1275{
1276 struct topology_update_data *update;
1277
1278 if (!data)
1279 return -EINVAL;
1280
1281 /*
1282 * Upon topology update, the numa-cpu lookup table needs to be updated
1283 * for all threads in the core, including offline CPUs, to ensure that
1284 * future hotplug operations respect the cpu-to-node associativity
1285 * properly.
1286 */
1287 for (update = data; update; update = update->next) {
1288 int nid, base, j;
1289
1290 nid = update->new_nid;
1291 base = cpu_first_thread_sibling(update->cpu);
1292
1293 for (j = 0; j < threads_per_core; j++) {
1294 update_numa_cpu_lookup_table(base + j, nid);
1295 }
1296 }
1297
1298 return 0;
1299}
1300
9eff1a38
JL
1301/*
1302 * Update the node maps and sysfs entries for each cpu whose home node
79c5fceb 1303 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
3e401f7a
TJB
1304 *
1305 * cpus_locked says whether we already hold cpu_hotplug_lock.
9eff1a38 1306 */
3e401f7a 1307int numa_update_cpu_topology(bool cpus_locked)
9eff1a38 1308{
3be7db6a 1309 unsigned int cpu, sibling, changed = 0;
30c05350 1310 struct topology_update_data *updates, *ud;
176bbf14 1311 cpumask_t updated_cpus;
8a25a2fd 1312 struct device *dev;
3be7db6a 1313 int weight, new_nid, i = 0;
9eff1a38 1314
2ea62630 1315 if (!prrn_enabled && !vphn_enabled && topology_inited)
2d73bae1
NA
1316 return 0;
1317
30c05350
NF
1318 weight = cpumask_weight(&cpu_associativity_changes_mask);
1319 if (!weight)
1320 return 0;
1321
6396bb22 1322 updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
30c05350
NF
1323 if (!updates)
1324 return 0;
9eff1a38 1325
176bbf14
JL
1326 cpumask_clear(&updated_cpus);
1327
5d88aa85 1328 for_each_cpu(cpu, &cpu_associativity_changes_mask) {
3be7db6a
RJ
1329 /*
1330 * If siblings aren't flagged for changes, updates list
1331 * will be too short. Skip on this update and set for next
1332 * update.
1333 */
1334 if (!cpumask_subset(cpu_sibling_mask(cpu),
1335 &cpu_associativity_changes_mask)) {
1336 pr_info("Sibling bits not set for associativity "
1337 "change, cpu%d\n", cpu);
1338 cpumask_or(&cpu_associativity_changes_mask,
1339 &cpu_associativity_changes_mask,
1340 cpu_sibling_mask(cpu));
1341 cpu = cpu_last_thread_sibling(cpu);
1342 continue;
1343 }
9eff1a38 1344
ea05ba7c 1345 new_nid = find_and_online_cpu_nid(cpu);
3be7db6a
RJ
1346
1347 if (new_nid == numa_cpu_lookup_table[cpu]) {
1348 cpumask_andnot(&cpu_associativity_changes_mask,
1349 &cpu_associativity_changes_mask,
1350 cpu_sibling_mask(cpu));
17f444c0
MB
1351 dbg("Assoc chg gives same node %d for cpu%d\n",
1352 new_nid, cpu);
3be7db6a
RJ
1353 cpu = cpu_last_thread_sibling(cpu);
1354 continue;
1355 }
9eff1a38 1356
3be7db6a
RJ
1357 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1358 ud = &updates[i++];
8bc93149 1359 ud->next = &updates[i];
3be7db6a
RJ
1360 ud->cpu = sibling;
1361 ud->new_nid = new_nid;
1362 ud->old_nid = numa_cpu_lookup_table[sibling];
1363 cpumask_set_cpu(sibling, &updated_cpus);
3be7db6a
RJ
1364 }
1365 cpu = cpu_last_thread_sibling(cpu);
30c05350
NF
1366 }
1367
8bc93149
MB
1368 /*
1369 * Prevent processing of 'updates' from overflowing array
1370 * where last entry filled in a 'next' pointer.
1371 */
1372 if (i)
1373 updates[i-1].next = NULL;
1374
2d73bae1
NA
1375 pr_debug("Topology update for the following CPUs:\n");
1376 if (cpumask_weight(&updated_cpus)) {
1377 for (ud = &updates[0]; ud; ud = ud->next) {
1378 pr_debug("cpu %d moving from node %d "
1379 "to %d\n", ud->cpu,
1380 ud->old_nid, ud->new_nid);
1381 }
1382 }
1383
9a013361
MW
1384 /*
1385 * In cases where we have nothing to update (because the updates list
1386 * is too short or because the new topology is same as the old one),
1387 * skip invoking update_cpu_topology() via stop-machine(). This is
1388 * necessary (and not just a fast-path optimization) since stop-machine
1389 * can end up electing a random CPU to run update_cpu_topology(), and
1390 * thus trick us into setting up incorrect cpu-node mappings (since
1391 * 'updates' is kzalloc()'ed).
1392 *
1393 * And for the similar reason, we will skip all the following updating.
1394 */
1395 if (!cpumask_weight(&updated_cpus))
1396 goto out;
1397
3e401f7a
TJB
1398 if (cpus_locked)
1399 stop_machine_cpuslocked(update_cpu_topology, &updates[0],
1400 &updated_cpus);
1401 else
1402 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
30c05350 1403
d4edc5b6
SB
1404 /*
1405 * Update the numa-cpu lookup table with the new mappings, even for
1406 * offline CPUs. It is best to perform this update from the stop-
1407 * machine context.
1408 */
3e401f7a
TJB
1409 if (cpus_locked)
1410 stop_machine_cpuslocked(update_lookup_table, &updates[0],
d4edc5b6 1411 cpumask_of(raw_smp_processor_id()));
3e401f7a
TJB
1412 else
1413 stop_machine(update_lookup_table, &updates[0],
1414 cpumask_of(raw_smp_processor_id()));
d4edc5b6 1415
30c05350 1416 for (ud = &updates[0]; ud; ud = ud->next) {
dd023217
NF
1417 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1418 register_cpu_under_node(ud->cpu, ud->new_nid);
1419
30c05350 1420 dev = get_cpu_device(ud->cpu);
8a25a2fd
KS
1421 if (dev)
1422 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
30c05350 1423 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
79c5fceb 1424 changed = 1;
9eff1a38
JL
1425 }
1426
9a013361 1427out:
30c05350 1428 kfree(updates);
79c5fceb 1429 return changed;
9eff1a38
JL
1430}
1431
3e401f7a
TJB
1432int arch_update_cpu_topology(void)
1433{
3e401f7a
TJB
1434 return numa_update_cpu_topology(true);
1435}
1436
9eff1a38
JL
1437static void topology_work_fn(struct work_struct *work)
1438{
1439 rebuild_sched_domains();
1440}
1441static DECLARE_WORK(topology_work, topology_work_fn);
1442
ec32dd66 1443static void topology_schedule_update(void)
9eff1a38
JL
1444{
1445 schedule_work(&topology_work);
1446}
1447
df7e828c 1448static void topology_timer_fn(struct timer_list *unused)
9eff1a38 1449{
5d88aa85 1450 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
9eff1a38 1451 topology_schedule_update();
5d88aa85
JL
1452 else if (vphn_enabled) {
1453 if (update_cpu_associativity_changes_mask() > 0)
1454 topology_schedule_update();
1455 reset_topology_timer();
1456 }
9eff1a38 1457}
df7e828c 1458static struct timer_list topology_timer;
9eff1a38 1459
5d88aa85 1460static void reset_topology_timer(void)
9eff1a38 1461{
8604895a
MB
1462 if (vphn_enabled)
1463 mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
9eff1a38
JL
1464}
1465
601abdc3
NF
1466#ifdef CONFIG_SMP
1467
5d88aa85
JL
1468static int dt_update_callback(struct notifier_block *nb,
1469 unsigned long action, void *data)
1470{
f5242e5a 1471 struct of_reconfig_data *update = data;
5d88aa85
JL
1472 int rc = NOTIFY_DONE;
1473
1474 switch (action) {
5d88aa85 1475 case OF_RECONFIG_UPDATE_PROPERTY:
e5480bdc 1476 if (of_node_is_type(update->dn, "cpu") &&
30c05350 1477 !of_prop_cmp(update->prop->name, "ibm,associativity")) {
5d88aa85
JL
1478 u32 core_id;
1479 of_property_read_u32(update->dn, "reg", &core_id);
81b61324 1480 rc = dlpar_cpu_readd(core_id);
5d88aa85
JL
1481 rc = NOTIFY_OK;
1482 }
1483 break;
1484 }
1485
1486 return rc;
9eff1a38
JL
1487}
1488
5d88aa85
JL
1489static struct notifier_block dt_update_nb = {
1490 .notifier_call = dt_update_callback,
1491};
1492
601abdc3
NF
1493#endif
1494
9eff1a38 1495/*
5d88aa85 1496 * Start polling for associativity changes.
9eff1a38
JL
1497 */
1498int start_topology_update(void)
1499{
1500 int rc = 0;
1501
2d4d9b30
NL
1502 if (!topology_updates_enabled)
1503 return 0;
1504
5d88aa85
JL
1505 if (firmware_has_feature(FW_FEATURE_PRRN)) {
1506 if (!prrn_enabled) {
1507 prrn_enabled = 1;
601abdc3 1508#ifdef CONFIG_SMP
5d88aa85 1509 rc = of_reconfig_notifier_register(&dt_update_nb);
601abdc3 1510#endif
5d88aa85 1511 }
a3496e91
MB
1512 }
1513 if (firmware_has_feature(FW_FEATURE_VPHN) &&
f13c13a0 1514 lppaca_shared_proc(get_lppaca())) {
5d88aa85 1515 if (!vphn_enabled) {
5d88aa85
JL
1516 vphn_enabled = 1;
1517 setup_cpu_associativity_change_counters();
df7e828c
KC
1518 timer_setup(&topology_timer, topology_timer_fn,
1519 TIMER_DEFERRABLE);
5d88aa85
JL
1520 reset_topology_timer();
1521 }
9eff1a38
JL
1522 }
1523
65b9fdad
MB
1524 pr_info("Starting topology update%s%s\n",
1525 (prrn_enabled ? " prrn_enabled" : ""),
1526 (vphn_enabled ? " vphn_enabled" : ""));
1527
9eff1a38
JL
1528 return rc;
1529}
9eff1a38
JL
1530
1531/*
1532 * Disable polling for VPHN associativity changes.
1533 */
1534int stop_topology_update(void)
1535{
5d88aa85
JL
1536 int rc = 0;
1537
2d4d9b30
NL
1538 if (!topology_updates_enabled)
1539 return 0;
1540
5d88aa85
JL
1541 if (prrn_enabled) {
1542 prrn_enabled = 0;
601abdc3 1543#ifdef CONFIG_SMP
5d88aa85 1544 rc = of_reconfig_notifier_unregister(&dt_update_nb);
601abdc3 1545#endif
a3496e91
MB
1546 }
1547 if (vphn_enabled) {
5d88aa85
JL
1548 vphn_enabled = 0;
1549 rc = del_timer_sync(&topology_timer);
1550 }
1551
65b9fdad
MB
1552 pr_info("Stopping topology update\n");
1553
5d88aa85 1554 return rc;
9eff1a38 1555}
e04fa612
NF
1556
1557int prrn_is_enabled(void)
1558{
1559 return prrn_enabled;
1560}
1561
2ea62630
SD
1562void __init shared_proc_topology_init(void)
1563{
1564 if (lppaca_shared_proc(get_lppaca())) {
1565 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1566 nr_cpumask_bits);
1567 numa_update_cpu_topology(false);
1568 }
1569}
1570
e04fa612
NF
1571static int topology_read(struct seq_file *file, void *v)
1572{
1573 if (vphn_enabled || prrn_enabled)
1574 seq_puts(file, "on\n");
1575 else
1576 seq_puts(file, "off\n");
1577
1578 return 0;
1579}
1580
1581static int topology_open(struct inode *inode, struct file *file)
1582{
1583 return single_open(file, topology_read, NULL);
1584}
1585
1586static ssize_t topology_write(struct file *file, const char __user *buf,
1587 size_t count, loff_t *off)
1588{
1589 char kbuf[4]; /* "on" or "off" plus null. */
1590 int read_len;
1591
1592 read_len = count < 3 ? count : 3;
1593 if (copy_from_user(kbuf, buf, read_len))
1594 return -EINVAL;
1595
1596 kbuf[read_len] = '\0';
1597
2d4d9b30
NL
1598 if (!strncmp(kbuf, "on", 2)) {
1599 topology_updates_enabled = true;
e04fa612 1600 start_topology_update();
2d4d9b30 1601 } else if (!strncmp(kbuf, "off", 3)) {
e04fa612 1602 stop_topology_update();
2d4d9b30
NL
1603 topology_updates_enabled = false;
1604 } else
e04fa612
NF
1605 return -EINVAL;
1606
1607 return count;
1608}
1609
1610static const struct file_operations topology_ops = {
1611 .read = seq_read,
1612 .write = topology_write,
1613 .open = topology_open,
1614 .release = single_release
1615};
1616
1617static int topology_update_init(void)
1618{
2d4d9b30 1619 start_topology_update();
2d73bae1 1620
17f444c0
MB
1621 if (vphn_enabled)
1622 topology_schedule_update();
1623
2d15b9b4
NA
1624 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1625 return -ENOMEM;
e04fa612 1626
17f444c0 1627 topology_inited = 1;
e04fa612 1628 return 0;
9eff1a38 1629}
e04fa612 1630device_initcall(topology_update_init);
39bf990e 1631#endif /* CONFIG_PPC_SPLPAR */