mm/execmem, arch: convert simple overrides of module_alloc to execmem
[linux-2.6-block.git] / mm / mm_init.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
6b74ab97
MG
2/*
3 * mm_init.c - Memory initialisation verification and debugging
4 *
5 * Copyright 2008 IBM Corporation, 2008
6 * Author Mel Gorman <mel@csn.ul.ie>
7 *
8 */
9#include <linux/kernel.h>
10#include <linux/init.h>
ff7ea79c 11#include <linux/kobject.h>
b95f1b31 12#include <linux/export.h>
917d9290
TC
13#include <linux/memory.h>
14#include <linux/notifier.h>
7e18adb4 15#include <linux/sched.h>
56f3547b 16#include <linux/mman.h>
9420f89d
MRI
17#include <linux/memblock.h>
18#include <linux/page-isolation.h>
19#include <linux/padata.h>
20#include <linux/nmi.h>
21#include <linux/buffer_head.h>
22#include <linux/kmemleak.h>
b7ec1bf3
MRI
23#include <linux/kfence.h>
24#include <linux/page_ext.h>
25#include <linux/pti.h>
26#include <linux/pgtable.h>
eb8589b4
MRI
27#include <linux/swap.h>
28#include <linux/cma.h>
7ea6ec4c 29#include <linux/crash_dump.h>
f6bec26c 30#include <linux/execmem.h>
708614e6 31#include "internal.h"
d5d2c02a 32#include "slab.h"
9420f89d 33#include "shuffle.h"
6b74ab97 34
b7ec1bf3
MRI
35#include <asm/setup.h>
36
5e9426ab 37#ifdef CONFIG_DEBUG_MEMORY_INIT
194e8151 38int __meminitdata mminit_loglevel;
6b74ab97 39
68ad8df4 40/* The zonelists are simply reported, validation is manual. */
0e2342c7 41void __init mminit_verify_zonelist(void)
68ad8df4
MG
42{
43 int nid;
44
45 if (mminit_loglevel < MMINIT_VERIFY)
46 return;
47
48 for_each_online_node(nid) {
49 pg_data_t *pgdat = NODE_DATA(nid);
50 struct zone *zone;
51 struct zoneref *z;
52 struct zonelist *zonelist;
53 int i, listid, zoneid;
54
e46b893d 55 BUILD_BUG_ON(MAX_ZONELISTS > 2);
68ad8df4
MG
56 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
57
58 /* Identify the zone and nodelist */
59 zoneid = i % MAX_NR_ZONES;
60 listid = i / MAX_NR_ZONES;
61 zonelist = &pgdat->node_zonelists[listid];
62 zone = &pgdat->node_zones[zoneid];
63 if (!populated_zone(zone))
64 continue;
65
66 /* Print information about the zonelist */
67 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
68 listid > 0 ? "thisnode" : "general", nid,
69 zone->name);
70
71 /* Iterate the zonelist */
c1093b74
PT
72 for_each_zone_zonelist(zone, z, zonelist, zoneid)
73 pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
1170532b 74 pr_cont("\n");
68ad8df4
MG
75 }
76 }
77}
78
708614e6
MG
79void __init mminit_verify_pageflags_layout(void)
80{
81 int shift, width;
82 unsigned long or_mask, add_mask;
83
daee07bf 84 shift = BITS_PER_LONG;
86fea8b4 85 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
ec1c86b2 86 - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
708614e6 87 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
ec1c86b2 88 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
708614e6
MG
89 SECTIONS_WIDTH,
90 NODES_WIDTH,
91 ZONES_WIDTH,
90572890 92 LAST_CPUPID_WIDTH,
86fea8b4 93 KASAN_TAG_WIDTH,
ec1c86b2
YZ
94 LRU_GEN_WIDTH,
95 LRU_REFS_WIDTH,
708614e6
MG
96 NR_PAGEFLAGS);
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
86fea8b4 98 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
708614e6 99 SECTIONS_SHIFT,
708614e6 100 NODES_SHIFT,
a4e1b4c6 101 ZONES_SHIFT,
86fea8b4
JX
102 LAST_CPUPID_SHIFT,
103 KASAN_TAG_WIDTH);
a4e1b4c6 104 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
86fea8b4 105 "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
708614e6
MG
106 (unsigned long)SECTIONS_PGSHIFT,
107 (unsigned long)NODES_PGSHIFT,
a4e1b4c6 108 (unsigned long)ZONES_PGSHIFT,
86fea8b4
JX
109 (unsigned long)LAST_CPUPID_PGSHIFT,
110 (unsigned long)KASAN_TAG_PGSHIFT);
a4e1b4c6
MG
111 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
112 "Node/Zone ID: %lu -> %lu\n",
113 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
114 (unsigned long)ZONEID_PGOFF);
708614e6 115 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
a4e1b4c6 116 "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
708614e6
MG
117 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
118#ifdef NODE_NOT_IN_PAGE_FLAGS
119 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
120 "Node not in page flags");
121#endif
90572890 122#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
a4e1b4c6 123 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
90572890 124 "Last cpupid not in page flags");
a4e1b4c6 125#endif
708614e6
MG
126
127 if (SECTIONS_WIDTH) {
128 shift -= SECTIONS_WIDTH;
129 BUG_ON(shift != SECTIONS_PGSHIFT);
130 }
131 if (NODES_WIDTH) {
132 shift -= NODES_WIDTH;
133 BUG_ON(shift != NODES_PGSHIFT);
134 }
135 if (ZONES_WIDTH) {
136 shift -= ZONES_WIDTH;
137 BUG_ON(shift != ZONES_PGSHIFT);
138 }
139
140 /* Check for bitmask overlaps */
141 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
142 (NODES_MASK << NODES_PGSHIFT) |
143 (SECTIONS_MASK << SECTIONS_PGSHIFT);
144 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
145 (NODES_MASK << NODES_PGSHIFT) +
146 (SECTIONS_MASK << SECTIONS_PGSHIFT);
147 BUG_ON(or_mask != add_mask);
148}
149
6b74ab97
MG
150static __init int set_mminit_loglevel(char *str)
151{
152 get_option(&str, &mminit_loglevel);
153 return 0;
154}
155early_param("mminit_loglevel", set_mminit_loglevel);
5e9426ab 156#endif /* CONFIG_DEBUG_MEMORY_INIT */
ff7ea79c
NA
157
158struct kobject *mm_kobj;
ff7ea79c 159
917d9290
TC
160#ifdef CONFIG_SMP
161s32 vm_committed_as_batch = 32;
162
56f3547b 163void mm_compute_batch(int overcommit_policy)
917d9290
TC
164{
165 u64 memsized_batch;
166 s32 nr = num_present_cpus();
167 s32 batch = max_t(s32, nr*2, 32);
56f3547b
FT
168 unsigned long ram_pages = totalram_pages();
169
170 /*
171 * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
172 * (total memory/#cpus), and lift it to 25% for other policies
173 * to easy the possible lock contention for percpu_counter
174 * vm_committed_as, while the max limit is INT_MAX
175 */
176 if (overcommit_policy == OVERCOMMIT_NEVER)
177 memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
178 else
179 memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
917d9290
TC
180
181 vm_committed_as_batch = max_t(s32, memsized_batch, batch);
182}
183
184static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
185 unsigned long action, void *arg)
186{
187 switch (action) {
188 case MEM_ONLINE:
189 case MEM_OFFLINE:
56f3547b 190 mm_compute_batch(sysctl_overcommit_memory);
01359eb2 191 break;
917d9290
TC
192 default:
193 break;
194 }
195 return NOTIFY_OK;
196}
197
917d9290
TC
198static int __init mm_compute_batch_init(void)
199{
56f3547b 200 mm_compute_batch(sysctl_overcommit_memory);
1eeaa4fd 201 hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
917d9290
TC
202 return 0;
203}
204
205__initcall(mm_compute_batch_init);
206
207#endif
208
ff7ea79c
NA
209static int __init mm_sysfs_init(void)
210{
211 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
212 if (!mm_kobj)
213 return -ENOMEM;
214
215 return 0;
216}
e82cb95d 217postcore_initcall(mm_sysfs_init);
9420f89d
MRI
218
219static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
220static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
221static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
222
223static unsigned long required_kernelcore __initdata;
224static unsigned long required_kernelcore_percent __initdata;
225static unsigned long required_movablecore __initdata;
226static unsigned long required_movablecore_percent __initdata;
227
228static unsigned long nr_kernel_pages __initdata;
229static unsigned long nr_all_pages __initdata;
230static unsigned long dma_reserve __initdata;
231
de57807e 232static bool deferred_struct_pages __meminitdata;
9420f89d
MRI
233
234static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
235
236static int __init cmdline_parse_core(char *p, unsigned long *core,
237 unsigned long *percent)
238{
239 unsigned long long coremem;
240 char *endptr;
241
242 if (!p)
243 return -EINVAL;
244
245 /* Value may be a percentage of total memory, otherwise bytes */
246 coremem = simple_strtoull(p, &endptr, 0);
247 if (*endptr == '%') {
248 /* Paranoid check for percent values greater than 100 */
249 WARN_ON(coremem > 100);
250
251 *percent = coremem;
252 } else {
253 coremem = memparse(p, &p);
254 /* Paranoid check that UL is enough for the coremem value */
255 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
256
257 *core = coremem >> PAGE_SHIFT;
258 *percent = 0UL;
259 }
260 return 0;
261}
262
072ba380
KW
263bool mirrored_kernelcore __initdata_memblock;
264
9420f89d
MRI
265/*
266 * kernelcore=size sets the amount of memory for use for allocations that
267 * cannot be reclaimed or migrated.
268 */
269static int __init cmdline_parse_kernelcore(char *p)
270{
271 /* parse kernelcore=mirror */
272 if (parse_option_str(p, "mirror")) {
273 mirrored_kernelcore = true;
274 return 0;
275 }
276
277 return cmdline_parse_core(p, &required_kernelcore,
278 &required_kernelcore_percent);
279}
280early_param("kernelcore", cmdline_parse_kernelcore);
281
282/*
283 * movablecore=size sets the amount of memory for use for allocations that
284 * can be reclaimed or migrated.
285 */
286static int __init cmdline_parse_movablecore(char *p)
287{
288 return cmdline_parse_core(p, &required_movablecore,
289 &required_movablecore_percent);
290}
291early_param("movablecore", cmdline_parse_movablecore);
292
293/*
294 * early_calculate_totalpages()
295 * Sum pages in active regions for movable zone.
296 * Populate N_MEMORY for calculating usable_nodes.
297 */
298static unsigned long __init early_calculate_totalpages(void)
299{
300 unsigned long totalpages = 0;
301 unsigned long start_pfn, end_pfn;
302 int i, nid;
303
304 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
305 unsigned long pages = end_pfn - start_pfn;
306
307 totalpages += pages;
308 if (pages)
309 node_set_state(nid, N_MEMORY);
310 }
311 return totalpages;
312}
313
314/*
315 * This finds a zone that can be used for ZONE_MOVABLE pages. The
316 * assumption is made that zones within a node are ordered in monotonic
317 * increasing memory addresses so that the "highest" populated zone is used
318 */
319static void __init find_usable_zone_for_movable(void)
320{
321 int zone_index;
322 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
323 if (zone_index == ZONE_MOVABLE)
324 continue;
325
326 if (arch_zone_highest_possible_pfn[zone_index] >
327 arch_zone_lowest_possible_pfn[zone_index])
328 break;
329 }
330
331 VM_BUG_ON(zone_index == -1);
332 movable_zone = zone_index;
333}
334
335/*
336 * Find the PFN the Movable zone begins in each node. Kernel memory
337 * is spread evenly between nodes as long as the nodes have enough
338 * memory. When they don't, some nodes will have more kernelcore than
339 * others
340 */
341static void __init find_zone_movable_pfns_for_nodes(void)
342{
343 int i, nid;
344 unsigned long usable_startpfn;
345 unsigned long kernelcore_node, kernelcore_remaining;
346 /* save the state before borrow the nodemask */
347 nodemask_t saved_node_state = node_states[N_MEMORY];
348 unsigned long totalpages = early_calculate_totalpages();
349 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
350 struct memblock_region *r;
351
352 /* Need to find movable_zone earlier when movable_node is specified. */
353 find_usable_zone_for_movable();
354
355 /*
356 * If movable_node is specified, ignore kernelcore and movablecore
357 * options.
358 */
359 if (movable_node_is_enabled()) {
360 for_each_mem_region(r) {
361 if (!memblock_is_hotpluggable(r))
362 continue;
363
364 nid = memblock_get_region_node(r);
365
366 usable_startpfn = PFN_DOWN(r->base);
367 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
368 min(usable_startpfn, zone_movable_pfn[nid]) :
369 usable_startpfn;
370 }
371
372 goto out2;
373 }
374
375 /*
376 * If kernelcore=mirror is specified, ignore movablecore option
377 */
378 if (mirrored_kernelcore) {
379 bool mem_below_4gb_not_mirrored = false;
380
0db31d63
MW
381 if (!memblock_has_mirror()) {
382 pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
383 goto out;
384 }
385
7ea6ec4c
MW
386 if (is_kdump_kernel()) {
387 pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");
388 goto out;
389 }
390
9420f89d
MRI
391 for_each_mem_region(r) {
392 if (memblock_is_mirror(r))
393 continue;
394
395 nid = memblock_get_region_node(r);
396
397 usable_startpfn = memblock_region_memory_base_pfn(r);
398
399 if (usable_startpfn < PHYS_PFN(SZ_4G)) {
400 mem_below_4gb_not_mirrored = true;
401 continue;
402 }
403
404 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
405 min(usable_startpfn, zone_movable_pfn[nid]) :
406 usable_startpfn;
407 }
408
409 if (mem_below_4gb_not_mirrored)
410 pr_warn("This configuration results in unmirrored kernel memory.\n");
411
412 goto out2;
413 }
414
415 /*
416 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
417 * amount of necessary memory.
418 */
419 if (required_kernelcore_percent)
420 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
421 10000UL;
422 if (required_movablecore_percent)
423 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
424 10000UL;
425
426 /*
427 * If movablecore= was specified, calculate what size of
428 * kernelcore that corresponds so that memory usable for
429 * any allocation type is evenly spread. If both kernelcore
430 * and movablecore are specified, then the value of kernelcore
431 * will be used for required_kernelcore if it's greater than
432 * what movablecore would have allowed.
433 */
434 if (required_movablecore) {
435 unsigned long corepages;
436
437 /*
438 * Round-up so that ZONE_MOVABLE is at least as large as what
439 * was requested by the user
440 */
441 required_movablecore =
442 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
443 required_movablecore = min(totalpages, required_movablecore);
444 corepages = totalpages - required_movablecore;
445
446 required_kernelcore = max(required_kernelcore, corepages);
447 }
448
449 /*
450 * If kernelcore was not specified or kernelcore size is larger
451 * than totalpages, there is no ZONE_MOVABLE.
452 */
453 if (!required_kernelcore || required_kernelcore >= totalpages)
454 goto out;
455
456 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
457 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
458
459restart:
460 /* Spread kernelcore memory as evenly as possible throughout nodes */
461 kernelcore_node = required_kernelcore / usable_nodes;
462 for_each_node_state(nid, N_MEMORY) {
463 unsigned long start_pfn, end_pfn;
464
465 /*
466 * Recalculate kernelcore_node if the division per node
467 * now exceeds what is necessary to satisfy the requested
468 * amount of memory for the kernel
469 */
470 if (required_kernelcore < kernelcore_node)
471 kernelcore_node = required_kernelcore / usable_nodes;
472
473 /*
474 * As the map is walked, we track how much memory is usable
475 * by the kernel using kernelcore_remaining. When it is
476 * 0, the rest of the node is usable by ZONE_MOVABLE
477 */
478 kernelcore_remaining = kernelcore_node;
479
480 /* Go through each range of PFNs within this node */
481 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
482 unsigned long size_pages;
483
484 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
485 if (start_pfn >= end_pfn)
486 continue;
487
488 /* Account for what is only usable for kernelcore */
489 if (start_pfn < usable_startpfn) {
490 unsigned long kernel_pages;
491 kernel_pages = min(end_pfn, usable_startpfn)
492 - start_pfn;
493
494 kernelcore_remaining -= min(kernel_pages,
495 kernelcore_remaining);
496 required_kernelcore -= min(kernel_pages,
497 required_kernelcore);
498
499 /* Continue if range is now fully accounted */
500 if (end_pfn <= usable_startpfn) {
501
502 /*
503 * Push zone_movable_pfn to the end so
504 * that if we have to rebalance
505 * kernelcore across nodes, we will
506 * not double account here
507 */
508 zone_movable_pfn[nid] = end_pfn;
509 continue;
510 }
511 start_pfn = usable_startpfn;
512 }
513
514 /*
515 * The usable PFN range for ZONE_MOVABLE is from
516 * start_pfn->end_pfn. Calculate size_pages as the
517 * number of pages used as kernelcore
518 */
519 size_pages = end_pfn - start_pfn;
520 if (size_pages > kernelcore_remaining)
521 size_pages = kernelcore_remaining;
522 zone_movable_pfn[nid] = start_pfn + size_pages;
523
524 /*
525 * Some kernelcore has been met, update counts and
526 * break if the kernelcore for this node has been
527 * satisfied
528 */
529 required_kernelcore -= min(required_kernelcore,
530 size_pages);
531 kernelcore_remaining -= size_pages;
532 if (!kernelcore_remaining)
533 break;
534 }
535 }
536
537 /*
538 * If there is still required_kernelcore, we do another pass with one
539 * less node in the count. This will push zone_movable_pfn[nid] further
540 * along on the nodes that still have memory until kernelcore is
541 * satisfied
542 */
543 usable_nodes--;
544 if (usable_nodes && required_kernelcore > usable_nodes)
545 goto restart;
546
547out2:
548 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
549 for (nid = 0; nid < MAX_NUMNODES; nid++) {
550 unsigned long start_pfn, end_pfn;
551
552 zone_movable_pfn[nid] =
553 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
554
555 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
556 if (zone_movable_pfn[nid] >= end_pfn)
557 zone_movable_pfn[nid] = 0;
558 }
559
560out:
561 /* restore the node_state */
562 node_states[N_MEMORY] = saved_node_state;
563}
564
fde1c4ec 565void __meminit __init_single_page(struct page *page, unsigned long pfn,
9420f89d
MRI
566 unsigned long zone, int nid)
567{
568 mm_zero_struct_page(page);
569 set_page_links(page, zone, nid, pfn);
570 init_page_count(page);
571 page_mapcount_reset(page);
572 page_cpupid_reset_last(page);
573 page_kasan_tag_reset(page);
574
575 INIT_LIST_HEAD(&page->lru);
576#ifdef WANT_PAGE_VIRTUAL
577 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
578 if (!is_highmem_idx(zone))
579 set_page_address(page, __va(pfn << PAGE_SHIFT));
580#endif
581}
582
583#ifdef CONFIG_NUMA
584/*
585 * During memory init memblocks map pfns to nids. The search is expensive and
586 * this caches recent lookups. The implementation of __early_pfn_to_nid
587 * treats start/end as pfns.
588 */
589struct mminit_pfnnid_cache {
590 unsigned long last_start;
591 unsigned long last_end;
592 int last_nid;
593};
594
595static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
596
597/*
598 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
599 */
600static int __meminit __early_pfn_to_nid(unsigned long pfn,
601 struct mminit_pfnnid_cache *state)
602{
603 unsigned long start_pfn, end_pfn;
604 int nid;
605
606 if (state->last_start <= pfn && pfn < state->last_end)
607 return state->last_nid;
608
609 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
610 if (nid != NUMA_NO_NODE) {
611 state->last_start = start_pfn;
612 state->last_end = end_pfn;
613 state->last_nid = nid;
614 }
615
616 return nid;
617}
618
619int __meminit early_pfn_to_nid(unsigned long pfn)
620{
621 static DEFINE_SPINLOCK(early_pfn_lock);
622 int nid;
623
624 spin_lock(&early_pfn_lock);
625 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
626 if (nid < 0)
627 nid = first_online_node;
628 spin_unlock(&early_pfn_lock);
629
630 return nid;
631}
534ef4e1
MRI
632
633int hashdist = HASHDIST_DEFAULT;
634
635static int __init set_hashdist(char *str)
636{
637 if (!str)
638 return 0;
639 hashdist = simple_strtoul(str, &str, 0);
640 return 1;
641}
642__setup("hashdist=", set_hashdist);
643
644static inline void fixup_hashdist(void)
645{
646 if (num_node_state(N_MEMORY) == 1)
647 hashdist = 0;
648}
649#else
650static inline void fixup_hashdist(void) {}
9420f89d
MRI
651#endif /* CONFIG_NUMA */
652
653#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
654static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
655{
656 pgdat->first_deferred_pfn = ULONG_MAX;
657}
658
659/* Returns true if the struct page for the pfn is initialised */
61167ad5 660static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
9420f89d 661{
9420f89d
MRI
662 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
663 return false;
664
665 return true;
666}
667
668/*
669 * Returns true when the remaining initialisation should be deferred until
670 * later in the boot cycle when it can be parallelised.
671 */
672static bool __meminit
673defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
674{
675 static unsigned long prev_end_pfn, nr_initialised;
676
677 if (early_page_ext_enabled())
678 return false;
679 /*
680 * prev_end_pfn static that contains the end of previous zone
681 * No need to protect because called very early in boot before smp_init.
682 */
683 if (prev_end_pfn != end_pfn) {
684 prev_end_pfn = end_pfn;
685 nr_initialised = 0;
686 }
687
688 /* Always populate low zones for address-constrained allocations */
689 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
690 return false;
691
692 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
693 return true;
694 /*
695 * We start only with one section of pages, more pages are added as
696 * needed until the rest of deferred pages are initialized.
697 */
698 nr_initialised++;
699 if ((nr_initialised > PAGES_PER_SECTION) &&
700 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
701 NODE_DATA(nid)->first_deferred_pfn = pfn;
702 return true;
703 }
704 return false;
705}
706
61167ad5 707static void __meminit init_reserved_page(unsigned long pfn, int nid)
9420f89d
MRI
708{
709 pg_data_t *pgdat;
61167ad5 710 int zid;
9420f89d 711
61167ad5 712 if (early_page_initialised(pfn, nid))
9420f89d
MRI
713 return;
714
9420f89d
MRI
715 pgdat = NODE_DATA(nid);
716
717 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
718 struct zone *zone = &pgdat->node_zones[zid];
719
720 if (zone_spans_pfn(zone, pfn))
721 break;
722 }
723 __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
724}
725#else
726static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
727
61167ad5 728static inline bool early_page_initialised(unsigned long pfn, int nid)
9420f89d
MRI
729{
730 return true;
731}
732
733static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
734{
735 return false;
736}
737
61167ad5 738static inline void init_reserved_page(unsigned long pfn, int nid)
9420f89d
MRI
739{
740}
741#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
742
743/*
744 * Initialised pages do not have PageReserved set. This function is
745 * called for each range allocated by the bootmem allocator and
746 * marks the pages PageReserved. The remaining valid pages are later
747 * sent to the buddy page allocator.
748 */
61167ad5
YD
749void __meminit reserve_bootmem_region(phys_addr_t start,
750 phys_addr_t end, int nid)
9420f89d
MRI
751{
752 unsigned long start_pfn = PFN_DOWN(start);
753 unsigned long end_pfn = PFN_UP(end);
754
755 for (; start_pfn < end_pfn; start_pfn++) {
756 if (pfn_valid(start_pfn)) {
757 struct page *page = pfn_to_page(start_pfn);
758
61167ad5 759 init_reserved_page(start_pfn, nid);
9420f89d
MRI
760
761 /* Avoid false-positive PageTail() */
762 INIT_LIST_HEAD(&page->lru);
763
764 /*
765 * no need for atomic set_bit because the struct
766 * page is not visible yet so nobody should
767 * access it yet.
768 */
769 __SetPageReserved(page);
770 }
771 }
772}
773
774/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
775static bool __meminit
776overlap_memmap_init(unsigned long zone, unsigned long *pfn)
777{
778 static struct memblock_region *r;
779
780 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
781 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
782 for_each_mem_region(r) {
783 if (*pfn < memblock_region_memory_end_pfn(r))
784 break;
785 }
786 }
787 if (*pfn >= memblock_region_memory_base_pfn(r) &&
788 memblock_is_mirror(r)) {
789 *pfn = memblock_region_memory_end_pfn(r);
790 return true;
791 }
792 }
793 return false;
794}
795
796/*
797 * Only struct pages that correspond to ranges defined by memblock.memory
798 * are zeroed and initialized by going through __init_single_page() during
799 * memmap_init_zone_range().
800 *
801 * But, there could be struct pages that correspond to holes in
802 * memblock.memory. This can happen because of the following reasons:
803 * - physical memory bank size is not necessarily the exact multiple of the
804 * arbitrary section size
805 * - early reserved memory may not be listed in memblock.memory
ecf5dd1f 806 * - non-memory regions covered by the contigious flatmem mapping
9420f89d
MRI
807 * - memory layouts defined with memmap= kernel parameter may not align
808 * nicely with memmap sections
809 *
810 * Explicitly initialize those struct pages so that:
811 * - PG_Reserved is set
812 * - zone and node links point to zone and node that span the page if the
813 * hole is in the middle of a zone
814 * - zone and node links point to adjacent zone/node if the hole falls on
815 * the zone boundary; the pages in such holes will be prepended to the
816 * zone/node above the hole except for the trailing pages in the last
817 * section that will be appended to the zone/node below.
818 */
819static void __init init_unavailable_range(unsigned long spfn,
820 unsigned long epfn,
821 int zone, int node)
822{
823 unsigned long pfn;
824 u64 pgcnt = 0;
825
826 for (pfn = spfn; pfn < epfn; pfn++) {
827 if (!pfn_valid(pageblock_start_pfn(pfn))) {
828 pfn = pageblock_end_pfn(pfn) - 1;
829 continue;
830 }
831 __init_single_page(pfn_to_page(pfn), pfn, zone, node);
832 __SetPageReserved(pfn_to_page(pfn));
833 pgcnt++;
834 }
835
836 if (pgcnt)
01846c6c 837 pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",
9420f89d
MRI
838 node, zone_names[zone], pgcnt);
839}
840
841/*
842 * Initially all pages are reserved - free ones are freed
843 * up by memblock_free_all() once the early boot process is
844 * done. Non-atomic initialization, single-pass.
845 *
846 * All aligned pageblocks are initialized to the specified migratetype
847 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
848 * zone stats (e.g., nr_isolate_pageblock) are touched.
849 */
850void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
851 unsigned long start_pfn, unsigned long zone_end_pfn,
852 enum meminit_context context,
853 struct vmem_altmap *altmap, int migratetype)
854{
855 unsigned long pfn, end_pfn = start_pfn + size;
856 struct page *page;
857
858 if (highest_memmap_pfn < end_pfn - 1)
859 highest_memmap_pfn = end_pfn - 1;
860
861#ifdef CONFIG_ZONE_DEVICE
862 /*
863 * Honor reservation requested by the driver for this ZONE_DEVICE
864 * memory. We limit the total number of pages to initialize to just
865 * those that might contain the memory mapping. We will defer the
866 * ZONE_DEVICE page initialization until after we have released
867 * the hotplug lock.
868 */
869 if (zone == ZONE_DEVICE) {
870 if (!altmap)
871 return;
872
873 if (start_pfn == altmap->base_pfn)
874 start_pfn += altmap->reserve;
875 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
876 }
877#endif
878
879 for (pfn = start_pfn; pfn < end_pfn; ) {
880 /*
881 * There can be holes in boot-time mem_map[]s handed to this
882 * function. They do not exist on hotplugged memory.
883 */
884 if (context == MEMINIT_EARLY) {
885 if (overlap_memmap_init(zone, &pfn))
886 continue;
887 if (defer_init(nid, pfn, zone_end_pfn)) {
888 deferred_struct_pages = true;
889 break;
890 }
891 }
892
893 page = pfn_to_page(pfn);
894 __init_single_page(page, pfn, zone, nid);
895 if (context == MEMINIT_HOTPLUG)
896 __SetPageReserved(page);
897
898 /*
899 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
900 * such that unmovable allocations won't be scattered all
901 * over the place during system boot.
902 */
903 if (pageblock_aligned(pfn)) {
904 set_pageblock_migratetype(page, migratetype);
905 cond_resched();
906 }
907 pfn++;
908 }
909}
910
911static void __init memmap_init_zone_range(struct zone *zone,
912 unsigned long start_pfn,
913 unsigned long end_pfn,
914 unsigned long *hole_pfn)
915{
916 unsigned long zone_start_pfn = zone->zone_start_pfn;
917 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
918 int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
919
920 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
921 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
922
923 if (start_pfn >= end_pfn)
924 return;
925
926 memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
927 zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
928
929 if (*hole_pfn < start_pfn)
930 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
931
932 *hole_pfn = end_pfn;
933}
934
935static void __init memmap_init(void)
936{
937 unsigned long start_pfn, end_pfn;
938 unsigned long hole_pfn = 0;
939 int i, j, zone_id = 0, nid;
940
941 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
942 struct pglist_data *node = NODE_DATA(nid);
943
944 for (j = 0; j < MAX_NR_ZONES; j++) {
945 struct zone *zone = node->node_zones + j;
946
947 if (!populated_zone(zone))
948 continue;
949
950 memmap_init_zone_range(zone, start_pfn, end_pfn,
951 &hole_pfn);
952 zone_id = j;
953 }
954 }
955
956#ifdef CONFIG_SPARSEMEM
957 /*
958 * Initialize the memory map for hole in the range [memory_end,
959 * section_end].
960 * Append the pages in this hole to the highest zone in the last
961 * node.
962 * The call to init_unavailable_range() is outside the ifdef to
963 * silence the compiler warining about zone_id set but not used;
964 * for FLATMEM it is a nop anyway
965 */
966 end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
967 if (hole_pfn < end_pfn)
968#endif
969 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
970}
971
972#ifdef CONFIG_ZONE_DEVICE
973static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
974 unsigned long zone_idx, int nid,
975 struct dev_pagemap *pgmap)
976{
977
978 __init_single_page(page, pfn, zone_idx, nid);
979
980 /*
981 * Mark page reserved as it will need to wait for onlining
982 * phase for it to be fully associated with a zone.
983 *
984 * We can use the non-atomic __set_bit operation for setting
985 * the flag as we are still initializing the pages.
986 */
987 __SetPageReserved(page);
988
989 /*
990 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
991 * and zone_device_data. It is a bug if a ZONE_DEVICE page is
992 * ever freed or placed on a driver-private list.
993 */
994 page->pgmap = pgmap;
995 page->zone_device_data = NULL;
996
997 /*
998 * Mark the block movable so that blocks are reserved for
999 * movable at startup. This will force kernel allocations
1000 * to reserve their blocks rather than leaking throughout
1001 * the address space during boot when many long-lived
1002 * kernel allocations are made.
1003 *
1004 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
1005 * because this is done early in section_activate()
1006 */
1007 if (pageblock_aligned(pfn)) {
1008 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1009 cond_resched();
1010 }
1011
1012 /*
1013 * ZONE_DEVICE pages are released directly to the driver page allocator
1014 * which will set the page count to 1 when allocating the page.
1015 */
1016 if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
1017 pgmap->type == MEMORY_DEVICE_COHERENT)
1018 set_page_count(page, 0);
1019}
1020
1021/*
1022 * With compound page geometry and when struct pages are stored in ram most
1023 * tail pages are reused. Consequently, the amount of unique struct pages to
1024 * initialize is a lot smaller that the total amount of struct pages being
1025 * mapped. This is a paired / mild layering violation with explicit knowledge
1026 * of how the sparse_vmemmap internals handle compound pages in the lack
1027 * of an altmap. See vmemmap_populate_compound_pages().
1028 */
1029static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
87a7ae75 1030 struct dev_pagemap *pgmap)
9420f89d 1031{
87a7ae75
AK
1032 if (!vmemmap_can_optimize(altmap, pgmap))
1033 return pgmap_vmemmap_nr(pgmap);
1034
c1a6c536 1035 return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
9420f89d
MRI
1036}
1037
1038static void __ref memmap_init_compound(struct page *head,
1039 unsigned long head_pfn,
1040 unsigned long zone_idx, int nid,
1041 struct dev_pagemap *pgmap,
1042 unsigned long nr_pages)
1043{
1044 unsigned long pfn, end_pfn = head_pfn + nr_pages;
1045 unsigned int order = pgmap->vmemmap_shift;
1046
1047 __SetPageHead(head);
1048 for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
1049 struct page *page = pfn_to_page(pfn);
1050
1051 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1052 prep_compound_tail(head, pfn - head_pfn);
1053 set_page_count(page, 0);
1054
1055 /*
1056 * The first tail page stores important compound page info.
1057 * Call prep_compound_head() after the first tail page has
1058 * been initialized, to not have the data overwritten.
1059 */
1060 if (pfn == head_pfn + 1)
1061 prep_compound_head(head, order);
1062 }
1063}
1064
1065void __ref memmap_init_zone_device(struct zone *zone,
1066 unsigned long start_pfn,
1067 unsigned long nr_pages,
1068 struct dev_pagemap *pgmap)
1069{
1070 unsigned long pfn, end_pfn = start_pfn + nr_pages;
1071 struct pglist_data *pgdat = zone->zone_pgdat;
1072 struct vmem_altmap *altmap = pgmap_altmap(pgmap);
1073 unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
1074 unsigned long zone_idx = zone_idx(zone);
1075 unsigned long start = jiffies;
1076 int nid = pgdat->node_id;
1077
1078 if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
1079 return;
1080
1081 /*
1082 * The call to memmap_init should have already taken care
1083 * of the pages reserved for the memmap, so we can just jump to
1084 * the end of that region and start processing the device pages.
1085 */
1086 if (altmap) {
1087 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
1088 nr_pages = end_pfn - start_pfn;
1089 }
1090
1091 for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
1092 struct page *page = pfn_to_page(pfn);
1093
1094 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1095
1096 if (pfns_per_compound == 1)
1097 continue;
1098
1099 memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
87a7ae75 1100 compound_nr_pages(altmap, pgmap));
9420f89d
MRI
1101 }
1102
dd31bad2 1103 pr_debug("%s initialised %lu pages in %ums\n", __func__,
9420f89d
MRI
1104 nr_pages, jiffies_to_msecs(jiffies - start));
1105}
1106#endif
1107
1108/*
1109 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
1110 * because it is sized independent of architecture. Unlike the other zones,
1111 * the starting point for ZONE_MOVABLE is not fixed. It may be different
1112 * in each node depending on the size of each node and how evenly kernelcore
1113 * is distributed. This helper function adjusts the zone ranges
1114 * provided by the architecture for a given node by using the end of the
1115 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
1116 * zones within a node are in order of monotonic increases memory addresses
1117 */
1118static void __init adjust_zone_range_for_zone_movable(int nid,
1119 unsigned long zone_type,
9420f89d
MRI
1120 unsigned long node_end_pfn,
1121 unsigned long *zone_start_pfn,
1122 unsigned long *zone_end_pfn)
1123{
1124 /* Only adjust if ZONE_MOVABLE is on this node */
1125 if (zone_movable_pfn[nid]) {
1126 /* Size ZONE_MOVABLE */
1127 if (zone_type == ZONE_MOVABLE) {
1128 *zone_start_pfn = zone_movable_pfn[nid];
1129 *zone_end_pfn = min(node_end_pfn,
1130 arch_zone_highest_possible_pfn[movable_zone]);
1131
1132 /* Adjust for ZONE_MOVABLE starting within this range */
1133 } else if (!mirrored_kernelcore &&
1134 *zone_start_pfn < zone_movable_pfn[nid] &&
1135 *zone_end_pfn > zone_movable_pfn[nid]) {
1136 *zone_end_pfn = zone_movable_pfn[nid];
1137
1138 /* Check if this whole range is within ZONE_MOVABLE */
1139 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
1140 *zone_start_pfn = *zone_end_pfn;
1141 }
1142}
1143
1144/*
1145 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
1146 * then all holes in the requested range will be accounted for.
1147 */
1148unsigned long __init __absent_pages_in_range(int nid,
1149 unsigned long range_start_pfn,
1150 unsigned long range_end_pfn)
1151{
1152 unsigned long nr_absent = range_end_pfn - range_start_pfn;
1153 unsigned long start_pfn, end_pfn;
1154 int i;
1155
1156 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
1157 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
1158 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
1159 nr_absent -= end_pfn - start_pfn;
1160 }
1161 return nr_absent;
1162}
1163
1164/**
1165 * absent_pages_in_range - Return number of page frames in holes within a range
1166 * @start_pfn: The start PFN to start searching for holes
1167 * @end_pfn: The end PFN to stop searching for holes
1168 *
1169 * Return: the number of pages frames in memory holes within a range.
1170 */
1171unsigned long __init absent_pages_in_range(unsigned long start_pfn,
1172 unsigned long end_pfn)
1173{
1174 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
1175}
1176
1177/* Return the number of page frames in holes in a zone on a node */
1178static unsigned long __init zone_absent_pages_in_node(int nid,
1179 unsigned long zone_type,
1c2d252f
HX
1180 unsigned long zone_start_pfn,
1181 unsigned long zone_end_pfn)
9420f89d 1182{
9420f89d
MRI
1183 unsigned long nr_absent;
1184
1c2d252f
HX
1185 /* zone is empty, we don't have any absent pages */
1186 if (zone_start_pfn == zone_end_pfn)
9420f89d
MRI
1187 return 0;
1188
9420f89d
MRI
1189 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
1190
1191 /*
1192 * ZONE_MOVABLE handling.
1193 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
1194 * and vice versa.
1195 */
1196 if (mirrored_kernelcore && zone_movable_pfn[nid]) {
1197 unsigned long start_pfn, end_pfn;
1198 struct memblock_region *r;
1199
1200 for_each_mem_region(r) {
1201 start_pfn = clamp(memblock_region_memory_base_pfn(r),
1202 zone_start_pfn, zone_end_pfn);
1203 end_pfn = clamp(memblock_region_memory_end_pfn(r),
1204 zone_start_pfn, zone_end_pfn);
1205
1206 if (zone_type == ZONE_MOVABLE &&
1207 memblock_is_mirror(r))
1208 nr_absent += end_pfn - start_pfn;
1209
1210 if (zone_type == ZONE_NORMAL &&
1211 !memblock_is_mirror(r))
1212 nr_absent += end_pfn - start_pfn;
1213 }
1214 }
1215
1216 return nr_absent;
1217}
1218
1219/*
1220 * Return the number of pages a zone spans in a node, including holes
1221 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
1222 */
1223static unsigned long __init zone_spanned_pages_in_node(int nid,
1224 unsigned long zone_type,
1225 unsigned long node_start_pfn,
1226 unsigned long node_end_pfn,
1227 unsigned long *zone_start_pfn,
1228 unsigned long *zone_end_pfn)
1229{
1230 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
1231 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
9420f89d
MRI
1232
1233 /* Get the start and end of the zone */
1234 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
1235 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
0792e47d
HX
1236 adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
1237 zone_start_pfn, zone_end_pfn);
9420f89d
MRI
1238
1239 /* Check that this node has pages within the zone's required range */
1240 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
1241 return 0;
1242
1243 /* Move the zone boundaries inside the node if necessary */
1244 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
1245 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
1246
1247 /* Return the spanned pages */
1248 return *zone_end_pfn - *zone_start_pfn;
1249}
1250
ba1b67c7
HX
1251static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
1252{
1253 struct zone *z;
1254
1255 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
1256 z->zone_start_pfn = 0;
1257 z->spanned_pages = 0;
1258 z->present_pages = 0;
1259#if defined(CONFIG_MEMORY_HOTPLUG)
1260 z->present_early_pages = 0;
1261#endif
1262 }
1263
1264 pgdat->node_spanned_pages = 0;
1265 pgdat->node_present_pages = 0;
1266 pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
1267}
1268
9420f89d
MRI
1269static void __init calculate_node_totalpages(struct pglist_data *pgdat,
1270 unsigned long node_start_pfn,
1271 unsigned long node_end_pfn)
1272{
1273 unsigned long realtotalpages = 0, totalpages = 0;
1274 enum zone_type i;
1275
1276 for (i = 0; i < MAX_NR_ZONES; i++) {
1277 struct zone *zone = pgdat->node_zones + i;
1278 unsigned long zone_start_pfn, zone_end_pfn;
1279 unsigned long spanned, absent;
1c2d252f 1280 unsigned long real_size;
9420f89d
MRI
1281
1282 spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
1283 node_start_pfn,
1284 node_end_pfn,
1285 &zone_start_pfn,
1286 &zone_end_pfn);
1287 absent = zone_absent_pages_in_node(pgdat->node_id, i,
1c2d252f
HX
1288 zone_start_pfn,
1289 zone_end_pfn);
9420f89d 1290
1c2d252f 1291 real_size = spanned - absent;
9420f89d 1292
1c2d252f 1293 if (spanned)
9420f89d
MRI
1294 zone->zone_start_pfn = zone_start_pfn;
1295 else
1296 zone->zone_start_pfn = 0;
1c2d252f 1297 zone->spanned_pages = spanned;
9420f89d
MRI
1298 zone->present_pages = real_size;
1299#if defined(CONFIG_MEMORY_HOTPLUG)
1300 zone->present_early_pages = real_size;
1301#endif
1302
1c2d252f 1303 totalpages += spanned;
9420f89d
MRI
1304 realtotalpages += real_size;
1305 }
1306
1307 pgdat->node_spanned_pages = totalpages;
1308 pgdat->node_present_pages = realtotalpages;
1309 pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1310}
1311
1312static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
1313 unsigned long present_pages)
1314{
1315 unsigned long pages = spanned_pages;
1316
1317 /*
1318 * Provide a more accurate estimation if there are holes within
1319 * the zone and SPARSEMEM is in use. If there are holes within the
1320 * zone, each populated memory region may cost us one or two extra
1321 * memmap pages due to alignment because memmap pages for each
1322 * populated regions may not be naturally aligned on page boundary.
1323 * So the (present_pages >> 4) heuristic is a tradeoff for that.
1324 */
1325 if (spanned_pages > present_pages + (present_pages >> 4) &&
1326 IS_ENABLED(CONFIG_SPARSEMEM))
1327 pages = present_pages;
1328
1329 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
1330}
1331
1332#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1333static void pgdat_init_split_queue(struct pglist_data *pgdat)
1334{
1335 struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
1336
1337 spin_lock_init(&ds_queue->split_queue_lock);
1338 INIT_LIST_HEAD(&ds_queue->split_queue);
1339 ds_queue->split_queue_len = 0;
1340}
1341#else
1342static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
1343#endif
1344
1345#ifdef CONFIG_COMPACTION
1346static void pgdat_init_kcompactd(struct pglist_data *pgdat)
1347{
1348 init_waitqueue_head(&pgdat->kcompactd_wait);
1349}
1350#else
1351static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
1352#endif
1353
1354static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
1355{
1356 int i;
1357
1358 pgdat_resize_init(pgdat);
1359 pgdat_kswapd_lock_init(pgdat);
1360
1361 pgdat_init_split_queue(pgdat);
1362 pgdat_init_kcompactd(pgdat);
1363
1364 init_waitqueue_head(&pgdat->kswapd_wait);
1365 init_waitqueue_head(&pgdat->pfmemalloc_wait);
1366
1367 for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
1368 init_waitqueue_head(&pgdat->reclaim_wait[i]);
1369
1370 pgdat_page_ext_init(pgdat);
1371 lruvec_init(&pgdat->__lruvec);
1372}
1373
1374static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
1375 unsigned long remaining_pages)
1376{
1377 atomic_long_set(&zone->managed_pages, remaining_pages);
1378 zone_set_nid(zone, nid);
1379 zone->name = zone_names[idx];
1380 zone->zone_pgdat = NODE_DATA(nid);
1381 spin_lock_init(&zone->lock);
1382 zone_seqlock_init(zone);
1383 zone_pcp_init(zone);
1384}
1385
1386static void __meminit zone_init_free_lists(struct zone *zone)
1387{
1388 unsigned int order, t;
1389 for_each_migratetype_order(order, t) {
1390 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1391 zone->free_area[order].nr_free = 0;
1392 }
dcdfdd40
KS
1393
1394#ifdef CONFIG_UNACCEPTED_MEMORY
1395 INIT_LIST_HEAD(&zone->unaccepted_pages);
1396#endif
9420f89d
MRI
1397}
1398
1399void __meminit init_currently_empty_zone(struct zone *zone,
1400 unsigned long zone_start_pfn,
1401 unsigned long size)
1402{
1403 struct pglist_data *pgdat = zone->zone_pgdat;
1404 int zone_idx = zone_idx(zone) + 1;
1405
1406 if (zone_idx > pgdat->nr_zones)
1407 pgdat->nr_zones = zone_idx;
1408
1409 zone->zone_start_pfn = zone_start_pfn;
1410
1411 mminit_dprintk(MMINIT_TRACE, "memmap_init",
1412 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
1413 pgdat->node_id,
1414 (unsigned long)zone_idx(zone),
1415 zone_start_pfn, (zone_start_pfn + size));
1416
1417 zone_init_free_lists(zone);
1418 zone->initialized = 1;
1419}
1420
1421#ifndef CONFIG_SPARSEMEM
1422/*
1423 * Calculate the size of the zone->blockflags rounded to an unsigned long
1424 * Start by making sure zonesize is a multiple of pageblock_order by rounding
1425 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
1426 * round what is now in bits to nearest long in bits, then return it in
1427 * bytes.
1428 */
1429static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
1430{
1431 unsigned long usemapsize;
1432
1433 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
1434 usemapsize = roundup(zonesize, pageblock_nr_pages);
1435 usemapsize = usemapsize >> pageblock_order;
1436 usemapsize *= NR_PAGEBLOCK_BITS;
daee07bf 1437 usemapsize = roundup(usemapsize, BITS_PER_LONG);
9420f89d 1438
daee07bf 1439 return usemapsize / BITS_PER_BYTE;
9420f89d
MRI
1440}
1441
1442static void __ref setup_usemap(struct zone *zone)
1443{
1444 unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
1445 zone->spanned_pages);
1446 zone->pageblock_flags = NULL;
1447 if (usemapsize) {
1448 zone->pageblock_flags =
1449 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
1450 zone_to_nid(zone));
1451 if (!zone->pageblock_flags)
1452 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
1453 usemapsize, zone->name, zone_to_nid(zone));
1454 }
1455}
1456#else
1457static inline void setup_usemap(struct zone *zone) {}
1458#endif /* CONFIG_SPARSEMEM */
1459
1460#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
1461
1462/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
1463void __init set_pageblock_order(void)
1464{
5e0a760b 1465 unsigned int order = MAX_PAGE_ORDER;
9420f89d
MRI
1466
1467 /* Check that pageblock_nr_pages has not already been setup */
1468 if (pageblock_order)
1469 return;
1470
1471 /* Don't let pageblocks exceed the maximum allocation granularity. */
1472 if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
1473 order = HUGETLB_PAGE_ORDER;
1474
1475 /*
1476 * Assume the largest contiguous order of interest is a huge page.
e99fb98d 1477 * This value may be variable depending on boot parameters on powerpc.
9420f89d
MRI
1478 */
1479 pageblock_order = order;
1480}
1481#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1482
1483/*
1484 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
1485 * is unused as pageblock_order is set at compile-time. See
1486 * include/linux/pageblock-flags.h for the values of pageblock_order based on
1487 * the kernel config
1488 */
1489void __init set_pageblock_order(void)
1490{
1491}
1492
1493#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1494
1495/*
1496 * Set up the zone data structures
1497 * - init pgdat internals
1498 * - init all zones belonging to this node
1499 *
1500 * NOTE: this function is only called during memory hotplug
1501 */
1502#ifdef CONFIG_MEMORY_HOTPLUG
1503void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
1504{
1505 int nid = pgdat->node_id;
1506 enum zone_type z;
1507 int cpu;
1508
1509 pgdat_init_internals(pgdat);
1510
1511 if (pgdat->per_cpu_nodestats == &boot_nodestats)
1512 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1513
1514 /*
1515 * Reset the nr_zones, order and highest_zoneidx before reuse.
1516 * Note that kswapd will init kswapd_highest_zoneidx properly
1517 * when it starts in the near future.
1518 */
1519 pgdat->nr_zones = 0;
1520 pgdat->kswapd_order = 0;
1521 pgdat->kswapd_highest_zoneidx = 0;
1522 pgdat->node_start_pfn = 0;
32b6a4a1
HX
1523 pgdat->node_present_pages = 0;
1524
9420f89d
MRI
1525 for_each_online_cpu(cpu) {
1526 struct per_cpu_nodestat *p;
1527
1528 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
1529 memset(p, 0, sizeof(*p));
1530 }
1531
32b6a4a1
HX
1532 /*
1533 * When memory is hot-added, all the memory is in offline state. So
1534 * clear all zones' present_pages and managed_pages because they will
1535 * be updated in online_pages() and offline_pages().
1536 */
1537 for (z = 0; z < MAX_NR_ZONES; z++) {
1538 struct zone *zone = pgdat->node_zones + z;
1539
1540 zone->present_pages = 0;
1541 zone_init_internals(zone, z, nid, 0);
1542 }
9420f89d
MRI
1543}
1544#endif
1545
1546/*
1547 * Set up the zone data structures:
1548 * - mark all pages reserved
1549 * - mark all memory queues empty
1550 * - clear the memory bitmaps
1551 *
1552 * NOTE: pgdat should get zeroed by caller.
1553 * NOTE: this function is only called during early init.
1554 */
1555static void __init free_area_init_core(struct pglist_data *pgdat)
1556{
1557 enum zone_type j;
1558 int nid = pgdat->node_id;
1559
1560 pgdat_init_internals(pgdat);
1561 pgdat->per_cpu_nodestats = &boot_nodestats;
1562
1563 for (j = 0; j < MAX_NR_ZONES; j++) {
1564 struct zone *zone = pgdat->node_zones + j;
1565 unsigned long size, freesize, memmap_pages;
1566
1567 size = zone->spanned_pages;
1568 freesize = zone->present_pages;
1569
1570 /*
1571 * Adjust freesize so that it accounts for how much memory
1572 * is used by this zone for memmap. This affects the watermark
1573 * and per-cpu initialisations
1574 */
1575 memmap_pages = calc_memmap_size(size, freesize);
1576 if (!is_highmem_idx(j)) {
1577 if (freesize >= memmap_pages) {
1578 freesize -= memmap_pages;
1579 if (memmap_pages)
1580 pr_debug(" %s zone: %lu pages used for memmap\n",
1581 zone_names[j], memmap_pages);
1582 } else
1583 pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
1584 zone_names[j], memmap_pages, freesize);
1585 }
1586
1587 /* Account for reserved pages */
1588 if (j == 0 && freesize > dma_reserve) {
1589 freesize -= dma_reserve;
1590 pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
1591 }
1592
1593 if (!is_highmem_idx(j))
1594 nr_kernel_pages += freesize;
1595 /* Charge for highmem memmap if there are enough kernel pages */
1596 else if (nr_kernel_pages > memmap_pages * 2)
1597 nr_kernel_pages -= memmap_pages;
1598 nr_all_pages += freesize;
1599
1600 /*
1601 * Set an approximate value for lowmem here, it will be adjusted
1602 * when the bootmem allocator frees pages into the buddy system.
1603 * And all highmem pages will be managed by the buddy system.
1604 */
1605 zone_init_internals(zone, j, nid, freesize);
1606
1607 if (!size)
1608 continue;
1609
9420f89d
MRI
1610 setup_usemap(zone);
1611 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
1612 }
1613}
1614
1615void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
1616 phys_addr_t min_addr, int nid, bool exact_nid)
1617{
1618 void *ptr;
1619
1620 if (exact_nid)
1621 ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
1622 MEMBLOCK_ALLOC_ACCESSIBLE,
1623 nid);
1624 else
1625 ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
1626 MEMBLOCK_ALLOC_ACCESSIBLE,
1627 nid);
1628
1629 if (ptr && size > 0)
1630 page_init_poison(ptr, size);
1631
1632 return ptr;
1633}
1634
1635#ifdef CONFIG_FLATMEM
1636static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1637{
e99fb98d
KW
1638 unsigned long start, offset, size, end;
1639 struct page *map;
9420f89d
MRI
1640
1641 /* Skip empty nodes */
1642 if (!pgdat->node_spanned_pages)
1643 return;
1644
1645 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
1646 offset = pgdat->node_start_pfn - start;
9420f89d 1647 /*
5e0a760b 1648 * The zone's endpoints aren't required to be MAX_PAGE_ORDER
e99fb98d
KW
1649 * aligned but the node_mem_map endpoints must be in order
1650 * for the buddy allocator to function correctly.
9420f89d 1651 */
e99fb98d
KW
1652 end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES);
1653 size = (end - start) * sizeof(struct page);
1654 map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
1655 pgdat->node_id, false);
1656 if (!map)
1657 panic("Failed to allocate %ld bytes for node %d memory map\n",
1658 size, pgdat->node_id);
1659 pgdat->node_mem_map = map + offset;
1660 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
1661 __func__, pgdat->node_id, (unsigned long)pgdat,
1662 (unsigned long)pgdat->node_mem_map);
1663#ifndef CONFIG_NUMA
1664 /* the global mem_map is just set as node 0's */
9420f89d
MRI
1665 if (pgdat == NODE_DATA(0)) {
1666 mem_map = NODE_DATA(0)->node_mem_map;
1667 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
1668 mem_map -= offset;
1669 }
1670#endif
1671}
1672#else
1673static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
1674#endif /* CONFIG_FLATMEM */
1675
1676/**
1677 * get_pfn_range_for_nid - Return the start and end page frames for a node
1678 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
1679 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
1680 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
1681 *
1682 * It returns the start and end page frame of a node based on information
1683 * provided by memblock_set_node(). If called for a node
3a29280a 1684 * with no available memory, the start and end PFNs will be 0.
9420f89d
MRI
1685 */
1686void __init get_pfn_range_for_nid(unsigned int nid,
1687 unsigned long *start_pfn, unsigned long *end_pfn)
1688{
1689 unsigned long this_start_pfn, this_end_pfn;
1690 int i;
1691
1692 *start_pfn = -1UL;
1693 *end_pfn = 0;
1694
1695 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
1696 *start_pfn = min(*start_pfn, this_start_pfn);
1697 *end_pfn = max(*end_pfn, this_end_pfn);
1698 }
1699
1700 if (*start_pfn == -1UL)
1701 *start_pfn = 0;
1702}
1703
1704static void __init free_area_init_node(int nid)
1705{
1706 pg_data_t *pgdat = NODE_DATA(nid);
1707 unsigned long start_pfn = 0;
1708 unsigned long end_pfn = 0;
1709
1710 /* pg_data_t should be reset to zero when it's allocated */
1711 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
1712
1713 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1714
1715 pgdat->node_id = nid;
1716 pgdat->node_start_pfn = start_pfn;
1717 pgdat->per_cpu_nodestats = NULL;
1718
1719 if (start_pfn != end_pfn) {
1720 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
1721 (u64)start_pfn << PAGE_SHIFT,
1722 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
ba1b67c7
HX
1723
1724 calculate_node_totalpages(pgdat, start_pfn, end_pfn);
9420f89d
MRI
1725 } else {
1726 pr_info("Initmem setup node %d as memoryless\n", nid);
9420f89d 1727
ba1b67c7
HX
1728 reset_memoryless_node_totalpages(pgdat);
1729 }
9420f89d
MRI
1730
1731 alloc_node_mem_map(pgdat);
1732 pgdat_set_deferred_range(pgdat);
1733
1734 free_area_init_core(pgdat);
1735 lru_gen_init_pgdat(pgdat);
1736}
1737
1738/* Any regular or high memory on that node ? */
b894da04 1739static void __init check_for_memory(pg_data_t *pgdat)
9420f89d
MRI
1740{
1741 enum zone_type zone_type;
1742
1743 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
1744 struct zone *zone = &pgdat->node_zones[zone_type];
1745 if (populated_zone(zone)) {
1746 if (IS_ENABLED(CONFIG_HIGHMEM))
91ff4d75 1747 node_set_state(pgdat->node_id, N_HIGH_MEMORY);
9420f89d 1748 if (zone_type <= ZONE_NORMAL)
91ff4d75 1749 node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
9420f89d
MRI
1750 break;
1751 }
1752 }
1753}
1754
1755#if MAX_NUMNODES > 1
1756/*
1757 * Figure out the number of possible node ids.
1758 */
1759void __init setup_nr_node_ids(void)
1760{
1761 unsigned int highest;
1762
1763 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
1764 nr_node_ids = highest + 1;
1765}
1766#endif
1767
9420f89d
MRI
1768/*
1769 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
1770 * such cases we allow max_zone_pfn sorted in the descending order
1771 */
5f300fd5 1772static bool arch_has_descending_max_zone_pfns(void)
9420f89d 1773{
5f300fd5 1774 return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
9420f89d
MRI
1775}
1776
1777/**
1778 * free_area_init - Initialise all pg_data_t and zone data
1779 * @max_zone_pfn: an array of max PFNs for each zone
1780 *
1781 * This will call free_area_init_node() for each active node in the system.
1782 * Using the page ranges provided by memblock_set_node(), the size of each
1783 * zone in each node and their holes is calculated. If the maximum PFN
1784 * between two adjacent zones match, it is assumed that the zone is empty.
1785 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
1786 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
1787 * starts where the previous one ended. For example, ZONE_DMA32 starts
1788 * at arch_max_dma_pfn.
1789 */
1790void __init free_area_init(unsigned long *max_zone_pfn)
1791{
1792 unsigned long start_pfn, end_pfn;
1793 int i, nid, zone;
1794 bool descending;
1795
1796 /* Record where the zone boundaries are */
1797 memset(arch_zone_lowest_possible_pfn, 0,
1798 sizeof(arch_zone_lowest_possible_pfn));
1799 memset(arch_zone_highest_possible_pfn, 0,
1800 sizeof(arch_zone_highest_possible_pfn));
1801
1802 start_pfn = PHYS_PFN(memblock_start_of_DRAM());
1803 descending = arch_has_descending_max_zone_pfns();
1804
1805 for (i = 0; i < MAX_NR_ZONES; i++) {
1806 if (descending)
1807 zone = MAX_NR_ZONES - i - 1;
1808 else
1809 zone = i;
1810
1811 if (zone == ZONE_MOVABLE)
1812 continue;
1813
1814 end_pfn = max(max_zone_pfn[zone], start_pfn);
1815 arch_zone_lowest_possible_pfn[zone] = start_pfn;
1816 arch_zone_highest_possible_pfn[zone] = end_pfn;
1817
1818 start_pfn = end_pfn;
1819 }
1820
1821 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
1822 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
1823 find_zone_movable_pfns_for_nodes();
1824
1825 /* Print out the zone ranges */
1826 pr_info("Zone ranges:\n");
1827 for (i = 0; i < MAX_NR_ZONES; i++) {
1828 if (i == ZONE_MOVABLE)
1829 continue;
1830 pr_info(" %-8s ", zone_names[i]);
1831 if (arch_zone_lowest_possible_pfn[i] ==
1832 arch_zone_highest_possible_pfn[i])
1833 pr_cont("empty\n");
1834 else
1835 pr_cont("[mem %#018Lx-%#018Lx]\n",
1836 (u64)arch_zone_lowest_possible_pfn[i]
1837 << PAGE_SHIFT,
1838 ((u64)arch_zone_highest_possible_pfn[i]
1839 << PAGE_SHIFT) - 1);
1840 }
1841
1842 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
1843 pr_info("Movable zone start for each node\n");
1844 for (i = 0; i < MAX_NUMNODES; i++) {
1845 if (zone_movable_pfn[i])
1846 pr_info(" Node %d: %#018Lx\n", i,
1847 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
1848 }
1849
1850 /*
1851 * Print out the early node map, and initialize the
1852 * subsection-map relative to active online memory ranges to
1853 * enable future "sub-section" extensions of the memory map.
1854 */
1855 pr_info("Early memory node ranges\n");
1856 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
1857 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
1858 (u64)start_pfn << PAGE_SHIFT,
1859 ((u64)end_pfn << PAGE_SHIFT) - 1);
1860 subsection_map_init(start_pfn, end_pfn - start_pfn);
1861 }
1862
1863 /* Initialise every node */
1864 mminit_verify_pageflags_layout();
1865 setup_nr_node_ids();
e3d9b45f
HX
1866 set_pageblock_order();
1867
9420f89d
MRI
1868 for_each_node(nid) {
1869 pg_data_t *pgdat;
1870
1871 if (!node_online(nid)) {
9420f89d
MRI
1872 /* Allocator not initialized yet */
1873 pgdat = arch_alloc_nodedata(nid);
1874 if (!pgdat)
1875 panic("Cannot allocate %zuB for node %d.\n",
1876 sizeof(*pgdat), nid);
1877 arch_refresh_nodedata(nid, pgdat);
837c2ba5 1878 free_area_init_node(nid);
9420f89d
MRI
1879
1880 /*
1881 * We do not want to confuse userspace by sysfs
1882 * files/directories for node without any memory
1883 * attached to it, so this node is not marked as
1884 * N_MEMORY and not marked online so that no sysfs
1885 * hierarchy will be created via register_one_node for
1886 * it. The pgdat will get fully initialized by
1887 * hotadd_init_pgdat() when memory is hotplugged into
1888 * this node.
1889 */
1890 continue;
1891 }
1892
1893 pgdat = NODE_DATA(nid);
1894 free_area_init_node(nid);
1895
1896 /* Any memory on that node */
1897 if (pgdat->node_present_pages)
1898 node_set_state(nid, N_MEMORY);
91ff4d75 1899 check_for_memory(pgdat);
9420f89d
MRI
1900 }
1901
1902 memmap_init();
534ef4e1
MRI
1903
1904 /* disable hash distribution for systems with a single node */
1905 fixup_hashdist();
9420f89d
MRI
1906}
1907
1908/**
1909 * node_map_pfn_alignment - determine the maximum internode alignment
1910 *
1911 * This function should be called after node map is populated and sorted.
1912 * It calculates the maximum power of two alignment which can distinguish
1913 * all the nodes.
1914 *
1915 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
1916 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
1917 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
1918 * shifted, 1GiB is enough and this function will indicate so.
1919 *
1920 * This is used to test whether pfn -> nid mapping of the chosen memory
1921 * model has fine enough granularity to avoid incorrect mapping for the
1922 * populated node map.
1923 *
1924 * Return: the determined alignment in pfn's. 0 if there is no alignment
1925 * requirement (single node).
1926 */
1927unsigned long __init node_map_pfn_alignment(void)
1928{
1929 unsigned long accl_mask = 0, last_end = 0;
1930 unsigned long start, end, mask;
1931 int last_nid = NUMA_NO_NODE;
1932 int i, nid;
1933
1934 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1935 if (!start || last_nid < 0 || last_nid == nid) {
1936 last_nid = nid;
1937 last_end = end;
1938 continue;
1939 }
1940
1941 /*
1942 * Start with a mask granular enough to pin-point to the
1943 * start pfn and tick off bits one-by-one until it becomes
1944 * too coarse to separate the current node from the last.
1945 */
1946 mask = ~((1 << __ffs(start)) - 1);
1947 while (mask && last_end <= (start & (mask << 1)))
1948 mask <<= 1;
1949
1950 /* accumulate all internode masks */
1951 accl_mask |= mask;
1952 }
1953
1954 /* convert mask to number of pages */
1955 return ~accl_mask + 1;
1956}
1957
1958#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1959static void __init deferred_free_range(unsigned long pfn,
1960 unsigned long nr_pages)
1961{
1962 struct page *page;
1963 unsigned long i;
1964
1965 if (!nr_pages)
1966 return;
1967
1968 page = pfn_to_page(pfn);
1969
1970 /* Free a large naturally-aligned chunk if possible */
3f6dac0f
KS
1971 if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
1972 for (i = 0; i < nr_pages; i += pageblock_nr_pages)
1973 set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
5e0a760b 1974 __free_pages_core(page, MAX_PAGE_ORDER);
9420f89d
MRI
1975 return;
1976 }
1977
5e0a760b 1978 /* Accept chunks smaller than MAX_PAGE_ORDER upfront */
dcdfdd40
KS
1979 accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
1980
9420f89d
MRI
1981 for (i = 0; i < nr_pages; i++, page++, pfn++) {
1982 if (pageblock_aligned(pfn))
1983 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1984 __free_pages_core(page, 0);
1985 }
1986}
1987
1988/* Completion tracking for deferred_init_memmap() threads */
1989static atomic_t pgdat_init_n_undone __initdata;
1990static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1991
1992static inline void __init pgdat_init_report_one_done(void)
1993{
1994 if (atomic_dec_and_test(&pgdat_init_n_undone))
1995 complete(&pgdat_init_all_done_comp);
1996}
1997
1998/*
1999 * Returns true if page needs to be initialized or freed to buddy allocator.
2000 *
5e0a760b
KS
2001 * We check if a current MAX_PAGE_ORDER block is valid by only checking the
2002 * validity of the head pfn.
9420f89d
MRI
2003 */
2004static inline bool __init deferred_pfn_valid(unsigned long pfn)
2005{
3f6dac0f 2006 if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
9420f89d
MRI
2007 return false;
2008 return true;
2009}
2010
2011/*
2012 * Free pages to buddy allocator. Try to free aligned pages in
3f6dac0f 2013 * MAX_ORDER_NR_PAGES sizes.
9420f89d
MRI
2014 */
2015static void __init deferred_free_pages(unsigned long pfn,
2016 unsigned long end_pfn)
2017{
2018 unsigned long nr_free = 0;
2019
2020 for (; pfn < end_pfn; pfn++) {
2021 if (!deferred_pfn_valid(pfn)) {
2022 deferred_free_range(pfn - nr_free, nr_free);
2023 nr_free = 0;
3f6dac0f 2024 } else if (IS_MAX_ORDER_ALIGNED(pfn)) {
9420f89d
MRI
2025 deferred_free_range(pfn - nr_free, nr_free);
2026 nr_free = 1;
2027 } else {
2028 nr_free++;
2029 }
2030 }
2031 /* Free the last block of pages to allocator */
2032 deferred_free_range(pfn - nr_free, nr_free);
2033}
2034
2035/*
2036 * Initialize struct pages. We minimize pfn page lookups and scheduler checks
3f6dac0f 2037 * by performing it only once every MAX_ORDER_NR_PAGES.
9420f89d
MRI
2038 * Return number of pages initialized.
2039 */
2040static unsigned long __init deferred_init_pages(struct zone *zone,
2041 unsigned long pfn,
2042 unsigned long end_pfn)
2043{
2044 int nid = zone_to_nid(zone);
2045 unsigned long nr_pages = 0;
2046 int zid = zone_idx(zone);
2047 struct page *page = NULL;
2048
2049 for (; pfn < end_pfn; pfn++) {
2050 if (!deferred_pfn_valid(pfn)) {
2051 page = NULL;
2052 continue;
3f6dac0f 2053 } else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) {
9420f89d
MRI
2054 page = pfn_to_page(pfn);
2055 } else {
2056 page++;
2057 }
2058 __init_single_page(page, pfn, zid, nid);
2059 nr_pages++;
2060 }
2061 return (nr_pages);
2062}
2063
2064/*
2065 * This function is meant to pre-load the iterator for the zone init.
2066 * Specifically it walks through the ranges until we are caught up to the
2067 * first_init_pfn value and exits there. If we never encounter the value we
2068 * return false indicating there are no valid ranges left.
2069 */
2070static bool __init
2071deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
2072 unsigned long *spfn, unsigned long *epfn,
2073 unsigned long first_init_pfn)
2074{
2075 u64 j;
2076
2077 /*
2078 * Start out by walking through the ranges in this zone that have
2079 * already been initialized. We don't need to do anything with them
2080 * so we just need to flush them out of the system.
2081 */
2082 for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
2083 if (*epfn <= first_init_pfn)
2084 continue;
2085 if (*spfn < first_init_pfn)
2086 *spfn = first_init_pfn;
2087 *i = j;
2088 return true;
2089 }
2090
2091 return false;
2092}
2093
2094/*
2095 * Initialize and free pages. We do it in two loops: first we initialize
2096 * struct page, then free to buddy allocator, because while we are
2097 * freeing pages we can access pages that are ahead (computing buddy
2098 * page in __free_one_page()).
2099 *
2100 * In order to try and keep some memory in the cache we have the loop
2101 * broken along max page order boundaries. This way we will not cause
2102 * any issues with the buddy page computation.
2103 */
2104static unsigned long __init
2105deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
2106 unsigned long *end_pfn)
2107{
2108 unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
2109 unsigned long spfn = *start_pfn, epfn = *end_pfn;
2110 unsigned long nr_pages = 0;
2111 u64 j = *i;
2112
2113 /* First we loop through and initialize the page values */
2114 for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
2115 unsigned long t;
2116
2117 if (mo_pfn <= *start_pfn)
2118 break;
2119
2120 t = min(mo_pfn, *end_pfn);
2121 nr_pages += deferred_init_pages(zone, *start_pfn, t);
2122
2123 if (mo_pfn < *end_pfn) {
2124 *start_pfn = mo_pfn;
2125 break;
2126 }
2127 }
2128
2129 /* Reset values and now loop through freeing pages as needed */
2130 swap(j, *i);
2131
2132 for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
2133 unsigned long t;
2134
2135 if (mo_pfn <= spfn)
2136 break;
2137
2138 t = min(mo_pfn, epfn);
2139 deferred_free_pages(spfn, t);
2140
2141 if (mo_pfn <= epfn)
2142 break;
2143 }
2144
2145 return nr_pages;
2146}
2147
2148static void __init
2149deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
2150 void *arg)
2151{
2152 unsigned long spfn, epfn;
2153 struct zone *zone = arg;
2154 u64 i;
2155
2156 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
2157
2158 /*
5e0a760b
KS
2159 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
2160 * we can avoid introducing any issues with the buddy allocator.
9420f89d
MRI
2161 */
2162 while (spfn < end_pfn) {
2163 deferred_init_maxorder(&i, zone, &spfn, &epfn);
2164 cond_resched();
2165 }
2166}
2167
2168/* An arch may override for more concurrency. */
2169__weak int __init
2170deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2171{
2172 return 1;
2173}
2174
2175/* Initialise remaining memory on a node */
2176static int __init deferred_init_memmap(void *data)
2177{
2178 pg_data_t *pgdat = data;
2179 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2180 unsigned long spfn = 0, epfn = 0;
2181 unsigned long first_init_pfn, flags;
2182 unsigned long start = jiffies;
2183 struct zone *zone;
2184 int zid, max_threads;
2185 u64 i;
2186
2187 /* Bind memory initialisation thread to a local node if possible */
2188 if (!cpumask_empty(cpumask))
2189 set_cpus_allowed_ptr(current, cpumask);
2190
2191 pgdat_resize_lock(pgdat, &flags);
2192 first_init_pfn = pgdat->first_deferred_pfn;
2193 if (first_init_pfn == ULONG_MAX) {
2194 pgdat_resize_unlock(pgdat, &flags);
2195 pgdat_init_report_one_done();
2196 return 0;
2197 }
2198
2199 /* Sanity check boundaries */
2200 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
2201 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
2202 pgdat->first_deferred_pfn = ULONG_MAX;
2203
2204 /*
2205 * Once we unlock here, the zone cannot be grown anymore, thus if an
2206 * interrupt thread must allocate this early in boot, zone must be
2207 * pre-grown prior to start of deferred page initialization.
2208 */
2209 pgdat_resize_unlock(pgdat, &flags);
2210
2211 /* Only the highest zone is deferred so find it */
2212 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2213 zone = pgdat->node_zones + zid;
2214 if (first_init_pfn < zone_end_pfn(zone))
2215 break;
2216 }
2217
2218 /* If the zone is empty somebody else may have cleared out the zone */
2219 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2220 first_init_pfn))
2221 goto zone_empty;
2222
2223 max_threads = deferred_page_init_max_threads(cpumask);
2224
2225 while (spfn < epfn) {
2226 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
2227 struct padata_mt_job job = {
2228 .thread_fn = deferred_init_memmap_chunk,
2229 .fn_arg = zone,
2230 .start = spfn,
2231 .size = epfn_align - spfn,
2232 .align = PAGES_PER_SECTION,
2233 .min_chunk = PAGES_PER_SECTION,
2234 .max_threads = max_threads,
eb522866 2235 .numa_aware = false,
9420f89d
MRI
2236 };
2237
2238 padata_do_multithreaded(&job);
2239 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2240 epfn_align);
2241 }
2242zone_empty:
2243 /* Sanity check that the next zone really is unpopulated */
2244 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
2245
2246 pr_info("node %d deferred pages initialised in %ums\n",
2247 pgdat->node_id, jiffies_to_msecs(jiffies - start));
2248
2249 pgdat_init_report_one_done();
2250 return 0;
2251}
2252
2253/*
2254 * If this zone has deferred pages, try to grow it by initializing enough
2255 * deferred pages to satisfy the allocation specified by order, rounded up to
2256 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
2257 * of SECTION_SIZE bytes by initializing struct pages in increments of
2258 * PAGES_PER_SECTION * sizeof(struct page) bytes.
2259 *
2260 * Return true when zone was grown, otherwise return false. We return true even
2261 * when we grow less than requested, to let the caller decide if there are
2262 * enough pages to satisfy the allocation.
2263 *
2264 * Note: We use noinline because this function is needed only during boot, and
2265 * it is called from a __ref function _deferred_grow_zone. This way we are
2266 * making sure that it is not inlined into permanent text section.
2267 */
2268bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
2269{
2270 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
2271 pg_data_t *pgdat = zone->zone_pgdat;
2272 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
2273 unsigned long spfn, epfn, flags;
2274 unsigned long nr_pages = 0;
2275 u64 i;
2276
2277 /* Only the last zone may have deferred pages */
2278 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
2279 return false;
2280
2281 pgdat_resize_lock(pgdat, &flags);
2282
2283 /*
2284 * If someone grew this zone while we were waiting for spinlock, return
2285 * true, as there might be enough pages already.
2286 */
2287 if (first_deferred_pfn != pgdat->first_deferred_pfn) {
2288 pgdat_resize_unlock(pgdat, &flags);
2289 return true;
2290 }
2291
2292 /* If the zone is empty somebody else may have cleared out the zone */
2293 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2294 first_deferred_pfn)) {
2295 pgdat->first_deferred_pfn = ULONG_MAX;
2296 pgdat_resize_unlock(pgdat, &flags);
2297 /* Retry only once. */
2298 return first_deferred_pfn != ULONG_MAX;
2299 }
2300
2301 /*
5e0a760b 2302 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
9420f89d
MRI
2303 * that we can avoid introducing any issues with the buddy
2304 * allocator.
2305 */
2306 while (spfn < epfn) {
2307 /* update our first deferred PFN for this section */
2308 first_deferred_pfn = spfn;
2309
2310 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
2311 touch_nmi_watchdog();
2312
2313 /* We should only stop along section boundaries */
2314 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2315 continue;
2316
2317 /* If our quota has been met we can stop here */
2318 if (nr_pages >= nr_pages_needed)
2319 break;
2320 }
2321
2322 pgdat->first_deferred_pfn = spfn;
2323 pgdat_resize_unlock(pgdat, &flags);
2324
2325 return nr_pages > 0;
2326}
2327
2328#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
2329
2330#ifdef CONFIG_CMA
2331void __init init_cma_reserved_pageblock(struct page *page)
2332{
2333 unsigned i = pageblock_nr_pages;
2334 struct page *p = page;
2335
2336 do {
2337 __ClearPageReserved(p);
2338 set_page_count(p, 0);
2339 } while (++p, --i);
2340
2341 set_pageblock_migratetype(page, MIGRATE_CMA);
2342 set_page_refcounted(page);
2343 __free_pages(page, pageblock_order);
2344
2345 adjust_managed_page_count(page, pageblock_nr_pages);
2346 page_zone(page)->cma_pages += pageblock_nr_pages;
2347}
2348#endif
2349
904d5857
KW
2350void set_zone_contiguous(struct zone *zone)
2351{
2352 unsigned long block_start_pfn = zone->zone_start_pfn;
2353 unsigned long block_end_pfn;
2354
2355 block_end_pfn = pageblock_end_pfn(block_start_pfn);
2356 for (; block_start_pfn < zone_end_pfn(zone);
2357 block_start_pfn = block_end_pfn,
2358 block_end_pfn += pageblock_nr_pages) {
2359
2360 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
2361
2362 if (!__pageblock_pfn_to_page(block_start_pfn,
2363 block_end_pfn, zone))
2364 return;
2365 cond_resched();
2366 }
2367
2368 /* We confirm that there is no hole */
2369 zone->contiguous = true;
2370}
2371
9420f89d
MRI
2372void __init page_alloc_init_late(void)
2373{
2374 struct zone *zone;
2375 int nid;
2376
2377#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
2378
2379 /* There will be num_node_state(N_MEMORY) threads */
2380 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
2381 for_each_node_state(nid, N_MEMORY) {
2382 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
2383 }
2384
2385 /* Block until all are initialised */
2386 wait_for_completion(&pgdat_init_all_done_comp);
2387
2388 /*
2389 * We initialized the rest of the deferred pages. Permanently disable
2390 * on-demand struct page initialization.
2391 */
2392 static_branch_disable(&deferred_pages);
2393
2394 /* Reinit limits that are based on free pages after the kernel is up */
2395 files_maxfiles_init();
2396#endif
2397
2398 buffer_init();
2399
2400 /* Discard memblock private memory */
2401 memblock_discard();
2402
2403 for_each_node_state(nid, N_MEMORY)
2404 shuffle_free_memory(NODE_DATA(nid));
2405
2406 for_each_populated_zone(zone)
2407 set_zone_contiguous(zone);
de57807e
MRI
2408
2409 /* Initialize page ext after all struct pages are initialized. */
2410 if (deferred_struct_pages)
2411 page_ext_init();
e95d372c
KW
2412
2413 page_alloc_sysctl_init();
9420f89d
MRI
2414}
2415
2416#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
2417/*
2418 * Returns the number of pages that arch has reserved but
2419 * is not known to alloc_large_system_hash().
2420 */
2421static unsigned long __init arch_reserved_kernel_pages(void)
2422{
2423 return 0;
2424}
2425#endif
2426
2427/*
2428 * Adaptive scale is meant to reduce sizes of hash tables on large memory
2429 * machines. As memory size is increased the scale is also increased but at
2430 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
2431 * quadruples the scale is increased by one, which means the size of hash table
2432 * only doubles, instead of quadrupling as well.
2433 * Because 32-bit systems cannot have large physical memory, where this scaling
2434 * makes sense, it is disabled on such platforms.
2435 */
2436#if __BITS_PER_LONG > 32
2437#define ADAPT_SCALE_BASE (64ul << 30)
2438#define ADAPT_SCALE_SHIFT 2
2439#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
2440#endif
2441
2442/*
2443 * allocate a large system hash table from bootmem
2444 * - it is assumed that the hash table must contain an exact power-of-2
2445 * quantity of entries
2446 * - limit is the number of hash buckets, not the total allocation size
2447 */
2448void *__init alloc_large_system_hash(const char *tablename,
2449 unsigned long bucketsize,
2450 unsigned long numentries,
2451 int scale,
2452 int flags,
2453 unsigned int *_hash_shift,
2454 unsigned int *_hash_mask,
2455 unsigned long low_limit,
2456 unsigned long high_limit)
2457{
2458 unsigned long long max = high_limit;
2459 unsigned long log2qty, size;
2460 void *table;
2461 gfp_t gfp_flags;
2462 bool virt;
2463 bool huge;
2464
2465 /* allow the kernel cmdline to have a say */
2466 if (!numentries) {
2467 /* round applicable memory size up to nearest megabyte */
2468 numentries = nr_kernel_pages;
2469 numentries -= arch_reserved_kernel_pages();
2470
2471 /* It isn't necessary when PAGE_SIZE >= 1MB */
2472 if (PAGE_SIZE < SZ_1M)
2473 numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
2474
2475#if __BITS_PER_LONG > 32
2476 if (!high_limit) {
2477 unsigned long adapt;
2478
2479 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
2480 adapt <<= ADAPT_SCALE_SHIFT)
2481 scale++;
2482 }
2483#endif
2484
2485 /* limit to 1 bucket per 2^scale bytes of low memory */
2486 if (scale > PAGE_SHIFT)
2487 numentries >>= (scale - PAGE_SHIFT);
2488 else
2489 numentries <<= (PAGE_SHIFT - scale);
2490
3fade62b 2491 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9420f89d
MRI
2492 numentries = PAGE_SIZE / bucketsize;
2493 }
2494 numentries = roundup_pow_of_two(numentries);
2495
2496 /* limit allocation size to 1/16 total memory by default */
2497 if (max == 0) {
2498 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2499 do_div(max, bucketsize);
2500 }
2501 max = min(max, 0x80000000ULL);
2502
2503 if (numentries < low_limit)
2504 numentries = low_limit;
2505 if (numentries > max)
2506 numentries = max;
2507
2508 log2qty = ilog2(numentries);
2509
2510 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
2511 do {
2512 virt = false;
2513 size = bucketsize << log2qty;
2514 if (flags & HASH_EARLY) {
2515 if (flags & HASH_ZERO)
2516 table = memblock_alloc(size, SMP_CACHE_BYTES);
2517 else
2518 table = memblock_alloc_raw(size,
2519 SMP_CACHE_BYTES);
5e0a760b 2520 } else if (get_order(size) > MAX_PAGE_ORDER || hashdist) {
9420f89d
MRI
2521 table = vmalloc_huge(size, gfp_flags);
2522 virt = true;
2523 if (table)
2524 huge = is_vm_area_hugepages(table);
2525 } else {
2526 /*
2527 * If bucketsize is not a power-of-two, we may free
2528 * some pages at the end of hash table which
2529 * alloc_pages_exact() automatically does
2530 */
2531 table = alloc_pages_exact(size, gfp_flags);
2532 kmemleak_alloc(table, size, 1, gfp_flags);
2533 }
2534 } while (!table && size > PAGE_SIZE && --log2qty);
2535
2536 if (!table)
2537 panic("Failed to allocate %s hash table\n", tablename);
2538
2539 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
2540 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
2541 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
2542
2543 if (_hash_shift)
2544 *_hash_shift = log2qty;
2545 if (_hash_mask)
2546 *_hash_mask = (1 << log2qty) - 1;
2547
2548 return table;
2549}
2550
2551/**
2552 * set_dma_reserve - set the specified number of pages reserved in the first zone
2553 * @new_dma_reserve: The number of pages to mark reserved
2554 *
2555 * The per-cpu batchsize and zone watermarks are determined by managed_pages.
2556 * In the DMA zone, a significant percentage may be consumed by kernel image
2557 * and other unfreeable allocations which can skew the watermarks badly. This
2558 * function may optionally be used to account for unfreeable pages in the
2559 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2560 * smaller per-cpu batchsize.
2561 */
2562void __init set_dma_reserve(unsigned long new_dma_reserve)
2563{
2564 dma_reserve = new_dma_reserve;
2565}
2566
2567void __init memblock_free_pages(struct page *page, unsigned long pfn,
2568 unsigned int order)
2569{
61167ad5
YD
2570
2571 if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
2572 int nid = early_pfn_to_nid(pfn);
2573
2574 if (!early_page_initialised(pfn, nid))
2575 return;
2576 }
2577
9420f89d
MRI
2578 if (!kmsan_memblock_free_pages(page, order)) {
2579 /* KMSAN will take care of these pages. */
2580 return;
2581 }
2582 __free_pages_core(page, order);
2583}
b7ec1bf3 2584
5e7d5da2
KW
2585DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
2586EXPORT_SYMBOL(init_on_alloc);
2587
2588DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
2589EXPORT_SYMBOL(init_on_free);
2590
f2fc4b44
MRI
2591static bool _init_on_alloc_enabled_early __read_mostly
2592 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
2593static int __init early_init_on_alloc(char *buf)
2594{
2595
2596 return kstrtobool(buf, &_init_on_alloc_enabled_early);
2597}
2598early_param("init_on_alloc", early_init_on_alloc);
2599
2600static bool _init_on_free_enabled_early __read_mostly
2601 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
2602static int __init early_init_on_free(char *buf)
2603{
2604 return kstrtobool(buf, &_init_on_free_enabled_early);
2605}
2606early_param("init_on_free", early_init_on_free);
2607
2608DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
2609
2610/*
2611 * Enable static keys related to various memory debugging and hardening options.
2612 * Some override others, and depend on early params that are evaluated in the
2613 * order of appearance. So we need to first gather the full picture of what was
2614 * enabled, and then make decisions.
2615 */
2616static void __init mem_debugging_and_hardening_init(void)
2617{
2618 bool page_poisoning_requested = false;
2619 bool want_check_pages = false;
2620
2621#ifdef CONFIG_PAGE_POISONING
2622 /*
2623 * Page poisoning is debug page alloc for some arches. If
2624 * either of those options are enabled, enable poisoning.
2625 */
2626 if (page_poisoning_enabled() ||
2627 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
2628 debug_pagealloc_enabled())) {
2629 static_branch_enable(&_page_poisoning_enabled);
2630 page_poisoning_requested = true;
2631 want_check_pages = true;
2632 }
2633#endif
2634
2635 if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
2636 page_poisoning_requested) {
2637 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
2638 "will take precedence over init_on_alloc and init_on_free\n");
2639 _init_on_alloc_enabled_early = false;
2640 _init_on_free_enabled_early = false;
2641 }
2642
2643 if (_init_on_alloc_enabled_early) {
2644 want_check_pages = true;
2645 static_branch_enable(&init_on_alloc);
2646 } else {
2647 static_branch_disable(&init_on_alloc);
2648 }
2649
2650 if (_init_on_free_enabled_early) {
2651 want_check_pages = true;
2652 static_branch_enable(&init_on_free);
2653 } else {
2654 static_branch_disable(&init_on_free);
2655 }
2656
2657 if (IS_ENABLED(CONFIG_KMSAN) &&
2658 (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
2659 pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
2660
2661#ifdef CONFIG_DEBUG_PAGEALLOC
2662 if (debug_pagealloc_enabled()) {
2663 want_check_pages = true;
2664 static_branch_enable(&_debug_pagealloc_enabled);
2665
2666 if (debug_guardpage_minorder())
2667 static_branch_enable(&_debug_guardpage_enabled);
2668 }
2669#endif
2670
2671 /*
2672 * Any page debugging or hardening option also enables sanity checking
2673 * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
2674 * enabled already.
2675 */
2676 if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
2677 static_branch_enable(&check_pages_enabled);
2678}
2679
b7ec1bf3
MRI
2680/* Report memory auto-initialization states for this boot. */
2681static void __init report_meminit(void)
2682{
2683 const char *stack;
2684
2685 if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
2686 stack = "all(pattern)";
2687 else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
2688 stack = "all(zero)";
2689 else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
2690 stack = "byref_all(zero)";
2691 else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
2692 stack = "byref(zero)";
2693 else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
2694 stack = "__user(zero)";
2695 else
2696 stack = "off";
2697
2698 pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
2699 stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
2700 want_init_on_free() ? "on" : "off");
2701 if (want_init_on_free())
2702 pr_info("mem auto-init: clearing system memory may take some time...\n");
2703}
2704
eb8589b4
MRI
2705static void __init mem_init_print_info(void)
2706{
2707 unsigned long physpages, codesize, datasize, rosize, bss_size;
2708 unsigned long init_code_size, init_data_size;
2709
2710 physpages = get_num_physpages();
2711 codesize = _etext - _stext;
2712 datasize = _edata - _sdata;
2713 rosize = __end_rodata - __start_rodata;
2714 bss_size = __bss_stop - __bss_start;
2715 init_data_size = __init_end - __init_begin;
2716 init_code_size = _einittext - _sinittext;
2717
2718 /*
2719 * Detect special cases and adjust section sizes accordingly:
2720 * 1) .init.* may be embedded into .data sections
2721 * 2) .init.text.* may be out of [__init_begin, __init_end],
2722 * please refer to arch/tile/kernel/vmlinux.lds.S.
2723 * 3) .rodata.* may be embedded into .text or .data sections.
2724 */
2725#define adj_init_size(start, end, size, pos, adj) \
2726 do { \
2727 if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
2728 size -= adj; \
2729 } while (0)
2730
2731 adj_init_size(__init_begin, __init_end, init_data_size,
2732 _sinittext, init_code_size);
2733 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
2734 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
2735 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
2736 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
2737
2738#undef adj_init_size
2739
2740 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
2741#ifdef CONFIG_HIGHMEM
2742 ", %luK highmem"
2743#endif
2744 ")\n",
2745 K(nr_free_pages()), K(physpages),
2746 codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
2747 (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
2748 K(physpages - totalram_pages() - totalcma_pages),
2749 K(totalcma_pages)
2750#ifdef CONFIG_HIGHMEM
2751 , K(totalhigh_pages())
2752#endif
2753 );
2754}
2755
b7ec1bf3
MRI
2756/*
2757 * Set up kernel memory allocators
2758 */
2759void __init mm_core_init(void)
2760{
2761 /* Initializations relying on SMP setup */
2762 build_all_zonelists(NULL);
2763 page_alloc_init_cpuhp();
2764
2765 /*
2766 * page_ext requires contiguous pages,
5e0a760b 2767 * bigger than MAX_PAGE_ORDER unless SPARSEMEM.
b7ec1bf3
MRI
2768 */
2769 page_ext_init_flatmem();
f2fc4b44 2770 mem_debugging_and_hardening_init();
cabdf74e 2771 kfence_alloc_pool_and_metadata();
b7ec1bf3
MRI
2772 report_meminit();
2773 kmsan_init_shadow();
2774 stack_depot_early_init();
2775 mem_init();
2776 mem_init_print_info();
2777 kmem_cache_init();
2778 /*
2779 * page_owner must be initialized after buddy is ready, and also after
2780 * slab is ready so that stack_depot_init() works properly
2781 */
2782 page_ext_init_flatmem_late();
2783 kmemleak_init();
4cd1e9ed
MRI
2784 ptlock_cache_init();
2785 pgtable_cache_init();
b7ec1bf3
MRI
2786 debug_objects_mem_init();
2787 vmalloc_init();
2788 /* If no deferred init page_ext now, as vmap is fully initialized */
2789 if (!deferred_struct_pages)
2790 page_ext_init();
2791 /* Should be run before the first non-init thread is created */
2792 init_espfix_bsp();
2793 /* Should be run after espfix64 is set up. */
2794 pti_init();
2795 kmsan_init_runtime();
2796 mm_cache_init();
f6bec26c 2797 execmem_init();
b7ec1bf3 2798}