io_uring: clean poll ->private flagging
[linux-block.git] / mm / sparse-vmemmap.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
8f6aac41
CL
2/*
3 * Virtual Memory Map support
4 *
cde53535 5 * (C) 2007 sgi. Christoph Lameter.
8f6aac41
CL
6 *
7 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
8 * virt_to_page, page_address() to be implemented as a base offset
9 * calculation without memory access.
10 *
11 * However, virtual mappings need a page table and TLBs. Many Linux
12 * architectures already map their physical space using 1-1 mappings
b595076a 13 * via TLBs. For those arches the virtual memory map is essentially
8f6aac41
CL
14 * for free if we use the same page size as the 1-1 mappings. In that
15 * case the overhead consists of a few additional pages that are
16 * allocated to create a view of memory for vmemmap.
17 *
29c71111
AW
18 * The architecture is expected to provide a vmemmap_populate() function
19 * to instantiate the mapping.
8f6aac41
CL
20 */
21#include <linux/mm.h>
22#include <linux/mmzone.h>
97ad1087 23#include <linux/memblock.h>
4b94ffdc 24#include <linux/memremap.h>
8f6aac41 25#include <linux/highmem.h>
5a0e3ad6 26#include <linux/slab.h>
8f6aac41
CL
27#include <linux/spinlock.h>
28#include <linux/vmalloc.h>
8bca44bb 29#include <linux/sched.h>
f41f2ed4
MS
30#include <linux/pgtable.h>
31#include <linux/bootmem_info.h>
32
8f6aac41
CL
33#include <asm/dma.h>
34#include <asm/pgalloc.h>
f41f2ed4
MS
35#include <asm/tlbflush.h>
36
47010c04 37#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
f41f2ed4
MS
38/**
39 * struct vmemmap_remap_walk - walk vmemmap page table
40 *
41 * @remap_pte: called for each lowest-level entry (PTE).
3bc2b6a7 42 * @nr_walked: the number of walked pte.
f41f2ed4
MS
43 * @reuse_page: the page which is reused for the tail vmemmap pages.
44 * @reuse_addr: the virtual address of the @reuse_page page.
ad2fa371
MS
45 * @vmemmap_pages: the list head of the vmemmap pages that can be freed
46 * or is mapped from.
f41f2ed4
MS
47 */
48struct vmemmap_remap_walk {
49 void (*remap_pte)(pte_t *pte, unsigned long addr,
50 struct vmemmap_remap_walk *walk);
3bc2b6a7 51 unsigned long nr_walked;
f41f2ed4
MS
52 struct page *reuse_page;
53 unsigned long reuse_addr;
54 struct list_head *vmemmap_pages;
55};
56
d8d55f56 57static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
3bc2b6a7
MS
58{
59 pmd_t __pmd;
60 int i;
61 unsigned long addr = start;
62 struct page *page = pmd_page(*pmd);
63 pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
64
65 if (!pgtable)
66 return -ENOMEM;
67
68 pmd_populate_kernel(&init_mm, &__pmd, pgtable);
69
70 for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
71 pte_t entry, *pte;
72 pgprot_t pgprot = PAGE_KERNEL;
73
74 entry = mk_pte(page + i, pgprot);
75 pte = pte_offset_kernel(&__pmd, addr);
76 set_pte_at(&init_mm, addr, pte, entry);
77 }
78
d8d55f56
MS
79 spin_lock(&init_mm.page_table_lock);
80 if (likely(pmd_leaf(*pmd))) {
39d35ede
MS
81 /*
82 * Higher order allocations from buddy allocator must be able to
83 * be treated as indepdenent small pages (as they can be freed
84 * individually).
85 */
86 if (!PageReserved(page))
87 split_page(page, get_order(PMD_SIZE));
88
d8d55f56
MS
89 /* Make pte visible before pmd. See comment in pmd_install(). */
90 smp_wmb();
91 pmd_populate_kernel(&init_mm, pmd, pgtable);
92 flush_tlb_kernel_range(start, start + PMD_SIZE);
93 } else {
94 pte_free_kernel(&init_mm, pgtable);
95 }
96 spin_unlock(&init_mm.page_table_lock);
3bc2b6a7
MS
97
98 return 0;
99}
100
d8d55f56
MS
101static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
102{
103 int leaf;
104
105 spin_lock(&init_mm.page_table_lock);
106 leaf = pmd_leaf(*pmd);
107 spin_unlock(&init_mm.page_table_lock);
108
109 if (!leaf)
110 return 0;
111
112 return __split_vmemmap_huge_pmd(pmd, start);
113}
114
f41f2ed4
MS
115static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
116 unsigned long end,
117 struct vmemmap_remap_walk *walk)
118{
119 pte_t *pte = pte_offset_kernel(pmd, addr);
120
121 /*
122 * The reuse_page is found 'first' in table walk before we start
123 * remapping (which is calling @walk->remap_pte).
124 */
125 if (!walk->reuse_page) {
126 walk->reuse_page = pte_page(*pte);
127 /*
128 * Because the reuse address is part of the range that we are
129 * walking, skip the reuse address range.
130 */
131 addr += PAGE_SIZE;
132 pte++;
3bc2b6a7 133 walk->nr_walked++;
f41f2ed4
MS
134 }
135
3bc2b6a7 136 for (; addr != end; addr += PAGE_SIZE, pte++) {
f41f2ed4 137 walk->remap_pte(pte, addr, walk);
3bc2b6a7
MS
138 walk->nr_walked++;
139 }
f41f2ed4
MS
140}
141
3bc2b6a7
MS
142static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
143 unsigned long end,
144 struct vmemmap_remap_walk *walk)
f41f2ed4
MS
145{
146 pmd_t *pmd;
147 unsigned long next;
148
149 pmd = pmd_offset(pud, addr);
150 do {
d8d55f56
MS
151 int ret;
152
153 ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
154 if (ret)
155 return ret;
f41f2ed4
MS
156
157 next = pmd_addr_end(addr, end);
158 vmemmap_pte_range(pmd, addr, next, walk);
159 } while (pmd++, addr = next, addr != end);
3bc2b6a7
MS
160
161 return 0;
f41f2ed4
MS
162}
163
3bc2b6a7
MS
164static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
165 unsigned long end,
166 struct vmemmap_remap_walk *walk)
f41f2ed4
MS
167{
168 pud_t *pud;
169 unsigned long next;
170
171 pud = pud_offset(p4d, addr);
172 do {
3bc2b6a7
MS
173 int ret;
174
f41f2ed4 175 next = pud_addr_end(addr, end);
3bc2b6a7
MS
176 ret = vmemmap_pmd_range(pud, addr, next, walk);
177 if (ret)
178 return ret;
f41f2ed4 179 } while (pud++, addr = next, addr != end);
3bc2b6a7
MS
180
181 return 0;
f41f2ed4
MS
182}
183
3bc2b6a7
MS
184static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
185 unsigned long end,
186 struct vmemmap_remap_walk *walk)
f41f2ed4
MS
187{
188 p4d_t *p4d;
189 unsigned long next;
190
191 p4d = p4d_offset(pgd, addr);
192 do {
3bc2b6a7
MS
193 int ret;
194
f41f2ed4 195 next = p4d_addr_end(addr, end);
3bc2b6a7
MS
196 ret = vmemmap_pud_range(p4d, addr, next, walk);
197 if (ret)
198 return ret;
f41f2ed4 199 } while (p4d++, addr = next, addr != end);
3bc2b6a7
MS
200
201 return 0;
f41f2ed4
MS
202}
203
3bc2b6a7
MS
204static int vmemmap_remap_range(unsigned long start, unsigned long end,
205 struct vmemmap_remap_walk *walk)
f41f2ed4
MS
206{
207 unsigned long addr = start;
208 unsigned long next;
209 pgd_t *pgd;
210
211 VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
212 VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
213
214 pgd = pgd_offset_k(addr);
215 do {
3bc2b6a7
MS
216 int ret;
217
f41f2ed4 218 next = pgd_addr_end(addr, end);
3bc2b6a7
MS
219 ret = vmemmap_p4d_range(pgd, addr, next, walk);
220 if (ret)
221 return ret;
f41f2ed4
MS
222 } while (pgd++, addr = next, addr != end);
223
224 /*
225 * We only change the mapping of the vmemmap virtual address range
226 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
227 * belongs to the range.
228 */
229 flush_tlb_kernel_range(start + PAGE_SIZE, end);
3bc2b6a7
MS
230
231 return 0;
f41f2ed4
MS
232}
233
234/*
235 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
236 * allocator or buddy allocator. If the PG_reserved flag is set, it means
237 * that it allocated from the memblock allocator, just free it via the
238 * free_bootmem_page(). Otherwise, use __free_page().
239 */
240static inline void free_vmemmap_page(struct page *page)
241{
242 if (PageReserved(page))
243 free_bootmem_page(page);
244 else
245 __free_page(page);
246}
247
248/* Free a list of the vmemmap pages */
249static void free_vmemmap_page_list(struct list_head *list)
250{
251 struct page *page, *next;
252
253 list_for_each_entry_safe(page, next, list, lru) {
254 list_del(&page->lru);
255 free_vmemmap_page(page);
256 }
257}
258
259static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
260 struct vmemmap_remap_walk *walk)
261{
262 /*
263 * Remap the tail pages as read-only to catch illegal write operation
264 * to the tail pages.
265 */
266 pgprot_t pgprot = PAGE_KERNEL_RO;
267 pte_t entry = mk_pte(walk->reuse_page, pgprot);
268 struct page *page = pte_page(*pte);
269
3bc2b6a7 270 list_add_tail(&page->lru, walk->vmemmap_pages);
f41f2ed4
MS
271 set_pte_at(&init_mm, addr, pte, entry);
272}
273
e7d32485
MS
274/*
275 * How many struct page structs need to be reset. When we reuse the head
276 * struct page, the special metadata (e.g. page->flags or page->mapping)
277 * cannot copy to the tail struct page structs. The invalid value will be
278 * checked in the free_tail_pages_check(). In order to avoid the message
279 * of "corrupted mapping in tail page". We need to reset at least 3 (one
280 * head struct page struct and two tail struct page structs) struct page
281 * structs.
282 */
283#define NR_RESET_STRUCT_PAGE 3
284
285static inline void reset_struct_pages(struct page *start)
286{
287 int i;
288 struct page *from = start + NR_RESET_STRUCT_PAGE;
289
290 for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
291 memcpy(start + i, from, sizeof(*from));
292}
293
3bc2b6a7
MS
294static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
295 struct vmemmap_remap_walk *walk)
296{
297 pgprot_t pgprot = PAGE_KERNEL;
298 struct page *page;
299 void *to;
300
301 BUG_ON(pte_page(*pte) != walk->reuse_page);
302
303 page = list_first_entry(walk->vmemmap_pages, struct page, lru);
304 list_del(&page->lru);
305 to = page_to_virt(page);
306 copy_page(to, (void *)walk->reuse_addr);
e7d32485 307 reset_struct_pages(to);
3bc2b6a7
MS
308
309 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
310}
311
f41f2ed4
MS
312/**
313 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
314 * to the page which @reuse is mapped to, then free vmemmap
315 * which the range are mapped to.
316 * @start: start address of the vmemmap virtual address range that we want
317 * to remap.
318 * @end: end address of the vmemmap virtual address range that we want to
319 * remap.
320 * @reuse: reuse address.
321 *
3bc2b6a7 322 * Return: %0 on success, negative error code otherwise.
f41f2ed4 323 */
3bc2b6a7
MS
324int vmemmap_remap_free(unsigned long start, unsigned long end,
325 unsigned long reuse)
f41f2ed4 326{
3bc2b6a7 327 int ret;
f41f2ed4
MS
328 LIST_HEAD(vmemmap_pages);
329 struct vmemmap_remap_walk walk = {
330 .remap_pte = vmemmap_remap_pte,
331 .reuse_addr = reuse,
332 .vmemmap_pages = &vmemmap_pages,
333 };
334
335 /*
336 * In order to make remapping routine most efficient for the huge pages,
337 * the routine of vmemmap page table walking has the following rules
338 * (see more details from the vmemmap_pte_range()):
339 *
340 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
341 * should be continuous.
342 * - The @reuse address is part of the range [@reuse, @end) that we are
343 * walking which is passed to vmemmap_remap_range().
344 * - The @reuse address is the first in the complete range.
345 *
346 * So we need to make sure that @start and @reuse meet the above rules.
347 */
348 BUG_ON(start - reuse != PAGE_SIZE);
349
d8d55f56 350 mmap_read_lock(&init_mm);
3bc2b6a7 351 ret = vmemmap_remap_range(reuse, end, &walk);
3bc2b6a7
MS
352 if (ret && walk.nr_walked) {
353 end = reuse + walk.nr_walked * PAGE_SIZE;
354 /*
355 * vmemmap_pages contains pages from the previous
356 * vmemmap_remap_range call which failed. These
357 * are pages which were removed from the vmemmap.
358 * They will be restored in the following call.
359 */
360 walk = (struct vmemmap_remap_walk) {
361 .remap_pte = vmemmap_restore_pte,
362 .reuse_addr = reuse,
363 .vmemmap_pages = &vmemmap_pages,
364 };
ad2fa371 365
3bc2b6a7
MS
366 vmemmap_remap_range(reuse, end, &walk);
367 }
368 mmap_read_unlock(&init_mm);
ad2fa371 369
3bc2b6a7 370 free_vmemmap_page_list(&vmemmap_pages);
ad2fa371 371
3bc2b6a7 372 return ret;
ad2fa371
MS
373}
374
375static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
376 gfp_t gfp_mask, struct list_head *list)
377{
378 unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
379 int nid = page_to_nid((struct page *)start);
380 struct page *page, *next;
381
382 while (nr_pages--) {
383 page = alloc_pages_node(nid, gfp_mask, 0);
384 if (!page)
385 goto out;
386 list_add_tail(&page->lru, list);
387 }
388
389 return 0;
390out:
391 list_for_each_entry_safe(page, next, list, lru)
392 __free_pages(page, 0);
393 return -ENOMEM;
394}
395
396/**
397 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
398 * to the page which is from the @vmemmap_pages
399 * respectively.
400 * @start: start address of the vmemmap virtual address range that we want
401 * to remap.
402 * @end: end address of the vmemmap virtual address range that we want to
403 * remap.
404 * @reuse: reuse address.
405 * @gfp_mask: GFP flag for allocating vmemmap pages.
3bc2b6a7
MS
406 *
407 * Return: %0 on success, negative error code otherwise.
ad2fa371
MS
408 */
409int vmemmap_remap_alloc(unsigned long start, unsigned long end,
410 unsigned long reuse, gfp_t gfp_mask)
411{
412 LIST_HEAD(vmemmap_pages);
413 struct vmemmap_remap_walk walk = {
414 .remap_pte = vmemmap_restore_pte,
415 .reuse_addr = reuse,
416 .vmemmap_pages = &vmemmap_pages,
417 };
418
419 /* See the comment in the vmemmap_remap_free(). */
420 BUG_ON(start - reuse != PAGE_SIZE);
421
ad2fa371
MS
422 if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
423 return -ENOMEM;
424
3bc2b6a7 425 mmap_read_lock(&init_mm);
ad2fa371 426 vmemmap_remap_range(reuse, end, &walk);
3bc2b6a7 427 mmap_read_unlock(&init_mm);
ad2fa371
MS
428
429 return 0;
430}
47010c04 431#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
ad2fa371 432
8f6aac41
CL
433/*
434 * Allocate a block of memory to be used to back the virtual memory map
435 * or to back the page tables that are used to create the mapping.
436 * Uses the main allocators if they are available, else bootmem.
437 */
e0dc3a53 438
bd721ea7 439static void * __ref __earlyonly_bootmem_alloc(int node,
e0dc3a53
KH
440 unsigned long size,
441 unsigned long align,
442 unsigned long goal)
443{
eb31d559 444 return memblock_alloc_try_nid_raw(size, align, goal,
97ad1087 445 MEMBLOCK_ALLOC_ACCESSIBLE, node);
e0dc3a53
KH
446}
447
8f6aac41
CL
448void * __meminit vmemmap_alloc_block(unsigned long size, int node)
449{
450 /* If the main allocator is up use that, fallback to bootmem. */
451 if (slab_is_available()) {
fcdaf842
MH
452 gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
453 int order = get_order(size);
454 static bool warned;
f52407ce
SL
455 struct page *page;
456
fcdaf842 457 page = alloc_pages_node(node, gfp_mask, order);
8f6aac41
CL
458 if (page)
459 return page_address(page);
fcdaf842
MH
460
461 if (!warned) {
462 warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
463 "vmemmap alloc failure: order:%u", order);
464 warned = true;
465 }
8f6aac41
CL
466 return NULL;
467 } else
e0dc3a53 468 return __earlyonly_bootmem_alloc(node, size, size,
8f6aac41
CL
469 __pa(MAX_DMA_ADDRESS));
470}
471
56993b4e
AK
472static void * __meminit altmap_alloc_block_buf(unsigned long size,
473 struct vmem_altmap *altmap);
474
9bdac914 475/* need to make sure size is all the same during early stage */
56993b4e
AK
476void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
477 struct vmem_altmap *altmap)
9bdac914 478{
56993b4e
AK
479 void *ptr;
480
481 if (altmap)
482 return altmap_alloc_block_buf(size, altmap);
9bdac914 483
56993b4e 484 ptr = sparse_buffer_alloc(size);
35fd1eb1
PT
485 if (!ptr)
486 ptr = vmemmap_alloc_block(size, node);
9bdac914
YL
487 return ptr;
488}
489
4b94ffdc
DW
490static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
491{
492 return altmap->base_pfn + altmap->reserve + altmap->alloc
493 + altmap->align;
494}
495
496static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
497{
498 unsigned long allocated = altmap->alloc + altmap->align;
499
500 if (altmap->free > allocated)
501 return altmap->free - allocated;
502 return 0;
503}
504
56993b4e
AK
505static void * __meminit altmap_alloc_block_buf(unsigned long size,
506 struct vmem_altmap *altmap)
4b94ffdc 507{
eb804533 508 unsigned long pfn, nr_pfns, nr_align;
4b94ffdc
DW
509
510 if (size & ~PAGE_MASK) {
511 pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
512 __func__, size);
513 return NULL;
514 }
515
eb804533 516 pfn = vmem_altmap_next_pfn(altmap);
4b94ffdc 517 nr_pfns = size >> PAGE_SHIFT;
eb804533
CH
518 nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
519 nr_align = ALIGN(pfn, nr_align) - pfn;
520 if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
521 return NULL;
522
523 altmap->alloc += nr_pfns;
524 altmap->align += nr_align;
525 pfn += nr_align;
526
4b94ffdc
DW
527 pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
528 __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
eb804533 529 return __va(__pfn_to_phys(pfn));
4b94ffdc
DW
530}
531
8f6aac41
CL
532void __meminit vmemmap_verify(pte_t *pte, int node,
533 unsigned long start, unsigned long end)
534{
535 unsigned long pfn = pte_pfn(*pte);
536 int actual_node = early_pfn_to_nid(pfn);
537
b41ad14c 538 if (node_distance(actual_node, node) > LOCAL_DISTANCE)
1170532b
JP
539 pr_warn("[%lx-%lx] potential offnode page_structs\n",
540 start, end - 1);
8f6aac41
CL
541}
542
1d9cfee7 543pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
4917f55b
JM
544 struct vmem_altmap *altmap,
545 struct page *reuse)
8f6aac41 546{
29c71111
AW
547 pte_t *pte = pte_offset_kernel(pmd, addr);
548 if (pte_none(*pte)) {
549 pte_t entry;
1d9cfee7
AK
550 void *p;
551
4917f55b
JM
552 if (!reuse) {
553 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
554 if (!p)
555 return NULL;
556 } else {
557 /*
558 * When a PTE/PMD entry is freed from the init_mm
559 * there's a a free_pages() call to this page allocated
560 * above. Thus this get_page() is paired with the
561 * put_page_testzero() on the freeing path.
562 * This can only called by certain ZONE_DEVICE path,
563 * and through vmemmap_populate_compound_pages() when
564 * slab is available.
565 */
566 get_page(reuse);
567 p = page_to_virt(reuse);
568 }
29c71111
AW
569 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
570 set_pte_at(&init_mm, addr, pte, entry);
571 }
572 return pte;
8f6aac41
CL
573}
574
f7f99100
PT
575static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
576{
577 void *p = vmemmap_alloc_block(size, node);
578
579 if (!p)
580 return NULL;
581 memset(p, 0, size);
582
583 return p;
584}
585
29c71111 586pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
8f6aac41 587{
29c71111
AW
588 pmd_t *pmd = pmd_offset(pud, addr);
589 if (pmd_none(*pmd)) {
f7f99100 590 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
29c71111 591 if (!p)
9dce07f1 592 return NULL;
29c71111 593 pmd_populate_kernel(&init_mm, pmd, p);
8f6aac41 594 }
29c71111 595 return pmd;
8f6aac41 596}
8f6aac41 597
c2febafc 598pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
8f6aac41 599{
c2febafc 600 pud_t *pud = pud_offset(p4d, addr);
29c71111 601 if (pud_none(*pud)) {
f7f99100 602 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
29c71111 603 if (!p)
9dce07f1 604 return NULL;
29c71111
AW
605 pud_populate(&init_mm, pud, p);
606 }
607 return pud;
608}
8f6aac41 609
c2febafc
KS
610p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
611{
612 p4d_t *p4d = p4d_offset(pgd, addr);
613 if (p4d_none(*p4d)) {
f7f99100 614 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
c2febafc
KS
615 if (!p)
616 return NULL;
617 p4d_populate(&init_mm, p4d, p);
618 }
619 return p4d;
620}
621
29c71111
AW
622pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
623{
624 pgd_t *pgd = pgd_offset_k(addr);
625 if (pgd_none(*pgd)) {
f7f99100 626 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
29c71111 627 if (!p)
9dce07f1 628 return NULL;
29c71111 629 pgd_populate(&init_mm, pgd, p);
8f6aac41 630 }
29c71111 631 return pgd;
8f6aac41
CL
632}
633
2beea70a 634static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
4917f55b
JM
635 struct vmem_altmap *altmap,
636 struct page *reuse)
8f6aac41 637{
29c71111 638 pgd_t *pgd;
c2febafc 639 p4d_t *p4d;
29c71111
AW
640 pud_t *pud;
641 pmd_t *pmd;
642 pte_t *pte;
8f6aac41 643
2beea70a
JM
644 pgd = vmemmap_pgd_populate(addr, node);
645 if (!pgd)
646 return NULL;
647 p4d = vmemmap_p4d_populate(pgd, addr, node);
648 if (!p4d)
649 return NULL;
650 pud = vmemmap_pud_populate(p4d, addr, node);
651 if (!pud)
652 return NULL;
653 pmd = vmemmap_pmd_populate(pud, addr, node);
654 if (!pmd)
655 return NULL;
4917f55b 656 pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
2beea70a
JM
657 if (!pte)
658 return NULL;
659 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
660
661 return pte;
662}
663
664static int __meminit vmemmap_populate_range(unsigned long start,
665 unsigned long end, int node,
4917f55b
JM
666 struct vmem_altmap *altmap,
667 struct page *reuse)
2beea70a
JM
668{
669 unsigned long addr = start;
670 pte_t *pte;
671
29c71111 672 for (; addr < end; addr += PAGE_SIZE) {
4917f55b 673 pte = vmemmap_populate_address(addr, node, altmap, reuse);
29c71111
AW
674 if (!pte)
675 return -ENOMEM;
8f6aac41 676 }
29c71111
AW
677
678 return 0;
8f6aac41 679}
8f6aac41 680
2beea70a
JM
681int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
682 int node, struct vmem_altmap *altmap)
683{
4917f55b
JM
684 return vmemmap_populate_range(start, end, node, altmap, NULL);
685}
686
687/*
688 * For compound pages bigger than section size (e.g. x86 1G compound
689 * pages with 2M subsection size) fill the rest of sections as tail
690 * pages.
691 *
692 * Note that memremap_pages() resets @nr_range value and will increment
693 * it after each range successful onlining. Thus the value or @nr_range
694 * at section memmap populate corresponds to the in-progress range
695 * being onlined here.
696 */
697static bool __meminit reuse_compound_section(unsigned long start_pfn,
698 struct dev_pagemap *pgmap)
699{
700 unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
701 unsigned long offset = start_pfn -
702 PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
703
704 return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
705}
706
707static pte_t * __meminit compound_section_tail_page(unsigned long addr)
708{
709 pte_t *pte;
710
711 addr -= PAGE_SIZE;
712
713 /*
714 * Assuming sections are populated sequentially, the previous section's
715 * page data can be reused.
716 */
717 pte = pte_offset_kernel(pmd_off_k(addr), addr);
718 if (!pte)
719 return NULL;
720
721 return pte;
722}
723
724static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
725 unsigned long start,
726 unsigned long end, int node,
727 struct dev_pagemap *pgmap)
728{
729 unsigned long size, addr;
730 pte_t *pte;
731 int rc;
732
733 if (reuse_compound_section(start_pfn, pgmap)) {
734 pte = compound_section_tail_page(start);
735 if (!pte)
736 return -ENOMEM;
737
738 /*
739 * Reuse the page that was populated in the prior iteration
740 * with just tail struct pages.
741 */
742 return vmemmap_populate_range(start, end, node, NULL,
743 pte_page(*pte));
744 }
745
746 size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
747 for (addr = start; addr < end; addr += size) {
748 unsigned long next = addr, last = addr + size;
749
750 /* Populate the head page vmemmap page */
751 pte = vmemmap_populate_address(addr, node, NULL, NULL);
752 if (!pte)
753 return -ENOMEM;
754
755 /* Populate the tail pages vmemmap page */
756 next = addr + PAGE_SIZE;
757 pte = vmemmap_populate_address(next, node, NULL, NULL);
758 if (!pte)
759 return -ENOMEM;
760
761 /*
762 * Reuse the previous page for the rest of tail pages
763 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
764 */
765 next += PAGE_SIZE;
766 rc = vmemmap_populate_range(next, last, node, NULL,
767 pte_page(*pte));
768 if (rc)
769 return -ENOMEM;
770 }
771
772 return 0;
2beea70a
JM
773}
774
e9c0a3f0 775struct page * __meminit __populate_section_memmap(unsigned long pfn,
e3246d8f
JM
776 unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
777 struct dev_pagemap *pgmap)
8f6aac41 778{
6cda7204
WY
779 unsigned long start = (unsigned long) pfn_to_page(pfn);
780 unsigned long end = start + nr_pages * sizeof(struct page);
4917f55b 781 int r;
6cda7204
WY
782
783 if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
784 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
785 return NULL;
0aad818b 786
4917f55b
JM
787 if (is_power_of_2(sizeof(struct page)) &&
788 pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
789 r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
790 else
791 r = vmemmap_populate(start, end, nid, altmap);
792
793 if (r < 0)
8f6aac41
CL
794 return NULL;
795
e9c0a3f0 796 return pfn_to_page(pfn);
8f6aac41 797}