mm/sparse: add vmemmap_*_hvo functions
authorFrank van der Linden <fvdl@google.com>
Fri, 28 Feb 2025 18:29:15 +0000 (18:29 +0000)
committerAndrew Morton <akpm@linux-foundation.org>
Mon, 17 Mar 2025 05:06:28 +0000 (22:06 -0700)
Add a few functions to enable early HVO:

vmemmap_populate_hvo
vmemmap_undo_hvo
vmemmap_wrprotect_hvo

The populate and undo functions are expected to be used in early init,
from the sparse_init_nid_early() function.  The wrprotect function is to
be used, potentially, later.

To implement these functions, mostly re-use the existing compound pages
vmemmap logic used by DAX.  vmemmap_populate_address has its argument
changed a bit in this commit: the page structure passed in to be reused in
the mapping is replaced by a PFN and a flag.  The flag indicates whether
an extra ref should be taken on the vmemmap page containing the head page
structure.  Taking the ref is appropriate to for DAX / ZONE_DEVICE, but
not for HugeTLB HVO.

The HugeTLB vmemmap optimization maps tail page structure pages read-only.
The vmemmap_wrprotect_hvo function that does this is implemented
separately, because it cannot be guaranteed that reserved page structures
will not be write accessed during memory initialization.  Even with
CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they
are at the bottom of a zone).  So, vmemmap_populate_hvo leaves the tail
page structure pages RW initially, and then later during initialization,
after memmap init is fully done, vmemmap_wrprotect_hvo must be called to
finish the job.

Subsequent commits will use these functions for early HugeTLB HVO.

Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm.h
mm/sparse-vmemmap.c

index 03e4807bd911aff1021f4105ac9ab774f277ab61..9a74a3ee68bc76ff97663889fc4fc8c122cb694b 100644 (file)
@@ -3937,7 +3937,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-                           struct vmem_altmap *altmap, struct page *reuse);
+                           struct vmem_altmap *altmap, unsigned long ptpfn,
+                           unsigned long flags);
 void *vmemmap_alloc_block(unsigned long size, int node);
 struct vmem_altmap;
 void *vmemmap_alloc_block_buf(unsigned long size, int node,
@@ -3953,6 +3954,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
 int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
+int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
+                        unsigned long headsize);
+int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
+                    unsigned long headsize);
+void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
+                         unsigned long headsize);
 void vmemmap_populate_print_last(void);
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
index 8751c46c35e4c28acec09cebaf6ebec535cc0e06..8cc848c4b17c20c6b3120fcf89d090aae3e5f670 100644 (file)
 
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Flags for vmemmap_populate_range and friends.
+ */
+/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
+#define VMEMMAP_POPULATE_PAGEREF       0x0001
 
 #include "internal.h"
 
@@ -144,17 +151,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
 
 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                                       struct vmem_altmap *altmap,
-                                      struct page *reuse)
+                                      unsigned long ptpfn, unsigned long flags)
 {
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(ptep_get(pte))) {
                pte_t entry;
                void *p;
 
-               if (!reuse) {
+               if (ptpfn == (unsigned long)-1) {
                        p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
                        if (!p)
                                return NULL;
+                       ptpfn = PHYS_PFN(__pa(p));
                } else {
                        /*
                         * When a PTE/PMD entry is freed from the init_mm
@@ -165,10 +173,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                         * and through vmemmap_populate_compound_pages() when
                         * slab is available.
                         */
-                       get_page(reuse);
-                       p = page_to_virt(reuse);
+                       if (flags & VMEMMAP_POPULATE_PAGEREF)
+                               get_page(pfn_to_page(ptpfn));
                }
-               entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+               entry = pfn_pte(ptpfn, PAGE_KERNEL);
                set_pte_at(&init_mm, addr, pte, entry);
        }
        return pte;
@@ -238,7 +246,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 
 static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
                                              struct vmem_altmap *altmap,
-                                             struct page *reuse)
+                                             unsigned long ptpfn,
+                                             unsigned long flags)
 {
        pgd_t *pgd;
        p4d_t *p4d;
@@ -258,7 +267,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
        pmd = vmemmap_pmd_populate(pud, addr, node);
        if (!pmd)
                return NULL;
-       pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
+       pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
        if (!pte)
                return NULL;
        vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -269,13 +278,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
 static int __meminit vmemmap_populate_range(unsigned long start,
                                            unsigned long end, int node,
                                            struct vmem_altmap *altmap,
-                                           struct page *reuse)
+                                           unsigned long ptpfn,
+                                           unsigned long flags)
 {
        unsigned long addr = start;
        pte_t *pte;
 
        for (; addr < end; addr += PAGE_SIZE) {
-               pte = vmemmap_populate_address(addr, node, altmap, reuse);
+               pte = vmemmap_populate_address(addr, node, altmap,
+                                              ptpfn, flags);
                if (!pte)
                        return -ENOMEM;
        }
@@ -286,7 +297,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
 int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
                                         int node, struct vmem_altmap *altmap)
 {
-       return vmemmap_populate_range(start, end, node, altmap, NULL);
+       return vmemmap_populate_range(start, end, node, altmap, -1, 0);
+}
+
+/*
+ * Undo populate_hvo, and replace it with a normal base page mapping.
+ * Used in memory init in case a HVO mapping needs to be undone.
+ *
+ * This can happen when it is discovered that a memblock allocated
+ * hugetlb page spans multiple zones, which can only be verified
+ * after zones have been initialized.
+ *
+ * We know that:
+ * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
+ *    allocated through memblock, and mapped.
+ *
+ * 2) The rest of the vmemmap pages are mirrors of the last head page.
+ */
+int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
+                                     int node, unsigned long headsize)
+{
+       unsigned long maddr, pfn;
+       pte_t *pte;
+       int headpages;
+
+       /*
+        * Should only be called early in boot, so nothing will
+        * be accessing these page structures.
+        */
+       WARN_ON(!early_boot_irqs_disabled);
+
+       headpages = headsize >> PAGE_SHIFT;
+
+       /*
+        * Clear mirrored mappings for tail page structs.
+        */
+       for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+               pte = virt_to_kpte(maddr);
+               pte_clear(&init_mm, maddr, pte);
+       }
+
+       /*
+        * Clear and free mappings for head page and first tail page
+        * structs.
+        */
+       for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
+               pte = virt_to_kpte(maddr);
+               pfn = pte_pfn(ptep_get(pte));
+               pte_clear(&init_mm, maddr, pte);
+               memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
+       }
+
+       flush_tlb_kernel_range(addr, end);
+
+       return vmemmap_populate(addr, end, node, NULL);
+}
+
+/*
+ * Write protect the mirrored tail page structs for HVO. This will be
+ * called from the hugetlb code when gathering and initializing the
+ * memblock allocated gigantic pages. The write protect can't be
+ * done earlier, since it can't be guaranteed that the reserved
+ * page structures will not be written to during initialization,
+ * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
+ *
+ * The PTEs are known to exist, and nothing else should be touching
+ * these pages. The caller is responsible for any TLB flushing.
+ */
+void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
+                                   int node, unsigned long headsize)
+{
+       unsigned long maddr;
+       pte_t *pte;
+
+       for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+               pte = virt_to_kpte(maddr);
+               ptep_set_wrprotect(&init_mm, maddr, pte);
+       }
+}
+
+/*
+ * Populate vmemmap pages HVO-style. The first page contains the head
+ * page and needed tail pages, the other ones are mirrors of the first
+ * page.
+ */
+int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
+                                      int node, unsigned long headsize)
+{
+       pte_t *pte;
+       unsigned long maddr;
+
+       for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
+               pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
+               if (!pte)
+                       return -ENOMEM;
+       }
+
+       /*
+        * Reuse the last page struct page mapped above for the rest.
+        */
+       return vmemmap_populate_range(maddr, end, node, NULL,
+                                       pte_pfn(ptep_get(pte)), 0);
 }
 
 void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
@@ -409,7 +520,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
                 * with just tail struct pages.
                 */
                return vmemmap_populate_range(start, end, node, NULL,
-                                             pte_page(ptep_get(pte)));
+                                             pte_pfn(ptep_get(pte)),
+                                             VMEMMAP_POPULATE_PAGEREF);
        }
 
        size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -417,13 +529,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
                unsigned long next, last = addr + size;
 
                /* Populate the head page vmemmap page */
-               pte = vmemmap_populate_address(addr, node, NULL, NULL);
+               pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
                if (!pte)
                        return -ENOMEM;
 
                /* Populate the tail pages vmemmap page */
                next = addr + PAGE_SIZE;
-               pte = vmemmap_populate_address(next, node, NULL, NULL);
+               pte = vmemmap_populate_address(next, node, NULL, -1, 0);
                if (!pte)
                        return -ENOMEM;
 
@@ -433,7 +545,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
                 */
                next += PAGE_SIZE;
                rc = vmemmap_populate_range(next, last, node, NULL,
-                                           pte_page(ptep_get(pte)));
+                                           pte_pfn(ptep_get(pte)),
+                                           VMEMMAP_POPULATE_PAGEREF);
                if (rc)
                        return -ENOMEM;
        }