Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / mm / hugetlb.c
index ce7be5c244429f71bc686399889fba7f4b6e1cf8..6be78e7d4f6e058ef1c72e54aa8fc2cadc76cc06 100644 (file)
@@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
        struct page *p;
 
        atomic_set(&folio->_entire_mapcount, 0);
-       atomic_set(&folio->_nr_pages_mapped, 0);
+       atomic_set(&folio->_large_mapcount, 0);
        atomic_set(&folio->_pincount, 0);
 
        for (i = 1; i < nr_pages; i++) {
@@ -1619,19 +1619,11 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
                                                unsigned int order) { }
 #endif
 
-static inline void __clear_hugetlb_destructor(struct hstate *h,
-                                               struct folio *folio)
-{
-       lockdep_assert_held(&hugetlb_lock);
-
-       __folio_clear_hugetlb(folio);
-}
-
 /*
  * Remove hugetlb folio from lists.
- * If vmemmap exists for the folio, update dtor so that the folio appears
- * as just a compound page.  Otherwise, wait until after allocating vmemmap
- * to update dtor.
+ * If vmemmap exists for the folio, clear the hugetlb flag so that the
+ * folio appears as just a compound page.  Otherwise, wait until after
+ * allocating vmemmap to clear the flag.
  *
  * A reference is held on the folio, except in the case of demote.
  *
@@ -1662,12 +1654,12 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
        }
 
        /*
-        * We can only clear the hugetlb destructor after allocating vmemmap
+        * We can only clear the hugetlb flag after allocating vmemmap
         * pages.  Otherwise, someone (memory error handling) may try to write
         * to tail struct pages.
         */
        if (!folio_test_hugetlb_vmemmap_optimized(folio))
-               __clear_hugetlb_destructor(h, folio);
+               __folio_clear_hugetlb(folio);
 
         /*
          * In the case of demote we do not ref count the page as it will soon
@@ -1734,14 +1726,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
                 */
                return;
 
-       arch_clear_hugepage_flags(&folio->page);
+       arch_clear_hugetlb_flags(folio);
        enqueue_hugetlb_folio(h, folio);
 }
 
 static void __update_and_free_hugetlb_folio(struct hstate *h,
                                                struct folio *folio)
 {
-       bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
+       bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
 
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
@@ -1754,11 +1746,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
                return;
 
        /*
-        * If folio is not vmemmap optimized (!clear_dtor), then the folio
+        * If folio is not vmemmap optimized (!clear_flag), then the folio
         * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
         * can only be passed hugetlb pages and will BUG otherwise.
         */
-       if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) {
+       if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
                spin_lock_irq(&hugetlb_lock);
                /*
                 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1779,11 +1771,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
 
        /*
         * If vmemmap pages were allocated above, then we need to clear the
-        * hugetlb destructor under the hugetlb lock.
+        * hugetlb flag under the hugetlb lock.
         */
        if (folio_test_hugetlb(folio)) {
                spin_lock_irq(&hugetlb_lock);
-               __clear_hugetlb_destructor(h, folio);
+               __folio_clear_hugetlb(folio);
                spin_unlock_irq(&hugetlb_lock);
        }
 
@@ -1796,7 +1788,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
                destroy_compound_gigantic_folio(folio, huge_page_order(h));
                free_gigantic_folio(folio, huge_page_order(h));
        } else {
-               __free_pages(&folio->page, huge_page_order(h));
+               INIT_LIST_HEAD(&folio->_deferred_list);
+               folio_put(folio);
        }
 }
 
@@ -1884,7 +1877,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,
                list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
                        list_del(&folio->lru);
                        spin_lock_irq(&hugetlb_lock);
-                       __clear_hugetlb_destructor(h, folio);
+                       __folio_clear_hugetlb(folio);
                        spin_unlock_irq(&hugetlb_lock);
                        update_and_free_hugetlb_folio(h, folio, false);
                        cond_resched();
@@ -1909,7 +1902,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,
                        } else {
                                list_del(&folio->lru);
                                spin_lock_irq(&hugetlb_lock);
-                               __clear_hugetlb_destructor(h, folio);
+                               __folio_clear_hugetlb(folio);
                                spin_unlock_irq(&hugetlb_lock);
                                update_and_free_hugetlb_folio(h, folio, false);
                                cond_resched();
@@ -1942,14 +1935,14 @@ retry:
         * should only be pages on the non_hvo_folios list.
         * Do note that the non_hvo_folios list could be empty.
         * Without HVO enabled, ret will be 0 and there is no need to call
-        * __clear_hugetlb_destructor as this was done previously.
+        * __folio_clear_hugetlb as this was done previously.
         */
        VM_WARN_ON(!list_empty(folio_list));
        VM_WARN_ON(ret < 0);
        if (!list_empty(&non_hvo_folios) && ret) {
                spin_lock_irq(&hugetlb_lock);
                list_for_each_entry(folio, &non_hvo_folios, lru)
-                       __clear_hugetlb_destructor(h, folio);
+                       __folio_clear_hugetlb(folio);
                spin_unlock_irq(&hugetlb_lock);
        }
 
@@ -1974,7 +1967,7 @@ void free_huge_folio(struct folio *folio)
 {
        /*
         * Can't pass hstate in here because it is called from the
-        * compound page destructor.
+        * generic mm code.
         */
        struct hstate *h = folio_hstate(folio);
        int nid = folio_nid(folio);
@@ -2031,7 +2024,7 @@ void free_huge_folio(struct folio *folio)
                spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_hugetlb_folio(h, folio, true);
        } else {
-               arch_clear_hugepage_flags(&folio->page);
+               arch_clear_hugetlb_flags(folio);
                enqueue_hugetlb_folio(h, folio);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
@@ -2124,10 +2117,10 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
                        set_compound_head(p, &folio->page);
        }
        __folio_set_head(folio);
-       /* we rely on prep_new_hugetlb_folio to set the destructor */
+       /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */
        folio_set_order(folio, order);
        atomic_set(&folio->_entire_mapcount, -1);
-       atomic_set(&folio->_nr_pages_mapped, 0);
+       atomic_set(&folio->_large_mapcount, -1);
        atomic_set(&folio->_pincount, 0);
        return true;
 
@@ -2162,13 +2155,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
 /*
  * Find and lock address space (mapping) in write mode.
  *
- * Upon entry, the page is locked which means that page_mapping() is
+ * Upon entry, the folio is locked which means that folio_mapping() is
  * stable.  Due to locking order, we can only trylock_write.  If we can
  * not get the lock, simply return NULL to caller.
  */
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
 {
-       struct address_space *mapping = page_mapping(hpage);
+       struct address_space *mapping = folio_mapping(folio);
 
        if (!mapping)
                return mapping;
@@ -2184,13 +2177,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
                nodemask_t *node_alloc_noretry)
 {
        int order = huge_page_order(h);
-       struct page *page;
+       struct folio *folio;
        bool alloc_try_hard = true;
        bool retry = true;
 
        /*
-        * By default we always try hard to allocate the page with
-        * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
+        * By default we always try hard to allocate the folio with
+        * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
         * a loop (to adjust global huge page counts) and previous allocation
         * failed, do not continue to try hard on the same node.  Use the
         * node_alloc_noretry bitmap to manage this state information.
@@ -2203,43 +2196,42 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
 retry:
-       page = __alloc_pages(gfp_mask, order, nid, nmask);
+       folio = __folio_alloc(gfp_mask, order, nid, nmask);
 
-       /* Freeze head page */
-       if (page && !page_ref_freeze(page, 1)) {
-               __free_pages(page, order);
+       if (folio && !folio_ref_freeze(folio, 1)) {
+               folio_put(folio);
                if (retry) {    /* retry once */
                        retry = false;
                        goto retry;
                }
                /* WOW!  twice in a row. */
-               pr_warn("HugeTLB head page unexpected inflated ref count\n");
-               page = NULL;
+               pr_warn("HugeTLB unexpected inflated folio ref count\n");
+               folio = NULL;
        }
 
        /*
-        * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
-        * indicates an overall state change.  Clear bit so that we resume
-        * normal 'try hard' allocations.
+        * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
+        * folio this indicates an overall state change.  Clear bit so
+        * that we resume normal 'try hard' allocations.
         */
-       if (node_alloc_noretry && page && !alloc_try_hard)
+       if (node_alloc_noretry && folio && !alloc_try_hard)
                node_clear(nid, *node_alloc_noretry);
 
        /*
-        * If we tried hard to get a page but failed, set bit so that
+        * If we tried hard to get a folio but failed, set bit so that
         * subsequent attempts will not try as hard until there is an
         * overall state change.
         */
-       if (node_alloc_noretry && !page && alloc_try_hard)
+       if (node_alloc_noretry && !folio && alloc_try_hard)
                node_set(nid, *node_alloc_noretry);
 
-       if (!page) {
+       if (!folio) {
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
                return NULL;
        }
 
        __count_vm_event(HTLB_BUDDY_PGALLOC);
-       return page_folio(page);
+       return folio;
 }
 
 static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h,
@@ -2385,8 +2377,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
 }
 
 /*
- * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use hugepages and non-hugepages.
+ * Dissolve a given free hugetlb folio into free buddy pages. This function
+ * does nothing for in-use hugetlb folios and non-hugetlb folios.
  * This function returns values like below:
  *
  *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
@@ -2398,10 +2390,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
  *       0:  successfully dissolved free hugepages or the page is not a
  *           hugepage (considered as already dissolved)
  */
-int dissolve_free_huge_page(struct page *page)
+int dissolve_free_hugetlb_folio(struct folio *folio)
 {
        int rc = -EBUSY;
-       struct folio *folio = page_folio(page);
 
 retry:
        /* Not to disrupt normal path by vainly holding hugetlb_lock */
@@ -2478,13 +2469,13 @@ out:
  * make specified memory blocks removable from the system.
  * Note that this will dissolve a free gigantic hugepage completely, if any
  * part of it lies within the given range.
- * Also note that if dissolve_free_huge_page() returns with an error, all
- * free hugepages that were dissolved before that error are lost.
+ * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
+ * free hugetlb folios that were dissolved before that error are lost.
  */
-int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
-       struct page *page;
+       struct folio *folio;
        int rc = 0;
        unsigned int order;
        struct hstate *h;
@@ -2497,8 +2488,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
                order = min(order, huge_page_order(h));
 
        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
-               page = pfn_to_page(pfn);
-               rc = dissolve_free_huge_page(page);
+               folio = pfn_folio(pfn);
+               rc = dissolve_free_hugetlb_folio(folio);
                if (rc)
                        break;
        }
@@ -2605,7 +2596,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
 
 /* folio migration callback function */
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
-               nodemask_t *nmask, gfp_t gfp_mask)
+               nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
 {
        spin_lock_irq(&hugetlb_lock);
        if (available_huge_pages(h)) {
@@ -2620,6 +2611,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
        }
        spin_unlock_irq(&hugetlb_lock);
 
+       /* We cannot fallback to other nodes, as we could break the per-node pool. */
+       if (!allow_alloc_fallback)
+               gfp_mask |= __GFP_THISNODE;
+
        return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
 }
 
@@ -5032,7 +5027,6 @@ static struct ctl_table hugetlb_table[] = {
                .mode           = 0644,
                .proc_handler   = hugetlb_overcommit_handler,
        },
-       { }
 };
 
 static void hugetlb_sysctl_init(void)
@@ -5923,19 +5917,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  * cannot race with other handlers or page migration.
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
-static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
-                      unsigned long address, pte_t *ptep, unsigned int flags,
-                      struct folio *pagecache_folio, spinlock_t *ptl,
+static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
                       struct vm_fault *vmf)
 {
-       const bool unshare = flags & FAULT_FLAG_UNSHARE;
-       pte_t pte = huge_ptep_get(ptep);
+       struct vm_area_struct *vma = vmf->vma;
+       struct mm_struct *mm = vma->vm_mm;
+       const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+       pte_t pte = huge_ptep_get(vmf->pte);
        struct hstate *h = hstate_vma(vma);
        struct folio *old_folio;
        struct folio *new_folio;
        int outside_reserve = 0;
        vm_fault_t ret = 0;
-       unsigned long haddr = address & huge_page_mask(h);
        struct mmu_notifier_range range;
 
        /*
@@ -5958,7 +5951,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 
        /* Let's take out MAP_SHARED mappings first. */
        if (vma->vm_flags & VM_MAYSHARE) {
-               set_huge_ptep_writable(vma, haddr, ptep);
+               set_huge_ptep_writable(vma, vmf->address, vmf->pte);
                return 0;
        }
 
@@ -5970,6 +5963,13 @@ retry_avoidcopy:
        /*
         * If no-one else is actually using this page, we're the exclusive
         * owner and can reuse this page.
+        *
+        * Note that we don't rely on the (safer) folio refcount here, because
+        * copying the hugetlb folio when there are unexpected (temporary)
+        * folio references could harm simple fork()+exit() users when
+        * we run out of free hugetlb folios: we would have to kill processes
+        * in scenarios that used to work. As a side effect, there can still
+        * be leaks between processes, for example, with FOLL_GET users.
         */
        if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
                if (!PageAnonExclusive(&old_folio->page)) {
@@ -5977,7 +5977,7 @@ retry_avoidcopy:
                        SetPageAnonExclusive(&old_folio->page);
                }
                if (likely(!unshare))
-                       set_huge_ptep_writable(vma, haddr, ptep);
+                       set_huge_ptep_writable(vma, vmf->address, vmf->pte);
 
                delayacct_wpcopy_end();
                return 0;
@@ -6004,8 +6004,8 @@ retry_avoidcopy:
         * Drop page table lock as buddy allocator may be called. It will
         * be acquired again before returning to the caller, as expected.
         */
-       spin_unlock(ptl);
-       new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
+       spin_unlock(vmf->ptl);
+       new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
 
        if (IS_ERR(new_folio)) {
                /*
@@ -6030,19 +6030,21 @@ retry_avoidcopy:
                         *
                         * Reacquire both after unmap operation.
                         */
-                       idx = vma_hugecache_offset(h, vma, haddr);
+                       idx = vma_hugecache_offset(h, vma, vmf->address);
                        hash = hugetlb_fault_mutex_hash(mapping, idx);
                        hugetlb_vma_unlock_read(vma);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
-                       unmap_ref_private(mm, vma, &old_folio->page, haddr);
+                       unmap_ref_private(mm, vma, &old_folio->page,
+                                       vmf->address);
 
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        hugetlb_vma_lock_read(vma);
-                       spin_lock(ptl);
-                       ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
-                       if (likely(ptep &&
-                                  pte_same(huge_ptep_get(ptep), pte)))
+                       spin_lock(vmf->ptl);
+                       vmf->pte = hugetlb_walk(vma, vmf->address,
+                                       huge_page_size(h));
+                       if (likely(vmf->pte &&
+                                  pte_same(huge_ptep_get(vmf->pte), pte)))
                                goto retry_avoidcopy;
                        /*
                         * race occurs while re-acquiring page table
@@ -6064,37 +6066,38 @@ retry_avoidcopy:
        if (unlikely(ret))
                goto out_release_all;
 
-       if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
-               ret = VM_FAULT_HWPOISON_LARGE;
+       if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
+               ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
                goto out_release_all;
        }
        __folio_mark_uptodate(new_folio);
 
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
-                               haddr + huge_page_size(h));
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
+                               vmf->address + huge_page_size(h));
        mmu_notifier_invalidate_range_start(&range);
 
        /*
         * Retake the page table lock to check for racing updates
         * before the page tables are altered
         */
-       spin_lock(ptl);
-       ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
-       if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+       spin_lock(vmf->ptl);
+       vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
+       if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {
                pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
 
                /* Break COW or unshare */
-               huge_ptep_clear_flush(vma, haddr, ptep);
+               huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
                hugetlb_remove_rmap(old_folio);
-               hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
+               hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
                if (huge_pte_uffd_wp(pte))
                        newpte = huge_pte_mkuffd_wp(newpte);
-               set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
+               set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
+                               huge_page_size(h));
                folio_set_hugetlb_migratable(new_folio);
                /* Make the old page be freed below */
                new_folio = old_folio;
        }
-       spin_unlock(ptl);
+       spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(&range);
 out_release_all:
        /*
@@ -6102,12 +6105,12 @@ out_release_all:
         * unshare)
         */
        if (new_folio != old_folio)
-               restore_reserve_on_error(h, vma, haddr, new_folio);
+               restore_reserve_on_error(h, vma, vmf->address, new_folio);
        folio_put(new_folio);
 out_release_old:
        folio_put(old_folio);
 
-       spin_lock(ptl); /* Caller expects lock to be held */
+       spin_lock(vmf->ptl); /* Caller expects lock to be held */
 
        delayacct_wpcopy_end();
        return ret;
@@ -6116,8 +6119,8 @@ out_release_old:
 /*
  * Return whether there is a pagecache page to back given address within VMA.
  */
-static bool hugetlbfs_pagecache_present(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long address)
+bool hugetlbfs_pagecache_present(struct hstate *h,
+                                struct vm_area_struct *vma, unsigned long address)
 {
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = linear_page_index(vma, address);
@@ -6193,23 +6196,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
        return same;
 }
 
-static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
-                       struct vm_area_struct *vma,
-                       struct address_space *mapping, pgoff_t idx,
-                       unsigned long address, pte_t *ptep,
-                       pte_t old_pte, unsigned int flags,
+static vm_fault_t hugetlb_no_page(struct address_space *mapping,
                        struct vm_fault *vmf)
 {
+       struct vm_area_struct *vma = vmf->vma;
+       struct mm_struct *mm = vma->vm_mm;
        struct hstate *h = hstate_vma(vma);
        vm_fault_t ret = VM_FAULT_SIGBUS;
        int anon_rmap = 0;
        unsigned long size;
        struct folio *folio;
        pte_t new_pte;
-       spinlock_t *ptl;
-       unsigned long haddr = address & huge_page_mask(h);
        bool new_folio, new_pagecache_folio = false;
-       u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
+       u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
 
        /*
         * Currently, we are forced to kill the process in the event the
@@ -6228,10 +6227,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         * before we get page_table_lock.
         */
        new_folio = false;
-       folio = filemap_lock_hugetlb_folio(h, mapping, idx);
+       folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
        if (IS_ERR(folio)) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
-               if (idx >= size)
+               if (vmf->pgoff >= size)
                        goto out;
                /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
@@ -6252,7 +6251,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         * never happen on the page after UFFDIO_COPY has
                         * correctly installed the page and returned.
                         */
-                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                       if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
                                ret = 0;
                                goto out;
                        }
@@ -6267,7 +6266,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                                goto out;
                }
 
-               folio = alloc_hugetlb_folio(vma, haddr, 0);
+               folio = alloc_hugetlb_folio(vma, vmf->address, 0);
                if (IS_ERR(folio)) {
                        /*
                         * Returning error will result in faulting task being
@@ -6281,18 +6280,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         * here.  Before returning error, get ptl and make
                         * sure there really is no pte entry.
                         */
-                       if (hugetlb_pte_stable(h, mm, ptep, old_pte))
+                       if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte))
                                ret = vmf_error(PTR_ERR(folio));
                        else
                                ret = 0;
                        goto out;
                }
-               clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+               clear_huge_page(&folio->page, vmf->real_address,
+                               pages_per_huge_page(h));
                __folio_mark_uptodate(folio);
                new_folio = true;
 
                if (vma->vm_flags & VM_MAYSHARE) {
-                       int err = hugetlb_add_to_page_cache(folio, mapping, idx);
+                       int err = hugetlb_add_to_page_cache(folio, mapping,
+                                                       vmf->pgoff);
                        if (err) {
                                /*
                                 * err can't be -EEXIST which implies someone
@@ -6301,7 +6302,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                                 * to the page cache. So it's safe to call
                                 * restore_reserve_on_error() here.
                                 */
-                               restore_reserve_on_error(h, vma, haddr, folio);
+                               restore_reserve_on_error(h, vma, vmf->address,
+                                                       folio);
                                folio_put(folio);
                                ret = VM_FAULT_SIGBUS;
                                goto out;
@@ -6328,7 +6330,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        folio_unlock(folio);
                        folio_put(folio);
                        /* See comment in userfaultfd_missing() block above */
-                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                       if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
                                ret = 0;
                                goto out;
                        }
@@ -6343,23 +6345,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         * any allocations necessary to record that reservation occur outside
         * the spinlock.
         */
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-               if (vma_needs_reservation(h, vma, haddr) < 0) {
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+               if (vma_needs_reservation(h, vma, vmf->address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto backout_unlocked;
                }
                /* Just decrements count, does not deallocate */
-               vma_end_reservation(h, vma, haddr);
+               vma_end_reservation(h, vma, vmf->address);
        }
 
-       ptl = huge_pte_lock(h, mm, ptep);
+       vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
        ret = 0;
        /* If pte changed from under us, retry */
-       if (!pte_same(huge_ptep_get(ptep), old_pte))
+       if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte))
                goto backout;
 
        if (anon_rmap)
-               hugetlb_add_new_anon_rmap(folio, vma, haddr);
+               hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
        else
                hugetlb_add_file_rmap(folio);
        new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
@@ -6368,17 +6370,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         * If this pte was previously wr-protected, keep it wr-protected even
         * if populated.
         */
-       if (unlikely(pte_marker_uffd_wp(old_pte)))
+       if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
                new_pte = huge_pte_mkuffd_wp(new_pte);
-       set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
+       set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
 
        hugetlb_count_add(pages_per_huge_page(h), mm);
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf);
+               ret = hugetlb_wp(folio, vmf);
        }
 
-       spin_unlock(ptl);
+       spin_unlock(vmf->ptl);
 
        /*
         * Only set hugetlb_migratable in newly allocated pages.  Existing pages
@@ -6395,10 +6397,10 @@ out:
        return ret;
 
 backout:
-       spin_unlock(ptl);
+       spin_unlock(vmf->ptl);
 backout_unlocked:
        if (new_folio && !new_pagecache_folio)
-               restore_reserve_on_error(h, vma, haddr, folio);
+               restore_reserve_on_error(h, vma, vmf->address, folio);
 
        folio_unlock(folio);
        folio_put(folio);
@@ -6432,8 +6434,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags)
 {
-       pte_t *ptep, entry;
-       spinlock_t *ptl;
        vm_fault_t ret;
        u32 hash;
        struct folio *folio = NULL;
@@ -6441,13 +6441,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping;
        int need_wait_lock = 0;
-       unsigned long haddr = address & huge_page_mask(h);
        struct vm_fault vmf = {
                .vma = vma,
-               .address = haddr,
+               .address = address & huge_page_mask(h),
                .real_address = address,
                .flags = flags,
-               .pgoff = vma_hugecache_offset(h, vma, haddr),
+               .pgoff = vma_hugecache_offset(h, vma,
+                               address & huge_page_mask(h)),
                /* TODO: Track hugetlb faults using vm_fault */
 
                /*
@@ -6467,25 +6467,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        /*
         * Acquire vma lock before calling huge_pte_alloc and hold
-        * until finished with ptep.  This prevents huge_pmd_unshare from
-        * being called elsewhere and making the ptep no longer valid.
+        * until finished with vmf.pte.  This prevents huge_pmd_unshare from
+        * being called elsewhere and making the vmf.pte no longer valid.
         */
        hugetlb_vma_lock_read(vma);
-       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
-       if (!ptep) {
+       vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
+       if (!vmf.pte) {
                hugetlb_vma_unlock_read(vma);
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                return VM_FAULT_OOM;
        }
 
-       entry = huge_ptep_get(ptep);
-       if (huge_pte_none_mostly(entry)) {
-               if (is_pte_marker(entry)) {
+       vmf.orig_pte = huge_ptep_get(vmf.pte);
+       if (huge_pte_none_mostly(vmf.orig_pte)) {
+               if (is_pte_marker(vmf.orig_pte)) {
                        pte_marker marker =
-                               pte_marker_get(pte_to_swp_entry(entry));
+                               pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
 
                        if (marker & PTE_MARKER_POISONED) {
-                               ret = VM_FAULT_HWPOISON_LARGE;
+                               ret = VM_FAULT_HWPOISON_LARGE |
+                                     VM_FAULT_SET_HINDEX(hstate_index(h));
                                goto out_mutex;
                        }
                }
@@ -6496,21 +6497,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 * hugetlb_no_page will drop vma lock and hugetlb fault
                 * mutex internally, which make us return immediately.
                 */
-               return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
-                                       ptep, entry, flags, &vmf);
+               return hugetlb_no_page(mapping, &vmf);
        }
 
        ret = 0;
 
        /*
-        * entry could be a migration/hwpoison entry at this point, so this
-        * check prevents the kernel from going below assuming that we have
-        * an active hugepage in pagecache. This goto expects the 2nd page
-        * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
-        * properly handle it.
+        * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
+        * point, so this check prevents the kernel from going below assuming
+        * that we have an active hugepage in pagecache. This goto expects
+        * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
+        * check will properly handle it.
         */
-       if (!pte_present(entry)) {
-               if (unlikely(is_hugetlb_entry_migration(entry))) {
+       if (!pte_present(vmf.orig_pte)) {
+               if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
                        /*
                         * Release the hugetlb fault lock now, but retain
                         * the vma lock, because it is needed to guard the
@@ -6519,9 +6519,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         * be released there.
                         */
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       migration_entry_wait_huge(vma, ptep);
+                       migration_entry_wait_huge(vma, vmf.pte);
                        return 0;
-               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+               } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
                        ret = VM_FAULT_HWPOISON_LARGE |
                            VM_FAULT_SET_HINDEX(hstate_index(h));
                goto out_mutex;
@@ -6535,13 +6535,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * determine if a reservation has been consumed.
         */
        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
-           !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
-               if (vma_needs_reservation(h, vma, haddr) < 0) {
+           !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
+               if (vma_needs_reservation(h, vma, vmf.address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
                }
                /* Just decrements count, does not deallocate */
-               vma_end_reservation(h, vma, haddr);
+               vma_end_reservation(h, vma, vmf.address);
 
                pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
                                                             vmf.pgoff);
@@ -6549,17 +6549,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        pagecache_folio = NULL;
        }
 
-       ptl = huge_pte_lock(h, mm, ptep);
+       vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
 
        /* Check for a racing update before calling hugetlb_wp() */
-       if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+       if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
                goto out_ptl;
 
        /* Handle userfault-wp first, before trying to lock more pages */
-       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
-           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
+           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
                if (!userfaultfd_wp_async(vma)) {
-                       spin_unlock(ptl);
+                       spin_unlock(vmf.ptl);
                        if (pagecache_folio) {
                                folio_unlock(pagecache_folio);
                                folio_put(pagecache_folio);
@@ -6569,18 +6569,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        return handle_userfault(&vmf, VM_UFFD_WP);
                }
 
-               entry = huge_pte_clear_uffd_wp(entry);
-               set_huge_pte_at(mm, haddr, ptep, entry,
+               vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
+               set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
                                huge_page_size(hstate_vma(vma)));
                /* Fallthrough to CoW */
        }
 
        /*
-        * hugetlb_wp() requires page locks of pte_page(entry) and
+        * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
         * pagecache_folio, so here we need take the former one
         * when folio != pagecache_folio or !pagecache_folio.
         */
-       folio = page_folio(pte_page(entry));
+       folio = page_folio(pte_page(vmf.orig_pte));
        if (folio != pagecache_folio)
                if (!folio_trylock(folio)) {
                        need_wait_lock = 1;
@@ -6590,24 +6590,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        folio_get(folio);
 
        if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
-               if (!huge_pte_write(entry)) {
-                       ret = hugetlb_wp(mm, vma, address, ptep, flags,
-                                        pagecache_folio, ptl, &vmf);
+               if (!huge_pte_write(vmf.orig_pte)) {
+                       ret = hugetlb_wp(pagecache_folio, &vmf);
                        goto out_put_page;
                } else if (likely(flags & FAULT_FLAG_WRITE)) {
-                       entry = huge_pte_mkdirty(entry);
+                       vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
                }
        }
-       entry = pte_mkyoung(entry);
-       if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+       vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
+       if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
                                                flags & FAULT_FLAG_WRITE))
-               update_mmu_cache(vma, haddr, ptep);
+               update_mmu_cache(vma, vmf.address, vmf.pte);
 out_put_page:
        if (folio != pagecache_folio)
                folio_unlock(folio);
        folio_put(folio);
 out_ptl:
-       spin_unlock(ptl);
+       spin_unlock(vmf.ptl);
 
        if (pagecache_folio) {
                folio_unlock(pagecache_folio);
@@ -6643,7 +6642,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
 
        gfp_mask = htlb_alloc_mask(h);
        node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-       folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+       /*
+        * This is used to allocate a temporary hugetlb to hold the copied
+        * content, which will then be copied again to the final hugetlb
+        * consuming a reservation. Set the alloc_fallback to false to indicate
+        * that breaking the per-node hugetlb pool is not allowed in this case.
+        */
+       folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
        mpol_cond_put(mpol);
 
        return folio;
@@ -6873,77 +6878,6 @@ out_release_nounlock:
 }
 #endif /* CONFIG_USERFAULTFD */
 
-struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
-                                     unsigned long address, unsigned int flags,
-                                     unsigned int *page_mask)
-{
-       struct hstate *h = hstate_vma(vma);
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & huge_page_mask(h);
-       struct page *page = NULL;
-       spinlock_t *ptl;
-       pte_t *pte, entry;
-       int ret;
-
-       hugetlb_vma_lock_read(vma);
-       pte = hugetlb_walk(vma, haddr, huge_page_size(h));
-       if (!pte)
-               goto out_unlock;
-
-       ptl = huge_pte_lock(h, mm, pte);
-       entry = huge_ptep_get(pte);
-       if (pte_present(entry)) {
-               page = pte_page(entry);
-
-               if (!huge_pte_write(entry)) {
-                       if (flags & FOLL_WRITE) {
-                               page = NULL;
-                               goto out;
-                       }
-
-                       if (gup_must_unshare(vma, flags, page)) {
-                               /* Tell the caller to do unsharing */
-                               page = ERR_PTR(-EMLINK);
-                               goto out;
-                       }
-               }
-
-               page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
-
-               /*
-                * Note that page may be a sub-page, and with vmemmap
-                * optimizations the page struct may be read only.
-                * try_grab_page() will increase the ref count on the
-                * head page, so this will be OK.
-                *
-                * try_grab_page() should always be able to get the page here,
-                * because we hold the ptl lock and have verified pte_present().
-                */
-               ret = try_grab_page(page, flags);
-
-               if (WARN_ON_ONCE(ret)) {
-                       page = ERR_PTR(ret);
-                       goto out;
-               }
-
-               *page_mask = (1U << huge_page_order(h)) - 1;
-       }
-out:
-       spin_unlock(ptl);
-out_unlock:
-       hugetlb_vma_unlock_read(vma);
-
-       /*
-        * Fixup retval for dump requests: if pagecache doesn't exist,
-        * don't try to allocate a new page but just skip it.
-        */
-       if (!page && (flags & FOLL_DUMP) &&
-           !hugetlbfs_pagecache_present(h, vma, address))
-               page = ERR_PTR(-EFAULT);
-
-       return page;
-}
-
 long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end,
                pgprot_t newprot, unsigned long cp_flags)
@@ -7867,9 +7801,9 @@ void __init hugetlb_cma_reserve(int order)
                 * huge page demotion.
                 */
                res = cma_declare_contiguous_nid(0, size, 0,
-                                               PAGE_SIZE << HUGETLB_PAGE_ORDER,
-                                                0, false, name,
-                                                &hugetlb_cma[nid], nid);
+                                       PAGE_SIZE << order,
+                                       HUGETLB_PAGE_ORDER, false, name,
+                                       &hugetlb_cma[nid], nid);
                if (res) {
                        pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
                                res, nid);