Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-block.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index ce7be5c244429f71bc686399889fba7f4b6e1cf8..6be78e7d4f6e058ef1c72e54aa8fc2cadc76cc06 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
         struct page *p;
  
         atomic_set(&folio->_entire_mapcount, 0);
-       atomic_set(&folio->_nr_pages_mapped, 0);
+       atomic_set(&folio->_large_mapcount, 0);
         atomic_set(&folio->_pincount, 0);
  
         for (i = 1; i < nr_pages; i++) {
@@ -1619,19 +1619,11 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
                                                 unsigned int order) { }
  #endif
  
-static inline void __clear_hugetlb_destructor(struct hstate *h,
-                                               struct folio *folio)
-{
-       lockdep_assert_held(&hugetlb_lock);
-
-       __folio_clear_hugetlb(folio);
-}
-
  /*
   * Remove hugetlb folio from lists.
- * If vmemmap exists for the folio, update dtor so that the folio appears
- * as just a compound page.  Otherwise, wait until after allocating vmemmap
- * to update dtor.
+ * If vmemmap exists for the folio, clear the hugetlb flag so that the
+ * folio appears as just a compound page.  Otherwise, wait until after
+ * allocating vmemmap to clear the flag.
   *
   * A reference is held on the folio, except in the case of demote.
   *
@@ -1662,12 +1654,12 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
         }
  
         /*
-        * We can only clear the hugetlb destructor after allocating vmemmap
+        * We can only clear the hugetlb flag after allocating vmemmap
          * pages.  Otherwise, someone (memory error handling) may try to write
          * to tail struct pages.
          */
         if (!folio_test_hugetlb_vmemmap_optimized(folio))
-               __clear_hugetlb_destructor(h, folio);
+               __folio_clear_hugetlb(folio);
  
          /*
           * In the case of demote we do not ref count the page as it will soon
@@ -1734,14 +1726,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
                  */
                 return;
  
-       arch_clear_hugepage_flags(&folio->page);
+       arch_clear_hugetlb_flags(folio);
         enqueue_hugetlb_folio(h, folio);
  }
  
  static void __update_and_free_hugetlb_folio(struct hstate *h,
                                                 struct folio *folio)
  {
-       bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
+       bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
  
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
@@ -1754,11 +1746,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
                 return;
  
         /*
-        * If folio is not vmemmap optimized (!clear_dtor), then the folio
+        * If folio is not vmemmap optimized (!clear_flag), then the folio
          * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
          * can only be passed hugetlb pages and will BUG otherwise.
          */
-       if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) {
+       if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
                 spin_lock_irq(&hugetlb_lock);
                 /*
                  * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1779,11 +1771,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
  
         /*
          * If vmemmap pages were allocated above, then we need to clear the
-        * hugetlb destructor under the hugetlb lock.
+        * hugetlb flag under the hugetlb lock.
          */
         if (folio_test_hugetlb(folio)) {
                 spin_lock_irq(&hugetlb_lock);
-               __clear_hugetlb_destructor(h, folio);
+               __folio_clear_hugetlb(folio);
                 spin_unlock_irq(&hugetlb_lock);
         }
  
@@ -1796,7 +1788,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
                 destroy_compound_gigantic_folio(folio, huge_page_order(h));
                 free_gigantic_folio(folio, huge_page_order(h));
         } else {
-               __free_pages(&folio->page, huge_page_order(h));
+               INIT_LIST_HEAD(&folio->_deferred_list);
+               folio_put(folio);
         }
  }
  
@@ -1884,7 +1877,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,
                 list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
                         list_del(&folio->lru);
                         spin_lock_irq(&hugetlb_lock);
-                       __clear_hugetlb_destructor(h, folio);
+                       __folio_clear_hugetlb(folio);
                         spin_unlock_irq(&hugetlb_lock);
                         update_and_free_hugetlb_folio(h, folio, false);
                         cond_resched();
@@ -1909,7 +1902,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,
                         } else {
                                 list_del(&folio->lru);
                                 spin_lock_irq(&hugetlb_lock);
-                               __clear_hugetlb_destructor(h, folio);
+                               __folio_clear_hugetlb(folio);
                                 spin_unlock_irq(&hugetlb_lock);
                                 update_and_free_hugetlb_folio(h, folio, false);
                                 cond_resched();
@@ -1942,14 +1935,14 @@ retry:
          * should only be pages on the non_hvo_folios list.
          * Do note that the non_hvo_folios list could be empty.
          * Without HVO enabled, ret will be 0 and there is no need to call
-        * __clear_hugetlb_destructor as this was done previously.
+        * __folio_clear_hugetlb as this was done previously.
          */
         VM_WARN_ON(!list_empty(folio_list));
         VM_WARN_ON(ret < 0);
         if (!list_empty(&non_hvo_folios) && ret) {
                 spin_lock_irq(&hugetlb_lock);
                 list_for_each_entry(folio, &non_hvo_folios, lru)
-                       __clear_hugetlb_destructor(h, folio);
+                       __folio_clear_hugetlb(folio);
                 spin_unlock_irq(&hugetlb_lock);
         }
  
@@ -1974,7 +1967,7 @@ void free_huge_folio(struct folio *folio)
  {
         /*
          * Can't pass hstate in here because it is called from the
-        * compound page destructor.
+        * generic mm code.
          */
         struct hstate *h = folio_hstate(folio);
         int nid = folio_nid(folio);
@@ -2031,7 +2024,7 @@ void free_huge_folio(struct folio *folio)
                 spin_unlock_irqrestore(&hugetlb_lock, flags);
                 update_and_free_hugetlb_folio(h, folio, true);
         } else {
-               arch_clear_hugepage_flags(&folio->page);
+               arch_clear_hugetlb_flags(folio);
                 enqueue_hugetlb_folio(h, folio);
                 spin_unlock_irqrestore(&hugetlb_lock, flags);
         }
@@ -2124,10 +2117,10 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
                         set_compound_head(p, &folio->page);
         }
         __folio_set_head(folio);
-       /* we rely on prep_new_hugetlb_folio to set the destructor */
+       /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */
         folio_set_order(folio, order);
         atomic_set(&folio->_entire_mapcount, -1);
-       atomic_set(&folio->_nr_pages_mapped, 0);
+       atomic_set(&folio->_large_mapcount, -1);
         atomic_set(&folio->_pincount, 0);
         return true;
  
@@ -2162,13 +2155,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
  /*
   * Find and lock address space (mapping) in write mode.
   *
- * Upon entry, the page is locked which means that page_mapping() is
+ * Upon entry, the folio is locked which means that folio_mapping() is
   * stable.  Due to locking order, we can only trylock_write.  If we can
   * not get the lock, simply return NULL to caller.
   */
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
  {
-       struct address_space *mapping = page_mapping(hpage);
+       struct address_space *mapping = folio_mapping(folio);
  
         if (!mapping)
                 return mapping;
@@ -2184,13 +2177,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
                 nodemask_t *node_alloc_noretry)
  {
         int order = huge_page_order(h);
-       struct page *page;
+       struct folio *folio;
         bool alloc_try_hard = true;
         bool retry = true;
  
         /*
-        * By default we always try hard to allocate the page with
-        * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
+        * By default we always try hard to allocate the folio with
+        * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
          * a loop (to adjust global huge page counts) and previous allocation
          * failed, do not continue to try hard on the same node.  Use the
          * node_alloc_noretry bitmap to manage this state information.
@@ -2203,43 +2196,42 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
         if (nid == NUMA_NO_NODE)
                 nid = numa_mem_id();
  retry:
-       page = __alloc_pages(gfp_mask, order, nid, nmask);
+       folio = __folio_alloc(gfp_mask, order, nid, nmask);
  
-       /* Freeze head page */
-       if (page && !page_ref_freeze(page, 1)) {
-               __free_pages(page, order);
+       if (folio && !folio_ref_freeze(folio, 1)) {
+               folio_put(folio);
                 if (retry) {    /* retry once */
                         retry = false;
                         goto retry;
                 }
                 /* WOW!  twice in a row. */
-               pr_warn("HugeTLB head page unexpected inflated ref count\n");
-               page = NULL;
+               pr_warn("HugeTLB unexpected inflated folio ref count\n");
+               folio = NULL;
         }
  
         /*
-        * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
-        * indicates an overall state change.  Clear bit so that we resume
-        * normal 'try hard' allocations.
+        * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
+        * folio this indicates an overall state change.  Clear bit so
+        * that we resume normal 'try hard' allocations.
          */
-       if (node_alloc_noretry && page && !alloc_try_hard)
+       if (node_alloc_noretry && folio && !alloc_try_hard)
                 node_clear(nid, *node_alloc_noretry);
  
         /*
-        * If we tried hard to get a page but failed, set bit so that
+        * If we tried hard to get a folio but failed, set bit so that
          * subsequent attempts will not try as hard until there is an
          * overall state change.
          */
-       if (node_alloc_noretry && !page && alloc_try_hard)
+       if (node_alloc_noretry && !folio && alloc_try_hard)
                 node_set(nid, *node_alloc_noretry);
  
-       if (!page) {
+       if (!folio) {
                 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
                 return NULL;
         }
  
         __count_vm_event(HTLB_BUDDY_PGALLOC);
-       return page_folio(page);
+       return folio;
  }
  
  static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h,
@@ -2385,8 +2377,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
  }
  
  /*
- * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use hugepages and non-hugepages.
+ * Dissolve a given free hugetlb folio into free buddy pages. This function
+ * does nothing for in-use hugetlb folios and non-hugetlb folios.
   * This function returns values like below:
   *
   *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
@@ -2398,10 +2390,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
   *       0:  successfully dissolved free hugepages or the page is not a
   *           hugepage (considered as already dissolved)
   */
-int dissolve_free_huge_page(struct page *page)
+int dissolve_free_hugetlb_folio(struct folio *folio)
  {
         int rc = -EBUSY;
-       struct folio *folio = page_folio(page);
  
  retry:
         /* Not to disrupt normal path by vainly holding hugetlb_lock */
@@ -2478,13 +2469,13 @@ out:
   * make specified memory blocks removable from the system.
   * Note that this will dissolve a free gigantic hugepage completely, if any
   * part of it lies within the given range.
- * Also note that if dissolve_free_huge_page() returns with an error, all
- * free hugepages that were dissolved before that error are lost.
+ * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
+ * free hugetlb folios that were dissolved before that error are lost.
   */
-int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
  {
         unsigned long pfn;
-       struct page *page;
+       struct folio *folio;
         int rc = 0;
         unsigned int order;
         struct hstate *h;
@@ -2497,8 +2488,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
                 order = min(order, huge_page_order(h));
  
         for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
-               page = pfn_to_page(pfn);
-               rc = dissolve_free_huge_page(page);
+               folio = pfn_folio(pfn);
+               rc = dissolve_free_hugetlb_folio(folio);
                 if (rc)
                         break;
         }
@@ -2605,7 +2596,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
  
  /* folio migration callback function */
  struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
-               nodemask_t *nmask, gfp_t gfp_mask)
+               nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
  {
         spin_lock_irq(&hugetlb_lock);
         if (available_huge_pages(h)) {
@@ -2620,6 +2611,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
         }
         spin_unlock_irq(&hugetlb_lock);
  
+       /* We cannot fallback to other nodes, as we could break the per-node pool. */
+       if (!allow_alloc_fallback)
+               gfp_mask |= __GFP_THISNODE;
+
         return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
  }
  
@@ -5032,7 +5027,6 @@ static struct ctl_table hugetlb_table[] = {
                 .mode           = 0644,
                 .proc_handler   = hugetlb_overcommit_handler,
         },
-       { }
  };
  
  static void hugetlb_sysctl_init(void)
@@ -5923,19 +5917,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
   * cannot race with other handlers or page migration.
   * Keep the pte_same checks anyway to make transition from the mutex easier.
   */
-static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
-                      unsigned long address, pte_t *ptep, unsigned int flags,
-                      struct folio *pagecache_folio, spinlock_t *ptl,
+static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
                        struct vm_fault *vmf)
  {
-       const bool unshare = flags & FAULT_FLAG_UNSHARE;
-       pte_t pte = huge_ptep_get(ptep);
+       struct vm_area_struct *vma = vmf->vma;
+       struct mm_struct *mm = vma->vm_mm;
+       const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+       pte_t pte = huge_ptep_get(vmf->pte);
         struct hstate *h = hstate_vma(vma);
         struct folio *old_folio;
         struct folio *new_folio;
         int outside_reserve = 0;
         vm_fault_t ret = 0;
-       unsigned long haddr = address & huge_page_mask(h);
         struct mmu_notifier_range range;
  
         /*
@@ -5958,7 +5951,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
  
         /* Let's take out MAP_SHARED mappings first. */
         if (vma->vm_flags & VM_MAYSHARE) {
-               set_huge_ptep_writable(vma, haddr, ptep);
+               set_huge_ptep_writable(vma, vmf->address, vmf->pte);
                 return 0;
         }
  
@@ -5970,6 +5963,13 @@ retry_avoidcopy:
         /*
          * If no-one else is actually using this page, we're the exclusive
          * owner and can reuse this page.
+        *
+        * Note that we don't rely on the (safer) folio refcount here, because
+        * copying the hugetlb folio when there are unexpected (temporary)
+        * folio references could harm simple fork()+exit() users when
+        * we run out of free hugetlb folios: we would have to kill processes
+        * in scenarios that used to work. As a side effect, there can still
+        * be leaks between processes, for example, with FOLL_GET users.
          */
         if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
                 if (!PageAnonExclusive(&old_folio->page)) {
@@ -5977,7 +5977,7 @@ retry_avoidcopy:
                         SetPageAnonExclusive(&old_folio->page);
                 }
                 if (likely(!unshare))
-                       set_huge_ptep_writable(vma, haddr, ptep);
+                       set_huge_ptep_writable(vma, vmf->address, vmf->pte);
  
                 delayacct_wpcopy_end();
                 return 0;
@@ -6004,8 +6004,8 @@ retry_avoidcopy:
          * Drop page table lock as buddy allocator may be called. It will
          * be acquired again before returning to the caller, as expected.
          */
-       spin_unlock(ptl);
-       new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
+       spin_unlock(vmf->ptl);
+       new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
  
         if (IS_ERR(new_folio)) {
                 /*
@@ -6030,19 +6030,21 @@ retry_avoidcopy:
                          *
                          * Reacquire both after unmap operation.
                          */
-                       idx = vma_hugecache_offset(h, vma, haddr);
+                       idx = vma_hugecache_offset(h, vma, vmf->address);
                         hash = hugetlb_fault_mutex_hash(mapping, idx);
                         hugetlb_vma_unlock_read(vma);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  
-                       unmap_ref_private(mm, vma, &old_folio->page, haddr);
+                       unmap_ref_private(mm, vma, &old_folio->page,
+                                       vmf->address);
  
                         mutex_lock(&hugetlb_fault_mutex_table[hash]);
                         hugetlb_vma_lock_read(vma);
-                       spin_lock(ptl);
-                       ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
-                       if (likely(ptep &&
-                                  pte_same(huge_ptep_get(ptep), pte)))
+                       spin_lock(vmf->ptl);
+                       vmf->pte = hugetlb_walk(vma, vmf->address,
+                                       huge_page_size(h));
+                       if (likely(vmf->pte &&
+                                  pte_same(huge_ptep_get(vmf->pte), pte)))
                                 goto retry_avoidcopy;
                         /*
                          * race occurs while re-acquiring page table
@@ -6064,37 +6066,38 @@ retry_avoidcopy:
         if (unlikely(ret))
                 goto out_release_all;
  
-       if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
-               ret = VM_FAULT_HWPOISON_LARGE;
+       if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
+               ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
                 goto out_release_all;
         }
         __folio_mark_uptodate(new_folio);
  
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
-                               haddr + huge_page_size(h));
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
+                               vmf->address + huge_page_size(h));
         mmu_notifier_invalidate_range_start(&range);
  
         /*
          * Retake the page table lock to check for racing updates
          * before the page tables are altered
          */
-       spin_lock(ptl);
-       ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
-       if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+       spin_lock(vmf->ptl);
+       vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
+       if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {
                 pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
  
                 /* Break COW or unshare */
-               huge_ptep_clear_flush(vma, haddr, ptep);
+               huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
                 hugetlb_remove_rmap(old_folio);
-               hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
+               hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
                 if (huge_pte_uffd_wp(pte))
                         newpte = huge_pte_mkuffd_wp(newpte);
-               set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
+               set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
+                               huge_page_size(h));
                 folio_set_hugetlb_migratable(new_folio);
                 /* Make the old page be freed below */
                 new_folio = old_folio;
         }
-       spin_unlock(ptl);
+       spin_unlock(vmf->ptl);
         mmu_notifier_invalidate_range_end(&range);
  out_release_all:
         /*
@@ -6102,12 +6105,12 @@ out_release_all:
          * unshare)
          */
         if (new_folio != old_folio)
-               restore_reserve_on_error(h, vma, haddr, new_folio);
+               restore_reserve_on_error(h, vma, vmf->address, new_folio);
         folio_put(new_folio);
  out_release_old:
         folio_put(old_folio);
  
-       spin_lock(ptl); /* Caller expects lock to be held */
+       spin_lock(vmf->ptl); /* Caller expects lock to be held */
  
         delayacct_wpcopy_end();
         return ret;
@@ -6116,8 +6119,8 @@ out_release_old:
  /*
   * Return whether there is a pagecache page to back given address within VMA.
   */
-static bool hugetlbfs_pagecache_present(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long address)
+bool hugetlbfs_pagecache_present(struct hstate *h,
+                                struct vm_area_struct *vma, unsigned long address)
  {
         struct address_space *mapping = vma->vm_file->f_mapping;
         pgoff_t idx = linear_page_index(vma, address);
@@ -6193,23 +6196,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
         return same;
  }
  
-static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
-                       struct vm_area_struct *vma,
-                       struct address_space *mapping, pgoff_t idx,
-                       unsigned long address, pte_t *ptep,
-                       pte_t old_pte, unsigned int flags,
+static vm_fault_t hugetlb_no_page(struct address_space *mapping,
                         struct vm_fault *vmf)
  {
+       struct vm_area_struct *vma = vmf->vma;
+       struct mm_struct *mm = vma->vm_mm;
         struct hstate *h = hstate_vma(vma);
         vm_fault_t ret = VM_FAULT_SIGBUS;
         int anon_rmap = 0;
         unsigned long size;
         struct folio *folio;
         pte_t new_pte;
-       spinlock_t *ptl;
-       unsigned long haddr = address & huge_page_mask(h);
         bool new_folio, new_pagecache_folio = false;
-       u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
+       u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
  
         /*
          * Currently, we are forced to kill the process in the event the
@@ -6228,10 +6227,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
          * before we get page_table_lock.
          */
         new_folio = false;
-       folio = filemap_lock_hugetlb_folio(h, mapping, idx);
+       folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
         if (IS_ERR(folio)) {
                 size = i_size_read(mapping->host) >> huge_page_shift(h);
-               if (idx >= size)
+               if (vmf->pgoff >= size)
                         goto out;
                 /* Check for page in userfault range */
                 if (userfaultfd_missing(vma)) {
@@ -6252,7 +6251,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                          * never happen on the page after UFFDIO_COPY has
                          * correctly installed the page and returned.
                          */
-                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                       if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
                                 ret = 0;
                                 goto out;
                         }
@@ -6267,7 +6266,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                                 goto out;
                 }
  
-               folio = alloc_hugetlb_folio(vma, haddr, 0);
+               folio = alloc_hugetlb_folio(vma, vmf->address, 0);
                 if (IS_ERR(folio)) {
                         /*
                          * Returning error will result in faulting task being
@@ -6281,18 +6280,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                          * here.  Before returning error, get ptl and make
                          * sure there really is no pte entry.
                          */
-                       if (hugetlb_pte_stable(h, mm, ptep, old_pte))
+                       if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte))
                                 ret = vmf_error(PTR_ERR(folio));
                         else
                                 ret = 0;
                         goto out;
                 }
-               clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+               clear_huge_page(&folio->page, vmf->real_address,
+                               pages_per_huge_page(h));
                 __folio_mark_uptodate(folio);
                 new_folio = true;
  
                 if (vma->vm_flags & VM_MAYSHARE) {
-                       int err = hugetlb_add_to_page_cache(folio, mapping, idx);
+                       int err = hugetlb_add_to_page_cache(folio, mapping,
+                                                       vmf->pgoff);
                         if (err) {
                                 /*
                                  * err can't be -EEXIST which implies someone
@@ -6301,7 +6302,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                                  * to the page cache. So it's safe to call
                                  * restore_reserve_on_error() here.
                                  */
-                               restore_reserve_on_error(h, vma, haddr, folio);
+                               restore_reserve_on_error(h, vma, vmf->address,
+                                                       folio);
                                 folio_put(folio);
                                 ret = VM_FAULT_SIGBUS;
                                 goto out;
@@ -6328,7 +6330,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         folio_unlock(folio);
                         folio_put(folio);
                         /* See comment in userfaultfd_missing() block above */
-                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                       if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
                                 ret = 0;
                                 goto out;
                         }
@@ -6343,23 +6345,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
          * any allocations necessary to record that reservation occur outside
          * the spinlock.
          */
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-               if (vma_needs_reservation(h, vma, haddr) < 0) {
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+               if (vma_needs_reservation(h, vma, vmf->address) < 0) {
                         ret = VM_FAULT_OOM;
                         goto backout_unlocked;
                 }
                 /* Just decrements count, does not deallocate */
-               vma_end_reservation(h, vma, haddr);
+               vma_end_reservation(h, vma, vmf->address);
         }
  
-       ptl = huge_pte_lock(h, mm, ptep);
+       vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
         ret = 0;
         /* If pte changed from under us, retry */
-       if (!pte_same(huge_ptep_get(ptep), old_pte))
+       if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte))
                 goto backout;
  
         if (anon_rmap)
-               hugetlb_add_new_anon_rmap(folio, vma, haddr);
+               hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
         else
                 hugetlb_add_file_rmap(folio);
         new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
@@ -6368,17 +6370,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
          * If this pte was previously wr-protected, keep it wr-protected even
          * if populated.
          */
-       if (unlikely(pte_marker_uffd_wp(old_pte)))
+       if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
                 new_pte = huge_pte_mkuffd_wp(new_pte);
-       set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
+       set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
  
         hugetlb_count_add(pages_per_huge_page(h), mm);
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
-               ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf);
+               ret = hugetlb_wp(folio, vmf);
         }
  
-       spin_unlock(ptl);
+       spin_unlock(vmf->ptl);
  
         /*
          * Only set hugetlb_migratable in newly allocated pages.  Existing pages
@@ -6395,10 +6397,10 @@ out:
         return ret;
  
  backout:
-       spin_unlock(ptl);
+       spin_unlock(vmf->ptl);
  backout_unlocked:
         if (new_folio && !new_pagecache_folio)
-               restore_reserve_on_error(h, vma, haddr, folio);
+               restore_reserve_on_error(h, vma, vmf->address, folio);
  
         folio_unlock(folio);
         folio_put(folio);
@@ -6432,8 +6434,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, unsigned int flags)
  {
-       pte_t *ptep, entry;
-       spinlock_t *ptl;
         vm_fault_t ret;
         u32 hash;
         struct folio *folio = NULL;
@@ -6441,13 +6441,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         struct hstate *h = hstate_vma(vma);
         struct address_space *mapping;
         int need_wait_lock = 0;
-       unsigned long haddr = address & huge_page_mask(h);
         struct vm_fault vmf = {
                 .vma = vma,
-               .address = haddr,
+               .address = address & huge_page_mask(h),
                 .real_address = address,
                 .flags = flags,
-               .pgoff = vma_hugecache_offset(h, vma, haddr),
+               .pgoff = vma_hugecache_offset(h, vma,
+                               address & huge_page_mask(h)),
                 /* TODO: Track hugetlb faults using vm_fault */
  
                 /*
@@ -6467,25 +6467,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         /*
          * Acquire vma lock before calling huge_pte_alloc and hold
-        * until finished with ptep.  This prevents huge_pmd_unshare from
-        * being called elsewhere and making the ptep no longer valid.
+        * until finished with vmf.pte.  This prevents huge_pmd_unshare from
+        * being called elsewhere and making the vmf.pte no longer valid.
          */
         hugetlb_vma_lock_read(vma);
-       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
-       if (!ptep) {
+       vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
+       if (!vmf.pte) {
                 hugetlb_vma_unlock_read(vma);
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 return VM_FAULT_OOM;
         }
  
-       entry = huge_ptep_get(ptep);
-       if (huge_pte_none_mostly(entry)) {
-               if (is_pte_marker(entry)) {
+       vmf.orig_pte = huge_ptep_get(vmf.pte);
+       if (huge_pte_none_mostly(vmf.orig_pte)) {
+               if (is_pte_marker(vmf.orig_pte)) {
                         pte_marker marker =
-                               pte_marker_get(pte_to_swp_entry(entry));
+                               pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
  
                         if (marker & PTE_MARKER_POISONED) {
-                               ret = VM_FAULT_HWPOISON_LARGE;
+                               ret = VM_FAULT_HWPOISON_LARGE |
+                                     VM_FAULT_SET_HINDEX(hstate_index(h));
                                 goto out_mutex;
                         }
                 }
@@ -6496,21 +6497,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                  * hugetlb_no_page will drop vma lock and hugetlb fault
                  * mutex internally, which make us return immediately.
                  */
-               return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
-                                       ptep, entry, flags, &vmf);
+               return hugetlb_no_page(mapping, &vmf);
         }
  
         ret = 0;
  
         /*
-        * entry could be a migration/hwpoison entry at this point, so this
-        * check prevents the kernel from going below assuming that we have
-        * an active hugepage in pagecache. This goto expects the 2nd page
-        * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
-        * properly handle it.
+        * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
+        * point, so this check prevents the kernel from going below assuming
+        * that we have an active hugepage in pagecache. This goto expects
+        * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
+        * check will properly handle it.
          */
-       if (!pte_present(entry)) {
-               if (unlikely(is_hugetlb_entry_migration(entry))) {
+       if (!pte_present(vmf.orig_pte)) {
+               if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
                         /*
                          * Release the hugetlb fault lock now, but retain
                          * the vma lock, because it is needed to guard the
@@ -6519,9 +6519,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                          * be released there.
                          */
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       migration_entry_wait_huge(vma, ptep);
+                       migration_entry_wait_huge(vma, vmf.pte);
                         return 0;
-               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+               } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
                         ret = VM_FAULT_HWPOISON_LARGE |
                             VM_FAULT_SET_HINDEX(hstate_index(h));
                 goto out_mutex;
@@ -6535,13 +6535,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * determine if a reservation has been consumed.
          */
         if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
-           !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
-               if (vma_needs_reservation(h, vma, haddr) < 0) {
+           !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
+               if (vma_needs_reservation(h, vma, vmf.address) < 0) {
                         ret = VM_FAULT_OOM;
                         goto out_mutex;
                 }
                 /* Just decrements count, does not deallocate */
-               vma_end_reservation(h, vma, haddr);
+               vma_end_reservation(h, vma, vmf.address);
  
                 pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
                                                              vmf.pgoff);
@@ -6549,17 +6549,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         pagecache_folio = NULL;
         }
  
-       ptl = huge_pte_lock(h, mm, ptep);
+       vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
  
         /* Check for a racing update before calling hugetlb_wp() */
-       if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+       if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
                 goto out_ptl;
  
         /* Handle userfault-wp first, before trying to lock more pages */
-       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
-           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
+           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
                 if (!userfaultfd_wp_async(vma)) {
-                       spin_unlock(ptl);
+                       spin_unlock(vmf.ptl);
                         if (pagecache_folio) {
                                 folio_unlock(pagecache_folio);
                                 folio_put(pagecache_folio);
@@ -6569,18 +6569,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         return handle_userfault(&vmf, VM_UFFD_WP);
                 }
  
-               entry = huge_pte_clear_uffd_wp(entry);
-               set_huge_pte_at(mm, haddr, ptep, entry,
+               vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
+               set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
                                 huge_page_size(hstate_vma(vma)));
                 /* Fallthrough to CoW */
         }
  
         /*
-        * hugetlb_wp() requires page locks of pte_page(entry) and
+        * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
          * pagecache_folio, so here we need take the former one
          * when folio != pagecache_folio or !pagecache_folio.
          */
-       folio = page_folio(pte_page(entry));
+       folio = page_folio(pte_page(vmf.orig_pte));
         if (folio != pagecache_folio)
                 if (!folio_trylock(folio)) {
                         need_wait_lock = 1;
@@ -6590,24 +6590,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         folio_get(folio);
  
         if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
-               if (!huge_pte_write(entry)) {
-                       ret = hugetlb_wp(mm, vma, address, ptep, flags,
-                                        pagecache_folio, ptl, &vmf);
+               if (!huge_pte_write(vmf.orig_pte)) {
+                       ret = hugetlb_wp(pagecache_folio, &vmf);
                         goto out_put_page;
                 } else if (likely(flags & FAULT_FLAG_WRITE)) {
-                       entry = huge_pte_mkdirty(entry);
+                       vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
                 }
         }
-       entry = pte_mkyoung(entry);
-       if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+       vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
+       if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
                                                 flags & FAULT_FLAG_WRITE))
-               update_mmu_cache(vma, haddr, ptep);
+               update_mmu_cache(vma, vmf.address, vmf.pte);
  out_put_page:
         if (folio != pagecache_folio)
                 folio_unlock(folio);
         folio_put(folio);
  out_ptl:
-       spin_unlock(ptl);
+       spin_unlock(vmf.ptl);
  
         if (pagecache_folio) {
                 folio_unlock(pagecache_folio);
@@ -6643,7 +6642,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
  
         gfp_mask = htlb_alloc_mask(h);
         node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-       folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+       /*
+        * This is used to allocate a temporary hugetlb to hold the copied
+        * content, which will then be copied again to the final hugetlb
+        * consuming a reservation. Set the alloc_fallback to false to indicate
+        * that breaking the per-node hugetlb pool is not allowed in this case.
+        */
+       folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
         mpol_cond_put(mpol);
  
         return folio;
@@ -6873,77 +6878,6 @@ out_release_nounlock:
  }
  #endif /* CONFIG_USERFAULTFD */
  
-struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
-                                     unsigned long address, unsigned int flags,
-                                     unsigned int *page_mask)
-{
-       struct hstate *h = hstate_vma(vma);
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & huge_page_mask(h);
-       struct page *page = NULL;
-       spinlock_t *ptl;
-       pte_t *pte, entry;
-       int ret;
-
-       hugetlb_vma_lock_read(vma);
-       pte = hugetlb_walk(vma, haddr, huge_page_size(h));
-       if (!pte)
-               goto out_unlock;
-
-       ptl = huge_pte_lock(h, mm, pte);
-       entry = huge_ptep_get(pte);
-       if (pte_present(entry)) {
-               page = pte_page(entry);
-
-               if (!huge_pte_write(entry)) {
-                       if (flags & FOLL_WRITE) {
-                               page = NULL;
-                               goto out;
-                       }
-
-                       if (gup_must_unshare(vma, flags, page)) {
-                               /* Tell the caller to do unsharing */
-                               page = ERR_PTR(-EMLINK);
-                               goto out;
-                       }
-               }
-
-               page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
-
-               /*
-                * Note that page may be a sub-page, and with vmemmap
-                * optimizations the page struct may be read only.
-                * try_grab_page() will increase the ref count on the
-                * head page, so this will be OK.
-                *
-                * try_grab_page() should always be able to get the page here,
-                * because we hold the ptl lock and have verified pte_present().
-                */
-               ret = try_grab_page(page, flags);
-
-               if (WARN_ON_ONCE(ret)) {
-                       page = ERR_PTR(ret);
-                       goto out;
-               }
-
-               *page_mask = (1U << huge_page_order(h)) - 1;
-       }
-out:
-       spin_unlock(ptl);
-out_unlock:
-       hugetlb_vma_unlock_read(vma);
-
-       /*
-        * Fixup retval for dump requests: if pagecache doesn't exist,
-        * don't try to allocate a new page but just skip it.
-        */
-       if (!page && (flags & FOLL_DUMP) &&
-           !hugetlbfs_pagecache_present(h, vma, address))
-               page = ERR_PTR(-EFAULT);
-
-       return page;
-}
-
  long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end,
                 pgprot_t newprot, unsigned long cp_flags)
@@ -7867,9 +7801,9 @@ void __init hugetlb_cma_reserve(int order)
                  * huge page demotion.
                  */
                 res = cma_declare_contiguous_nid(0, size, 0,
-                                               PAGE_SIZE << HUGETLB_PAGE_ORDER,
-                                                0, false, name,
-                                                &hugetlb_cma[nid], nid);
+                                       PAGE_SIZE << order,
+                                       HUGETLB_PAGE_ORDER, false, name,
+                                       &hugetlb_cma[nid], nid);
                 if (res) {
                         pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
                                 res, nid);