mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page

[linux-2.6-block.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index a18c071c294e35c33823edf0f21dd3094afd031b..9564bf817e6a8d92999d2653304abf406a3f96d2 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order)
  #endif
  static unsigned long hugetlb_cma_size __initdata;
  
-/*
- * Minimum page order among possible hugepage sizes, set to a proper value
- * at boot time.
- */
-static unsigned int minimum_order __read_mostly = UINT_MAX;
-
  __initdata LIST_HEAD(huge_boot_pages);
  
  /* for command line parsing */
@@ -1135,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
  
         lockdep_assert_held(&hugetlb_lock);
         list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-               if (pin && !is_pinnable_page(page))
+               if (pin && !is_longterm_pinnable_page(page))
                         continue;
  
                 if (PageHWPoison(page))
@@ -1541,7 +1535,14 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
  
-       if (hugetlb_vmemmap_alloc(h, page)) {
+       /*
+        * If we don't know which subpages are hwpoisoned, we can't free
+        * the hugepage, so it's leaked intentionally.
+        */
+       if (HPageRawHwpUnreliable(page))
+               return;
+
+       if (hugetlb_vmemmap_restore(h, page)) {
                 spin_lock_irq(&hugetlb_lock);
                 /*
                  * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1553,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
                 return;
         }
  
+       /*
+        * Move PageHWPoison flag from head page to the raw error pages,
+        * which makes any healthy subpages reusable.
+        */
+       if (unlikely(PageHWPoison(page)))
+               hugetlb_clear_page_hwpoison(page);
+
         for (i = 0; i < pages_per_huge_page(h);
              i++, subpage = mem_map_next(subpage, page, i)) {
                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1618,7 +1626,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
  
  static inline void flush_free_hpage_work(struct hstate *h)
  {
-       if (hugetlb_optimize_vmemmap_pages(h))
+       if (hugetlb_vmemmap_optimizable(h))
                 flush_work(&free_hpage_work);
  }
  
@@ -1740,7 +1748,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
  
  static void __prep_new_huge_page(struct hstate *h, struct page *page)
  {
-       hugetlb_vmemmap_free(h, page);
+       hugetlb_vmemmap_optimize(h, page);
         INIT_LIST_HEAD(&page->lru);
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         hugetlb_set_page_subpool(page, NULL);
@@ -2113,17 +2121,8 @@ retry:
                  * Attempt to allocate vmemmmap here so that we can take
                  * appropriate action on failure.
                  */
-               rc = hugetlb_vmemmap_alloc(h, head);
+               rc = hugetlb_vmemmap_restore(h, head);
                 if (!rc) {
-                       /*
-                        * Move PageHWPoison flag from head page to the raw
-                        * error page, which makes any subpages rather than
-                        * the error page reusable.
-                        */
-                       if (PageHWPoison(head) && page != head) {
-                               SetPageHWPoison(page);
-                               ClearPageHWPoison(head);
-                       }
                         update_and_free_page(h, head, false);
                 } else {
                         spin_lock_irq(&hugetlb_lock);
@@ -2152,11 +2151,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
         unsigned long pfn;
         struct page *page;
         int rc = 0;
+       unsigned int order;
+       struct hstate *h;
  
         if (!hugepages_supported())
                 return rc;
  
-       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+       order = huge_page_order(&default_hstate);
+       for_each_hstate(h)
+               order = min(order, huge_page_order(h));
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
                 page = pfn_to_page(pfn);
                 rc = dissolve_free_huge_page(page);
                 if (rc)
@@ -2432,8 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h,
         /* Uncommit the reservation */
         h->resv_huge_pages -= unused_resv_pages;
  
-       /* Cannot return gigantic pages currently */
-       if (hstate_is_gigantic(h))
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 goto out;
  
         /*
@@ -2766,8 +2770,7 @@ retry:
                  * Fail with -EBUSY if not possible.
                  */
                 spin_unlock_irq(&hugetlb_lock);
-               if (!isolate_huge_page(old_page, list))
-                       ret = -EBUSY;
+               ret = isolate_hugetlb(old_page, list);
                 spin_lock_irq(&hugetlb_lock);
                 goto free_new;
         } else if (!HPageFreed(old_page)) {
@@ -2843,7 +2846,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
         if (hstate_is_gigantic(h))
                 return -ENOMEM;
  
-       if (page_count(head) && isolate_huge_page(head, list))
+       if (page_count(head) && !isolate_hugetlb(head, list))
                 ret = 0;
         else if (!page_count(head))
                 ret = alloc_and_dissolve_huge_page(h, head, list);
@@ -3149,9 +3152,6 @@ static void __init hugetlb_init_hstates(void)
         struct hstate *h, *h2;
  
         for_each_hstate(h) {
-               if (minimum_order > huge_page_order(h))
-                       minimum_order = huge_page_order(h);
-
                 /* oversize hugepages were init'ed in early boot */
                 if (!hstate_is_gigantic(h))
                         hugetlb_hstate_alloc_pages(h);
@@ -3176,7 +3176,6 @@ static void __init hugetlb_init_hstates(void)
                                 h->demote_order = h2->order;
                 }
         }
-       VM_BUG_ON(minimum_order == UINT_MAX);
  }
  
  static void __init report_hugepages(void)
@@ -3187,8 +3186,10 @@ static void __init report_hugepages(void)
                 char buf[32];
  
                 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
-               pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+               pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
                         buf, h->free_huge_pages);
+               pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
+                       hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
         }
  }
  
@@ -3419,6 +3420,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
  {
         int i, nid = page_to_nid(page);
         struct hstate *target_hstate;
+       struct page *subpage;
         int rc = 0;
  
         target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3426,7 +3428,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
         remove_hugetlb_page_for_demote(h, page, false);
         spin_unlock_irq(&hugetlb_lock);
  
-       rc = hugetlb_vmemmap_alloc(h, page);
+       rc = hugetlb_vmemmap_restore(h, page);
         if (rc) {
                 /* Allocation of vmemmmap failed, we can not demote page */
                 spin_lock_irq(&hugetlb_lock);
@@ -3452,15 +3454,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
         mutex_lock(&target_hstate->resize_lock);
         for (i = 0; i < pages_per_huge_page(h);
                                 i += pages_per_huge_page(target_hstate)) {
+               subpage = nth_page(page, i);
                 if (hstate_is_gigantic(target_hstate))
-                       prep_compound_gigantic_page_for_demote(page + i,
+                       prep_compound_gigantic_page_for_demote(subpage,
                                                         target_hstate->order);
                 else
-                       prep_compound_page(page + i, target_hstate->order);
-               set_page_private(page + i, 0);
-               set_page_refcounted(page + i);
-               prep_new_huge_page(target_hstate, page + i, nid);
-               put_page(page + i);
+                       prep_compound_page(subpage, target_hstate->order);
+               set_page_private(subpage, 0);
+               set_page_refcounted(subpage);
+               prep_new_huge_page(target_hstate, subpage, nid);
+               put_page(subpage);
         }
         mutex_unlock(&target_hstate->resize_lock);
  
@@ -4116,7 +4119,6 @@ void __init hugetlb_add_hstate(unsigned int order)
         h->next_nid_to_free = first_memory_node;
         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                         huge_page_size(h)/1024);
-       hugetlb_vmemmap_init(h);
  
         parsed_hstate = h;
  }
@@ -4482,22 +4484,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid)
                              nid, h->surplus_huge_pages_node[nid]);
  }
  
-void hugetlb_show_meminfo(void)
+void hugetlb_show_meminfo_node(int nid)
  {
         struct hstate *h;
-       int nid;
  
         if (!hugepages_supported())
                 return;
  
-       for_each_node_state(nid, N_MEMORY)
-               for_each_hstate(h)
-                       pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
-                               nid,
-                               h->nr_huge_pages_node[nid],
-                               h->free_huge_pages_node[nid],
-                               h->surplus_huge_pages_node[nid],
-                               huge_page_size(h) / SZ_1K);
+       for_each_hstate(h)
+               printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+                       nid,
+                       h->nr_huge_pages_node[nid],
+                       h->free_huge_pages_node[nid],
+                       h->surplus_huge_pages_node[nid],
+                       huge_page_size(h) / SZ_1K);
  }
  
  void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
@@ -4732,6 +4732,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
         unsigned long npages = pages_per_huge_page(h);
         struct address_space *mapping = src_vma->vm_file->f_mapping;
         struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
         int ret = 0;
  
         if (cow) {
@@ -4751,11 +4752,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 i_mmap_lock_read(mapping);
         }
  
+       last_addr_mask = hugetlb_mask_last_page(h);
         for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                 spinlock_t *src_ptl, *dst_ptl;
                 src_pte = huge_pte_offset(src, addr, sz);
-               if (!src_pte)
+               if (!src_pte) {
+                       addr |= last_addr_mask;
                         continue;
+               }
                 dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                 if (!dst_pte) {
                         ret = -ENOMEM;
@@ -4772,8 +4776,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                  * after taking the lock below.
                  */
                 dst_entry = huge_ptep_get(dst_pte);
-               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+                       addr |= last_addr_mask;
                         continue;
+               }
  
                 dst_ptl = huge_pte_lock(h, dst, dst_pte);
                 src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -4808,12 +4814,11 @@ again:
                                 entry = swp_entry_to_pte(swp_entry);
                                 if (userfaultfd_wp(src_vma) && uffd_wp)
                                         entry = huge_pte_mkuffd_wp(entry);
-                               set_huge_swap_pte_at(src, addr, src_pte,
-                                                    entry, sz);
+                               set_huge_pte_at(src, addr, src_pte, entry);
                         }
                         if (!userfaultfd_wp(dst_vma) && uffd_wp)
                                 entry = huge_pte_clear_uffd_wp(entry);
-                       set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+                       set_huge_pte_at(dst, addr, dst_pte, entry);
                 } else if (unlikely(is_pte_marker(entry))) {
                         /*
                          * We copy the pte marker only if the dst vma has
@@ -4880,7 +4885,7 @@ again:
                                  * table protection not changing it to point
                                  * to a new page.
                                  *
-                                * See Documentation/vm/mmu_notifier.rst
+                                * See Documentation/mm/mmu_notifier.rst
                                  */
                                 huge_ptep_set_wrprotect(src, addr, src_pte);
                                 entry = huge_pte_wrprotect(entry);
@@ -4939,7 +4944,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
         unsigned long sz = huge_page_size(h);
         struct mm_struct *mm = vma->vm_mm;
         unsigned long old_end = old_addr + len;
-       unsigned long old_addr_copy;
+       unsigned long last_addr_mask;
         pte_t *src_pte, *dst_pte;
         struct mmu_notifier_range range;
         bool shared_pmd = false;
@@ -4954,23 +4959,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
         flush_cache_range(vma, range.start, range.end);
  
         mmu_notifier_invalidate_range_start(&range);
+       last_addr_mask = hugetlb_mask_last_page(h);
         /* Prevent race with file truncation */
         i_mmap_lock_write(mapping);
         for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                 src_pte = huge_pte_offset(mm, old_addr, sz);
-               if (!src_pte)
+               if (!src_pte) {
+                       old_addr |= last_addr_mask;
+                       new_addr |= last_addr_mask;
                         continue;
+               }
                 if (huge_pte_none(huge_ptep_get(src_pte)))
                         continue;
  
-               /* old_addr arg to huge_pmd_unshare() is a pointer and so the
-                * arg may be modified. Pass a copy instead to preserve the
-                * value in old_addr.
-                */
-               old_addr_copy = old_addr;
-
-               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+               if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
                         shared_pmd = true;
+                       old_addr |= last_addr_mask;
+                       new_addr |= last_addr_mask;
                         continue;
                 }
  
@@ -5004,6 +5009,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
         struct hstate *h = hstate_vma(vma);
         unsigned long sz = huge_page_size(h);
         struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
         bool force_flush = false;
  
         WARN_ON(!is_vm_hugetlb_page(vma));
@@ -5024,17 +5030,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                                 end);
         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
         mmu_notifier_invalidate_range_start(&range);
+       last_addr_mask = hugetlb_mask_last_page(h);
         address = start;
         for (; address < end; address += sz) {
                 ptep = huge_pte_offset(mm, address, sz);
-               if (!ptep)
+               if (!ptep) {
+                       address |= last_addr_mask;
                         continue;
+               }
  
                 ptl = huge_pte_lock(h, mm, ptep);
-               if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+               if (huge_pmd_unshare(mm, vma, address, ptep)) {
                         spin_unlock(ptl);
                         tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
                         force_flush = true;
+                       address |= last_addr_mask;
                         continue;
                 }
  
@@ -5233,6 +5243,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
         VM_BUG_ON(unshare && (flags & FOLL_WRITE));
         VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
  
+       /*
+        * hugetlb does not support FOLL_FORCE-style write faults that keep the
+        * PTE mapped R/O such as maybe_mkwrite() would do.
+        */
+       if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
+               return VM_FAULT_SIGSEGV;
+
+       /* Let's take out MAP_SHARED mappings first. */
+       if (vma->vm_flags & VM_MAYSHARE) {
+               if (unlikely(unshare))
+                       return 0;
+               set_huge_ptep_writable(vma, haddr, ptep);
+               return 0;
+       }
+
         pte = huge_ptep_get(ptep);
         old_page = pte_page(pte);
  
@@ -5419,19 +5444,25 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                            pgoff_t idx)
  {
+       struct folio *folio = page_folio(page);
         struct inode *inode = mapping->host;
         struct hstate *h = hstate_inode(inode);
-       int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+       int err;
  
-       if (err)
+       __folio_set_locked(folio);
+       err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
+
+       if (unlikely(err)) {
+               __folio_clear_locked(folio);
                 return err;
+       }
         ClearHPageRestoreReserve(page);
  
         /*
-        * set page dirty so that it will not be removed from cache/file
+        * mark folio dirty so that it will not be removed from cache/file
          * by non-hugetlbfs specific code paths.
          */
-       set_page_dirty(page);
+       folio_mark_dirty(folio);
  
         spin_lock(&inode->i_lock);
         inode->i_blocks += blocks_per_huge_page(h);
@@ -5708,7 +5739,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                  */
                 entry = huge_ptep_get(ptep);
                 if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait_huge(vma, mm, ptep);
+                       migration_entry_wait_huge(vma, ptep);
                         return 0;
                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                         return VM_FAULT_HWPOISON_LARGE |
@@ -5767,12 +5798,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * If we are going to COW/unshare the mapping later, we examine the
          * pending reservations for this page now. This will ensure that any
          * allocations necessary to record that reservation occur outside the
-        * spinlock. For private mappings, we also lookup the pagecache
-        * page now as it is used to determine if a reservation has been
-        * consumed.
+        * spinlock. Also lookup the pagecache page now as it is used to
+        * determine if a reservation has been consumed.
          */
         if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
-           !huge_pte_write(entry)) {
+           !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
                 if (vma_needs_reservation(h, vma, haddr) < 0) {
                         ret = VM_FAULT_OOM;
                         goto out_mutex;
@@ -5780,9 +5810,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 /* Just decrements count, does not deallocate */
                 vma_end_reservation(h, vma, haddr);
  
-               if (!(vma->vm_flags & VM_MAYSHARE))
-                       pagecache_page = hugetlbfs_pagecache_page(h,
-                                                               vma, haddr);
+               pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
         }
  
         ptl = huge_pte_lock(h, mm, ptep);
@@ -6015,7 +6043,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
  
-       if (vm_shared) {
+       if (page_in_pagecache) {
                 page_dup_file_rmap(page, true);
         } else {
                 ClearHPageRestoreReserve(page);
@@ -6046,8 +6074,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
  
         set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  
-       (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
-                                       dst_vma->vm_flags & VM_WRITE);
         hugetlb_count_add(pages_per_huge_page(h), dst_mm);
  
         /* No need to invalidate - it was non-present before */
@@ -6299,6 +6325,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         unsigned long pages = 0, psize = huge_page_size(h);
         bool shared_pmd = false;
         struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  
@@ -6315,14 +6342,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         flush_cache_range(vma, range.start, range.end);
  
         mmu_notifier_invalidate_range_start(&range);
+       last_addr_mask = hugetlb_mask_last_page(h);
         i_mmap_lock_write(vma->vm_file->f_mapping);
         for (; address < end; address += psize) {
                 spinlock_t *ptl;
                 ptep = huge_pte_offset(mm, address, psize);
-               if (!ptep)
+               if (!ptep) {
+                       address |= last_addr_mask;
                         continue;
+               }
                 ptl = huge_pte_lock(h, mm, ptep);
-               if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+               if (huge_pmd_unshare(mm, vma, address, ptep)) {
                         /*
                          * When uffd-wp is enabled on the vma, unshare
                          * shouldn't happen at all.  Warn about it if it
@@ -6332,6 +6362,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                         pages++;
                         spin_unlock(ptl);
                         shared_pmd = true;
+                       address |= last_addr_mask;
                         continue;
                 }
                 pte = huge_ptep_get(ptep);
@@ -6357,8 +6388,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                                         newpte = pte_swp_mkuffd_wp(newpte);
                                 else if (uffd_wp_resolve)
                                         newpte = pte_swp_clear_uffd_wp(newpte);
-                               set_huge_swap_pte_at(mm, address, ptep,
-                                                    newpte, psize);
+                               set_huge_pte_at(mm, address, ptep, newpte);
                                 pages++;
                         }
                         spin_unlock(ptl);
@@ -6409,7 +6439,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
          * No need to call mmu_notifier_invalidate_range() we are downgrading
          * page table protection not changing it to point to a new page.
          *
-        * See Documentation/vm/mmu_notifier.rst
+        * See Documentation/mm/mmu_notifier.rst
          */
         i_mmap_unlock_write(vma->vm_file->f_mapping);
         mmu_notifier_invalidate_range_end(&range);
@@ -6755,11 +6785,11 @@ out:
   *         0 the underlying pte page is not shared, or it is the last user
   */
  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-                                       unsigned long *addr, pte_t *ptep)
+                                       unsigned long addr, pte_t *ptep)
  {
-       pgd_t *pgd = pgd_offset(mm, *addr);
-       p4d_t *p4d = p4d_offset(pgd, *addr);
-       pud_t *pud = pud_offset(p4d, *addr);
+       pgd_t *pgd = pgd_offset(mm, addr);
+       p4d_t *p4d = p4d_offset(pgd, addr);
+       pud_t *pud = pud_offset(p4d, addr);
  
         i_mmap_assert_write_locked(vma->vm_file->f_mapping);
         BUG_ON(page_count(virt_to_page(ptep)) == 0);
@@ -6769,14 +6799,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
         pud_clear(pud);
         put_page(virt_to_page(ptep));
         mm_dec_nr_pmds(mm);
-       /*
-        * This update of passed address optimizes loops sequentially
-        * processing addresses in increments of huge page size (PMD_SIZE
-        * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
-        * Update address to the 'last page' in the cleared area so that
-        * calling loop can move to first page past this area.
-        */
-       *addr |= PUD_SIZE - PMD_SIZE;
         return 1;
  }
  
@@ -6788,7 +6810,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-                               unsigned long *addr, pte_t *ptep)
+                               unsigned long addr, pte_t *ptep)
  {
         return 0;
  }
@@ -6871,6 +6893,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
         return (pte_t *)pmd;
  }
  
+/*
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size.  Used to skip non-present
+ * page table entries when linearly scanning address ranges.  Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
+ */
+unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+       unsigned long hp_size = huge_page_size(h);
+
+       if (hp_size == PUD_SIZE)
+               return P4D_SIZE - PUD_SIZE;
+       else if (hp_size == PMD_SIZE)
+               return PUD_SIZE - PMD_SIZE;
+       else
+               return 0UL;
+}
+
+#else
+
+/* See description above.  Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+       if (huge_page_size(h) == PMD_SIZE)
+               return PUD_SIZE - PMD_SIZE;
+#endif
+       return 0UL;
+}
+
  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
  
  /*
@@ -6893,12 +6946,13 @@ follow_huge_pd(struct vm_area_struct *vma,
  }
  
  struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
  {
+       struct hstate *h = hstate_vma(vma);
+       struct mm_struct *mm = vma->vm_mm;
         struct page *page = NULL;
         spinlock_t *ptl;
-       pte_t pte;
+       pte_t *ptep, pte;
  
         /*
          * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
@@ -6908,17 +6962,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                 return NULL;
  
  retry:
-       ptl = pmd_lockptr(mm, pmd);
-       spin_lock(ptl);
-       /*
-        * make sure that the address range covered by this pmd is not
-        * unmapped from other threads.
-        */
-       if (!pmd_huge(*pmd))
-               goto out;
-       pte = huge_ptep_get((pte_t *)pmd);
+       ptep = huge_pte_offset(mm, address, huge_page_size(h));
+       if (!ptep)
+               return NULL;
+
+       ptl = huge_pte_lock(h, mm, ptep);
+       pte = huge_ptep_get(ptep);
         if (pte_present(pte)) {
-               page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+               page = pte_page(pte) +
+                       ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
                 /*
                  * try_grab_page() should always succeed here, because: a) we
                  * hold the pmd (ptl) lock, and b) we've just checked that the
@@ -6934,7 +6986,7 @@ retry:
         } else {
                 if (is_hugetlb_entry_migration(pte)) {
                         spin_unlock(ptl);
-                       __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+                       __migration_entry_wait_huge(ptep, ptl);
                         goto retry;
                 }
                 /*
@@ -6951,10 +7003,38 @@ struct page * __weak
  follow_huge_pud(struct mm_struct *mm, unsigned long address,
                 pud_t *pud, int flags)
  {
-       if (flags & (FOLL_GET | FOLL_PIN))
+       struct page *page = NULL;
+       spinlock_t *ptl;
+       pte_t pte;
+
+       if (WARN_ON_ONCE(flags & FOLL_PIN))
                 return NULL;
  
-       return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+retry:
+       ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
+       if (!pud_huge(*pud))
+               goto out;
+       pte = huge_ptep_get((pte_t *)pud);
+       if (pte_present(pte)) {
+               page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+               if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+                       page = NULL;
+                       goto out;
+               }
+       } else {
+               if (is_hugetlb_entry_migration(pte)) {
+                       spin_unlock(ptl);
+                       __migration_entry_wait(mm, (pte_t *)pud, ptl);
+                       goto retry;
+               }
+               /*
+                * hwpoisoned entry is treated as no_page_table in
+                * follow_page_mask().
+                */
+       }
+out:
+       spin_unlock(ptl);
+       return page;
  }
  
  struct page * __weak
@@ -6966,15 +7046,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
         return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
  }
  
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int isolate_hugetlb(struct page *page, struct list_head *list)
  {
-       bool ret = true;
+       int ret = 0;
  
         spin_lock_irq(&hugetlb_lock);
         if (!PageHeadHuge(page) ||
             !HPageMigratable(page) ||
             !get_page_unless_zero(page)) {
-               ret = false;
+               ret = -EBUSY;
                 goto unlock;
         }
         ClearHPageMigratable(page);
@@ -7094,21 +7174,18 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         mmu_notifier_invalidate_range_start(&range);
         i_mmap_lock_write(vma->vm_file->f_mapping);
         for (address = start; address < end; address += PUD_SIZE) {
-               unsigned long tmp = address;
-
                 ptep = huge_pte_offset(mm, address, sz);
                 if (!ptep)
                         continue;
                 ptl = huge_pte_lock(h, mm, ptep);
-               /* We don't want 'address' to be changed */
-               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               huge_pmd_unshare(mm, vma, address, ptep);
                 spin_unlock(ptl);
         }
         flush_hugetlb_tlb_range(vma, start, end);
         i_mmap_unlock_write(vma->vm_file->f_mapping);
         /*
          * No need to call mmu_notifier_invalidate_range(), see
-        * Documentation/vm/mmu_notifier.rst.
+        * Documentation/mm/mmu_notifier.rst.
          */
         mmu_notifier_invalidate_range_end(&range);
  }