mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page
[linux-2.6-block.git] / mm / hugetlb.c
index a18c071c294e35c33823edf0f21dd3094afd031b..9564bf817e6a8d92999d2653304abf406a3f96d2 100644 (file)
@@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order)
 #endif
 static unsigned long hugetlb_cma_size __initdata;
 
-/*
- * Minimum page order among possible hugepage sizes, set to a proper value
- * at boot time.
- */
-static unsigned int minimum_order __read_mostly = UINT_MAX;
-
 __initdata LIST_HEAD(huge_boot_pages);
 
 /* for command line parsing */
@@ -1135,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
        lockdep_assert_held(&hugetlb_lock);
        list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-               if (pin && !is_pinnable_page(page))
+               if (pin && !is_longterm_pinnable_page(page))
                        continue;
 
                if (PageHWPoison(page))
@@ -1541,7 +1535,14 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
-       if (hugetlb_vmemmap_alloc(h, page)) {
+       /*
+        * If we don't know which subpages are hwpoisoned, we can't free
+        * the hugepage, so it's leaked intentionally.
+        */
+       if (HPageRawHwpUnreliable(page))
+               return;
+
+       if (hugetlb_vmemmap_restore(h, page)) {
                spin_lock_irq(&hugetlb_lock);
                /*
                 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1553,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
                return;
        }
 
+       /*
+        * Move PageHWPoison flag from head page to the raw error pages,
+        * which makes any healthy subpages reusable.
+        */
+       if (unlikely(PageHWPoison(page)))
+               hugetlb_clear_page_hwpoison(page);
+
        for (i = 0; i < pages_per_huge_page(h);
             i++, subpage = mem_map_next(subpage, page, i)) {
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1618,7 +1626,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
 
 static inline void flush_free_hpage_work(struct hstate *h)
 {
-       if (hugetlb_optimize_vmemmap_pages(h))
+       if (hugetlb_vmemmap_optimizable(h))
                flush_work(&free_hpage_work);
 }
 
@@ -1740,7 +1748,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 
 static void __prep_new_huge_page(struct hstate *h, struct page *page)
 {
-       hugetlb_vmemmap_free(h, page);
+       hugetlb_vmemmap_optimize(h, page);
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
@@ -2113,17 +2121,8 @@ retry:
                 * Attempt to allocate vmemmmap here so that we can take
                 * appropriate action on failure.
                 */
-               rc = hugetlb_vmemmap_alloc(h, head);
+               rc = hugetlb_vmemmap_restore(h, head);
                if (!rc) {
-                       /*
-                        * Move PageHWPoison flag from head page to the raw
-                        * error page, which makes any subpages rather than
-                        * the error page reusable.
-                        */
-                       if (PageHWPoison(head) && page != head) {
-                               SetPageHWPoison(page);
-                               ClearPageHWPoison(head);
-                       }
                        update_and_free_page(h, head, false);
                } else {
                        spin_lock_irq(&hugetlb_lock);
@@ -2152,11 +2151,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
        unsigned long pfn;
        struct page *page;
        int rc = 0;
+       unsigned int order;
+       struct hstate *h;
 
        if (!hugepages_supported())
                return rc;
 
-       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+       order = huge_page_order(&default_hstate);
+       for_each_hstate(h)
+               order = min(order, huge_page_order(h));
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
                page = pfn_to_page(pfn);
                rc = dissolve_free_huge_page(page);
                if (rc)
@@ -2432,8 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h,
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
 
-       /* Cannot return gigantic pages currently */
-       if (hstate_is_gigantic(h))
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                goto out;
 
        /*
@@ -2766,8 +2770,7 @@ retry:
                 * Fail with -EBUSY if not possible.
                 */
                spin_unlock_irq(&hugetlb_lock);
-               if (!isolate_huge_page(old_page, list))
-                       ret = -EBUSY;
+               ret = isolate_hugetlb(old_page, list);
                spin_lock_irq(&hugetlb_lock);
                goto free_new;
        } else if (!HPageFreed(old_page)) {
@@ -2843,7 +2846,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
        if (hstate_is_gigantic(h))
                return -ENOMEM;
 
-       if (page_count(head) && isolate_huge_page(head, list))
+       if (page_count(head) && !isolate_hugetlb(head, list))
                ret = 0;
        else if (!page_count(head))
                ret = alloc_and_dissolve_huge_page(h, head, list);
@@ -3149,9 +3152,6 @@ static void __init hugetlb_init_hstates(void)
        struct hstate *h, *h2;
 
        for_each_hstate(h) {
-               if (minimum_order > huge_page_order(h))
-                       minimum_order = huge_page_order(h);
-
                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
@@ -3176,7 +3176,6 @@ static void __init hugetlb_init_hstates(void)
                                h->demote_order = h2->order;
                }
        }
-       VM_BUG_ON(minimum_order == UINT_MAX);
 }
 
 static void __init report_hugepages(void)
@@ -3187,8 +3186,10 @@ static void __init report_hugepages(void)
                char buf[32];
 
                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
-               pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+               pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
                        buf, h->free_huge_pages);
+               pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
+                       hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
        }
 }
 
@@ -3419,6 +3420,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
 {
        int i, nid = page_to_nid(page);
        struct hstate *target_hstate;
+       struct page *subpage;
        int rc = 0;
 
        target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3426,7 +3428,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
        remove_hugetlb_page_for_demote(h, page, false);
        spin_unlock_irq(&hugetlb_lock);
 
-       rc = hugetlb_vmemmap_alloc(h, page);
+       rc = hugetlb_vmemmap_restore(h, page);
        if (rc) {
                /* Allocation of vmemmmap failed, we can not demote page */
                spin_lock_irq(&hugetlb_lock);
@@ -3452,15 +3454,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
        mutex_lock(&target_hstate->resize_lock);
        for (i = 0; i < pages_per_huge_page(h);
                                i += pages_per_huge_page(target_hstate)) {
+               subpage = nth_page(page, i);
                if (hstate_is_gigantic(target_hstate))
-                       prep_compound_gigantic_page_for_demote(page + i,
+                       prep_compound_gigantic_page_for_demote(subpage,
                                                        target_hstate->order);
                else
-                       prep_compound_page(page + i, target_hstate->order);
-               set_page_private(page + i, 0);
-               set_page_refcounted(page + i);
-               prep_new_huge_page(target_hstate, page + i, nid);
-               put_page(page + i);
+                       prep_compound_page(subpage, target_hstate->order);
+               set_page_private(subpage, 0);
+               set_page_refcounted(subpage);
+               prep_new_huge_page(target_hstate, subpage, nid);
+               put_page(subpage);
        }
        mutex_unlock(&target_hstate->resize_lock);
 
@@ -4116,7 +4119,6 @@ void __init hugetlb_add_hstate(unsigned int order)
        h->next_nid_to_free = first_memory_node;
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
-       hugetlb_vmemmap_init(h);
 
        parsed_hstate = h;
 }
@@ -4482,22 +4484,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid)
                             nid, h->surplus_huge_pages_node[nid]);
 }
 
-void hugetlb_show_meminfo(void)
+void hugetlb_show_meminfo_node(int nid)
 {
        struct hstate *h;
-       int nid;
 
        if (!hugepages_supported())
                return;
 
-       for_each_node_state(nid, N_MEMORY)
-               for_each_hstate(h)
-                       pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
-                               nid,
-                               h->nr_huge_pages_node[nid],
-                               h->free_huge_pages_node[nid],
-                               h->surplus_huge_pages_node[nid],
-                               huge_page_size(h) / SZ_1K);
+       for_each_hstate(h)
+               printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+                       nid,
+                       h->nr_huge_pages_node[nid],
+                       h->free_huge_pages_node[nid],
+                       h->surplus_huge_pages_node[nid],
+                       huge_page_size(h) / SZ_1K);
 }
 
 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
@@ -4732,6 +4732,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        unsigned long npages = pages_per_huge_page(h);
        struct address_space *mapping = src_vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
        int ret = 0;
 
        if (cow) {
@@ -4751,11 +4752,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                i_mmap_lock_read(mapping);
        }
 
+       last_addr_mask = hugetlb_mask_last_page(h);
        for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
-               if (!src_pte)
+               if (!src_pte) {
+                       addr |= last_addr_mask;
                        continue;
+               }
                dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
@@ -4772,8 +4776,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 * after taking the lock below.
                 */
                dst_entry = huge_ptep_get(dst_pte);
-               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+                       addr |= last_addr_mask;
                        continue;
+               }
 
                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -4808,12 +4814,11 @@ again:
                                entry = swp_entry_to_pte(swp_entry);
                                if (userfaultfd_wp(src_vma) && uffd_wp)
                                        entry = huge_pte_mkuffd_wp(entry);
-                               set_huge_swap_pte_at(src, addr, src_pte,
-                                                    entry, sz);
+                               set_huge_pte_at(src, addr, src_pte, entry);
                        }
                        if (!userfaultfd_wp(dst_vma) && uffd_wp)
                                entry = huge_pte_clear_uffd_wp(entry);
-                       set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+                       set_huge_pte_at(dst, addr, dst_pte, entry);
                } else if (unlikely(is_pte_marker(entry))) {
                        /*
                         * We copy the pte marker only if the dst vma has
@@ -4880,7 +4885,7 @@ again:
                                 * table protection not changing it to point
                                 * to a new page.
                                 *
-                                * See Documentation/vm/mmu_notifier.rst
+                                * See Documentation/mm/mmu_notifier.rst
                                 */
                                huge_ptep_set_wrprotect(src, addr, src_pte);
                                entry = huge_pte_wrprotect(entry);
@@ -4939,7 +4944,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_end = old_addr + len;
-       unsigned long old_addr_copy;
+       unsigned long last_addr_mask;
        pte_t *src_pte, *dst_pte;
        struct mmu_notifier_range range;
        bool shared_pmd = false;
@@ -4954,23 +4959,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
        flush_cache_range(vma, range.start, range.end);
 
        mmu_notifier_invalidate_range_start(&range);
+       last_addr_mask = hugetlb_mask_last_page(h);
        /* Prevent race with file truncation */
        i_mmap_lock_write(mapping);
        for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                src_pte = huge_pte_offset(mm, old_addr, sz);
-               if (!src_pte)
+               if (!src_pte) {
+                       old_addr |= last_addr_mask;
+                       new_addr |= last_addr_mask;
                        continue;
+               }
                if (huge_pte_none(huge_ptep_get(src_pte)))
                        continue;
 
-               /* old_addr arg to huge_pmd_unshare() is a pointer and so the
-                * arg may be modified. Pass a copy instead to preserve the
-                * value in old_addr.
-                */
-               old_addr_copy = old_addr;
-
-               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+               if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
                        shared_pmd = true;
+                       old_addr |= last_addr_mask;
+                       new_addr |= last_addr_mask;
                        continue;
                }
 
@@ -5004,6 +5009,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
        bool force_flush = false;
 
        WARN_ON(!is_vm_hugetlb_page(vma));
@@ -5024,17 +5030,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                                end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        mmu_notifier_invalidate_range_start(&range);
+       last_addr_mask = hugetlb_mask_last_page(h);
        address = start;
        for (; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address, sz);
-               if (!ptep)
+               if (!ptep) {
+                       address |= last_addr_mask;
                        continue;
+               }
 
                ptl = huge_pte_lock(h, mm, ptep);
-               if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+               if (huge_pmd_unshare(mm, vma, address, ptep)) {
                        spin_unlock(ptl);
                        tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
                        force_flush = true;
+                       address |= last_addr_mask;
                        continue;
                }
 
@@ -5233,6 +5243,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
        VM_BUG_ON(unshare && (flags & FOLL_WRITE));
        VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
 
+       /*
+        * hugetlb does not support FOLL_FORCE-style write faults that keep the
+        * PTE mapped R/O such as maybe_mkwrite() would do.
+        */
+       if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
+               return VM_FAULT_SIGSEGV;
+
+       /* Let's take out MAP_SHARED mappings first. */
+       if (vma->vm_flags & VM_MAYSHARE) {
+               if (unlikely(unshare))
+                       return 0;
+               set_huge_ptep_writable(vma, haddr, ptep);
+               return 0;
+       }
+
        pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);
 
@@ -5419,19 +5444,25 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                           pgoff_t idx)
 {
+       struct folio *folio = page_folio(page);
        struct inode *inode = mapping->host;
        struct hstate *h = hstate_inode(inode);
-       int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+       int err;
 
-       if (err)
+       __folio_set_locked(folio);
+       err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
+
+       if (unlikely(err)) {
+               __folio_clear_locked(folio);
                return err;
+       }
        ClearHPageRestoreReserve(page);
 
        /*
-        * set page dirty so that it will not be removed from cache/file
+        * mark folio dirty so that it will not be removed from cache/file
         * by non-hugetlbfs specific code paths.
         */
-       set_page_dirty(page);
+       folio_mark_dirty(folio);
 
        spin_lock(&inode->i_lock);
        inode->i_blocks += blocks_per_huge_page(h);
@@ -5708,7 +5739,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
-                       migration_entry_wait_huge(vma, mm, ptep);
+                       migration_entry_wait_huge(vma, ptep);
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
@@ -5767,12 +5798,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * If we are going to COW/unshare the mapping later, we examine the
         * pending reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
-        * spinlock. For private mappings, we also lookup the pagecache
-        * page now as it is used to determine if a reservation has been
-        * consumed.
+        * spinlock. Also lookup the pagecache page now as it is used to
+        * determine if a reservation has been consumed.
         */
        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
-           !huge_pte_write(entry)) {
+           !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
                if (vma_needs_reservation(h, vma, haddr) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
@@ -5780,9 +5810,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, haddr);
 
-               if (!(vma->vm_flags & VM_MAYSHARE))
-                       pagecache_page = hugetlbfs_pagecache_page(h,
-                                                               vma, haddr);
+               pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
        }
 
        ptl = huge_pte_lock(h, mm, ptep);
@@ -6015,7 +6043,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                goto out_release_unlock;
 
-       if (vm_shared) {
+       if (page_in_pagecache) {
                page_dup_file_rmap(page, true);
        } else {
                ClearHPageRestoreReserve(page);
@@ -6046,8 +6074,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
-       (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
-                                       dst_vma->vm_flags & VM_WRITE);
        hugetlb_count_add(pages_per_huge_page(h), dst_mm);
 
        /* No need to invalidate - it was non-present before */
@@ -6299,6 +6325,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        unsigned long pages = 0, psize = huge_page_size(h);
        bool shared_pmd = false;
        struct mmu_notifier_range range;
+       unsigned long last_addr_mask;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
@@ -6315,14 +6342,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        flush_cache_range(vma, range.start, range.end);
 
        mmu_notifier_invalidate_range_start(&range);
+       last_addr_mask = hugetlb_mask_last_page(h);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (; address < end; address += psize) {
                spinlock_t *ptl;
                ptep = huge_pte_offset(mm, address, psize);
-               if (!ptep)
+               if (!ptep) {
+                       address |= last_addr_mask;
                        continue;
+               }
                ptl = huge_pte_lock(h, mm, ptep);
-               if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+               if (huge_pmd_unshare(mm, vma, address, ptep)) {
                        /*
                         * When uffd-wp is enabled on the vma, unshare
                         * shouldn't happen at all.  Warn about it if it
@@ -6332,6 +6362,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                        pages++;
                        spin_unlock(ptl);
                        shared_pmd = true;
+                       address |= last_addr_mask;
                        continue;
                }
                pte = huge_ptep_get(ptep);
@@ -6357,8 +6388,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                                        newpte = pte_swp_mkuffd_wp(newpte);
                                else if (uffd_wp_resolve)
                                        newpte = pte_swp_clear_uffd_wp(newpte);
-                               set_huge_swap_pte_at(mm, address, ptep,
-                                                    newpte, psize);
+                               set_huge_pte_at(mm, address, ptep, newpte);
                                pages++;
                        }
                        spin_unlock(ptl);
@@ -6409,7 +6439,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         * No need to call mmu_notifier_invalidate_range() we are downgrading
         * page table protection not changing it to point to a new page.
         *
-        * See Documentation/vm/mmu_notifier.rst
+        * See Documentation/mm/mmu_notifier.rst
         */
        i_mmap_unlock_write(vma->vm_file->f_mapping);
        mmu_notifier_invalidate_range_end(&range);
@@ -6755,11 +6785,11 @@ out:
  *         0 the underlying pte page is not shared, or it is the last user
  */
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-                                       unsigned long *addr, pte_t *ptep)
+                                       unsigned long addr, pte_t *ptep)
 {
-       pgd_t *pgd = pgd_offset(mm, *addr);
-       p4d_t *p4d = p4d_offset(pgd, *addr);
-       pud_t *pud = pud_offset(p4d, *addr);
+       pgd_t *pgd = pgd_offset(mm, addr);
+       p4d_t *p4d = p4d_offset(pgd, addr);
+       pud_t *pud = pud_offset(p4d, addr);
 
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);
        BUG_ON(page_count(virt_to_page(ptep)) == 0);
@@ -6769,14 +6799,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
        pud_clear(pud);
        put_page(virt_to_page(ptep));
        mm_dec_nr_pmds(mm);
-       /*
-        * This update of passed address optimizes loops sequentially
-        * processing addresses in increments of huge page size (PMD_SIZE
-        * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
-        * Update address to the 'last page' in the cleared area so that
-        * calling loop can move to first page past this area.
-        */
-       *addr |= PUD_SIZE - PMD_SIZE;
        return 1;
 }
 
@@ -6788,7 +6810,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-                               unsigned long *addr, pte_t *ptep)
+                               unsigned long addr, pte_t *ptep)
 {
        return 0;
 }
@@ -6871,6 +6893,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
        return (pte_t *)pmd;
 }
 
+/*
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size.  Used to skip non-present
+ * page table entries when linearly scanning address ranges.  Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
+ */
+unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+       unsigned long hp_size = huge_page_size(h);
+
+       if (hp_size == PUD_SIZE)
+               return P4D_SIZE - PUD_SIZE;
+       else if (hp_size == PMD_SIZE)
+               return PUD_SIZE - PMD_SIZE;
+       else
+               return 0UL;
+}
+
+#else
+
+/* See description above.  Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+       if (huge_page_size(h) == PMD_SIZE)
+               return PUD_SIZE - PMD_SIZE;
+#endif
+       return 0UL;
+}
+
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
 /*
@@ -6893,12 +6946,13 @@ follow_huge_pd(struct vm_area_struct *vma,
 }
 
 struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
 {
+       struct hstate *h = hstate_vma(vma);
+       struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
        spinlock_t *ptl;
-       pte_t pte;
+       pte_t *ptep, pte;
 
        /*
         * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
@@ -6908,17 +6962,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                return NULL;
 
 retry:
-       ptl = pmd_lockptr(mm, pmd);
-       spin_lock(ptl);
-       /*
-        * make sure that the address range covered by this pmd is not
-        * unmapped from other threads.
-        */
-       if (!pmd_huge(*pmd))
-               goto out;
-       pte = huge_ptep_get((pte_t *)pmd);
+       ptep = huge_pte_offset(mm, address, huge_page_size(h));
+       if (!ptep)
+               return NULL;
+
+       ptl = huge_pte_lock(h, mm, ptep);
+       pte = huge_ptep_get(ptep);
        if (pte_present(pte)) {
-               page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+               page = pte_page(pte) +
+                       ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
                /*
                 * try_grab_page() should always succeed here, because: a) we
                 * hold the pmd (ptl) lock, and b) we've just checked that the
@@ -6934,7 +6986,7 @@ retry:
        } else {
                if (is_hugetlb_entry_migration(pte)) {
                        spin_unlock(ptl);
-                       __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+                       __migration_entry_wait_huge(ptep, ptl);
                        goto retry;
                }
                /*
@@ -6951,10 +7003,38 @@ struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
                pud_t *pud, int flags)
 {
-       if (flags & (FOLL_GET | FOLL_PIN))
+       struct page *page = NULL;
+       spinlock_t *ptl;
+       pte_t pte;
+
+       if (WARN_ON_ONCE(flags & FOLL_PIN))
                return NULL;
 
-       return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+retry:
+       ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
+       if (!pud_huge(*pud))
+               goto out;
+       pte = huge_ptep_get((pte_t *)pud);
+       if (pte_present(pte)) {
+               page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+               if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+                       page = NULL;
+                       goto out;
+               }
+       } else {
+               if (is_hugetlb_entry_migration(pte)) {
+                       spin_unlock(ptl);
+                       __migration_entry_wait(mm, (pte_t *)pud, ptl);
+                       goto retry;
+               }
+               /*
+                * hwpoisoned entry is treated as no_page_table in
+                * follow_page_mask().
+                */
+       }
+out:
+       spin_unlock(ptl);
+       return page;
 }
 
 struct page * __weak
@@ -6966,15 +7046,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
        return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 }
 
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int isolate_hugetlb(struct page *page, struct list_head *list)
 {
-       bool ret = true;
+       int ret = 0;
 
        spin_lock_irq(&hugetlb_lock);
        if (!PageHeadHuge(page) ||
            !HPageMigratable(page) ||
            !get_page_unless_zero(page)) {
-               ret = false;
+               ret = -EBUSY;
                goto unlock;
        }
        ClearHPageMigratable(page);
@@ -7094,21 +7174,18 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
        mmu_notifier_invalidate_range_start(&range);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (address = start; address < end; address += PUD_SIZE) {
-               unsigned long tmp = address;
-
                ptep = huge_pte_offset(mm, address, sz);
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
-               /* We don't want 'address' to be changed */
-               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               huge_pmd_unshare(mm, vma, address, ptep);
                spin_unlock(ptl);
        }
        flush_hugetlb_tlb_range(vma, start, end);
        i_mmap_unlock_write(vma->vm_file->f_mapping);
        /*
         * No need to call mmu_notifier_invalidate_range(), see
-        * Documentation/vm/mmu_notifier.rst.
+        * Documentation/mm/mmu_notifier.rst.
         */
        mmu_notifier_invalidate_range_end(&range);
 }