mmu_notifiers: don't invalidate secondary TLBs as part of mmu_notifier_invalidate_ran...
[linux-block.git] / mm / hugetlb.c
index bce28cca73a1054c07f52587abb17ce144280d84..4672752b0b17f4e17bcfd40a4af4f104054329b5 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/nospec.h>
 #include <linux/delayacct.h>
 #include <linux/memory.h>
+#include <linux/mm_inline.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -1784,10 +1785,10 @@ static void free_hpage_workfn(struct work_struct *work)
                node = node->next;
                page->mapping = NULL;
                /*
-                * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
-                * is going to trigger because a previous call to
+                * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
+                * folio_hstate() is going to trigger because a previous call to
                 * remove_hugetlb_folio() will call folio_set_compound_dtor
-                * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
+                * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate()
                 * directly.
                 */
                h = size_to_hstate(page_size(page));
@@ -5101,15 +5102,12 @@ again:
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                } else if (unlikely(is_pte_marker(entry))) {
-                       /* No swap on hugetlb */
-                       WARN_ON_ONCE(
-                           is_swapin_error_entry(pte_to_swp_entry(entry)));
-                       /*
-                        * We copy the pte marker only if the dst vma has
-                        * uffd-wp enabled.
-                        */
-                       if (userfaultfd_wp(dst_vma))
-                               set_huge_pte_at(dst, addr, dst_pte, entry);
+                       pte_marker marker = copy_pte_marker(
+                               pte_to_swp_entry(entry), dst_vma);
+
+                       if (marker)
+                               set_huge_pte_at(dst, addr, dst_pte,
+                                               make_pte_marker(marker));
                } else {
                        entry = huge_ptep_get(src_pte);
                        pte_folio = page_folio(pte_page(entry));
@@ -5690,7 +5688,6 @@ retry_avoidcopy:
 
                /* Break COW or unshare */
                huge_ptep_clear_flush(vma, haddr, ptep);
-               mmu_notifier_invalidate_range(mm, range.start, range.end);
                page_remove_rmap(&old_folio->page, vma, true);
                hugepage_add_new_anon_rmap(new_folio, vma, haddr);
                if (huge_pte_uffd_wp(pte))
@@ -5721,7 +5718,6 @@ out_release_old:
 
 /*
  * Return whether there is a pagecache page to back given address within VMA.
- * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
  */
 static bool hugetlbfs_pagecache_present(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
@@ -6090,14 +6086,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        entry = huge_ptep_get(ptep);
-       /* PTE markers should be handled the same way as none pte */
-       if (huge_pte_none_mostly(entry))
+       if (huge_pte_none_mostly(entry)) {
+               if (is_pte_marker(entry)) {
+                       pte_marker marker =
+                               pte_marker_get(pte_to_swp_entry(entry));
+
+                       if (marker & PTE_MARKER_POISONED) {
+                               ret = VM_FAULT_HWPOISON_LARGE;
+                               goto out_mutex;
+                       }
+               }
+
                /*
+                * Other PTE markers should be handled the same way as none PTE.
+                *
                 * hugetlb_no_page will drop vma lock and hugetlb fault
                 * mutex internally, which make us return immediately.
                 */
                return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
                                      entry, flags);
+       }
 
        ret = 0;
 
@@ -6253,6 +6261,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
        int writable;
        bool folio_in_pagecache = false;
 
+       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
+               ptl = huge_pte_lock(h, dst_mm, dst_pte);
+
+               /* Don't overwrite any existing PTEs (even markers) */
+               if (!huge_pte_none(huge_ptep_get(dst_pte))) {
+                       spin_unlock(ptl);
+                       return -EEXIST;
+               }
+
+               _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
+               set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+               /* No need to invalidate - it was non-present before */
+               update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+               spin_unlock(ptl);
+               return 0;
+       }
+
        if (is_continue) {
                ret = -EFAULT;
                folio = filemap_lock_folio(mapping, idx);
@@ -6422,39 +6449,9 @@ out_release_nounlock:
 }
 #endif /* CONFIG_USERFAULTFD */
 
-static void record_subpages(struct page *page, struct vm_area_struct *vma,
-                           int refs, struct page **pages)
-{
-       int nr;
-
-       for (nr = 0; nr < refs; nr++) {
-               if (likely(pages))
-                       pages[nr] = nth_page(page, nr);
-       }
-}
-
-static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
-                                              unsigned int flags, pte_t *pte,
-                                              bool *unshare)
-{
-       pte_t pteval = huge_ptep_get(pte);
-
-       *unshare = false;
-       if (is_swap_pte(pteval))
-               return true;
-       if (huge_pte_write(pteval))
-               return false;
-       if (flags & FOLL_WRITE)
-               return true;
-       if (gup_must_unshare(vma, flags, pte_page(pteval))) {
-               *unshare = true;
-               return true;
-       }
-       return false;
-}
-
 struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
-                               unsigned long address, unsigned int flags)
+                                     unsigned long address, unsigned int flags,
+                                     unsigned int *page_mask)
 {
        struct hstate *h = hstate_vma(vma);
        struct mm_struct *mm = vma->vm_mm;
@@ -6462,13 +6459,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
        struct page *page = NULL;
        spinlock_t *ptl;
        pte_t *pte, entry;
-
-       /*
-        * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
-        * follow_hugetlb_page().
-        */
-       if (WARN_ON_ONCE(flags & FOLL_PIN))
-               return NULL;
+       int ret;
 
        hugetlb_vma_lock_read(vma);
        pte = hugetlb_walk(vma, haddr, huge_page_size(h));
@@ -6478,8 +6469,23 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
        ptl = huge_pte_lock(h, mm, pte);
        entry = huge_ptep_get(pte);
        if (pte_present(entry)) {
-               page = pte_page(entry) +
-                               ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+               page = pte_page(entry);
+
+               if (!huge_pte_write(entry)) {
+                       if (flags & FOLL_WRITE) {
+                               page = NULL;
+                               goto out;
+                       }
+
+                       if (gup_must_unshare(vma, flags, page)) {
+                               /* Tell the caller to do unsharing */
+                               page = ERR_PTR(-EMLINK);
+                               goto out;
+                       }
+               }
+
+               page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+
                /*
                 * Note that page may be a sub-page, and with vmemmap
                 * optimizations the page struct may be read only.
@@ -6489,208 +6495,29 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
                 * try_grab_page() should always be able to get the page here,
                 * because we hold the ptl lock and have verified pte_present().
                 */
-               if (try_grab_page(page, flags)) {
-                       page = NULL;
+               ret = try_grab_page(page, flags);
+
+               if (WARN_ON_ONCE(ret)) {
+                       page = ERR_PTR(ret);
                        goto out;
                }
+
+               *page_mask = (1U << huge_page_order(h)) - 1;
        }
 out:
        spin_unlock(ptl);
 out_unlock:
        hugetlb_vma_unlock_read(vma);
-       return page;
-}
-
-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                        struct page **pages, unsigned long *position,
-                        unsigned long *nr_pages, long i, unsigned int flags,
-                        int *locked)
-{
-       unsigned long pfn_offset;
-       unsigned long vaddr = *position;
-       unsigned long remainder = *nr_pages;
-       struct hstate *h = hstate_vma(vma);
-       int err = -EFAULT, refs;
-
-       while (vaddr < vma->vm_end && remainder) {
-               pte_t *pte;
-               spinlock_t *ptl = NULL;
-               bool unshare = false;
-               int absent;
-               struct page *page;
-
-               /*
-                * If we have a pending SIGKILL, don't keep faulting pages and
-                * potentially allocating memory.
-                */
-               if (fatal_signal_pending(current)) {
-                       remainder = 0;
-                       break;
-               }
-
-               hugetlb_vma_lock_read(vma);
-               /*
-                * Some archs (sparc64, sh*) have multiple pte_ts to
-                * each hugepage.  We have to make sure we get the
-                * first, for the page indexing below to work.
-                *
-                * Note that page table lock is not held when pte is null.
-                */
-               pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
-                                  huge_page_size(h));
-               if (pte)
-                       ptl = huge_pte_lock(h, mm, pte);
-               absent = !pte || huge_pte_none(huge_ptep_get(pte));
-
-               /*
-                * When coredumping, it suits get_dump_page if we just return
-                * an error where there's an empty slot with no huge pagecache
-                * to back it.  This way, we avoid allocating a hugepage, and
-                * the sparse dumpfile avoids allocating disk blocks, but its
-                * huge holes still show up with zeroes where they need to be.
-                */
-               if (absent && (flags & FOLL_DUMP) &&
-                   !hugetlbfs_pagecache_present(h, vma, vaddr)) {
-                       if (pte)
-                               spin_unlock(ptl);
-                       hugetlb_vma_unlock_read(vma);
-                       remainder = 0;
-                       break;
-               }
-
-               /*
-                * We need call hugetlb_fault for both hugepages under migration
-                * (in which case hugetlb_fault waits for the migration,) and
-                * hwpoisoned hugepages (in which case we need to prevent the
-                * caller from accessing to them.) In order to do this, we use
-                * here is_swap_pte instead of is_hugetlb_entry_migration and
-                * is_hugetlb_entry_hwpoisoned. This is because it simply covers
-                * both cases, and because we can't follow correct pages
-                * directly from any kind of swap entries.
-                */
-               if (absent ||
-                   __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
-                       vm_fault_t ret;
-                       unsigned int fault_flags = 0;
-
-                       if (pte)
-                               spin_unlock(ptl);
-                       hugetlb_vma_unlock_read(vma);
 
-                       if (flags & FOLL_WRITE)
-                               fault_flags |= FAULT_FLAG_WRITE;
-                       else if (unshare)
-                               fault_flags |= FAULT_FLAG_UNSHARE;
-                       if (locked) {
-                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
-                                       FAULT_FLAG_KILLABLE;
-                               if (flags & FOLL_INTERRUPTIBLE)
-                                       fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
-                       }
-                       if (flags & FOLL_NOWAIT)
-                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
-                                       FAULT_FLAG_RETRY_NOWAIT;
-                       if (flags & FOLL_TRIED) {
-                               /*
-                                * Note: FAULT_FLAG_ALLOW_RETRY and
-                                * FAULT_FLAG_TRIED can co-exist
-                                */
-                               fault_flags |= FAULT_FLAG_TRIED;
-                       }
-                       ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
-                       if (ret & VM_FAULT_ERROR) {
-                               err = vm_fault_to_errno(ret, flags);
-                               remainder = 0;
-                               break;
-                       }
-                       if (ret & VM_FAULT_RETRY) {
-                               if (locked &&
-                                   !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
-                                       *locked = 0;
-                               *nr_pages = 0;
-                               /*
-                                * VM_FAULT_RETRY must not return an
-                                * error, it will return zero
-                                * instead.
-                                *
-                                * No need to update "position" as the
-                                * caller will not check it after
-                                * *nr_pages is set to 0.
-                                */
-                               return i;
-                       }
-                       continue;
-               }
-
-               pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
-               page = pte_page(huge_ptep_get(pte));
-
-               VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
-                              !PageAnonExclusive(page), page);
-
-               /*
-                * If subpage information not requested, update counters
-                * and skip the same_page loop below.
-                */
-               if (!pages && !pfn_offset &&
-                   (vaddr + huge_page_size(h) < vma->vm_end) &&
-                   (remainder >= pages_per_huge_page(h))) {
-                       vaddr += huge_page_size(h);
-                       remainder -= pages_per_huge_page(h);
-                       i += pages_per_huge_page(h);
-                       spin_unlock(ptl);
-                       hugetlb_vma_unlock_read(vma);
-                       continue;
-               }
-
-               /* vaddr may not be aligned to PAGE_SIZE */
-               refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
-                   (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
-
-               if (pages)
-                       record_subpages(nth_page(page, pfn_offset),
-                                       vma, refs,
-                                       likely(pages) ? pages + i : NULL);
-
-               if (pages) {
-                       /*
-                        * try_grab_folio() should always succeed here,
-                        * because: a) we hold the ptl lock, and b) we've just
-                        * checked that the huge page is present in the page
-                        * tables. If the huge page is present, then the tail
-                        * pages must also be present. The ptl prevents the
-                        * head page and tail pages from being rearranged in
-                        * any way. As this is hugetlb, the pages will never
-                        * be p2pdma or not longterm pinable. So this page
-                        * must be available at this point, unless the page
-                        * refcount overflowed:
-                        */
-                       if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
-                                                        flags))) {
-                               spin_unlock(ptl);
-                               hugetlb_vma_unlock_read(vma);
-                               remainder = 0;
-                               err = -ENOMEM;
-                               break;
-                       }
-               }
-
-               vaddr += (refs << PAGE_SHIFT);
-               remainder -= refs;
-               i += refs;
-
-               spin_unlock(ptl);
-               hugetlb_vma_unlock_read(vma);
-       }
-       *nr_pages = remainder;
        /*
-        * setting position is actually required only if remainder is
-        * not zero but it's faster not to add a "if (remainder)"
-        * branch.
+        * Fixup retval for dump requests: if pagecache doesn't exist,
+        * don't try to allocate a new page but just skip it.
         */
-       *position = vaddr;
+       if (!page && (flags & FOLL_DUMP) &&
+           !hugetlbfs_pagecache_present(h, vma, address))
+               page = ERR_PTR(-EFAULT);
 
-       return i ? i : err;
+       return page;
 }
 
 long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -7246,7 +7073,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
        }
-       BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte)));
+
+       if (pte) {
+               pte_t pteval = ptep_get_lockless(pte);
+
+               BUG_ON(pte_present(pteval) && !pte_huge(pteval));
+       }
 
        return pte;
 }