Merge tag 'perf-urgent-2024-05-18' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-block.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index d2155ced45f8f84ef8eac74ba3eda42a67d37102..b5453b86ec4b7ecce51141284f1f94ecc321b0b2 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -112,8 +112,10 @@ static bool vmf_pte_changed(struct vm_fault *vmf);
   * Return true if the original pte was a uffd-wp pte marker (so the pte was
   * wr-protected).
   */
-static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
  {
+       if (!userfaultfd_wp(vmf->vma))
+               return false;
         if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                 return false;
  
@@ -989,7 +991,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
                         flags |= FPB_IGNORE_SOFT_DIRTY;
  
                 nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
-                                    &any_writable);
+                                    &any_writable, NULL, NULL);
                 folio_ref_add(folio, nr);
                 if (folio_test_anon(folio)) {
                         if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1502,10 +1504,15 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
         if (!delay_rmap) {
                 folio_remove_rmap_ptes(folio, page, nr, vma);
  
-               /* Only sanity-check the first page in a batch. */
-               if (unlikely(page_mapcount(page) < 0))
+               if (unlikely(folio_mapcount(folio) < 0))
                         print_bad_pte(vma, addr, ptent, page);
         }
+
+       if (want_init_mlocked_on_free() && folio_test_mlocked(folio) &&
+           !delay_rmap && folio_test_anon(folio)) {
+               kernel_init_pages(page, folio_nr_pages(folio));
+       }
+
         if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
                 *force_flush = true;
                 *force_break = true;
@@ -1553,7 +1560,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
          */
         if (unlikely(folio_test_large(folio) && max_nr != 1)) {
                 nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
-                                    NULL);
+                                    NULL, NULL, NULL);
  
                 zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
                                        addr, details, rss, force_flush,
@@ -1631,12 +1638,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                 folio_remove_rmap_pte(folio, page, vma);
                         folio_put(folio);
                 } else if (!non_swap_entry(entry)) {
-                       /* Genuine swap entry, hence a private anon page */
+                       max_nr = (end - addr) / PAGE_SIZE;
+                       nr = swap_pte_batch(pte, max_nr, ptent);
+                       /* Genuine swap entries, hence a private anon pages */
                         if (!should_zap_cows(details))
                                 continue;
-                       rss[MM_SWAPENTS]--;
-                       if (unlikely(!free_swap_and_cache(entry)))
-                               print_bad_pte(vma, addr, ptent, NULL);
+                       rss[MM_SWAPENTS] -= nr;
+                       free_swap_and_cache_nr(entry, nr);
                 } else if (is_migration_entry(entry)) {
                         folio = pfn_swap_entry_folio(entry);
                         if (!should_zap_folio(details, folio))
@@ -1659,8 +1667,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
                         WARN_ON_ONCE(1);
                 }
-               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-               zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+               clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+               zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
         } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
  
         add_mm_rss_vec(mm, rss);
@@ -2765,7 +2773,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
         unsigned long next;
         int err = 0;
  
-       BUG_ON(pud_huge(*pud));
+       BUG_ON(pud_leaf(*pud));
  
         if (create) {
                 pmd = pmd_alloc_track(mm, pud, addr, mask);
@@ -3206,19 +3214,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
         return VM_FAULT_RETRY;
  }
  
+/**
+ * vmf_anon_prepare - Prepare to handle an anonymous fault.
+ * @vmf: The vm_fault descriptor passed from the fault handler.
+ *
+ * When preparing to insert an anonymous page into a VMA from a
+ * fault handler, call this function rather than anon_vma_prepare().
+ * If this vma does not already have an associated anon_vma and we are
+ * only protected by the per-VMA lock, the caller must retry with the
+ * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
+ * determine if this VMA can share its anon_vma, and that's not safe to
+ * do with only the per-VMA lock held for this VMA.
+ *
+ * Return: 0 if fault handling can proceed.  Any other value should be
+ * returned to the caller.
+ */
  vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
+       vm_fault_t ret = 0;
  
         if (likely(vma->anon_vma))
                 return 0;
         if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-               vma_end_read(vma);
-               return VM_FAULT_RETRY;
+               if (!mmap_read_trylock(vma->vm_mm)) {
+                       vma_end_read(vma);
+                       return VM_FAULT_RETRY;
+               }
         }
         if (__anon_vma_prepare(vma))
-               return VM_FAULT_OOM;
-       return 0;
+               ret = VM_FAULT_OOM;
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+               mmap_read_unlock(vma->vm_mm);
+       return ret;
  }
  
  /*
@@ -3329,13 +3357,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 ptep_clear_flush(vma, vmf->address, vmf->pte);
                 folio_add_new_anon_rmap(new_folio, vma, vmf->address);
                 folio_add_lru_vma(new_folio, vma);
-               /*
-                * We call the notify macro here because, when using secondary
-                * mmu page tables (such as kvm shadow page tables), we want the
-                * new page to be mapped directly into the secondary page table.
-                */
                 BUG_ON(unshare && pte_write(entry));
-               set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+               set_pte_at(mm, vmf->address, vmf->pte, entry);
                 update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                 if (old_folio) {
                         /*
@@ -4190,7 +4213,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
          * when reading from swap. This metadata may be indexed by swap entry
          * so this must be called before swap_free().
          */
-       arch_swap_restore(entry, folio);
+       arch_swap_restore(folio_swap(entry, folio), folio);
  
         /*
          * Remove the swap entry and conditionally try to free up the swapcache.
@@ -4326,8 +4349,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
          * for this vma. Then filter out the orders that can't be allocated over
          * the faulting address and still be fully contained in the vma.
          */
-       orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
-                                         BIT(PMD_ORDER) - 1);
+       orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+                       TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
         orders = thp_vma_suitable_orders(vma, vmf->address, orders);
  
         if (!orders)
@@ -4352,6 +4375,9 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
  
         pte_unmap(pte);
  
+       if (!orders)
+               goto fallback;
+
         /* Try allocating the highest of the remaining orders. */
         gfp = vma_thp_gfp_mask(vma);
         while (orders) {
@@ -4359,6 +4385,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
                 folio = vma_alloc_folio(gfp, order, vma, addr, true);
                 if (folio) {
                         if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+                               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                                 folio_put(folio);
                                 goto next;
                         }
@@ -4367,6 +4394,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
                         return folio;
                 }
  next:
+               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                 order = next_order(&orders, order);
         }
  
@@ -4382,7 +4410,6 @@ fallback:
   */
  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  {
-       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
         struct vm_area_struct *vma = vmf->vma;
         unsigned long addr = vmf->address;
         struct folio *folio;
@@ -4427,8 +4454,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         }
  
         /* Allocate our own private page. */
-       if (unlikely(anon_vma_prepare(vma)))
-               goto oom;
+       ret = vmf_anon_prepare(vmf);
+       if (ret)
+               return ret;
         /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
         folio = alloc_anon_folio(vmf);
         if (IS_ERR(folio))
@@ -4476,10 +4504,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  
         folio_ref_add(folio, nr_pages - 1);
         add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
+#endif
         folio_add_new_anon_rmap(folio, vma, addr);
         folio_add_lru_vma(folio, vma);
  setpte:
-       if (uffd_wp)
+       if (vmf_orig_pte_uffd_wp(vmf))
                 entry = pte_mkuffd_wp(entry);
         set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
  
@@ -4654,7 +4685,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                 struct page *page, unsigned int nr, unsigned long addr)
  {
         struct vm_area_struct *vma = vmf->vma;
-       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
         bool write = vmf->flags & FAULT_FLAG_WRITE;
         bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
         pte_t entry;
@@ -4669,16 +4699,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
  
         if (write)
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-       if (unlikely(uffd_wp))
+       if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
                 entry = pte_mkuffd_wp(entry);
         /* copy-on-write page */
         if (write && !(vma->vm_flags & VM_SHARED)) {
-               add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
                 VM_BUG_ON_FOLIO(nr != 1, folio);
                 folio_add_new_anon_rmap(folio, vma, addr);
                 folio_add_lru_vma(folio, vma);
         } else {
-               add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
                 folio_add_file_rmap_ptes(folio, page, nr, vma);
         }
         set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
@@ -4715,9 +4743,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         struct page *page;
         vm_fault_t ret;
+       bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
+                     !(vma->vm_flags & VM_SHARED);
  
         /* Did we COW the page? */
-       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+       if (is_cow)
                 page = vmf->cow_page;
         else
                 page = vmf->page;
@@ -4753,8 +4783,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
         /* Re-check under ptl */
         if (likely(!vmf_pte_changed(vmf))) {
                 struct folio *folio = page_folio(page);
+               int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
  
                 set_pte_range(vmf, folio, page, 1, vmf->address);
+               add_mm_counter(vma->vm_mm, type, 1);
                 ret = 0;
         } else {
                 update_mmu_tlb(vma, vmf->address, vmf->pte);
@@ -5035,9 +5067,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
         return ret;
  }
  
-int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
                       unsigned long addr, int page_nid, int *flags)
  {
+       struct vm_area_struct *vma = vmf->vma;
+
         folio_get(folio);
  
         /* Record the current PID acceesing VMA */
@@ -5049,7 +5083,55 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
                 *flags |= TNF_FAULT_LOCAL;
         }
  
-       return mpol_misplaced(folio, vma, addr);
+       return mpol_misplaced(folio, vmf, addr);
+}
+
+static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
+                                       unsigned long fault_addr, pte_t *fault_pte,
+                                       bool writable)
+{
+       pte_t pte, old_pte;
+
+       old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
+       pte = pte_modify(old_pte, vma->vm_page_prot);
+       pte = pte_mkyoung(pte);
+       if (writable)
+               pte = pte_mkwrite(pte, vma);
+       ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
+       update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
+}
+
+static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
+                                      struct folio *folio, pte_t fault_pte,
+                                      bool ignore_writable, bool pte_write_upgrade)
+{
+       int nr = pte_pfn(fault_pte) - folio_pfn(folio);
+       unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
+       unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
+       pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
+       unsigned long addr;
+
+       /* Restore all PTEs' mapping of the large folio */
+       for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
+               pte_t ptent = ptep_get(start_ptep);
+               bool writable = false;
+
+               if (!pte_present(ptent) || !pte_protnone(ptent))
+                       continue;
+
+               if (pfn_folio(pte_pfn(ptent)) != folio)
+                       continue;
+
+               if (!ignore_writable) {
+                       ptent = pte_modify(ptent, vma->vm_page_prot);
+                       writable = pte_write(ptent);
+                       if (!writable && pte_write_upgrade &&
+                           can_change_pte_writable(vma, addr, ptent))
+                               writable = true;
+               }
+
+               numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
+       }
  }
  
  static vm_fault_t do_numa_page(struct vm_fault *vmf)
@@ -5057,11 +5139,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         struct folio *folio = NULL;
         int nid = NUMA_NO_NODE;
-       bool writable = false;
+       bool writable = false, ignore_writable = false;
+       bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
         int last_cpupid;
         int target_nid;
         pte_t pte, old_pte;
-       int flags = 0;
+       int flags = 0, nr_pages;
  
         /*
          * The pte cannot be used safely until we verify, while holding the page
@@ -5083,7 +5166,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
          * is only valid while holding the PT lock.
          */
         writable = pte_write(pte);
-       if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+       if (!writable && pte_write_upgrade &&
             can_change_pte_writable(vma, vmf->address, pte))
                 writable = true;
  
@@ -5091,10 +5174,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         if (!folio || folio_is_zone_device(folio))
                 goto out_map;
  
-       /* TODO: handle PTE-mapped THP */
-       if (folio_test_large(folio))
-               goto out_map;
-
         /*
          * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
          * much anyway since they can be in shared cache state. This misses
@@ -5110,10 +5189,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
          * Flag if the folio is shared between multiple address spaces. This
          * is later used when determining whether to group tasks together
          */
-       if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED))
+       if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
                 flags |= TNF_SHARED;
  
         nid = folio_nid(folio);
+       nr_pages = folio_nr_pages(folio);
         /*
          * For memory tiering mode, cpupid of slow memory page is used
          * to record page access time.  So use default value.
@@ -5123,13 +5203,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
                 last_cpupid = (-1 & LAST_CPUPID_MASK);
         else
                 last_cpupid = folio_last_cpupid(folio);
-       target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
+       target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
         if (target_nid == NUMA_NO_NODE) {
                 folio_put(folio);
                 goto out_map;
         }
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         writable = false;
+       ignore_writable = true;
  
         /* Migrate to the requested node */
         if (migrate_misplaced_folio(folio, vma, target_nid)) {
@@ -5150,20 +5231,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
  
  out:
         if (nid != NUMA_NO_NODE)
-               task_numa_fault(last_cpupid, nid, 1, flags);
+               task_numa_fault(last_cpupid, nid, nr_pages, flags);
         return 0;
  out_map:
         /*
          * Make it present again, depending on how arch implements
          * non-accessible ptes, some can allow access by kernel mode.
          */
-       old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
-       pte = pte_modify(old_pte, vma->vm_page_prot);
-       pte = pte_mkyoung(pte);
-       if (writable)
-               pte = pte_mkwrite(pte, vma);
-       ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
-       update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+       if (folio && folio_test_large(folio))
+               numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
+                                          pte_write_upgrade);
+       else
+               numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
+                                           writable);
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         goto out;
  }
@@ -5374,7 +5454,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                 return VM_FAULT_OOM;
  retry_pud:
         if (pud_none(*vmf.pud) &&
-           thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) {
+           thp_vma_allowable_order(vma, vm_flags,
+                               TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
                 ret = create_huge_pud(&vmf);
                 if (!(ret & VM_FAULT_FALLBACK))
                         return ret;
@@ -5408,7 +5489,8 @@ retry_pud:
                 goto retry_pud;
  
         if (pmd_none(*vmf.pmd) &&
-           thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) {
+           thp_vma_allowable_order(vma, vm_flags,
+                               TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
                 ret = create_huge_pmd(&vmf);
                 if (!(ret & VM_FAULT_FALLBACK))
                         return ret;
@@ -5762,15 +5844,6 @@ retry:
         if (!vma_start_read(vma))
                 goto inval;
  
-       /*
-        * find_mergeable_anon_vma uses adjacent vmas which are not locked.
-        * This check must happen after vma_start_read(); otherwise, a
-        * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
-        * from its anon_vma.
-        */
-       if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
-               goto inval_end_read;
-
         /* Check since vm_start/vm_end might change before we lock the VMA */
         if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                 goto inval_end_read;
@@ -5868,34 +5941,48 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  
  /**
   * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
+ * @vma: the memory mapping
   * @address: user virtual address
   * @ptepp: location to store found PTE
   * @ptlp: location to store the lock for the PTE
   *
   * On a successful return, the pointer to the PTE is stored in @ptepp;
   * the corresponding lock is taken and its location is stored in @ptlp.
- * The contents of the PTE are only stable until @ptlp is released;
- * any further use, if any, must be protected against invalidation
- * with MMU notifiers.
+ *
+ * The contents of the PTE are only stable until @ptlp is released using
+ * pte_unmap_unlock(). This function will fail if the PTE is non-present.
+ * Present PTEs may include PTEs that map refcounted pages, such as
+ * anonymous folios in COW mappings.
+ *
+ * Callers must be careful when relying on PTE content after
+ * pte_unmap_unlock(). Especially if the PTE maps a refcounted page,
+ * callers must protect against invalidation with MMU notifiers; otherwise
+ * access to the PFN at a later point in time can trigger use-after-free.
   *
   * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
   * should be taken for read.
   *
- * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
- * it is not a good general-purpose API.
+ * This function must not be used to modify PTE content.
   *
   * Return: zero on success, -ve otherwise.
   */
-int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct vm_area_struct *vma, unsigned long address,
                pte_t **ptepp, spinlock_t **ptlp)
  {
+       struct mm_struct *mm = vma->vm_mm;
         pgd_t *pgd;
         p4d_t *p4d;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *ptep;
  
+       mmap_assert_locked(mm);
+       if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+               goto out;
+
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               goto out;
+
         pgd = pgd_offset(mm, address);
         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                 goto out;
@@ -5925,71 +6012,7 @@ out:
  }
  EXPORT_SYMBOL_GPL(follow_pte);
  
-/**
- * follow_pfn - look up PFN at a user virtual address
- * @vma: memory mapping
- * @address: user virtual address
- * @pfn: location to store found PFN
- *
- * Only IO mappings and raw PFN mappings are allowed.
- *
- * This function does not allow the caller to read the permissions
- * of the PTE.  Do not use it.
- *
- * Return: zero and the pfn at @pfn on success, -ve otherwise.
- */
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
-       unsigned long *pfn)
-{
-       int ret = -EINVAL;
-       spinlock_t *ptl;
-       pte_t *ptep;
-
-       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-               return ret;
-
-       ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
-       if (ret)
-               return ret;
-       *pfn = pte_pfn(ptep_get(ptep));
-       pte_unmap_unlock(ptep, ptl);
-       return 0;
-}
-EXPORT_SYMBOL(follow_pfn);
-
  #ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
-               unsigned long address, unsigned int flags,
-               unsigned long *prot, resource_size_t *phys)
-{
-       int ret = -EINVAL;
-       pte_t *ptep, pte;
-       spinlock_t *ptl;
-
-       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-               goto out;
-
-       if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
-               goto out;
-       pte = ptep_get(ptep);
-
-       /* Never return PFNs of anon folios in COW mappings. */
-       if (vm_normal_folio(vma, address, pte))
-               goto unlock;
-
-       if ((flags & FOLL_WRITE) && !pte_write(pte))
-               goto unlock;
-
-       *prot = pgprot_val(pte_pgprot(pte));
-       *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
-
-       ret = 0;
-unlock:
-       pte_unmap_unlock(ptep, ptl);
-out:
-       return ret;
-}
-
  /**
   * generic_access_phys - generic implementation for iomem mmap access
   * @vma: the vma to access
@@ -6013,11 +6036,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
         int offset = offset_in_page(addr);
         int ret = -EINVAL;
  
-       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-               return -EINVAL;
-
  retry:
-       if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+       if (follow_pte(vma, addr, &ptep, &ptl))
                 return -EINVAL;
         pte = ptep_get(ptep);
         pte_unmap_unlock(ptep, ptl);
@@ -6032,7 +6052,7 @@ retry:
         if (!maddr)
                 return -ENOMEM;
  
-       if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+       if (follow_pte(vma, addr, &ptep, &ptl))
                 goto out_unmap;
  
         if (!pte_same(pte, ptep_get(ptep))) {
@@ -6440,3 +6460,15 @@ void ptlock_free(struct ptdesc *ptdesc)
         kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
  }
  #endif
+
+void vma_pgtable_walk_begin(struct vm_area_struct *vma)
+{
+       if (is_vm_hugetlb_page(vma))
+               hugetlb_vma_lock_read(vma);
+}
+
+void vma_pgtable_walk_end(struct vm_area_struct *vma)
+{
+       if (is_vm_hugetlb_page(vma))
+               hugetlb_vma_unlock_read(vma);
+}