dm-crypt: use __bio_add_page to add single page to clone bio

[linux-block.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index f456f3b5049cf1545e875436cd9138b3c2b6ead4..f69fbc2511984e224ab31f38a6315404b5d902b1 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
  #endif
  
  static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+       if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+               return false;
+
+       return pte_marker_uffd_wp(vmf->orig_pte);
+}
  
  /*
   * A number of key systems in x86 including ioremap() rely on the assumption
@@ -348,7 +362,7 @@ void free_pgd_range(struct mmu_gather *tlb,
  
  void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                    struct vm_area_struct *vma, unsigned long floor,
-                  unsigned long ceiling)
+                  unsigned long ceiling, bool mm_wr_locked)
  {
         MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
  
@@ -366,6 +380,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                  * Hide vma from rmap and truncate_pagecache before freeing
                  * pgtables
                  */
+               if (mm_wr_locked)
+                       vma_start_write(vma);
                 unlink_anon_vmas(vma);
                 unlink_file_vma(vma);
  
@@ -380,6 +396,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                                && !is_vm_hugetlb_page(next)) {
                                 vma = next;
                                 next = mas_find(&mas, ceiling - 1);
+                               if (mm_wr_locked)
+                                       vma_start_write(vma);
                                 unlink_anon_vmas(vma);
                                 unlink_file_vma(vma);
                         }
@@ -970,7 +988,7 @@ static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
                 folio_put(new_folio);
                 return NULL;
         }
-       cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+       folio_throttle_swaprate(new_folio, GFP_KERNEL);
  
         return new_folio;
  }
@@ -1290,6 +1308,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
                         continue;
                 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                             addr, next))) {
+                       untrack_pfn_clear(dst_vma);
                         ret = -ENOMEM;
                         break;
                 }
@@ -1345,6 +1364,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                               unsigned long addr, pte_t *pte,
                               struct zap_details *details, pte_t pteval)
  {
+       /* Zap on anonymous always means dropping everything */
+       if (vma_is_anonymous(vma))
+               return;
+
         if (zap_drop_file_uffd_wp(details))
                 return;
  
@@ -1451,8 +1474,12 @@ again:
                                 continue;
                         rss[mm_counter(page)]--;
                 } else if (pte_marker_entry_uffd_wp(entry)) {
-                       /* Only drop the uffd-wp marker if explicitly requested */
-                       if (!zap_drop_file_uffd_wp(details))
+                       /*
+                        * For anon: always drop the marker; for file: only
+                        * drop the marker if explicitly requested.
+                        */
+                       if (!vma_is_anonymous(vma) &&
+                           !zap_drop_file_uffd_wp(details))
                                 continue;
                 } else if (is_hwpoison_entry(entry) ||
                            is_swapin_error_entry(entry)) {
@@ -2142,8 +2169,20 @@ out_unlock:
   * vmf_insert_pfn_prot should only be used if using multiple VMAs is
   * impractical.
   *
- * See vmf_insert_mixed_prot() for a discussion of the implication of using
- * a value of @pgprot different from that of @vma->vm_page_prot.
+ * pgprot typically only differs from @vma->vm_page_prot when drivers set
+ * caching- and encryption bits different than those of @vma->vm_page_prot,
+ * because the caching- or encryption mode may not be known at mmap() time.
+ *
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ *
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
   *
   * Context: Process context.  May allocate using %GFP_KERNEL.
   * Return: vm_fault_t value.
@@ -2218,9 +2257,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
  }
  
  static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
-               unsigned long addr, pfn_t pfn, pgprot_t pgprot,
-               bool mkwrite)
+               unsigned long addr, pfn_t pfn, bool mkwrite)
  {
+       pgprot_t pgprot = vma->vm_page_prot;
         int err;
  
         BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -2263,43 +2302,10 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
         return VM_FAULT_NOPAGE;
  }
  
-/**
- * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- * @pgprot: pgprot flags for the inserted page
- *
- * This is exactly like vmf_insert_mixed(), except that it allows drivers
- * to override pgprot on a per-page basis.
- *
- * Typically this function should be used by drivers to set caching- and
- * encryption bits different than those of @vma->vm_page_prot, because
- * the caching- or encryption mode may not be known at mmap() time.
- * This is ok as long as @vma->vm_page_prot is not used by the core vm
- * to set caching and encryption bits for those vmas (except for COW pages).
- * This is ensured by core vm only modifying these page table entries using
- * functions that don't touch caching- or encryption bits, using pte_modify()
- * if needed. (See for example mprotect()).
- * Also when new page-table entries are created, this is only done using the
- * fault() callback, and never using the value of vma->vm_page_prot,
- * except for page-table entries that point to anonymous pages as the result
- * of COW.
- *
- * Context: Process context.  May allocate using %GFP_KERNEL.
- * Return: vm_fault_t value.
- */
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
-                                pfn_t pfn, pgprot_t pgprot)
-{
-       return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed_prot);
-
  vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                 pfn_t pfn)
  {
-       return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
+       return __vm_insert_mixed(vma, addr, pfn, false);
  }
  EXPORT_SYMBOL(vmf_insert_mixed);
  
@@ -2311,7 +2317,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
  vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                 unsigned long addr, pfn_t pfn)
  {
-       return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
+       return __vm_insert_mixed(vma, addr, pfn, true);
  }
  EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
  
@@ -3091,7 +3097,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
  
         if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
                 goto oom_free_new;
-       cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+       folio_throttle_swaprate(new_folio, GFP_KERNEL);
  
         __folio_mark_uptodate(new_folio);
  
@@ -3563,8 +3569,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         struct mmu_notifier_range range;
  
-       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
+       /*
+        * We need a reference to lock the folio because we don't hold
+        * the PTL so a racing thread can remove the device-exclusive
+        * entry and unmap it. If the folio is free the entry must
+        * have been removed already. If it happens to have already
+        * been re-allocated after being freed all we do is lock and
+        * unlock it.
+        */
+       if (!folio_try_get(folio))
+               return 0;
+
+       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+               folio_put(folio);
                 return VM_FAULT_RETRY;
+       }
         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                 vma->vm_mm, vmf->address & PAGE_MASK,
                                 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
@@ -3577,6 +3596,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
  
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         folio_unlock(folio);
+       folio_put(folio);
  
         mmu_notifier_invalidate_range_end(&range);
         return 0;
@@ -3619,6 +3639,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
         return 0;
  }
  
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+       if (vma_is_anonymous(vmf->vma))
+               return do_anonymous_page(vmf);
+       else
+               return do_fault(vmf);
+}
+
  /*
   * This is actually a page-missing access, but with uffd-wp special pte
   * installed.  It means this pte was wr-protected before being unmapped.
@@ -3629,11 +3657,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
          * Just in case there're leftover special ptes even after the region
          * got unregistered - we can simply clear them.
          */
-       if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+       if (unlikely(!userfaultfd_wp(vmf->vma)))
                 return pte_marker_clear(vmf);
  
-       /* do_fault() can handle pte markers too like none pte */
-       return do_fault(vmf);
+       return do_pte_missing(vmf);
  }
  
  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -3684,6 +3711,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         if (!pte_unmap_same(vmf))
                 goto out;
  
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               ret = VM_FAULT_RETRY;
+               goto out;
+       }
+
         entry = pte_to_swp_entry(vmf->orig_pte);
         if (unlikely(non_swap_entry(entry))) {
                 if (is_migration_entry(entry)) {
@@ -3838,7 +3870,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                         lru_add_drain();
         }
  
-       cgroup_throttle_swaprate(page, GFP_KERNEL);
+       folio_throttle_swaprate(folio, GFP_KERNEL);
  
         /*
          * Back out if somebody else already faulted in this pte.
@@ -3998,6 +4030,7 @@ out_release:
   */
  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  {
+       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
         struct vm_area_struct *vma = vmf->vma;
         struct folio *folio;
         vm_fault_t ret = 0;
@@ -4031,7 +4064,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
                                                 vma->vm_page_prot));
                 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                 vmf->address, &vmf->ptl);
-               if (!pte_none(*vmf->pte)) {
+               if (vmf_pte_changed(vmf)) {
                         update_mmu_tlb(vma, vmf->address, vmf->pte);
                         goto unlock;
                 }
@@ -4055,7 +4088,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  
         if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
                 goto oom_free_page;
-       cgroup_throttle_swaprate(&folio->page, GFP_KERNEL);
+       folio_throttle_swaprate(folio, GFP_KERNEL);
  
         /*
          * The memory barrier inside __folio_mark_uptodate makes sure that
@@ -4071,7 +4104,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  
         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                         &vmf->ptl);
-       if (!pte_none(*vmf->pte)) {
+       if (vmf_pte_changed(vmf)) {
                 update_mmu_tlb(vma, vmf->address, vmf->pte);
                 goto release;
         }
@@ -4091,6 +4124,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         folio_add_new_anon_rmap(folio, vma, vmf->address);
         folio_add_lru_vma(folio, vma);
  setpte:
+       if (uffd_wp)
+               entry = pte_mkuffd_wp(entry);
         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
  
         /* No need to invalidate - it was non-present before */
@@ -4258,7 +4293,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
  {
         struct vm_area_struct *vma = vmf->vma;
-       bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
         bool write = vmf->flags & FAULT_FLAG_WRITE;
         bool prefault = vmf->address != addr;
         pte_t entry;
@@ -4372,13 +4407,13 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
         return ret;
  }
  
-static unsigned long fault_around_bytes __read_mostly =
-       rounddown_pow_of_two(65536);
+static unsigned long fault_around_pages __read_mostly =
+       65536 >> PAGE_SHIFT;
  
  #ifdef CONFIG_DEBUG_FS
  static int fault_around_bytes_get(void *data, u64 *val)
  {
-       *val = fault_around_bytes;
+       *val = fault_around_pages << PAGE_SHIFT;
         return 0;
  }
  
@@ -4390,10 +4425,13 @@ static int fault_around_bytes_set(void *data, u64 val)
  {
         if (val / PAGE_SIZE > PTRS_PER_PTE)
                 return -EINVAL;
-       if (val > PAGE_SIZE)
-               fault_around_bytes = rounddown_pow_of_two(val);
-       else
-               fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+
+       /*
+        * The minimum value is 1 page, however this results in no fault-around
+        * at all. See should_fault_around().
+        */
+       fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
+
         return 0;
  }
  DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
@@ -4416,41 +4454,34 @@ late_initcall(fault_around_debugfs);
   * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
   * not ready to be mapped: not up-to-date, locked, etc.
   *
- * This function doesn't cross the VMA boundaries, in order to call map_pages()
- * only once.
+ * This function doesn't cross VMA or page table boundaries, in order to call
+ * map_pages() and acquire a PTE lock only once.
   *
- * fault_around_bytes defines how many bytes we'll try to map.
+ * fault_around_pages defines how many pages we'll try to map.
   * do_fault_around() expects it to be set to a power of two less than or equal
   * to PTRS_PER_PTE.
   *
   * The virtual address of the area that we map is naturally aligned to
- * fault_around_bytes rounded down to the machine page size
+ * fault_around_pages * PAGE_SIZE rounded down to the machine page size
   * (and therefore to page order).  This way it's easier to guarantee
   * that we don't cross page table boundaries.
   */
  static vm_fault_t do_fault_around(struct vm_fault *vmf)
  {
-       unsigned long address = vmf->address, nr_pages, mask;
-       pgoff_t start_pgoff = vmf->pgoff;
-       pgoff_t end_pgoff;
-       int off;
-
-       nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
-       mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
+       pgoff_t nr_pages = READ_ONCE(fault_around_pages);
+       pgoff_t pte_off = pte_index(vmf->address);
+       /* The page offset of vmf->address within the VMA. */
+       pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
+       pgoff_t from_pte, to_pte;
+       vm_fault_t ret;
  
-       address = max(address & mask, vmf->vma->vm_start);
-       off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-       start_pgoff -= off;
+       /* The PTE offset of the start address, clamped to the VMA. */
+       from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
+                      pte_off - min(pte_off, vma_off));
  
-       /*
-        *  end_pgoff is either the end of the page table, the end of
-        *  the vma or nr_pages from start_pgoff, depending what is nearest.
-        */
-       end_pgoff = start_pgoff -
-               ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
-               PTRS_PER_PTE - 1;
-       end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
-                       start_pgoff + nr_pages - 1);
+       /* The PTE offset of the end address, clamped to the VMA and PTE. */
+       to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
+                     pte_off + vma_pages(vmf->vma) - vma_off) - 1;
  
         if (pmd_none(*vmf->pmd)) {
                 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
@@ -4458,7 +4489,13 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
                         return VM_FAULT_OOM;
         }
  
-       return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
+       rcu_read_lock();
+       ret = vmf->vma->vm_ops->map_pages(vmf,
+                       vmf->pgoff + from_pte - pte_off,
+                       vmf->pgoff + to_pte - pte_off);
+       rcu_read_unlock();
+
+       return ret;
  }
  
  /* Return true if we should do read fault-around, false otherwise */
@@ -4471,7 +4508,8 @@ static inline bool should_fault_around(struct vm_fault *vmf)
         if (uffd_disable_fault_around(vmf->vma))
                 return false;
  
-       return fault_around_bytes >> PAGE_SHIFT > 1;
+       /* A single page implies no faulting 'around' at all. */
+       return fault_around_pages > 1;
  }
  
  static vm_fault_t do_read_fault(struct vm_fault *vmf)
@@ -4517,7 +4555,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
                 put_page(vmf->cow_page);
                 return VM_FAULT_OOM;
         }
-       cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
+       folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL);
  
         ret = __do_fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4637,6 +4675,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
  {
         get_page(page);
  
+       /* Record the current PID acceesing VMA */
+       vma_set_access_pid_bit(vma);
+
         count_vm_numa_event(NUMA_HINT_FAULTS);
         if (page_nid == numa_node_id()) {
                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
@@ -4902,12 +4943,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                 }
         }
  
-       if (!vmf->pte) {
-               if (vma_is_anonymous(vmf->vma))
-                       return do_anonymous_page(vmf);
-               else
-                       return do_fault(vmf);
-       }
+       if (!vmf->pte)
+               return do_pte_missing(vmf);
  
         if (!pte_present(vmf->orig_pte))
                 return do_swap_page(vmf);
@@ -4943,7 +4980,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                  * with threads.
                  */
                 if (vmf->flags & FAULT_FLAG_WRITE)
-                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
+                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+                                                    vmf->pte);
         }
  unlock:
         pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -5066,24 +5104,31 @@ retry_pud:
   * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
   * still be in per-arch page fault handlers at the entry of page fault.
   */
-static inline void mm_account_fault(struct pt_regs *regs,
+static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                     unsigned long address, unsigned int flags,
                                     vm_fault_t ret)
  {
         bool major;
  
+       /* Incomplete faults will be accounted upon completion. */
+       if (ret & VM_FAULT_RETRY)
+               return;
+
         /*
-        * We don't do accounting for some specific faults:
-        *
-        * - Unsuccessful faults (e.g. when the address wasn't valid).  That
-        *   includes arch_vma_access_permitted() failing before reaching here.
-        *   So this is not a "this many hardware page faults" counter.  We
-        *   should use the hw profiling for that.
-        *
-        * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
-        *   once they're completed.
+        * To preserve the behavior of older kernels, PGFAULT counters record
+        * both successful and failed faults, as opposed to perf counters,
+        * which ignore failed cases.
+        */
+       count_vm_event(PGFAULT);
+       count_memcg_event_mm(mm, PGFAULT);
+
+       /*
+        * Do not account for unsuccessful faults (e.g. when the address wasn't
+        * valid).  That includes arch_vma_access_permitted() failing before
+        * reaching here. So this is not a "this many hardware page faults"
+        * counter.  We should use the hw profiling for that.
          */
-       if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+       if (ret & VM_FAULT_ERROR)
                 return;
  
         /*
@@ -5166,21 +5211,22 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                            unsigned int flags, struct pt_regs *regs)
  {
+       /* If the fault handler drops the mmap_lock, vma may be freed */
+       struct mm_struct *mm = vma->vm_mm;
         vm_fault_t ret;
  
         __set_current_state(TASK_RUNNING);
  
-       count_vm_event(PGFAULT);
-       count_memcg_event_mm(vma->vm_mm, PGFAULT);
-
         ret = sanitize_fault_flags(vma, &flags);
         if (ret)
-               return ret;
+               goto out;
  
         if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                             flags & FAULT_FLAG_INSTRUCTION,
-                                           flags & FAULT_FLAG_REMOTE))
-               return VM_FAULT_SIGSEGV;
+                                           flags & FAULT_FLAG_REMOTE)) {
+               ret = VM_FAULT_SIGSEGV;
+               goto out;
+       }
  
         /*
          * Enable the memcg OOM handling for faults triggered in user
@@ -5209,13 +5255,74 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                         mem_cgroup_oom_synchronize(false);
         }
-
-       mm_account_fault(regs, address, flags, ret);
+out:
+       mm_account_fault(mm, regs, address, flags, ret);
  
         return ret;
  }
  EXPORT_SYMBOL_GPL(handle_mm_fault);
  
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       MA_STATE(mas, &mm->mm_mt, address, address);
+       struct vm_area_struct *vma;
+
+       rcu_read_lock();
+retry:
+       vma = mas_walk(&mas);
+       if (!vma)
+               goto inval;
+
+       /* Only anonymous vmas are supported for now */
+       if (!vma_is_anonymous(vma))
+               goto inval;
+
+       /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
+       if (!vma->anon_vma)
+               goto inval;
+
+       if (!vma_start_read(vma))
+               goto inval;
+
+       /*
+        * Due to the possibility of userfault handler dropping mmap_lock, avoid
+        * it for now and fall back to page fault handling under mmap_lock.
+        */
+       if (userfaultfd_armed(vma)) {
+               vma_end_read(vma);
+               goto inval;
+       }
+
+       /* Check since vm_start/vm_end might change before we lock the VMA */
+       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+               vma_end_read(vma);
+               goto inval;
+       }
+
+       /* Check if the VMA got isolated after we found it */
+       if (vma->detached) {
+               vma_end_read(vma);
+               count_vm_vma_lock_event(VMA_LOCK_MISS);
+               /* The area was replaced with another one */
+               goto retry;
+       }
+
+       rcu_read_unlock();
+       return vma;
+inval:
+       rcu_read_unlock();
+       count_vm_vma_lock_event(VMA_LOCK_ABORT);
+       return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
  #ifndef __PAGETABLE_P4D_FOLDED
  /*
   * Allocate p4d page table.
@@ -5634,12 +5741,12 @@ EXPORT_SYMBOL(__might_fault);
   * operation.  The target subpage will be processed last to keep its
   * cache lines hot.
   */
-static inline void process_huge_page(
+static inline int process_huge_page(
         unsigned long addr_hint, unsigned int pages_per_huge_page,
-       void (*process_subpage)(unsigned long addr, int idx, void *arg),
+       int (*process_subpage)(unsigned long addr, int idx, void *arg),
         void *arg)
  {
-       int i, n, base, l;
+       int i, n, base, l, ret;
         unsigned long addr = addr_hint &
                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
  
@@ -5653,7 +5760,9 @@ static inline void process_huge_page(
                 /* Process subpages at the end of huge page */
                 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                         cond_resched();
-                       process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       if (ret)
+                               return ret;
                 }
         } else {
                 /* If target subpage in second half of huge page */
@@ -5662,7 +5771,9 @@ static inline void process_huge_page(
                 /* Process subpages at the begin of huge page */
                 for (i = 0; i < base; i++) {
                         cond_resched();
-                       process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       if (ret)
+                               return ret;
                 }
         }
         /*
@@ -5674,10 +5785,15 @@ static inline void process_huge_page(
                 int right_idx = base + 2 * l - 1 - i;
  
                 cond_resched();
-               process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+               ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+               if (ret)
+                       return ret;
                 cond_resched();
-               process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+               ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+               if (ret)
+                       return ret;
         }
+       return 0;
  }
  
  static void clear_gigantic_page(struct page *page,
@@ -5695,11 +5811,12 @@ static void clear_gigantic_page(struct page *page,
         }
  }
  
-static void clear_subpage(unsigned long addr, int idx, void *arg)
+static int clear_subpage(unsigned long addr, int idx, void *arg)
  {
         struct page *page = arg;
  
         clear_user_highpage(page + idx, addr);
+       return 0;
  }
  
  void clear_huge_page(struct page *page,
@@ -5716,22 +5833,27 @@ void clear_huge_page(struct page *page,
         process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
  }
  
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-                                   unsigned long addr,
-                                   struct vm_area_struct *vma,
-                                   unsigned int pages_per_huge_page)
+static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
+                                    unsigned long addr,
+                                    struct vm_area_struct *vma,
+                                    unsigned int pages_per_huge_page)
  {
         int i;
-       struct page *dst_base = dst;
-       struct page *src_base = src;
+       struct page *dst_page;
+       struct page *src_page;
  
         for (i = 0; i < pages_per_huge_page; i++) {
-               dst = nth_page(dst_base, i);
-               src = nth_page(src_base, i);
+               dst_page = folio_page(dst, i);
+               src_page = folio_page(src, i);
  
                 cond_resched();
-               copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+               if (copy_mc_user_highpage(dst_page, src_page,
+                                         addr + i*PAGE_SIZE, vma)) {
+                       memory_failure_queue(page_to_pfn(src_page), 0);
+                       return -EHWPOISON;
+               }
         }
+       return 0;
  }
  
  struct copy_subpage_arg {
@@ -5740,57 +5862,56 @@ struct copy_subpage_arg {
         struct vm_area_struct *vma;
  };
  
-static void copy_subpage(unsigned long addr, int idx, void *arg)
+static int copy_subpage(unsigned long addr, int idx, void *arg)
  {
         struct copy_subpage_arg *copy_arg = arg;
  
-       copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-                          addr, copy_arg->vma);
+       if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+                                 addr, copy_arg->vma)) {
+               memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+               return -EHWPOISON;
+       }
+       return 0;
  }
  
-void copy_user_huge_page(struct page *dst, struct page *src,
-                        unsigned long addr_hint, struct vm_area_struct *vma,
-                        unsigned int pages_per_huge_page)
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+                         unsigned long addr_hint, struct vm_area_struct *vma)
  {
+       unsigned int pages_per_huge_page = folio_nr_pages(dst);
         unsigned long addr = addr_hint &
                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
         struct copy_subpage_arg arg = {
-               .dst = dst,
-               .src = src,
+               .dst = &dst->page,
+               .src = &src->page,
                 .vma = vma,
         };
  
-       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
-               copy_user_gigantic_page(dst, src, addr, vma,
-                                       pages_per_huge_page);
-               return;
-       }
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
+               return copy_user_gigantic_page(dst, src, addr, vma,
+                                              pages_per_huge_page);
  
-       process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
+       return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
  }
  
-long copy_huge_page_from_user(struct page *dst_page,
-                               const void __user *usr_src,
-                               unsigned int pages_per_huge_page,
-                               bool allow_pagefault)
+long copy_folio_from_user(struct folio *dst_folio,
+                          const void __user *usr_src,
+                          bool allow_pagefault)
  {
-       void *page_kaddr;
+       void *kaddr;
         unsigned long i, rc = 0;
-       unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+       unsigned int nr_pages = folio_nr_pages(dst_folio);
+       unsigned long ret_val = nr_pages * PAGE_SIZE;
         struct page *subpage;
  
-       for (i = 0; i < pages_per_huge_page; i++) {
-               subpage = nth_page(dst_page, i);
-               if (allow_pagefault)
-                       page_kaddr = kmap(subpage);
-               else
-                       page_kaddr = kmap_atomic(subpage);
-               rc = copy_from_user(page_kaddr,
-                               usr_src + i * PAGE_SIZE, PAGE_SIZE);
-               if (allow_pagefault)
-                       kunmap(subpage);
-               else
-                       kunmap_atomic(page_kaddr);
+       for (i = 0; i < nr_pages; i++) {
+               subpage = folio_page(dst_folio, i);
+               kaddr = kmap_local_page(subpage);
+               if (!allow_pagefault)
+                       pagefault_disable();
+               rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
+               if (!allow_pagefault)
+                       pagefault_enable();
+               kunmap_local(kaddr);
  
                 ret_val -= (PAGE_SIZE - rc);
                 if (rc)