dm-crypt: use __bio_add_page to add single page to clone bio
[linux-block.git] / mm / memory.c
index f456f3b5049cf1545e875436cd9138b3c2b6ead4..f69fbc2511984e224ab31f38a6315404b5d902b1 100644 (file)
@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
 #endif
 
 static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+       if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+               return false;
+
+       return pte_marker_uffd_wp(vmf->orig_pte);
+}
 
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
@@ -348,7 +362,7 @@ void free_pgd_range(struct mmu_gather *tlb,
 
 void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                   struct vm_area_struct *vma, unsigned long floor,
-                  unsigned long ceiling)
+                  unsigned long ceiling, bool mm_wr_locked)
 {
        MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
 
@@ -366,6 +380,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
+               if (mm_wr_locked)
+                       vma_start_write(vma);
                unlink_anon_vmas(vma);
                unlink_file_vma(vma);
 
@@ -380,6 +396,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = mas_find(&mas, ceiling - 1);
+                               if (mm_wr_locked)
+                                       vma_start_write(vma);
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
@@ -970,7 +988,7 @@ static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
                folio_put(new_folio);
                return NULL;
        }
-       cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+       folio_throttle_swaprate(new_folio, GFP_KERNEL);
 
        return new_folio;
 }
@@ -1290,6 +1308,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
+                       untrack_pfn_clear(dst_vma);
                        ret = -ENOMEM;
                        break;
                }
@@ -1345,6 +1364,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte,
                              struct zap_details *details, pte_t pteval)
 {
+       /* Zap on anonymous always means dropping everything */
+       if (vma_is_anonymous(vma))
+               return;
+
        if (zap_drop_file_uffd_wp(details))
                return;
 
@@ -1451,8 +1474,12 @@ again:
                                continue;
                        rss[mm_counter(page)]--;
                } else if (pte_marker_entry_uffd_wp(entry)) {
-                       /* Only drop the uffd-wp marker if explicitly requested */
-                       if (!zap_drop_file_uffd_wp(details))
+                       /*
+                        * For anon: always drop the marker; for file: only
+                        * drop the marker if explicitly requested.
+                        */
+                       if (!vma_is_anonymous(vma) &&
+                           !zap_drop_file_uffd_wp(details))
                                continue;
                } else if (is_hwpoison_entry(entry) ||
                           is_swapin_error_entry(entry)) {
@@ -2142,8 +2169,20 @@ out_unlock:
  * vmf_insert_pfn_prot should only be used if using multiple VMAs is
  * impractical.
  *
- * See vmf_insert_mixed_prot() for a discussion of the implication of using
- * a value of @pgprot different from that of @vma->vm_page_prot.
+ * pgprot typically only differs from @vma->vm_page_prot when drivers set
+ * caching- and encryption bits different than those of @vma->vm_page_prot,
+ * because the caching- or encryption mode may not be known at mmap() time.
+ *
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ *
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
  *
  * Context: Process context.  May allocate using %GFP_KERNEL.
  * Return: vm_fault_t value.
@@ -2218,9 +2257,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
 }
 
 static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
-               unsigned long addr, pfn_t pfn, pgprot_t pgprot,
-               bool mkwrite)
+               unsigned long addr, pfn_t pfn, bool mkwrite)
 {
+       pgprot_t pgprot = vma->vm_page_prot;
        int err;
 
        BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -2263,43 +2302,10 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
        return VM_FAULT_NOPAGE;
 }
 
-/**
- * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- * @pgprot: pgprot flags for the inserted page
- *
- * This is exactly like vmf_insert_mixed(), except that it allows drivers
- * to override pgprot on a per-page basis.
- *
- * Typically this function should be used by drivers to set caching- and
- * encryption bits different than those of @vma->vm_page_prot, because
- * the caching- or encryption mode may not be known at mmap() time.
- * This is ok as long as @vma->vm_page_prot is not used by the core vm
- * to set caching and encryption bits for those vmas (except for COW pages).
- * This is ensured by core vm only modifying these page table entries using
- * functions that don't touch caching- or encryption bits, using pte_modify()
- * if needed. (See for example mprotect()).
- * Also when new page-table entries are created, this is only done using the
- * fault() callback, and never using the value of vma->vm_page_prot,
- * except for page-table entries that point to anonymous pages as the result
- * of COW.
- *
- * Context: Process context.  May allocate using %GFP_KERNEL.
- * Return: vm_fault_t value.
- */
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
-                                pfn_t pfn, pgprot_t pgprot)
-{
-       return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed_prot);
-
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                pfn_t pfn)
 {
-       return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
+       return __vm_insert_mixed(vma, addr, pfn, false);
 }
 EXPORT_SYMBOL(vmf_insert_mixed);
 
@@ -2311,7 +2317,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn)
 {
-       return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
+       return __vm_insert_mixed(vma, addr, pfn, true);
 }
 EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
 
@@ -3091,7 +3097,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 
        if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
                goto oom_free_new;
-       cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL);
+       folio_throttle_swaprate(new_folio, GFP_KERNEL);
 
        __folio_mark_uptodate(new_folio);
 
@@ -3563,8 +3569,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
 
-       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
+       /*
+        * We need a reference to lock the folio because we don't hold
+        * the PTL so a racing thread can remove the device-exclusive
+        * entry and unmap it. If the folio is free the entry must
+        * have been removed already. If it happens to have already
+        * been re-allocated after being freed all we do is lock and
+        * unlock it.
+        */
+       if (!folio_try_get(folio))
+               return 0;
+
+       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+               folio_put(folio);
                return VM_FAULT_RETRY;
+       }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
@@ -3577,6 +3596,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
 
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
+       folio_put(folio);
 
        mmu_notifier_invalidate_range_end(&range);
        return 0;
@@ -3619,6 +3639,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
        return 0;
 }
 
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+       if (vma_is_anonymous(vmf->vma))
+               return do_anonymous_page(vmf);
+       else
+               return do_fault(vmf);
+}
+
 /*
  * This is actually a page-missing access, but with uffd-wp special pte
  * installed.  It means this pte was wr-protected before being unmapped.
@@ -3629,11 +3657,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
         * Just in case there're leftover special ptes even after the region
         * got unregistered - we can simply clear them.
         */
-       if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+       if (unlikely(!userfaultfd_wp(vmf->vma)))
                return pte_marker_clear(vmf);
 
-       /* do_fault() can handle pte markers too like none pte */
-       return do_fault(vmf);
+       return do_pte_missing(vmf);
 }
 
 static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -3684,6 +3711,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        if (!pte_unmap_same(vmf))
                goto out;
 
+       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+               ret = VM_FAULT_RETRY;
+               goto out;
+       }
+
        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
@@ -3838,7 +3870,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        lru_add_drain();
        }
 
-       cgroup_throttle_swaprate(page, GFP_KERNEL);
+       folio_throttle_swaprate(folio, GFP_KERNEL);
 
        /*
         * Back out if somebody else already faulted in this pte.
@@ -3998,6 +4030,7 @@ out_release:
  */
 static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
+       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret = 0;
@@ -4031,7 +4064,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
-               if (!pte_none(*vmf->pte)) {
+               if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
@@ -4055,7 +4088,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
        if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
                goto oom_free_page;
-       cgroup_throttle_swaprate(&folio->page, GFP_KERNEL);
+       folio_throttle_swaprate(folio, GFP_KERNEL);
 
        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
@@ -4071,7 +4104,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
-       if (!pte_none(*vmf->pte)) {
+       if (vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                goto release;
        }
@@ -4091,6 +4124,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
        folio_add_new_anon_rmap(folio, vma, vmf->address);
        folio_add_lru_vma(folio, vma);
 setpte:
+       if (uffd_wp)
+               entry = pte_mkuffd_wp(entry);
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
        /* No need to invalidate - it was non-present before */
@@ -4258,7 +4293,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 {
        struct vm_area_struct *vma = vmf->vma;
-       bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+       bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        bool prefault = vmf->address != addr;
        pte_t entry;
@@ -4372,13 +4407,13 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
        return ret;
 }
 
-static unsigned long fault_around_bytes __read_mostly =
-       rounddown_pow_of_two(65536);
+static unsigned long fault_around_pages __read_mostly =
+       65536 >> PAGE_SHIFT;
 
 #ifdef CONFIG_DEBUG_FS
 static int fault_around_bytes_get(void *data, u64 *val)
 {
-       *val = fault_around_bytes;
+       *val = fault_around_pages << PAGE_SHIFT;
        return 0;
 }
 
@@ -4390,10 +4425,13 @@ static int fault_around_bytes_set(void *data, u64 val)
 {
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;
-       if (val > PAGE_SIZE)
-               fault_around_bytes = rounddown_pow_of_two(val);
-       else
-               fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+
+       /*
+        * The minimum value is 1 page, however this results in no fault-around
+        * at all. See should_fault_around().
+        */
+       fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
+
        return 0;
 }
 DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
@@ -4416,41 +4454,34 @@ late_initcall(fault_around_debugfs);
  * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
  * not ready to be mapped: not up-to-date, locked, etc.
  *
- * This function doesn't cross the VMA boundaries, in order to call map_pages()
- * only once.
+ * This function doesn't cross VMA or page table boundaries, in order to call
+ * map_pages() and acquire a PTE lock only once.
  *
- * fault_around_bytes defines how many bytes we'll try to map.
+ * fault_around_pages defines how many pages we'll try to map.
  * do_fault_around() expects it to be set to a power of two less than or equal
  * to PTRS_PER_PTE.
  *
  * The virtual address of the area that we map is naturally aligned to
- * fault_around_bytes rounded down to the machine page size
+ * fault_around_pages * PAGE_SIZE rounded down to the machine page size
  * (and therefore to page order).  This way it's easier to guarantee
  * that we don't cross page table boundaries.
  */
 static vm_fault_t do_fault_around(struct vm_fault *vmf)
 {
-       unsigned long address = vmf->address, nr_pages, mask;
-       pgoff_t start_pgoff = vmf->pgoff;
-       pgoff_t end_pgoff;
-       int off;
-
-       nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
-       mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
+       pgoff_t nr_pages = READ_ONCE(fault_around_pages);
+       pgoff_t pte_off = pte_index(vmf->address);
+       /* The page offset of vmf->address within the VMA. */
+       pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
+       pgoff_t from_pte, to_pte;
+       vm_fault_t ret;
 
-       address = max(address & mask, vmf->vma->vm_start);
-       off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-       start_pgoff -= off;
+       /* The PTE offset of the start address, clamped to the VMA. */
+       from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
+                      pte_off - min(pte_off, vma_off));
 
-       /*
-        *  end_pgoff is either the end of the page table, the end of
-        *  the vma or nr_pages from start_pgoff, depending what is nearest.
-        */
-       end_pgoff = start_pgoff -
-               ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
-               PTRS_PER_PTE - 1;
-       end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
-                       start_pgoff + nr_pages - 1);
+       /* The PTE offset of the end address, clamped to the VMA and PTE. */
+       to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
+                     pte_off + vma_pages(vmf->vma) - vma_off) - 1;
 
        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
@@ -4458,7 +4489,13 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
                        return VM_FAULT_OOM;
        }
 
-       return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
+       rcu_read_lock();
+       ret = vmf->vma->vm_ops->map_pages(vmf,
+                       vmf->pgoff + from_pte - pte_off,
+                       vmf->pgoff + to_pte - pte_off);
+       rcu_read_unlock();
+
+       return ret;
 }
 
 /* Return true if we should do read fault-around, false otherwise */
@@ -4471,7 +4508,8 @@ static inline bool should_fault_around(struct vm_fault *vmf)
        if (uffd_disable_fault_around(vmf->vma))
                return false;
 
-       return fault_around_bytes >> PAGE_SHIFT > 1;
+       /* A single page implies no faulting 'around' at all. */
+       return fault_around_pages > 1;
 }
 
 static vm_fault_t do_read_fault(struct vm_fault *vmf)
@@ -4517,7 +4555,7 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
                put_page(vmf->cow_page);
                return VM_FAULT_OOM;
        }
-       cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
+       folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL);
 
        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4637,6 +4675,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 {
        get_page(page);
 
+       /* Record the current PID acceesing VMA */
+       vma_set_access_pid_bit(vma);
+
        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
@@ -4902,12 +4943,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                }
        }
 
-       if (!vmf->pte) {
-               if (vma_is_anonymous(vmf->vma))
-                       return do_anonymous_page(vmf);
-               else
-                       return do_fault(vmf);
-       }
+       if (!vmf->pte)
+               return do_pte_missing(vmf);
 
        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);
@@ -4943,7 +4980,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
-                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
+                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+                                                    vmf->pte);
        }
 unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -5066,24 +5104,31 @@ retry_pud:
  * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
  * still be in per-arch page fault handlers at the entry of page fault.
  */
-static inline void mm_account_fault(struct pt_regs *regs,
+static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
 {
        bool major;
 
+       /* Incomplete faults will be accounted upon completion. */
+       if (ret & VM_FAULT_RETRY)
+               return;
+
        /*
-        * We don't do accounting for some specific faults:
-        *
-        * - Unsuccessful faults (e.g. when the address wasn't valid).  That
-        *   includes arch_vma_access_permitted() failing before reaching here.
-        *   So this is not a "this many hardware page faults" counter.  We
-        *   should use the hw profiling for that.
-        *
-        * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
-        *   once they're completed.
+        * To preserve the behavior of older kernels, PGFAULT counters record
+        * both successful and failed faults, as opposed to perf counters,
+        * which ignore failed cases.
+        */
+       count_vm_event(PGFAULT);
+       count_memcg_event_mm(mm, PGFAULT);
+
+       /*
+        * Do not account for unsuccessful faults (e.g. when the address wasn't
+        * valid).  That includes arch_vma_access_permitted() failing before
+        * reaching here. So this is not a "this many hardware page faults"
+        * counter.  We should use the hw profiling for that.
         */
-       if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+       if (ret & VM_FAULT_ERROR)
                return;
 
        /*
@@ -5166,21 +5211,22 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
 vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
 {
+       /* If the fault handler drops the mmap_lock, vma may be freed */
+       struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;
 
        __set_current_state(TASK_RUNNING);
 
-       count_vm_event(PGFAULT);
-       count_memcg_event_mm(vma->vm_mm, PGFAULT);
-
        ret = sanitize_fault_flags(vma, &flags);
        if (ret)
-               return ret;
+               goto out;
 
        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
-                                           flags & FAULT_FLAG_REMOTE))
-               return VM_FAULT_SIGSEGV;
+                                           flags & FAULT_FLAG_REMOTE)) {
+               ret = VM_FAULT_SIGSEGV;
+               goto out;
+       }
 
        /*
         * Enable the memcg OOM handling for faults triggered in user
@@ -5209,13 +5255,74 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }
-
-       mm_account_fault(regs, address, flags, ret);
+out:
+       mm_account_fault(mm, regs, address, flags, ret);
 
        return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
 
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address)
+{
+       MA_STATE(mas, &mm->mm_mt, address, address);
+       struct vm_area_struct *vma;
+
+       rcu_read_lock();
+retry:
+       vma = mas_walk(&mas);
+       if (!vma)
+               goto inval;
+
+       /* Only anonymous vmas are supported for now */
+       if (!vma_is_anonymous(vma))
+               goto inval;
+
+       /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
+       if (!vma->anon_vma)
+               goto inval;
+
+       if (!vma_start_read(vma))
+               goto inval;
+
+       /*
+        * Due to the possibility of userfault handler dropping mmap_lock, avoid
+        * it for now and fall back to page fault handling under mmap_lock.
+        */
+       if (userfaultfd_armed(vma)) {
+               vma_end_read(vma);
+               goto inval;
+       }
+
+       /* Check since vm_start/vm_end might change before we lock the VMA */
+       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+               vma_end_read(vma);
+               goto inval;
+       }
+
+       /* Check if the VMA got isolated after we found it */
+       if (vma->detached) {
+               vma_end_read(vma);
+               count_vm_vma_lock_event(VMA_LOCK_MISS);
+               /* The area was replaced with another one */
+               goto retry;
+       }
+
+       rcu_read_unlock();
+       return vma;
+inval:
+       rcu_read_unlock();
+       count_vm_vma_lock_event(VMA_LOCK_ABORT);
+       return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
 #ifndef __PAGETABLE_P4D_FOLDED
 /*
  * Allocate p4d page table.
@@ -5634,12 +5741,12 @@ EXPORT_SYMBOL(__might_fault);
  * operation.  The target subpage will be processed last to keep its
  * cache lines hot.
  */
-static inline void process_huge_page(
+static inline int process_huge_page(
        unsigned long addr_hint, unsigned int pages_per_huge_page,
-       void (*process_subpage)(unsigned long addr, int idx, void *arg),
+       int (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
 {
-       int i, n, base, l;
+       int i, n, base, l, ret;
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
 
@@ -5653,7 +5760,9 @@ static inline void process_huge_page(
                /* Process subpages at the end of huge page */
                for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                        cond_resched();
-                       process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       if (ret)
+                               return ret;
                }
        } else {
                /* If target subpage in second half of huge page */
@@ -5662,7 +5771,9 @@ static inline void process_huge_page(
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
-                       process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
+                       if (ret)
+                               return ret;
                }
        }
        /*
@@ -5674,10 +5785,15 @@ static inline void process_huge_page(
                int right_idx = base + 2 * l - 1 - i;
 
                cond_resched();
-               process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+               ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+               if (ret)
+                       return ret;
                cond_resched();
-               process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+               ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+               if (ret)
+                       return ret;
        }
+       return 0;
 }
 
 static void clear_gigantic_page(struct page *page,
@@ -5695,11 +5811,12 @@ static void clear_gigantic_page(struct page *page,
        }
 }
 
-static void clear_subpage(unsigned long addr, int idx, void *arg)
+static int clear_subpage(unsigned long addr, int idx, void *arg)
 {
        struct page *page = arg;
 
        clear_user_highpage(page + idx, addr);
+       return 0;
 }
 
 void clear_huge_page(struct page *page,
@@ -5716,22 +5833,27 @@ void clear_huge_page(struct page *page,
        process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
 }
 
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
-                                   unsigned long addr,
-                                   struct vm_area_struct *vma,
-                                   unsigned int pages_per_huge_page)
+static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
+                                    unsigned long addr,
+                                    struct vm_area_struct *vma,
+                                    unsigned int pages_per_huge_page)
 {
        int i;
-       struct page *dst_base = dst;
-       struct page *src_base = src;
+       struct page *dst_page;
+       struct page *src_page;
 
        for (i = 0; i < pages_per_huge_page; i++) {
-               dst = nth_page(dst_base, i);
-               src = nth_page(src_base, i);
+               dst_page = folio_page(dst, i);
+               src_page = folio_page(src, i);
 
                cond_resched();
-               copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+               if (copy_mc_user_highpage(dst_page, src_page,
+                                         addr + i*PAGE_SIZE, vma)) {
+                       memory_failure_queue(page_to_pfn(src_page), 0);
+                       return -EHWPOISON;
+               }
        }
+       return 0;
 }
 
 struct copy_subpage_arg {
@@ -5740,57 +5862,56 @@ struct copy_subpage_arg {
        struct vm_area_struct *vma;
 };
 
-static void copy_subpage(unsigned long addr, int idx, void *arg)
+static int copy_subpage(unsigned long addr, int idx, void *arg)
 {
        struct copy_subpage_arg *copy_arg = arg;
 
-       copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-                          addr, copy_arg->vma);
+       if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+                                 addr, copy_arg->vma)) {
+               memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+               return -EHWPOISON;
+       }
+       return 0;
 }
 
-void copy_user_huge_page(struct page *dst, struct page *src,
-                        unsigned long addr_hint, struct vm_area_struct *vma,
-                        unsigned int pages_per_huge_page)
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+                         unsigned long addr_hint, struct vm_area_struct *vma)
 {
+       unsigned int pages_per_huge_page = folio_nr_pages(dst);
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
        struct copy_subpage_arg arg = {
-               .dst = dst,
-               .src = src,
+               .dst = &dst->page,
+               .src = &src->page,
                .vma = vma,
        };
 
-       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
-               copy_user_gigantic_page(dst, src, addr, vma,
-                                       pages_per_huge_page);
-               return;
-       }
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
+               return copy_user_gigantic_page(dst, src, addr, vma,
+                                              pages_per_huge_page);
 
-       process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
+       return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
 }
 
-long copy_huge_page_from_user(struct page *dst_page,
-                               const void __user *usr_src,
-                               unsigned int pages_per_huge_page,
-                               bool allow_pagefault)
+long copy_folio_from_user(struct folio *dst_folio,
+                          const void __user *usr_src,
+                          bool allow_pagefault)
 {
-       void *page_kaddr;
+       void *kaddr;
        unsigned long i, rc = 0;
-       unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+       unsigned int nr_pages = folio_nr_pages(dst_folio);
+       unsigned long ret_val = nr_pages * PAGE_SIZE;
        struct page *subpage;
 
-       for (i = 0; i < pages_per_huge_page; i++) {
-               subpage = nth_page(dst_page, i);
-               if (allow_pagefault)
-                       page_kaddr = kmap(subpage);
-               else
-                       page_kaddr = kmap_atomic(subpage);
-               rc = copy_from_user(page_kaddr,
-                               usr_src + i * PAGE_SIZE, PAGE_SIZE);
-               if (allow_pagefault)
-                       kunmap(subpage);
-               else
-                       kunmap_atomic(page_kaddr);
+       for (i = 0; i < nr_pages; i++) {
+               subpage = folio_page(dst_folio, i);
+               kaddr = kmap_local_page(subpage);
+               if (!allow_pagefault)
+                       pagefault_disable();
+               rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
+               if (!allow_pagefault)
+                       pagefault_enable();
+               kunmap_local(kaddr);
 
                ret_val -= (PAGE_SIZE - rc);
                if (rc)