mm/memory.c: fix race when faulting a device private page
[linux-block.git] / mm / memory.c
index 4ba73f5aa8bb7771c584ac7097c66fba8222ddaa..4ad6077164cd2f35f548e5a17ceccba69dd64948 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/memremap.h>
+#include <linux/kmsan.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
@@ -66,6 +67,7 @@
 #include <linux/gfp.h>
 #include <linux/migrate.h>
 #include <linux/string.h>
+#include <linux/memory-tiers.h>
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/dax.h>
@@ -74,6 +76,7 @@
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
 
 #include <trace/events/kmem.h>
 
@@ -125,18 +128,6 @@ int randomize_va_space __read_mostly =
                                        2;
 #endif
 
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
-       /*
-        * Those arches which don't have hw access flag feature need to
-        * implement their own helper. By default, "true" means pagefault
-        * will be hit on old pte.
-        */
-       return true;
-}
-#endif
-
 #ifndef arch_wants_old_prefaulted_pte
 static inline bool arch_wants_old_prefaulted_pte(void)
 {
@@ -402,12 +393,21 @@ void free_pgd_range(struct mmu_gather *tlb,
        } while (pgd++, addr = next, addr != end);
 }
 
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
-               unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+                  struct vm_area_struct *vma, unsigned long floor,
+                  unsigned long ceiling)
 {
-       while (vma) {
-               struct vm_area_struct *next = vma->vm_next;
+       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+
+       do {
                unsigned long addr = vma->vm_start;
+               struct vm_area_struct *next;
+
+               /*
+                * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
+                * be 0.  This will underflow and is okay.
+                */
+               next = mas_find(&mas, ceiling - 1);
 
                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
@@ -426,7 +426,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
-                               next = vma->vm_next;
+                               next = mas_find(&mas, ceiling - 1);
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
@@ -434,7 +434,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                floor, next ? next->vm_start : ceiling);
                }
                vma = next;
-       }
+       } while (vma);
 }
 
 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
@@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte,
                              struct zap_details *details, pte_t pteval)
 {
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
        if (zap_drop_file_uffd_wp(details))
                return;
 
        pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+#endif
 }
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1685,10 +1687,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                        if (vma->vm_file) {
                                zap_flags_t zap_flags = details ?
                                    details->zap_flags : 0;
-                               i_mmap_lock_write(vma->vm_file->f_mapping);
                                __unmap_hugepage_range_final(tlb, vma, start, end,
                                                             NULL, zap_flags);
-                               i_mmap_unlock_write(vma->vm_file->f_mapping);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
@@ -1698,6 +1698,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlb: address of the caller's struct mmu_gather
+ * @mt: the maple tree
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
@@ -1713,7 +1714,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-void unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr)
 {
@@ -1723,12 +1724,14 @@ void unmap_vmas(struct mmu_gather *tlb,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };
+       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
 
        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
-       for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+       do {
                unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
+       } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
        mmu_notifier_invalidate_range_end(&range);
 }
 
@@ -1743,8 +1746,11 @@ void unmap_vmas(struct mmu_gather *tlb,
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
                unsigned long size)
 {
+       struct maple_tree *mt = &vma->vm_mm->mm_mt;
+       unsigned long end = start + size;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;
+       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
 
        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -1752,8 +1758,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
        tlb_gather_mmu(&tlb, vma->vm_mm);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
-       for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+       do {
                unmap_single_vma(&tlb, vma, start, range.end, NULL);
+       } while ((vma = mas_find(&mas, end - 1)) != NULL);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
 }
@@ -2870,7 +2877,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
-       if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+       if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;
 
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
@@ -3128,6 +3135,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                        delayacct_wpcopy_end();
                        return 0;
                }
+               kmsan_copy_page_meta(new_page, old_page);
        }
 
        if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
@@ -3362,6 +3370,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 {
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
+       struct folio *folio;
 
        VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
        VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
@@ -3408,48 +3417,47 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
         * Take out anonymous pages first, anonymous shared vmas are
         * not dirty accountable.
         */
-       if (PageAnon(vmf->page)) {
-               struct page *page = vmf->page;
-
+       folio = page_folio(vmf->page);
+       if (folio_test_anon(folio)) {
                /*
                 * If the page is exclusive to this process we must reuse the
                 * page without further checks.
                 */
-               if (PageAnonExclusive(page))
+               if (PageAnonExclusive(vmf->page))
                        goto reuse;
 
                /*
-                * We have to verify under page lock: these early checks are
-                * just an optimization to avoid locking the page and freeing
+                * We have to verify under folio lock: these early checks are
+                * just an optimization to avoid locking the folio and freeing
                 * the swapcache if there is little hope that we can reuse.
                 *
-                * PageKsm() doesn't necessarily raise the page refcount.
+                * KSM doesn't necessarily raise the folio refcount.
                 */
-               if (PageKsm(page) || page_count(page) > 3)
+               if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                        goto copy;
-               if (!PageLRU(page))
+               if (!folio_test_lru(folio))
                        /*
                         * Note: We cannot easily detect+handle references from
-                        * remote LRU pagevecs or references to PageLRU() pages.
+                        * remote LRU pagevecs or references to LRU folios.
                         */
                        lru_add_drain();
-               if (page_count(page) > 1 + PageSwapCache(page))
+               if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                        goto copy;
-               if (!trylock_page(page))
+               if (!folio_trylock(folio))
                        goto copy;
-               if (PageSwapCache(page))
-                       try_to_free_swap(page);
-               if (PageKsm(page) || page_count(page) != 1) {
-                       unlock_page(page);
+               if (folio_test_swapcache(folio))
+                       folio_free_swap(folio);
+               if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+                       folio_unlock(folio);
                        goto copy;
                }
                /*
-                * Ok, we've got the only page reference from our mapping
-                * and the page is locked, it's dark out, and we're wearing
+                * Ok, we've got the only folio reference from our mapping
+                * and the folio is locked, it's dark out, and we're wearing
                 * sunglasses. Hit it.
                 */
-               page_move_anon_rmap(page, vma);
-               unlock_page(page);
+               page_move_anon_rmap(vmf->page, vma);
+               folio_unlock(folio);
 reuse:
                if (unlikely(unshare)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3612,11 +3620,11 @@ EXPORT_SYMBOL(unmap_mapping_range);
  */
 static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
 {
-       struct page *page = vmf->page;
+       struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
 
-       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
                return VM_FAULT_RETRY;
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
                                vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3626,23 +3634,23 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
-               restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+               restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
 
        pte_unmap_unlock(vmf->pte, vmf->ptl);
-       unlock_page(page);
+       folio_unlock(folio);
 
        mmu_notifier_invalidate_range_end(&range);
        return 0;
 }
 
-static inline bool should_try_to_free_swap(struct page *page,
+static inline bool should_try_to_free_swap(struct folio *folio,
                                           struct vm_area_struct *vma,
                                           unsigned int fault_flags)
 {
-       if (!PageSwapCache(page))
+       if (!folio_test_swapcache(folio))
                return false;
-       if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
-           PageMlocked(page))
+       if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
+           folio_test_mlocked(folio))
                return true;
        /*
         * If we want to map a page that's in the swapcache writable, we
@@ -3650,8 +3658,8 @@ static inline bool should_try_to_free_swap(struct page *page,
         * user. Try freeing the swapcache to get rid of the swapcache
         * reference only in case it's likely that we'll be the exlusive user.
         */
-       return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
-               page_count(page) == 2;
+       return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
+               folio_ref_count(folio) == 2;
 }
 
 static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -3718,7 +3726,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
-       struct page *page = NULL, *swapcache;
+       struct folio *swapcache, *folio = NULL;
+       struct page *page;
        struct swap_info_struct *si = NULL;
        rmap_t rmap_flags = RMAP_NONE;
        bool exclusive = false;
@@ -3741,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
                        vmf->page = pfn_swap_entry_to_page(entry);
-                       ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                       vmf->address, &vmf->ptl);
+                       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+                               spin_unlock(vmf->ptl);
+                               goto out;
+                       }
+
+                       /*
+                        * Get a page reference while we know the page can't be
+                        * freed.
+                        */
+                       get_page(vmf->page);
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+                       vmf->page->pgmap->ops->migrate_to_ram(vmf);
+                       put_page(vmf->page);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else if (is_swapin_error_entry(entry)) {
@@ -3760,21 +3783,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        if (unlikely(!si))
                goto out;
 
-       page = lookup_swap_cache(entry, vma, vmf->address);
-       swapcache = page;
+       folio = swap_cache_get_folio(entry, vma, vmf->address);
+       if (folio)
+               page = folio_file_page(folio, swp_offset(entry));
+       swapcache = folio;
 
-       if (!page) {
+       if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /* skip swapcache */
-                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
-                                                       vmf->address);
-                       if (page) {
-                               __SetPageLocked(page);
-                               __SetPageSwapBacked(page);
-
-                               if (mem_cgroup_swapin_charge_page(page,
-                                       vma->vm_mm, GFP_KERNEL, entry)) {
+                       folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
+                                               vma, vmf->address, false);
+                       page = &folio->page;
+                       if (folio) {
+                               __folio_set_locked(folio);
+                               __folio_set_swapbacked(folio);
+
+                               if (mem_cgroup_swapin_charge_folio(folio,
+                                                       vma->vm_mm, GFP_KERNEL,
+                                                       entry)) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }
@@ -3782,23 +3809,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
-                                       workingset_refault(page_folio(page),
-                                                               shadow);
+                                       workingset_refault(folio, shadow);
 
-                               lru_cache_add(page);
+                               folio_add_lru(folio);
 
                                /* To provide entry to swap_readpage() */
-                               set_page_private(page, entry.val);
+                               folio_set_swap_entry(folio, entry);
                                swap_readpage(page, true, NULL);
-                               set_page_private(page, 0);
+                               folio->private = NULL;
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
-                       swapcache = page;
+                       if (page)
+                               folio = page_folio(page);
+                       swapcache = folio;
                }
 
-               if (!page) {
+               if (!folio) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
@@ -3823,7 +3851,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                goto out_release;
        }
 
-       locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+       locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
 
        if (!locked) {
                ret |= VM_FAULT_RETRY;
@@ -3832,13 +3860,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
        if (swapcache) {
                /*
-                * Make sure try_to_free_swap or swapoff did not release the
+                * Make sure folio_free_swap() or swapoff did not release the
                 * swapcache from under us.  The page pin, and pte_same test
                 * below, are not enough to exclude that.  Even if it is still
                 * swapcache, we need to check that the page's swap has not
                 * changed.
                 */
-               if (unlikely(!PageSwapCache(page) ||
+               if (unlikely(!folio_test_swapcache(folio) ||
                             page_private(page) != entry.val))
                        goto out_page;
 
@@ -3850,9 +3878,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                page = ksm_might_need_to_copy(page, vma, vmf->address);
                if (unlikely(!page)) {
                        ret = VM_FAULT_OOM;
-                       page = swapcache;
                        goto out_page;
                }
+               folio = page_folio(page);
 
                /*
                 * If we want to map a page that's in the swapcache writable, we
@@ -3860,8 +3888,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * owner. Try removing the extra reference from the local LRU
                 * pagevecs if required.
                 */
-               if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
-                   !PageKsm(page) && !PageLRU(page))
+               if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
+                   !folio_test_ksm(folio) && !folio_test_lru(folio))
                        lru_add_drain();
        }
 
@@ -3875,7 +3903,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
                goto out_nomap;
 
-       if (unlikely(!PageUptodate(page))) {
+       if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }
@@ -3888,26 +3916,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         * check after taking the PT lock and making sure that nobody
         * concurrently faulted in this page and set PG_anon_exclusive.
         */
-       BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
-       BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+       BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
+       BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
 
        /*
         * Check under PT lock (to protect against concurrent fork() sharing
         * the swap entry concurrently) for certainly exclusive pages.
         */
-       if (!PageKsm(page)) {
+       if (!folio_test_ksm(folio)) {
                /*
                 * Note that pte_swp_exclusive() == false for architectures
                 * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
                 */
                exclusive = pte_swp_exclusive(vmf->orig_pte);
-               if (page != swapcache) {
+               if (folio != swapcache) {
                        /*
                         * We have a fresh page that is not exposed to the
                         * swapcache -> certainly exclusive.
                         */
                        exclusive = true;
-               } else if (exclusive && PageWriteback(page) &&
+               } else if (exclusive && folio_test_writeback(folio) &&
                          data_race(si->flags & SWP_STABLE_WRITES)) {
                        /*
                         * This is tricky: not all swap backends support
@@ -3937,8 +3965,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         * yet.
         */
        swap_free(entry);
-       if (should_try_to_free_swap(page, vma, vmf->flags))
-               try_to_free_swap(page);
+       if (should_try_to_free_swap(folio, vma, vmf->flags))
+               folio_free_swap(folio);
 
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
@@ -3950,7 +3978,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         * exposing them to the swapcache or because the swap entry indicates
         * exclusivity.
         */
-       if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
+       if (!folio_test_ksm(folio) &&
+           (exclusive || folio_ref_count(folio) == 1)) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                        vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -3968,19 +3997,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        vmf->orig_pte = pte;
 
        /* ksm created a completely new copy */
-       if (unlikely(page != swapcache && swapcache)) {
+       if (unlikely(folio != swapcache && swapcache)) {
                page_add_new_anon_rmap(page, vma, vmf->address);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               folio_add_lru_vma(folio, vma);
        } else {
                page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
        }
 
-       VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
+       VM_BUG_ON(!folio_test_anon(folio) ||
+                       (pte_write(pte) && !PageAnonExclusive(page)));
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
 
-       unlock_page(page);
-       if (page != swapcache && swapcache) {
+       folio_unlock(folio);
+       if (folio != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
@@ -3989,8 +4019,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
-               unlock_page(swapcache);
-               put_page(swapcache);
+               folio_unlock(swapcache);
+               folio_put(swapcache);
        }
 
        if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -4011,12 +4041,12 @@ out:
 out_nomap:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
-       unlock_page(page);
+       folio_unlock(folio);
 out_release:
-       put_page(page);
-       if (page != swapcache && swapcache) {
-               unlock_page(swapcache);
-               put_page(swapcache);
+       folio_put(folio);
+       if (folio != swapcache && swapcache) {
+               folio_unlock(swapcache);
+               folio_put(swapcache);
        }
        if (si)
                put_swap_device(si);
@@ -4386,14 +4416,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                      vmf->address, &vmf->ptl);
-       ret = 0;
+
        /* Re-check under ptl */
-       if (likely(!vmf_pte_changed(vmf)))
+       if (likely(!vmf_pte_changed(vmf))) {
                do_set_pte(vmf, page, vmf->address);
-       else
+
+               /* no need to invalidate: a not-present page won't be cached */
+               update_mmu_cache(vma, vmf->address, vmf->pte);
+
+               ret = 0;
+       } else {
+               update_mmu_tlb(vma, vmf->address, vmf->pte);
                ret = VM_FAULT_NOPAGE;
+       }
 
-       update_mmu_tlb(vma, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
 }
@@ -4725,8 +4761,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
                flags |= TNF_SHARED;
 
-       last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
+       /*
+        * For memory tiering mode, cpupid of slow memory page is used
+        * to record page access time.  So use default value.
+        */
+       if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+           !node_is_toptier(page_nid))
+               last_cpupid = (-1 & LAST_CPUPID_MASK);
+       else
+               last_cpupid = page_cpupid_last(page);
        target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
        if (target_nid == NUMA_NO_NODE) {
@@ -4985,7 +5029,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                return VM_FAULT_OOM;
 retry_pud:
        if (pud_none(*vmf.pud) &&
-           hugepage_vma_check(vma, vm_flags, false, true)) {
+           hugepage_vma_check(vma, vm_flags, false, true, true)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
@@ -5019,7 +5063,7 @@ retry_pud:
                goto retry_pud;
 
        if (pmd_none(*vmf.pmd) &&
-           hugepage_vma_check(vma, vm_flags, false, true)) {
+           hugepage_vma_check(vma, vm_flags, false, true, true)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
@@ -5114,6 +5158,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 }
 
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+       /* the LRU algorithm doesn't apply to sequential or random reads */
+       current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+       current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
 /*
  * By the time we get here, we already hold the mm semaphore
  *
@@ -5145,11 +5210,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();
 
+       lru_gen_enter_fault(vma);
+
        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);
 
+       lru_gen_exit_fault();
+
        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
@@ -5637,11 +5706,11 @@ static void clear_gigantic_page(struct page *page,
                                unsigned int pages_per_huge_page)
 {
        int i;
-       struct page *p = page;
+       struct page *p;
 
        might_sleep();
-       for (i = 0; i < pages_per_huge_page;
-            i++, p = mem_map_next(p, page, i)) {
+       for (i = 0; i < pages_per_huge_page; i++) {
+               p = nth_page(page, i);
                cond_resched();
                clear_user_highpage(p, addr + i * PAGE_SIZE);
        }
@@ -5677,13 +5746,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
        struct page *dst_base = dst;
        struct page *src_base = src;
 
-       for (i = 0; i < pages_per_huge_page; ) {
+       for (i = 0; i < pages_per_huge_page; i++) {
+               dst = nth_page(dst_base, i);
+               src = nth_page(src_base, i);
+
                cond_resched();
                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
-               i++;
-               dst = mem_map_next(dst, dst_base, i);
-               src = mem_map_next(src, src_base, i);
        }
 }
 
@@ -5730,10 +5798,10 @@ long copy_huge_page_from_user(struct page *dst_page,
        void *page_kaddr;
        unsigned long i, rc = 0;
        unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
-       struct page *subpage = dst_page;
+       struct page *subpage;
 
-       for (i = 0; i < pages_per_huge_page;
-            i++, subpage = mem_map_next(subpage, dst_page, i)) {
+       for (i = 0; i < pages_per_huge_page; i++) {
+               subpage = nth_page(dst_page, i);
                if (allow_pagefault)
                        page_kaddr = kmap(subpage);
                else