mm/memory.c: fix race when faulting a device private page

[linux-block.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 4ba73f5aa8bb7771c584ac7097c66fba8222ddaa..4ad6077164cd2f35f548e5a17ceccba69dd64948 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,7 @@
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
  #include <linux/memremap.h>
+#include <linux/kmsan.h>
  #include <linux/ksm.h>
  #include <linux/rmap.h>
  #include <linux/export.h>
@@ -66,6 +67,7 @@
  #include <linux/gfp.h>
  #include <linux/migrate.h>
  #include <linux/string.h>
+#include <linux/memory-tiers.h>
  #include <linux/debugfs.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/dax.h>
@@ -74,6 +76,7 @@
  #include <linux/perf_event.h>
  #include <linux/ptrace.h>
  #include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
  
  #include <trace/events/kmem.h>
  
@@ -125,18 +128,6 @@ int randomize_va_space __read_mostly =
                                         2;
  #endif
  
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
-       /*
-        * Those arches which don't have hw access flag feature need to
-        * implement their own helper. By default, "true" means pagefault
-        * will be hit on old pte.
-        */
-       return true;
-}
-#endif
-
  #ifndef arch_wants_old_prefaulted_pte
  static inline bool arch_wants_old_prefaulted_pte(void)
  {
@@ -402,12 +393,21 @@ void free_pgd_range(struct mmu_gather *tlb,
         } while (pgd++, addr = next, addr != end);
  }
  
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
-               unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+                  struct vm_area_struct *vma, unsigned long floor,
+                  unsigned long ceiling)
  {
-       while (vma) {
-               struct vm_area_struct *next = vma->vm_next;
+       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+
+       do {
                 unsigned long addr = vma->vm_start;
+               struct vm_area_struct *next;
+
+               /*
+                * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
+                * be 0.  This will underflow and is okay.
+                */
+               next = mas_find(&mas, ceiling - 1);
  
                 /*
                  * Hide vma from rmap and truncate_pagecache before freeing
@@ -426,7 +426,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                         while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                                && !is_vm_hugetlb_page(next)) {
                                 vma = next;
-                               next = vma->vm_next;
+                               next = mas_find(&mas, ceiling - 1);
                                 unlink_anon_vmas(vma);
                                 unlink_file_vma(vma);
                         }
@@ -434,7 +434,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                 floor, next ? next->vm_start : ceiling);
                 }
                 vma = next;
-       }
+       } while (vma);
  }
  
  void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
@@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                               unsigned long addr, pte_t *pte,
                               struct zap_details *details, pte_t pteval)
  {
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
         if (zap_drop_file_uffd_wp(details))
                 return;
  
         pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+#endif
  }
  
  static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1685,10 +1687,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         if (vma->vm_file) {
                                 zap_flags_t zap_flags = details ?
                                     details->zap_flags : 0;
-                               i_mmap_lock_write(vma->vm_file->f_mapping);
                                 __unmap_hugepage_range_final(tlb, vma, start, end,
                                                              NULL, zap_flags);
-                               i_mmap_unlock_write(vma->vm_file->f_mapping);
                         }
                 } else
                         unmap_page_range(tlb, vma, start, end, details);
@@ -1698,6 +1698,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  /**
   * unmap_vmas - unmap a range of memory covered by a list of vma's
   * @tlb: address of the caller's struct mmu_gather
+ * @mt: the maple tree
   * @vma: the starting vma
   * @start_addr: virtual address at which to start unmapping
   * @end_addr: virtual address at which to end unmapping
@@ -1713,7 +1714,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   * drops the lock and schedules.
   */
-void unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
                 struct vm_area_struct *vma, unsigned long start_addr,
                 unsigned long end_addr)
  {
@@ -1723,12 +1724,14 @@ void unmap_vmas(struct mmu_gather *tlb,
                 /* Careful - we need to zap private pages too! */
                 .even_cows = true,
         };
+       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
  
         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
                                 start_addr, end_addr);
         mmu_notifier_invalidate_range_start(&range);
-       for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+       do {
                 unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
+       } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
         mmu_notifier_invalidate_range_end(&range);
  }
  
@@ -1743,8 +1746,11 @@ void unmap_vmas(struct mmu_gather *tlb,
  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
                 unsigned long size)
  {
+       struct maple_tree *mt = &vma->vm_mm->mm_mt;
+       unsigned long end = start + size;
         struct mmu_notifier_range range;
         struct mmu_gather tlb;
+       MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
  
         lru_add_drain();
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -1752,8 +1758,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
         tlb_gather_mmu(&tlb, vma->vm_mm);
         update_hiwater_rss(vma->vm_mm);
         mmu_notifier_invalidate_range_start(&range);
-       for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+       do {
                 unmap_single_vma(&tlb, vma, start, range.end, NULL);
+       } while ((vma = mas_find(&mas, end - 1)) != NULL);
         mmu_notifier_invalidate_range_end(&range);
         tlb_finish_mmu(&tlb);
  }
@@ -2870,7 +2877,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
          * On architectures with software "accessed" bits, we would
          * take a double page fault, so mark it accessed here.
          */
-       if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+       if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                 pte_t entry;
  
                 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
@@ -3128,6 +3135,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                         delayacct_wpcopy_end();
                         return 0;
                 }
+               kmsan_copy_page_meta(new_page, old_page);
         }
  
         if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
@@ -3362,6 +3370,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
  {
         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
         struct vm_area_struct *vma = vmf->vma;
+       struct folio *folio;
  
         VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
         VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
@@ -3408,48 +3417,47 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
          * Take out anonymous pages first, anonymous shared vmas are
          * not dirty accountable.
          */
-       if (PageAnon(vmf->page)) {
-               struct page *page = vmf->page;
-
+       folio = page_folio(vmf->page);
+       if (folio_test_anon(folio)) {
                 /*
                  * If the page is exclusive to this process we must reuse the
                  * page without further checks.
                  */
-               if (PageAnonExclusive(page))
+               if (PageAnonExclusive(vmf->page))
                         goto reuse;
  
                 /*
-                * We have to verify under page lock: these early checks are
-                * just an optimization to avoid locking the page and freeing
+                * We have to verify under folio lock: these early checks are
+                * just an optimization to avoid locking the folio and freeing
                  * the swapcache if there is little hope that we can reuse.
                  *
-                * PageKsm() doesn't necessarily raise the page refcount.
+                * KSM doesn't necessarily raise the folio refcount.
                  */
-               if (PageKsm(page) || page_count(page) > 3)
+               if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                         goto copy;
-               if (!PageLRU(page))
+               if (!folio_test_lru(folio))
                         /*
                          * Note: We cannot easily detect+handle references from
-                        * remote LRU pagevecs or references to PageLRU() pages.
+                        * remote LRU pagevecs or references to LRU folios.
                          */
                         lru_add_drain();
-               if (page_count(page) > 1 + PageSwapCache(page))
+               if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                         goto copy;
-               if (!trylock_page(page))
+               if (!folio_trylock(folio))
                         goto copy;
-               if (PageSwapCache(page))
-                       try_to_free_swap(page);
-               if (PageKsm(page) || page_count(page) != 1) {
-                       unlock_page(page);
+               if (folio_test_swapcache(folio))
+                       folio_free_swap(folio);
+               if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+                       folio_unlock(folio);
                         goto copy;
                 }
                 /*
-                * Ok, we've got the only page reference from our mapping
-                * and the page is locked, it's dark out, and we're wearing
+                * Ok, we've got the only folio reference from our mapping
+                * and the folio is locked, it's dark out, and we're wearing
                  * sunglasses. Hit it.
                  */
-               page_move_anon_rmap(page, vma);
-               unlock_page(page);
+               page_move_anon_rmap(vmf->page, vma);
+               folio_unlock(folio);
  reuse:
                 if (unlikely(unshare)) {
                         pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3612,11 +3620,11 @@ EXPORT_SYMBOL(unmap_mapping_range);
   */
  static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
  {
-       struct page *page = vmf->page;
+       struct folio *folio = page_folio(vmf->page);
         struct vm_area_struct *vma = vmf->vma;
         struct mmu_notifier_range range;
  
-       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+       if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
                 return VM_FAULT_RETRY;
         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
                                 vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3626,23 +3634,23 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                 &vmf->ptl);
         if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
-               restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+               restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
  
         pte_unmap_unlock(vmf->pte, vmf->ptl);
-       unlock_page(page);
+       folio_unlock(folio);
  
         mmu_notifier_invalidate_range_end(&range);
         return 0;
  }
  
-static inline bool should_try_to_free_swap(struct page *page,
+static inline bool should_try_to_free_swap(struct folio *folio,
                                            struct vm_area_struct *vma,
                                            unsigned int fault_flags)
  {
-       if (!PageSwapCache(page))
+       if (!folio_test_swapcache(folio))
                 return false;
-       if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
-           PageMlocked(page))
+       if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
+           folio_test_mlocked(folio))
                 return true;
         /*
          * If we want to map a page that's in the swapcache writable, we
@@ -3650,8 +3658,8 @@ static inline bool should_try_to_free_swap(struct page *page,
          * user. Try freeing the swapcache to get rid of the swapcache
          * reference only in case it's likely that we'll be the exlusive user.
          */
-       return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
-               page_count(page) == 2;
+       return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
+               folio_ref_count(folio) == 2;
  }
  
  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -3718,7 +3726,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
  vm_fault_t do_swap_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       struct page *page = NULL, *swapcache;
+       struct folio *swapcache, *folio = NULL;
+       struct page *page;
         struct swap_info_struct *si = NULL;
         rmap_t rmap_flags = RMAP_NONE;
         bool exclusive = false;
@@ -3741,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                         ret = remove_device_exclusive_entry(vmf);
                 } else if (is_device_private_entry(entry)) {
                         vmf->page = pfn_swap_entry_to_page(entry);
-                       ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                       vmf->address, &vmf->ptl);
+                       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+                               spin_unlock(vmf->ptl);
+                               goto out;
+                       }
+
+                       /*
+                        * Get a page reference while we know the page can't be
+                        * freed.
+                        */
+                       get_page(vmf->page);
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+                       vmf->page->pgmap->ops->migrate_to_ram(vmf);
+                       put_page(vmf->page);
                 } else if (is_hwpoison_entry(entry)) {
                         ret = VM_FAULT_HWPOISON;
                 } else if (is_swapin_error_entry(entry)) {
@@ -3760,21 +3783,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         if (unlikely(!si))
                 goto out;
  
-       page = lookup_swap_cache(entry, vma, vmf->address);
-       swapcache = page;
+       folio = swap_cache_get_folio(entry, vma, vmf->address);
+       if (folio)
+               page = folio_file_page(folio, swp_offset(entry));
+       swapcache = folio;
  
-       if (!page) {
+       if (!folio) {
                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                     __swap_count(entry) == 1) {
                         /* skip swapcache */
-                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
-                                                       vmf->address);
-                       if (page) {
-                               __SetPageLocked(page);
-                               __SetPageSwapBacked(page);
-
-                               if (mem_cgroup_swapin_charge_page(page,
-                                       vma->vm_mm, GFP_KERNEL, entry)) {
+                       folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
+                                               vma, vmf->address, false);
+                       page = &folio->page;
+                       if (folio) {
+                               __folio_set_locked(folio);
+                               __folio_set_swapbacked(folio);
+
+                               if (mem_cgroup_swapin_charge_folio(folio,
+                                                       vma->vm_mm, GFP_KERNEL,
+                                                       entry)) {
                                         ret = VM_FAULT_OOM;
                                         goto out_page;
                                 }
@@ -3782,23 +3809,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
  
                                 shadow = get_shadow_from_swap_cache(entry);
                                 if (shadow)
-                                       workingset_refault(page_folio(page),
-                                                               shadow);
+                                       workingset_refault(folio, shadow);
  
-                               lru_cache_add(page);
+                               folio_add_lru(folio);
  
                                 /* To provide entry to swap_readpage() */
-                               set_page_private(page, entry.val);
+                               folio_set_swap_entry(folio, entry);
                                 swap_readpage(page, true, NULL);
-                               set_page_private(page, 0);
+                               folio->private = NULL;
                         }
                 } else {
                         page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                 vmf);
-                       swapcache = page;
+                       if (page)
+                               folio = page_folio(page);
+                       swapcache = folio;
                 }
  
-               if (!page) {
+               if (!folio) {
                         /*
                          * Back out if somebody else faulted in this pte
                          * while we released the pte lock.
@@ -3823,7 +3851,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 goto out_release;
         }
  
-       locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+       locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
  
         if (!locked) {
                 ret |= VM_FAULT_RETRY;
@@ -3832,13 +3860,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
  
         if (swapcache) {
                 /*
-                * Make sure try_to_free_swap or swapoff did not release the
+                * Make sure folio_free_swap() or swapoff did not release the
                  * swapcache from under us.  The page pin, and pte_same test
                  * below, are not enough to exclude that.  Even if it is still
                  * swapcache, we need to check that the page's swap has not
                  * changed.
                  */
-               if (unlikely(!PageSwapCache(page) ||
+               if (unlikely(!folio_test_swapcache(folio) ||
                              page_private(page) != entry.val))
                         goto out_page;
  
@@ -3850,9 +3878,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 page = ksm_might_need_to_copy(page, vma, vmf->address);
                 if (unlikely(!page)) {
                         ret = VM_FAULT_OOM;
-                       page = swapcache;
                         goto out_page;
                 }
+               folio = page_folio(page);
  
                 /*
                  * If we want to map a page that's in the swapcache writable, we
@@ -3860,8 +3888,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                  * owner. Try removing the extra reference from the local LRU
                  * pagevecs if required.
                  */
-               if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
-                   !PageKsm(page) && !PageLRU(page))
+               if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
+                   !folio_test_ksm(folio) && !folio_test_lru(folio))
                         lru_add_drain();
         }
  
@@ -3875,7 +3903,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
                 goto out_nomap;
  
-       if (unlikely(!PageUptodate(page))) {
+       if (unlikely(!folio_test_uptodate(folio))) {
                 ret = VM_FAULT_SIGBUS;
                 goto out_nomap;
         }
@@ -3888,26 +3916,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
          * check after taking the PT lock and making sure that nobody
          * concurrently faulted in this page and set PG_anon_exclusive.
          */
-       BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
-       BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+       BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
+       BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
  
         /*
          * Check under PT lock (to protect against concurrent fork() sharing
          * the swap entry concurrently) for certainly exclusive pages.
          */
-       if (!PageKsm(page)) {
+       if (!folio_test_ksm(folio)) {
                 /*
                  * Note that pte_swp_exclusive() == false for architectures
                  * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
                  */
                 exclusive = pte_swp_exclusive(vmf->orig_pte);
-               if (page != swapcache) {
+               if (folio != swapcache) {
                         /*
                          * We have a fresh page that is not exposed to the
                          * swapcache -> certainly exclusive.
                          */
                         exclusive = true;
-               } else if (exclusive && PageWriteback(page) &&
+               } else if (exclusive && folio_test_writeback(folio) &&
                           data_race(si->flags & SWP_STABLE_WRITES)) {
                         /*
                          * This is tricky: not all swap backends support
@@ -3937,8 +3965,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
          * yet.
          */
         swap_free(entry);
-       if (should_try_to_free_swap(page, vma, vmf->flags))
-               try_to_free_swap(page);
+       if (should_try_to_free_swap(folio, vma, vmf->flags))
+               folio_free_swap(folio);
  
         inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
         dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
@@ -3950,7 +3978,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
          * exposing them to the swapcache or because the swap entry indicates
          * exclusivity.
          */
-       if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
+       if (!folio_test_ksm(folio) &&
+           (exclusive || folio_ref_count(folio) == 1)) {
                 if (vmf->flags & FAULT_FLAG_WRITE) {
                         pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                         vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -3968,19 +3997,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         vmf->orig_pte = pte;
  
         /* ksm created a completely new copy */
-       if (unlikely(page != swapcache && swapcache)) {
+       if (unlikely(folio != swapcache && swapcache)) {
                 page_add_new_anon_rmap(page, vma, vmf->address);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               folio_add_lru_vma(folio, vma);
         } else {
                 page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
         }
  
-       VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
+       VM_BUG_ON(!folio_test_anon(folio) ||
+                       (pte_write(pte) && !PageAnonExclusive(page)));
         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
         arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  
-       unlock_page(page);
-       if (page != swapcache && swapcache) {
+       folio_unlock(folio);
+       if (folio != swapcache && swapcache) {
                 /*
                  * Hold the lock to avoid the swap entry to be reused
                  * until we take the PT lock for the pte_same() check
@@ -3989,8 +4019,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                  * so that the swap count won't change under a
                  * parallel locked swapcache.
                  */
-               unlock_page(swapcache);
-               put_page(swapcache);
+               folio_unlock(swapcache);
+               folio_put(swapcache);
         }
  
         if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -4011,12 +4041,12 @@ out:
  out_nomap:
         pte_unmap_unlock(vmf->pte, vmf->ptl);
  out_page:
-       unlock_page(page);
+       folio_unlock(folio);
  out_release:
-       put_page(page);
-       if (page != swapcache && swapcache) {
-               unlock_page(swapcache);
-               put_page(swapcache);
+       folio_put(folio);
+       if (folio != swapcache && swapcache) {
+               folio_unlock(swapcache);
+               folio_put(swapcache);
         }
         if (si)
                 put_swap_device(si);
@@ -4386,14 +4416,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
  
         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
-       ret = 0;
+
         /* Re-check under ptl */
-       if (likely(!vmf_pte_changed(vmf)))
+       if (likely(!vmf_pte_changed(vmf))) {
                 do_set_pte(vmf, page, vmf->address);
-       else
+
+               /* no need to invalidate: a not-present page won't be cached */
+               update_mmu_cache(vma, vmf->address, vmf->pte);
+
+               ret = 0;
+       } else {
+               update_mmu_tlb(vma, vmf->address, vmf->pte);
                 ret = VM_FAULT_NOPAGE;
+       }
  
-       update_mmu_tlb(vma, vmf->address, vmf->pte);
         pte_unmap_unlock(vmf->pte, vmf->ptl);
         return ret;
  }
@@ -4725,8 +4761,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
                 flags |= TNF_SHARED;
  
-       last_cpupid = page_cpupid_last(page);
         page_nid = page_to_nid(page);
+       /*
+        * For memory tiering mode, cpupid of slow memory page is used
+        * to record page access time.  So use default value.
+        */
+       if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+           !node_is_toptier(page_nid))
+               last_cpupid = (-1 & LAST_CPUPID_MASK);
+       else
+               last_cpupid = page_cpupid_last(page);
         target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                         &flags);
         if (target_nid == NUMA_NO_NODE) {
@@ -4985,7 +5029,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                 return VM_FAULT_OOM;
  retry_pud:
         if (pud_none(*vmf.pud) &&
-           hugepage_vma_check(vma, vm_flags, false, true)) {
+           hugepage_vma_check(vma, vm_flags, false, true, true)) {
                 ret = create_huge_pud(&vmf);
                 if (!(ret & VM_FAULT_FALLBACK))
                         return ret;
@@ -5019,7 +5063,7 @@ retry_pud:
                 goto retry_pud;
  
         if (pmd_none(*vmf.pmd) &&
-           hugepage_vma_check(vma, vm_flags, false, true)) {
+           hugepage_vma_check(vma, vm_flags, false, true, true)) {
                 ret = create_huge_pmd(&vmf);
                 if (!(ret & VM_FAULT_FALLBACK))
                         return ret;
@@ -5114,6 +5158,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
  }
  
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+       /* the LRU algorithm doesn't apply to sequential or random reads */
+       current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+       current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
  /*
   * By the time we get here, we already hold the mm semaphore
   *
@@ -5145,11 +5210,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         if (flags & FAULT_FLAG_USER)
                 mem_cgroup_enter_user_fault();
  
+       lru_gen_enter_fault(vma);
+
         if (unlikely(is_vm_hugetlb_page(vma)))
                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
         else
                 ret = __handle_mm_fault(vma, address, flags);
  
+       lru_gen_exit_fault();
+
         if (flags & FAULT_FLAG_USER) {
                 mem_cgroup_exit_user_fault();
                 /*
@@ -5637,11 +5706,11 @@ static void clear_gigantic_page(struct page *page,
                                 unsigned int pages_per_huge_page)
  {
         int i;
-       struct page *p = page;
+       struct page *p;
  
         might_sleep();
-       for (i = 0; i < pages_per_huge_page;
-            i++, p = mem_map_next(p, page, i)) {
+       for (i = 0; i < pages_per_huge_page; i++) {
+               p = nth_page(page, i);
                 cond_resched();
                 clear_user_highpage(p, addr + i * PAGE_SIZE);
         }
@@ -5677,13 +5746,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
         struct page *dst_base = dst;
         struct page *src_base = src;
  
-       for (i = 0; i < pages_per_huge_page; ) {
+       for (i = 0; i < pages_per_huge_page; i++) {
+               dst = nth_page(dst_base, i);
+               src = nth_page(src_base, i);
+
                 cond_resched();
                 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
-               i++;
-               dst = mem_map_next(dst, dst_base, i);
-               src = mem_map_next(src, src_base, i);
         }
  }
  
@@ -5730,10 +5798,10 @@ long copy_huge_page_from_user(struct page *dst_page,
         void *page_kaddr;
         unsigned long i, rc = 0;
         unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
-       struct page *subpage = dst_page;
+       struct page *subpage;
  
-       for (i = 0; i < pages_per_huge_page;
-            i++, subpage = mem_map_next(subpage, dst_page, i)) {
+       for (i = 0; i < pages_per_huge_page; i++) {
+               subpage = nth_page(dst_page, i);
                 if (allow_pagefault)
                         page_kaddr = kmap(subpage);
                 else