mm/ksm: use folio in write_protect_page
[linux-block.git] / mm / madvise.c
index 44a498c94158c882c624eac2e29a5f07d854e322..f59169888b8ee2edae374b473d626c46b4d498e7 100644 (file)
@@ -336,6 +336,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
        LIST_HEAD(folio_list);
        bool pageout_anon_only_filter;
        unsigned int batch_count = 0;
+       int nr;
 
        if (fatal_signal_pending(current))
                return -EINTR;
@@ -363,10 +364,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                        goto huge_unlock;
                }
 
-               folio = pfn_folio(pmd_pfn(orig_pmd));
+               folio = pmd_folio(orig_pmd);
 
                /* Do not interfere with other mappings of this folio */
-               if (folio_estimated_sharers(folio) != 1)
+               if (folio_likely_mapped_shared(folio))
                        goto huge_unlock;
 
                if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -423,7 +424,8 @@ restart:
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
-       for (; addr < end; pte++, addr += PAGE_SIZE) {
+       for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
+               nr = 1;
                ptent = ptep_get(pte);
 
                if (++batch_count == SWAP_CLUSTER_MAX) {
@@ -447,55 +449,66 @@ restart:
                        continue;
 
                /*
-                * Creating a THP page is expensive so split it only if we
-                * are sure it's worth. Split it if we are only owner.
+                * If we encounter a large folio, only split it if it is not
+                * fully mapped within the range we are operating on. Otherwise
+                * leave it as is so that it can be swapped out whole. If we
+                * fail to split a folio, leave it in place and advance to the
+                * next pte in the range.
                 */
                if (folio_test_large(folio)) {
-                       int err;
-
-                       if (folio_estimated_sharers(folio) > 1)
-                               break;
-                       if (pageout_anon_only_filter && !folio_test_anon(folio))
-                               break;
-                       if (!folio_trylock(folio))
-                               break;
-                       folio_get(folio);
-                       arch_leave_lazy_mmu_mode();
-                       pte_unmap_unlock(start_pte, ptl);
-                       start_pte = NULL;
-                       err = split_folio(folio);
-                       folio_unlock(folio);
-                       folio_put(folio);
-                       if (err)
-                               break;
-                       start_pte = pte =
-                               pte_offset_map_lock(mm, pmd, addr, &ptl);
-                       if (!start_pte)
-                               break;
-                       arch_enter_lazy_mmu_mode();
-                       pte--;
-                       addr -= PAGE_SIZE;
-                       continue;
+                       const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
+                                               FPB_IGNORE_SOFT_DIRTY;
+                       int max_nr = (end - addr) / PAGE_SIZE;
+                       bool any_young;
+
+                       nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
+                                            fpb_flags, NULL, &any_young);
+                       if (any_young)
+                               ptent = pte_mkyoung(ptent);
+
+                       if (nr < folio_nr_pages(folio)) {
+                               int err;
+
+                               if (folio_likely_mapped_shared(folio))
+                                       continue;
+                               if (pageout_anon_only_filter && !folio_test_anon(folio))
+                                       continue;
+                               if (!folio_trylock(folio))
+                                       continue;
+                               folio_get(folio);
+                               arch_leave_lazy_mmu_mode();
+                               pte_unmap_unlock(start_pte, ptl);
+                               start_pte = NULL;
+                               err = split_folio(folio);
+                               folio_unlock(folio);
+                               folio_put(folio);
+                               start_pte = pte =
+                                       pte_offset_map_lock(mm, pmd, addr, &ptl);
+                               if (!start_pte)
+                                       break;
+                               arch_enter_lazy_mmu_mode();
+                               if (!err)
+                                       nr = 0;
+                               continue;
+                       }
                }
 
                /*
                 * Do not interfere with other mappings of this folio and
-                * non-LRU folio.
+                * non-LRU folio. If we have a large folio at this point, we
+                * know it is fully mapped so if its mapcount is the same as its
+                * number of pages, it must be exclusive.
                 */
-               if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+               if (!folio_test_lru(folio) ||
+                   folio_mapcount(folio) != folio_nr_pages(folio))
                        continue;
 
                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        continue;
 
-               VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
                if (!pageout && pte_young(ptent)) {
-                       ptent = ptep_get_and_clear_full(mm, addr, pte,
-                                                       tlb->fullmm);
-                       ptent = pte_mkold(ptent);
-                       set_pte_at(mm, addr, pte, ptent);
-                       tlb_remove_tlb_entry(tlb, pte, addr);
+                       mkold_ptes(vma, addr, pte, nr);
+                       tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }
 
                /*
@@ -628,6 +641,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
        struct folio *folio;
        int nr_swap = 0;
        unsigned long next;
+       int nr, max_nr;
 
        next = pmd_addr_end(addr, end);
        if (pmd_trans_huge(*pmd))
@@ -640,7 +654,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
-       for (; addr != end; pte++, addr += PAGE_SIZE) {
+       for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+               nr = 1;
                ptent = ptep_get(pte);
 
                if (pte_none(ptent))
@@ -655,9 +670,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
                        entry = pte_to_swp_entry(ptent);
                        if (!non_swap_entry(entry)) {
-                               nr_swap--;
-                               free_swap_and_cache(entry);
-                               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                               max_nr = (end - addr) / PAGE_SIZE;
+                               nr = swap_pte_batch(pte, max_nr, ptent);
+                               nr_swap -= nr;
+                               free_swap_and_cache_nr(entry, nr);
+                               clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                        } else if (is_hwpoison_entry(entry) ||
                                   is_poisoned_swp_entry(entry)) {
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
@@ -677,7 +694,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                if (folio_test_large(folio)) {
                        int err;
 
-                       if (folio_estimated_sharers(folio) != 1)
+                       if (folio_likely_mapped_shared(folio))
                                break;
                        if (!folio_trylock(folio))
                                break;
@@ -901,39 +918,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
                return -EINVAL;
 }
 
-static long madvise_populate(struct vm_area_struct *vma,
-                            struct vm_area_struct **prev,
-                            unsigned long start, unsigned long end,
-                            int behavior)
+static long madvise_populate(struct mm_struct *mm, unsigned long start,
+               unsigned long end, int behavior)
 {
        const bool write = behavior == MADV_POPULATE_WRITE;
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long tmp_end;
        int locked = 1;
        long pages;
 
-       *prev = vma;
-
        while (start < end) {
-               /*
-                * We might have temporarily dropped the lock. For example,
-                * our VMA might have been split.
-                */
-               if (!vma || start >= vma->vm_end) {
-                       vma = vma_lookup(mm, start);
-                       if (!vma)
-                               return -ENOMEM;
-               }
-
-               tmp_end = min_t(unsigned long, end, vma->vm_end);
                /* Populate (prefault) page tables readable/writable. */
-               pages = faultin_vma_page_range(vma, start, tmp_end, write,
-                                              &locked);
+               pages = faultin_page_range(mm, start, end, write, &locked);
                if (!locked) {
                        mmap_read_lock(mm);
                        locked = 1;
-                       *prev = NULL;
-                       vma = NULL;
                }
                if (pages < 0) {
                        switch (pages) {
@@ -949,7 +946,7 @@ static long madvise_populate(struct vm_area_struct *vma,
                                pr_warn_once("%s: unhandled return value: %ld\n",
                                             __func__, pages);
                                fallthrough;
-                       case -ENOMEM:
+                       case -ENOMEM: /* No VMA or out of memory. */
                                return -ENOMEM;
                        }
                }
@@ -1034,9 +1031,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
                return madvise_dontneed_free(vma, prev, start, end, behavior);
-       case MADV_POPULATE_READ:
-       case MADV_POPULATE_WRITE:
-               return madvise_populate(vma, prev, start, end, behavior);
        case MADV_NORMAL:
                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
                break;
@@ -1438,8 +1432,16 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
        end = start + len;
 
        blk_start_plug(&plug);
-       error = madvise_walk_vmas(mm, start, end, behavior,
-                       madvise_vma_behavior);
+       switch (behavior) {
+       case MADV_POPULATE_READ:
+       case MADV_POPULATE_WRITE:
+               error = madvise_populate(mm, start, end, behavior);
+               break;
+       default:
+               error = madvise_walk_vmas(mm, start, end, behavior,
+                                         madvise_vma_behavior);
+               break;
+       }
        blk_finish_plug(&plug);
        if (write)
                mmap_write_unlock(mm);