mm: introduce MADV_PAGEOUT
authorMinchan Kim <minchan@kernel.org>
Wed, 25 Sep 2019 23:49:15 +0000 (16:49 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 Sep 2019 00:51:41 +0000 (17:51 -0700)
When a process expects no accesses to a certain memory range for a long
time, it could hint kernel that the pages can be reclaimed instantly but
data should be preserved for future use.  This could reduce workingset
eviction so it ends up increasing performance.

This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall.
MADV_PAGEOUT can be used by a process to mark a memory range as not
expected to be used for a long time so that kernel reclaims *any LRU*
pages instantly.  The hint can help kernel in deciding which pages to
evict proactively.

A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit
intentionally because it's automatically bounded by PMD size.  If PMD
size(e.g., 256) makes some trouble, we could fix it later by limit it to
SWAP_CLUSTER_MAX[1].

- man-page material

MADV_PAGEOUT (since Linux x.x)

Do not expect access in the near future so pages in the specified
regions could be reclaimed instantly regardless of memory pressure.
Thus, access in the range after successful operation could cause
major page fault but never lose the up-to-date contents unlike
MADV_DONTNEED. Pages belonging to a shared mapping are only processed
if a write access is allowed for the calling process.

MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
VM_PFNMAP pages.

[1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/

[minchan@kernel.org: clear PG_active on MADV_PAGEOUT]
Link: http://lkml.kernel.org/r/20190802200643.GA181880@google.com
[akpm@linux-foundation.org: resolve conflicts with hmm.git]
Link: http://lkml.kernel.org/r/20190726023435.214162-5-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: kbuild test robot <lkp@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: Daniel Colascione <dancol@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Oleksandr Natalenko <oleksandr@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Sonny Rao <sonnyrao@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
arch/alpha/include/uapi/asm/mman.h
arch/mips/include/uapi/asm/mman.h
arch/parisc/include/uapi/asm/mman.h
arch/xtensa/include/uapi/asm/mman.h
include/linux/swap.h
include/uapi/asm-generic/mman-common.h
mm/madvise.c
mm/vmscan.c

index f3258fbf03d0349d1431922d91b74aff5812f776..a18ec7f638880824247ebcf9bc3b587db9957e60 100644 (file)
@@ -69,6 +69,7 @@
 #define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
 
 #define MADV_COLD      20              /* deactivate these pages */
+#define MADV_PAGEOUT   21              /* reclaim these pages */
 
 /* compatibility flags */
 #define MAP_FILE       0
index 00ad09fc5eb16f0c478d5b7dfedda7fb6fc35cd1..57dc2ac4f8bda21700f71d4c80551cf8d31828c0 100644 (file)
@@ -96,6 +96,7 @@
 #define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
 
 #define MADV_COLD      20              /* deactivate these pages */
+#define MADV_PAGEOUT   21              /* reclaim these pages */
 
 /* compatibility flags */
 #define MAP_FILE       0
index eb14e3a7b8f378ed5b60377158d11c29338b1a8a..6fd8871e4081eff2ae302c7c00b18b51243ceb3c 100644 (file)
@@ -49,6 +49,7 @@
 #define MADV_DOFORK    11              /* do inherit across fork */
 
 #define MADV_COLD      20              /* deactivate these pages */
+#define MADV_PAGEOUT   21              /* reclaim these pages */
 
 #define MADV_MERGEABLE   65            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 66            /* KSM may not merge identical pages */
index f926b00ff11f96a88b0d7b6dcab6d1217f1fbb63..e5e643752947591890d8b70552d9cc9aa3456023 100644 (file)
 #define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
 
 #define MADV_COLD      20              /* deactivate these pages */
+#define MADV_PAGEOUT   21              /* reclaim these pages */
 
 /* compatibility flags */
 #define MAP_FILE       0
index 0ce997edb8bbc44b5d330265e6253f05328b8018..063c0c1e112bdc9c9edb81ecbcbefd45bdc7ee57 100644 (file)
@@ -365,6 +365,7 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
index 23431faf0eb6e8509f3cf32a078ac698806b88df..c160a5354eb62b3b17de564be439451c812470ae 100644 (file)
@@ -68,6 +68,7 @@
 #define MADV_KEEPONFORK 19             /* Undo MADV_WIPEONFORK */
 
 #define MADV_COLD      20              /* deactivate these pages */
+#define MADV_PAGEOUT   21              /* reclaim these pages */
 
 /* compatibility flags */
 #define MAP_FILE       0
index e1aee62967c3a9015b22ebaff023d82ea838a27d..54c5639774b6630b40d171b9504288ec0d132808 100644 (file)
@@ -44,6 +44,7 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_COLD:
+       case MADV_PAGEOUT:
        case MADV_FREE:
                return 0;
        default:
@@ -461,6 +462,191 @@ static long madvise_cold(struct vm_area_struct *vma,
        return 0;
 }
 
+static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
+                               unsigned long end, struct mm_walk *walk)
+{
+       struct mmu_gather *tlb = walk->private;
+       struct mm_struct *mm = tlb->mm;
+       struct vm_area_struct *vma = walk->vma;
+       pte_t *orig_pte, *pte, ptent;
+       spinlock_t *ptl;
+       LIST_HEAD(page_list);
+       struct page *page;
+
+       if (fatal_signal_pending(current))
+               return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (pmd_trans_huge(*pmd)) {
+               pmd_t orig_pmd;
+               unsigned long next = pmd_addr_end(addr, end);
+
+               tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+               ptl = pmd_trans_huge_lock(pmd, vma);
+               if (!ptl)
+                       return 0;
+
+               orig_pmd = *pmd;
+               if (is_huge_zero_pmd(orig_pmd))
+                       goto huge_unlock;
+
+               if (unlikely(!pmd_present(orig_pmd))) {
+                       VM_BUG_ON(thp_migration_supported() &&
+                                       !is_pmd_migration_entry(orig_pmd));
+                       goto huge_unlock;
+               }
+
+               page = pmd_page(orig_pmd);
+               if (next - addr != HPAGE_PMD_SIZE) {
+                       int err;
+
+                       if (page_mapcount(page) != 1)
+                               goto huge_unlock;
+                       get_page(page);
+                       spin_unlock(ptl);
+                       lock_page(page);
+                       err = split_huge_page(page);
+                       unlock_page(page);
+                       put_page(page);
+                       if (!err)
+                               goto regular_page;
+                       return 0;
+               }
+
+               if (pmd_young(orig_pmd)) {
+                       pmdp_invalidate(vma, addr, pmd);
+                       orig_pmd = pmd_mkold(orig_pmd);
+
+                       set_pmd_at(mm, addr, pmd, orig_pmd);
+                       tlb_remove_tlb_entry(tlb, pmd, addr);
+               }
+
+               ClearPageReferenced(page);
+               test_and_clear_page_young(page);
+
+               if (!isolate_lru_page(page))
+                       list_add(&page->lru, &page_list);
+huge_unlock:
+               spin_unlock(ptl);
+               reclaim_pages(&page_list);
+               return 0;
+       }
+
+       if (pmd_trans_unstable(pmd))
+               return 0;
+regular_page:
+#endif
+       tlb_change_page_size(tlb, PAGE_SIZE);
+       orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+       flush_tlb_batched_pending(mm);
+       arch_enter_lazy_mmu_mode();
+       for (; addr < end; pte++, addr += PAGE_SIZE) {
+               ptent = *pte;
+               if (!pte_present(ptent))
+                       continue;
+
+               page = vm_normal_page(vma, addr, ptent);
+               if (!page)
+                       continue;
+
+               /*
+                * creating a THP page is expensive so split it only if we
+                * are sure it's worth. Split it if we are only owner.
+                */
+               if (PageTransCompound(page)) {
+                       if (page_mapcount(page) != 1)
+                               break;
+                       get_page(page);
+                       if (!trylock_page(page)) {
+                               put_page(page);
+                               break;
+                       }
+                       pte_unmap_unlock(orig_pte, ptl);
+                       if (split_huge_page(page)) {
+                               unlock_page(page);
+                               put_page(page);
+                               pte_offset_map_lock(mm, pmd, addr, &ptl);
+                               break;
+                       }
+                       unlock_page(page);
+                       put_page(page);
+                       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       pte--;
+                       addr -= PAGE_SIZE;
+                       continue;
+               }
+
+               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+               if (pte_young(ptent)) {
+                       ptent = ptep_get_and_clear_full(mm, addr, pte,
+                                                       tlb->fullmm);
+                       ptent = pte_mkold(ptent);
+                       set_pte_at(mm, addr, pte, ptent);
+                       tlb_remove_tlb_entry(tlb, pte, addr);
+               }
+               ClearPageReferenced(page);
+               test_and_clear_page_young(page);
+
+               if (!isolate_lru_page(page))
+                       list_add(&page->lru, &page_list);
+       }
+
+       arch_leave_lazy_mmu_mode();
+       pte_unmap_unlock(orig_pte, ptl);
+       reclaim_pages(&page_list);
+       cond_resched();
+
+       return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+                            struct vm_area_struct *vma,
+                            unsigned long addr, unsigned long end)
+{
+       tlb_start_vma(tlb, vma);
+       walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL);
+       tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+       if (vma_is_anonymous(vma))
+               return true;
+       if (!vma->vm_file)
+               return false;
+       /*
+        * paging out pagecache only for non-anonymous mappings that correspond
+        * to the files the calling process could (if tried) open for writing;
+        * otherwise we'd be including shared non-exclusive mappings, which
+        * opens a side channel.
+        */
+       return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+               inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+                       struct vm_area_struct **prev,
+                       unsigned long start_addr, unsigned long end_addr)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_gather tlb;
+
+       *prev = vma;
+       if (!can_madv_lru_vma(vma))
+               return -EINVAL;
+
+       if (!can_do_pageout(vma))
+               return 0;
+
+       lru_add_drain();
+       tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+       madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+       tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+       return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
 
@@ -843,6 +1029,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                return madvise_willneed(vma, prev, start, end);
        case MADV_COLD:
                return madvise_cold(vma, prev, start, end);
+       case MADV_PAGEOUT:
+               return madvise_pageout(vma, prev, start, end);
        case MADV_FREE:
        case MADV_DONTNEED:
                return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -865,6 +1053,7 @@ madvise_behavior_valid(int behavior)
        case MADV_DONTNEED:
        case MADV_FREE:
        case MADV_COLD:
+       case MADV_PAGEOUT:
 #ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
index d8bbaf068c35007d7b1810ec2ef389f5d95e0d8d..e5d52d6a24aff1c7fccd292bb8a2455e1eec1d52 100644 (file)
@@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan,
                        nr_deactivate, nr_rotated, sc->priority, file);
 }
 
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+       int nid = -1;
+       unsigned long nr_reclaimed = 0;
+       LIST_HEAD(node_page_list);
+       struct reclaim_stat dummy_stat;
+       struct page *page;
+       struct scan_control sc = {
+               .gfp_mask = GFP_KERNEL,
+               .priority = DEF_PRIORITY,
+               .may_writepage = 1,
+               .may_unmap = 1,
+               .may_swap = 1,
+       };
+
+       while (!list_empty(page_list)) {
+               page = lru_to_page(page_list);
+               if (nid == -1) {
+                       nid = page_to_nid(page);
+                       INIT_LIST_HEAD(&node_page_list);
+               }
+
+               if (nid == page_to_nid(page)) {
+                       ClearPageActive(page);
+                       list_move(&page->lru, &node_page_list);
+                       continue;
+               }
+
+               nr_reclaimed += shrink_page_list(&node_page_list,
+                                               NODE_DATA(nid),
+                                               &sc, 0,
+                                               &dummy_stat, false);
+               while (!list_empty(&node_page_list)) {
+                       page = lru_to_page(&node_page_list);
+                       list_del(&page->lru);
+                       putback_lru_page(page);
+               }
+
+               nid = -1;
+       }
+
+       if (!list_empty(&node_page_list)) {
+               nr_reclaimed += shrink_page_list(&node_page_list,
+                                               NODE_DATA(nid),
+                                               &sc, 0,
+                                               &dummy_stat, false);
+               while (!list_empty(&node_page_list)) {
+                       page = lru_to_page(&node_page_list);
+                       list_del(&page->lru);
+                       putback_lru_page(page);
+               }
+       }
+
+       return nr_reclaimed;
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.