Merge tag 'x86_mm_for_6.2_v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[linux-block.git] / mm / vmscan.c
index 4936a88bb26a71c2d70af8d81c1db797f04796a6..bd6637fcd8f9b19a69045e79e73d6b0c44d9f15b 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/khugepaged.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1020,31 +1021,52 @@ out:
        return freed;
 }
 
-static void drop_slab_node(int nid)
+static unsigned long drop_slab_node(int nid)
 {
-       unsigned long freed;
-       int shift = 0;
+       unsigned long freed = 0;
+       struct mem_cgroup *memcg = NULL;
 
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
-               struct mem_cgroup *memcg = NULL;
+               freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
+       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 
-               if (fatal_signal_pending(current))
-                       return;
+       return freed;
+}
 
+void drop_slab(void)
+{
+       int nid;
+       int shift = 0;
+       unsigned long freed;
+
+       do {
                freed = 0;
-               memcg = mem_cgroup_iter(NULL, NULL, NULL);
-               do {
-                       freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
-               } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+               for_each_online_node(nid) {
+                       if (fatal_signal_pending(current))
+                               return;
+
+                       freed += drop_slab_node(nid);
+               }
        } while ((freed >> shift++) > 1);
 }
 
-void drop_slab(void)
+static int reclaimer_offset(void)
 {
-       int nid;
+       BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+                       PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+       BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+                       PGSCAN_DIRECT - PGSCAN_KSWAPD);
+       BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+                       PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+       BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+                       PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
 
-       for_each_online_node(nid)
-               drop_slab_node(nid);
+       if (current_is_kswapd())
+               return 0;
+       if (current_is_khugepaged())
+               return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+       return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
 }
 
 static inline int is_page_cache_freeable(struct folio *folio)
@@ -1346,11 +1368,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
        if (folio_test_swapcache(folio)) {
                swp_entry_t swap = folio_swap_entry(folio);
 
-               /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */
                if (reclaimed && !mapping_exiting(mapping))
                        shadow = workingset_eviction(folio, target_memcg);
-               mem_cgroup_swapout(folio, swap);
                __delete_from_swap_cache(folio, swap, shadow);
+               mem_cgroup_swapout(folio, swap);
                xa_unlock_irq(&mapping->i_pages);
                put_swap_folio(folio, swap);
        } else {
@@ -1599,10 +1620,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
                      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
                      &nr_succeeded);
 
-       if (current_is_kswapd())
-               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
-       else
-               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+       __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
 
        return nr_succeeded;
 }
@@ -2069,10 +2087,29 @@ keep:
        nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
        /* Folios that could not be demoted are still in @demote_folios */
        if (!list_empty(&demote_folios)) {
-               /* Folios which weren't demoted go back on @folio_list for retry: */
+               /* Folios which weren't demoted go back on @folio_list */
                list_splice_init(&demote_folios, folio_list);
-               do_demote_pass = false;
-               goto retry;
+
+               /*
+                * goto retry to reclaim the undemoted folios in folio_list if
+                * desired.
+                *
+                * Reclaiming directly from top tier nodes is not often desired
+                * due to it breaking the LRU ordering: in general memory
+                * should be reclaimed from lower tier nodes and demoted from
+                * top tier nodes.
+                *
+                * However, disabling reclaim from top tier nodes entirely
+                * would cause ooms in edge scenarios where lower tier memory
+                * is unreclaimable for whatever reason, eg memory being
+                * mlocked or too hot to reclaim. We can disable reclaim
+                * from top tier nodes in proactive reclaim though as that is
+                * not real memory pressure.
+                */
+               if (!sc->proactive) {
+                       do_demote_pass = false;
+                       goto retry;
+               }
        }
 
        pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
@@ -2475,7 +2512,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
                                     &nr_scanned, sc, lru);
 
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
-       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+       item = PGSCAN_KSWAPD + reclaimer_offset();
        if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_scanned);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@@ -2492,14 +2529,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
        move_folios_to_lru(lruvec, &folio_list);
 
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+       item = PGSTEAL_KSWAPD + reclaimer_offset();
        if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_reclaimed);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
        __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
        spin_unlock_irq(&lruvec->lru_lock);
 
-       lru_note_cost(lruvec, file, stat.nr_pageout);
+       lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
        mem_cgroup_uncharge_list(&folio_list);
        free_unref_page_list(&folio_list);
 
@@ -2514,8 +2551,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
         * the flushers simply cannot keep up with the allocation
         * rate. Nudge the flusher threads in case they are asleep.
         */
-       if (stat.nr_unqueued_dirty == nr_taken)
+       if (stat.nr_unqueued_dirty == nr_taken) {
                wakeup_flusher_threads(WB_REASON_VMSCAN);
+               /*
+                * For cgroupv1 dirty throttling is achieved by waking up
+                * the kernel flusher here and later waiting on folios
+                * which are in writeback to finish (see shrink_folio_list()).
+                *
+                * Flusher may not be able to issue writeback quickly
+                * enough for cgroupv1 writeback throttling to work
+                * on a large system.
+                */
+               if (!writeback_throttling_sane(sc))
+                       reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+       }
 
        sc->nr.dirty += stat.nr_dirty;
        sc->nr.congested += stat.nr_congested;
@@ -2639,6 +2688,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&lruvec->lru_lock);
 
+       if (nr_rotated)
+               lru_note_cost(lruvec, file, 0, nr_rotated);
        mem_cgroup_uncharge_list(&l_active);
        free_unref_page_list(&l_active);
        trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
@@ -3133,7 +3184,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
        if (memcg) {
                struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
 
-               /* for hotadd_new_pgdat() */
+               /* see the comment in mem_cgroup_lruvec() */
                if (!lruvec->pgdat)
                        lruvec->pgdat = pgdat;
 
@@ -3142,7 +3193,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
 #endif
        VM_WARN_ON_ONCE(!mem_cgroup_disabled());
 
-       return pgdat ? &pgdat->__lruvec : NULL;
+       return &pgdat->__lruvec;
 }
 
 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
@@ -3206,9 +3257,6 @@ void lru_gen_add_mm(struct mm_struct *mm)
        for_each_node_state(nid, N_MEMORY) {
                struct lruvec *lruvec = get_lruvec(memcg, nid);
 
-               if (!lruvec)
-                       continue;
-
                /* the first addition since the last iteration */
                if (lruvec->mm_state.tail == &mm_list->fifo)
                        lruvec->mm_state.tail = &mm->lru_gen.list;
@@ -3238,9 +3286,6 @@ void lru_gen_del_mm(struct mm_struct *mm)
        for_each_node(nid) {
                struct lruvec *lruvec = get_lruvec(memcg, nid);
 
-               if (!lruvec)
-                       continue;
-
                /* where the last iteration ended (exclusive) */
                if (lruvec->mm_state.tail == &mm->lru_gen.list)
                        lruvec->mm_state.tail = lruvec->mm_state.tail->next;
@@ -3975,7 +4020,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
                        goto next;
 
                if (!pmd_trans_huge(pmd[i])) {
-                       if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+                       if (arch_has_hw_nonleaf_pmd_young() &&
                            get_cap(LRU_GEN_NONLEAF_YOUNG))
                                pmdp_test_and_clear_young(vma, addr, pmd + i);
                        goto next;
@@ -4070,14 +4115,14 @@ restart:
 #endif
                walk->mm_stats[MM_NONLEAF_TOTAL]++;
 
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
-               if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+               if (arch_has_hw_nonleaf_pmd_young() &&
+                   get_cap(LRU_GEN_NONLEAF_YOUNG)) {
                        if (!pmd_young(val))
                                continue;
 
                        walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
                }
-#endif
+
                if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
                        continue;
 
@@ -4483,7 +4528,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
 
        mem_cgroup_calculate_protection(NULL, memcg);
 
-       if (mem_cgroup_below_min(memcg))
+       if (mem_cgroup_below_min(NULL, memcg))
                return false;
 
        need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
@@ -4854,7 +4899,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
                        break;
        }
 
-       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+       item = PGSCAN_KSWAPD + reclaimer_offset();
        if (!cgroup_reclaim(sc)) {
                __count_vm_events(item, isolated);
                __count_vm_events(PGREFILL, sorted);
@@ -4968,10 +5013,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
        int scanned;
        int reclaimed;
        LIST_HEAD(list);
+       LIST_HEAD(clean);
        struct folio *folio;
+       struct folio *next;
        enum vm_event_item item;
        struct reclaim_stat stat;
        struct lru_gen_mm_walk *walk;
+       bool skip_retry = false;
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
@@ -4988,20 +5036,37 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 
        if (list_empty(&list))
                return scanned;
-
+retry:
        reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+       sc->nr_reclaimed += reclaimed;
 
-       list_for_each_entry(folio, &list, lru) {
-               /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
-               if (folio_test_workingset(folio))
-                       folio_set_referenced(folio);
+       list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+               if (!folio_evictable(folio)) {
+                       list_del(&folio->lru);
+                       folio_putback_lru(folio);
+                       continue;
+               }
 
-               /* don't add rejected pages to the oldest generation */
                if (folio_test_reclaim(folio) &&
-                   (folio_test_dirty(folio) || folio_test_writeback(folio)))
-                       folio_clear_active(folio);
-               else
-                       folio_set_active(folio);
+                   (folio_test_dirty(folio) || folio_test_writeback(folio))) {
+                       /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+                       if (folio_test_workingset(folio))
+                               folio_set_referenced(folio);
+                       continue;
+               }
+
+               if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
+                   folio_mapped(folio) || folio_test_locked(folio) ||
+                   folio_test_dirty(folio) || folio_test_writeback(folio)) {
+                       /* don't add rejected folios to the oldest generation */
+                       set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
+                                     BIT(PG_active));
+                       continue;
+               }
+
+               /* retry folios that may have missed folio_rotate_reclaimable() */
+               list_move(&folio->lru, &clean);
+               sc->nr_scanned -= folio_nr_pages(folio);
        }
 
        spin_lock_irq(&lruvec->lru_lock);
@@ -5012,7 +5077,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
        if (walk && walk->batched)
                reset_batch_size(lruvec, walk);
 
-       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+       item = PGSTEAL_KSWAPD + reclaimer_offset();
        if (!cgroup_reclaim(sc))
                __count_vm_events(item, reclaimed);
        __count_memcg_events(memcg, item, reclaimed);
@@ -5023,7 +5088,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
        mem_cgroup_uncharge_list(&list);
        free_unref_page_list(&list);
 
-       sc->nr_reclaimed += reclaimed;
+       INIT_LIST_HEAD(&list);
+       list_splice_init(&clean, &list);
+
+       if (!list_empty(&list)) {
+               skip_retry = true;
+               goto retry;
+       }
 
        if (need_swapping && type == LRU_GEN_ANON)
                *need_swapping = true;
@@ -5044,8 +5115,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
        DEFINE_MAX_SEQ(lruvec);
        DEFINE_MIN_SEQ(lruvec);
 
-       if (mem_cgroup_below_min(memcg) ||
-           (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+       if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
+           (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
+            !sc->memcg_low_reclaim))
                return 0;
 
        *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
@@ -5286,9 +5358,6 @@ static void lru_gen_change_state(bool enabled)
                for_each_node(nid) {
                        struct lruvec *lruvec = get_lruvec(memcg, nid);
 
-                       if (!lruvec)
-                               continue;
-
                        spin_lock_irq(&lruvec->lru_lock);
 
                        VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -5351,10 +5420,10 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
        if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
                caps |= BIT(LRU_GEN_MM_WALK);
 
-       if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+       if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
                caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 
-       return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+       return sysfs_emit(buf, "0x%04x\n", caps);
 }
 
 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
@@ -5841,8 +5910,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
        enum lru_list lru;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       bool proportional_reclaim;
        struct blk_plug plug;
-       bool scan_adjusted;
 
        if (lru_gen_enabled()) {
                lru_gen_shrink_lruvec(lruvec, sc);
@@ -5865,8 +5934,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
         * abort proportional reclaim if either the file or anon lru has already
         * dropped to zero at the first pass.
         */
-       scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
-                        sc->priority == DEF_PRIORITY);
+       proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+                               sc->priority == DEF_PRIORITY);
 
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -5886,7 +5955,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 
                cond_resched();
 
-               if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+               if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
                        continue;
 
                /*
@@ -5937,8 +6006,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                nr_scanned = targets[lru] - nr[lru];
                nr[lru] = targets[lru] * (100 - percentage) / 100;
                nr[lru] -= min(nr[lru], nr_scanned);
-
-               scan_adjusted = true;
        }
        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
@@ -6045,13 +6112,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 
                mem_cgroup_calculate_protection(target_memcg, memcg);
 
-               if (mem_cgroup_below_min(memcg)) {
+               if (mem_cgroup_below_min(target_memcg, memcg)) {
                        /*
                         * Hard protection.
                         * If there is no reclaimable memory, OOM.
                         */
                        continue;
-               } else if (mem_cgroup_below_low(memcg)) {
+               } else if (mem_cgroup_below_low(target_memcg, memcg)) {
                        /*
                         * Soft protection.
                         * Respect the protection only as long as
@@ -6687,7 +6754,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           unsigned long nr_pages,
                                           gfp_t gfp_mask,
-                                          unsigned int reclaim_options)
+                                          unsigned int reclaim_options,
+                                          nodemask_t *nodemask)
 {
        unsigned long nr_reclaimed;
        unsigned int noreclaim_flag;
@@ -6702,6 +6770,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_unmap = 1,
                .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
                .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+               .nodemask = nodemask,
        };
        /*
         * Traverse the ZONELIST_FALLBACK zonelist of the current node to put