mm/vmscan.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   4  *
   5  *  Swap reorganised 29.12.95, Stephen Tweedie.
   6  *  kswapd added: 7.1.96  sct
   7  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   8  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
   9  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  10  *  Multiqueue VM started 5.8.00, Rik van Riel.
  11  */
  12
  13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15 #include <linux/mm.h>
  16 #include <linux/sched/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/gfp.h>
  19 #include <linux/kernel_stat.h>
  20 #include <linux/swap.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/init.h>
  23 #include <linux/highmem.h>
  24 #include <linux/vmpressure.h>
  25 #include <linux/vmstat.h>
  26 #include <linux/file.h>
  27 #include <linux/writeback.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/buffer_head.h>  /* for buffer_heads_over_limit */
  30 #include <linux/mm_inline.h>
  31 #include <linux/backing-dev.h>
  32 #include <linux/rmap.h>
  33 #include <linux/topology.h>
  34 #include <linux/cpu.h>
  35 #include <linux/cpuset.h>
  36 #include <linux/compaction.h>
  37 #include <linux/notifier.h>
  38 #include <linux/rwsem.h>
  39 #include <linux/delay.h>
  40 #include <linux/kthread.h>
  41 #include <linux/freezer.h>
  42 #include <linux/memcontrol.h>
  43 #include <linux/migrate.h>
  44 #include <linux/delayacct.h>
  45 #include <linux/sysctl.h>
  46 #include <linux/oom.h>
  47 #include <linux/pagevec.h>
  48 #include <linux/prefetch.h>
  49 #include <linux/printk.h>
  50 #include <linux/dax.h>
  51 #include <linux/psi.h>
  52
  53 #include <asm/tlbflush.h>
  54 #include <asm/div64.h>
  55
  56 #include <linux/swapops.h>
  57 #include <linux/balloon_compaction.h>
  58 #include <linux/sched/sysctl.h>
  59
  60 #include "internal.h"
  61 #include "swap.h"
  62
  63 #define CREATE_TRACE_POINTS
  64 #include <trace/events/vmscan.h>
  65
  66 struct scan_control {
  67         /* How many pages shrink_list() should reclaim */
  68         unsigned long nr_to_reclaim;
  69
  70         /*
  71          * Nodemask of nodes allowed by the caller. If NULL, all nodes
  72          * are scanned.
  73          */
  74         nodemask_t      *nodemask;
  75
  76         /*
  77          * The memory cgroup that hit its limit and as a result is the
  78          * primary target of this reclaim invocation.
  79          */
  80         struct mem_cgroup *target_mem_cgroup;
  81
  82         /*
  83          * Scan pressure balancing between anon and file LRUs
  84          */
  85         unsigned long   anon_cost;
  86         unsigned long   file_cost;
  87
  88         /* Can active pages be deactivated as part of reclaim? */
  89 #define DEACTIVATE_ANON 1
  90 #define DEACTIVATE_FILE 2
  91         unsigned int may_deactivate:2;
  92         unsigned int force_deactivate:1;
  93         unsigned int skipped_deactivate:1;
  94
  95         /* Writepage batching in laptop mode; RECLAIM_WRITE */
  96         unsigned int may_writepage:1;
  97
  98         /* Can mapped pages be reclaimed? */
  99         unsigned int may_unmap:1;
 100
 101         /* Can pages be swapped as part of reclaim? */
 102         unsigned int may_swap:1;
 103
 104         /* Proactive reclaim invoked by userspace through memory.reclaim */
 105         unsigned int proactive:1;
 106
 107         /*
 108          * Cgroup memory below memory.low is protected as long as we
 109          * don't threaten to OOM. If any cgroup is reclaimed at
 110          * reduced force or passed over entirely due to its memory.low
 111          * setting (memcg_low_skipped), and nothing is reclaimed as a
 112          * result, then go back for one more cycle that reclaims the protected
 113          * memory (memcg_low_reclaim) to avert OOM.
 114          */
 115         unsigned int memcg_low_reclaim:1;
 116         unsigned int memcg_low_skipped:1;
 117
 118         unsigned int hibernation_mode:1;
 119
 120         /* One of the zones is ready for compaction */
 121         unsigned int compaction_ready:1;
 122
 123         /* There is easily reclaimable cold cache in the current node */
 124         unsigned int cache_trim_mode:1;
 125
 126         /* The file pages on the current node are dangerously low */
 127         unsigned int file_is_tiny:1;
 128
 129         /* Always discard instead of demoting to lower tier memory */
 130         unsigned int no_demotion:1;
 131
 132         /* Allocation order */
 133         s8 order;
 134
 135         /* Scan (total_size >> priority) pages at once */
 136         s8 priority;
 137
 138         /* The highest zone to isolate pages for reclaim from */
 139         s8 reclaim_idx;
 140
 141         /* This context's GFP mask */
 142         gfp_t gfp_mask;
 143
 144         /* Incremented by the number of inactive pages that were scanned */
 145         unsigned long nr_scanned;
 146
 147         /* Number of pages freed so far during a call to shrink_zones() */
 148         unsigned long nr_reclaimed;
 149
 150         struct {
 151                 unsigned int dirty;
 152                 unsigned int unqueued_dirty;
 153                 unsigned int congested;
 154                 unsigned int writeback;
 155                 unsigned int immediate;
 156                 unsigned int file_taken;
 157                 unsigned int taken;
 158         } nr;
 159
 160         /* for recording the reclaimed slab by now */
 161         struct reclaim_state reclaim_state;
 162 };
 163
 164 #ifdef ARCH_HAS_PREFETCHW
 165 #define prefetchw_prev_lru_folio(_folio, _base, _field)                 \
 166         do {                                                            \
 167                 if ((_folio)->lru.prev != _base) {                      \
 168                         struct folio *prev;                             \
 169                                                                         \
 170                         prev = lru_to_folio(&(_folio->lru));            \
 171                         prefetchw(&prev->_field);                       \
 172                 }                                                       \
 173         } while (0)
 174 #else
 175 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
 176 #endif
 177
 178 /*
 179  * From 0 .. 200.  Higher means more swappy.
 180  */
 181 int vm_swappiness = 60;
 182
 183 static void set_task_reclaim_state(struct task_struct *task,
 184                                    struct reclaim_state *rs)
 185 {
 186         /* Check for an overwrite */
 187         WARN_ON_ONCE(rs && task->reclaim_state);
 188
 189         /* Check for the nulling of an already-nulled member */
 190         WARN_ON_ONCE(!rs && !task->reclaim_state);
 191
 192         task->reclaim_state = rs;
 193 }
 194
 195 LIST_HEAD(shrinker_list);
 196 DECLARE_RWSEM(shrinker_rwsem);
 197
 198 #ifdef CONFIG_MEMCG
 199 static int shrinker_nr_max;
 200
 201 /* The shrinker_info is expanded in a batch of BITS_PER_LONG */
 202 static inline int shrinker_map_size(int nr_items)
 203 {
 204         return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
 205 }
 206
 207 static inline int shrinker_defer_size(int nr_items)
 208 {
 209         return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
 210 }
 211
 212 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 213                                                      int nid)
 214 {
 215         return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
 216                                          lockdep_is_held(&shrinker_rwsem));
 217 }
 218
 219 static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 220                                     int map_size, int defer_size,
 221                                     int old_map_size, int old_defer_size)
 222 {
 223         struct shrinker_info *new, *old;
 224         struct mem_cgroup_per_node *pn;
 225         int nid;
 226         int size = map_size + defer_size;
 227
 228         for_each_node(nid) {
 229                 pn = memcg->nodeinfo[nid];
 230                 old = shrinker_info_protected(memcg, nid);
 231                 /* Not yet online memcg */
 232                 if (!old)
 233                         return 0;
 234
 235                 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
 236                 if (!new)
 237                         return -ENOMEM;
 238
 239                 new->nr_deferred = (atomic_long_t *)(new + 1);
 240                 new->map = (void *)new->nr_deferred + defer_size;
 241
 242                 /* map: set all old bits, clear all new bits */
 243                 memset(new->map, (int)0xff, old_map_size);
 244                 memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
 245                 /* nr_deferred: copy old values, clear all new values */
 246                 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
 247                 memset((void *)new->nr_deferred + old_defer_size, 0,
 248                        defer_size - old_defer_size);
 249
 250                 rcu_assign_pointer(pn->shrinker_info, new);
 251                 kvfree_rcu(old, rcu);
 252         }
 253
 254         return 0;
 255 }
 256
 257 void free_shrinker_info(struct mem_cgroup *memcg)
 258 {
 259         struct mem_cgroup_per_node *pn;
 260         struct shrinker_info *info;
 261         int nid;
 262
 263         for_each_node(nid) {
 264                 pn = memcg->nodeinfo[nid];
 265                 info = rcu_dereference_protected(pn->shrinker_info, true);
 266                 kvfree(info);
 267                 rcu_assign_pointer(pn->shrinker_info, NULL);
 268         }
 269 }
 270
 271 int alloc_shrinker_info(struct mem_cgroup *memcg)
 272 {
 273         struct shrinker_info *info;
 274         int nid, size, ret = 0;
 275         int map_size, defer_size = 0;
 276
 277         down_write(&shrinker_rwsem);
 278         map_size = shrinker_map_size(shrinker_nr_max);
 279         defer_size = shrinker_defer_size(shrinker_nr_max);
 280         size = map_size + defer_size;
 281         for_each_node(nid) {
 282                 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
 283                 if (!info) {
 284                         free_shrinker_info(memcg);
 285                         ret = -ENOMEM;
 286                         break;
 287                 }
 288                 info->nr_deferred = (atomic_long_t *)(info + 1);
 289                 info->map = (void *)info->nr_deferred + defer_size;
 290                 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
 291         }
 292         up_write(&shrinker_rwsem);
 293
 294         return ret;
 295 }
 296
 297 static inline bool need_expand(int nr_max)
 298 {
 299         return round_up(nr_max, BITS_PER_LONG) >
 300                round_up(shrinker_nr_max, BITS_PER_LONG);
 301 }
 302
 303 static int expand_shrinker_info(int new_id)
 304 {
 305         int ret = 0;
 306         int new_nr_max = new_id + 1;
 307         int map_size, defer_size = 0;
 308         int old_map_size, old_defer_size = 0;
 309         struct mem_cgroup *memcg;
 310
 311         if (!need_expand(new_nr_max))
 312                 goto out;
 313
 314         if (!root_mem_cgroup)
 315                 goto out;
 316
 317         lockdep_assert_held(&shrinker_rwsem);
 318
 319         map_size = shrinker_map_size(new_nr_max);
 320         defer_size = shrinker_defer_size(new_nr_max);
 321         old_map_size = shrinker_map_size(shrinker_nr_max);
 322         old_defer_size = shrinker_defer_size(shrinker_nr_max);
 323
 324         memcg = mem_cgroup_iter(NULL, NULL, NULL);
 325         do {
 326                 ret = expand_one_shrinker_info(memcg, map_size, defer_size,
 327                                                old_map_size, old_defer_size);
 328                 if (ret) {
 329                         mem_cgroup_iter_break(NULL, memcg);
 330                         goto out;
 331                 }
 332         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 333 out:
 334         if (!ret)
 335                 shrinker_nr_max = new_nr_max;
 336
 337         return ret;
 338 }
 339
 340 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 341 {
 342         if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 343                 struct shrinker_info *info;
 344
 345                 rcu_read_lock();
 346                 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
 347                 /* Pairs with smp mb in shrink_slab() */
 348                 smp_mb__before_atomic();
 349                 set_bit(shrinker_id, info->map);
 350                 rcu_read_unlock();
 351         }
 352 }
 353
 354 static DEFINE_IDR(shrinker_idr);
 355
 356 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 357 {
 358         int id, ret = -ENOMEM;
 359
 360         if (mem_cgroup_disabled())
 361                 return -ENOSYS;
 362
 363         down_write(&shrinker_rwsem);
 364         /* This may call shrinker, so it must use down_read_trylock() */
 365         id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
 366         if (id < 0)
 367                 goto unlock;
 368
 369         if (id >= shrinker_nr_max) {
 370                 if (expand_shrinker_info(id)) {
 371                         idr_remove(&shrinker_idr, id);
 372                         goto unlock;
 373                 }
 374         }
 375         shrinker->id = id;
 376         ret = 0;
 377 unlock:
 378         up_write(&shrinker_rwsem);
 379         return ret;
 380 }
 381
 382 static void unregister_memcg_shrinker(struct shrinker *shrinker)
 383 {
 384         int id = shrinker->id;
 385
 386         BUG_ON(id < 0);
 387
 388         lockdep_assert_held(&shrinker_rwsem);
 389
 390         idr_remove(&shrinker_idr, id);
 391 }
 392
 393 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 394                                    struct mem_cgroup *memcg)
 395 {
 396         struct shrinker_info *info;
 397
 398         info = shrinker_info_protected(memcg, nid);
 399         return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
 400 }
 401
 402 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 403                                   struct mem_cgroup *memcg)
 404 {
 405         struct shrinker_info *info;
 406
 407         info = shrinker_info_protected(memcg, nid);
 408         return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
 409 }
 410
 411 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
 412 {
 413         int i, nid;
 414         long nr;
 415         struct mem_cgroup *parent;
 416         struct shrinker_info *child_info, *parent_info;
 417
 418         parent = parent_mem_cgroup(memcg);
 419         if (!parent)
 420                 parent = root_mem_cgroup;
 421
 422         /* Prevent from concurrent shrinker_info expand */
 423         down_read(&shrinker_rwsem);
 424         for_each_node(nid) {
 425                 child_info = shrinker_info_protected(memcg, nid);
 426                 parent_info = shrinker_info_protected(parent, nid);
 427                 for (i = 0; i < shrinker_nr_max; i++) {
 428                         nr = atomic_long_read(&child_info->nr_deferred[i]);
 429                         atomic_long_add(nr, &parent_info->nr_deferred[i]);
 430                 }
 431         }
 432         up_read(&shrinker_rwsem);
 433 }
 434
 435 static bool cgroup_reclaim(struct scan_control *sc)
 436 {
 437         return sc->target_mem_cgroup;
 438 }
 439
 440 /**
 441  * writeback_throttling_sane - is the usual dirty throttling mechanism available?
 442  * @sc: scan_control in question
 443  *
 444  * The normal page dirty throttling mechanism in balance_dirty_pages() is
 445  * completely broken with the legacy memcg and direct stalling in
 446  * shrink_page_list() is used for throttling instead, which lacks all the
 447  * niceties such as fairness, adaptive pausing, bandwidth proportional
 448  * allocation and configurability.
 449  *
 450  * This function tests whether the vmscan currently in progress can assume
 451  * that the normal dirty throttling mechanism is operational.
 452  */
 453 static bool writeback_throttling_sane(struct scan_control *sc)
 454 {
 455         if (!cgroup_reclaim(sc))
 456                 return true;
 457 #ifdef CONFIG_CGROUP_WRITEBACK
 458         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 459                 return true;
 460 #endif
 461         return false;
 462 }
 463 #else
 464 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 465 {
 466         return -ENOSYS;
 467 }
 468
 469 static void unregister_memcg_shrinker(struct shrinker *shrinker)
 470 {
 471 }
 472
 473 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 474                                    struct mem_cgroup *memcg)
 475 {
 476         return 0;
 477 }
 478
 479 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 480                                   struct mem_cgroup *memcg)
 481 {
 482         return 0;
 483 }
 484
 485 static bool cgroup_reclaim(struct scan_control *sc)
 486 {
 487         return false;
 488 }
 489
 490 static bool writeback_throttling_sane(struct scan_control *sc)
 491 {
 492         return true;
 493 }
 494 #endif
 495
 496 static long xchg_nr_deferred(struct shrinker *shrinker,
 497                              struct shrink_control *sc)
 498 {
 499         int nid = sc->nid;
 500
 501         if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 502                 nid = 0;
 503
 504         if (sc->memcg &&
 505             (shrinker->flags & SHRINKER_MEMCG_AWARE))
 506                 return xchg_nr_deferred_memcg(nid, shrinker,
 507                                               sc->memcg);
 508
 509         return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 510 }
 511
 512
 513 static long add_nr_deferred(long nr, struct shrinker *shrinker,
 514                             struct shrink_control *sc)
 515 {
 516         int nid = sc->nid;
 517
 518         if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 519                 nid = 0;
 520
 521         if (sc->memcg &&
 522             (shrinker->flags & SHRINKER_MEMCG_AWARE))
 523                 return add_nr_deferred_memcg(nr, nid, shrinker,
 524                                              sc->memcg);
 525
 526         return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
 527 }
 528
 529 static bool can_demote(int nid, struct scan_control *sc)
 530 {
 531         if (!numa_demotion_enabled)
 532                 return false;
 533         if (sc && sc->no_demotion)
 534                 return false;
 535         if (next_demotion_node(nid) == NUMA_NO_NODE)
 536                 return false;
 537
 538         return true;
 539 }
 540
 541 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 542                                           int nid,
 543                                           struct scan_control *sc)
 544 {
 545         if (memcg == NULL) {
 546                 /*
 547                  * For non-memcg reclaim, is there
 548                  * space in any swap device?
 549                  */
 550                 if (get_nr_swap_pages() > 0)
 551                         return true;
 552         } else {
 553                 /* Is the memcg below its swap limit? */
 554                 if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
 555                         return true;
 556         }
 557
 558         /*
 559          * The page can not be swapped.
 560          *
 561          * Can it be reclaimed from this node via demotion?
 562          */
 563         return can_demote(nid, sc);
 564 }
 565
 566 /*
 567  * This misses isolated pages which are not accounted for to save counters.
 568  * As the data only determines if reclaim or compaction continues, it is
 569  * not expected that isolated pages will be a dominating factor.
 570  */
 571 unsigned long zone_reclaimable_pages(struct zone *zone)
 572 {
 573         unsigned long nr;
 574
 575         nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
 576                 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
 577         if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
 578                 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
 579                         zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
 580
 581         return nr;
 582 }
 583
 584 /**
 585  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
 586  * @lruvec: lru vector
 587  * @lru: lru to use
 588  * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
 589  */
 590 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
 591                                      int zone_idx)
 592 {
 593         unsigned long size = 0;
 594         int zid;
 595
 596         for (zid = 0; zid <= zone_idx; zid++) {
 597                 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
 598
 599                 if (!managed_zone(zone))
 600                         continue;
 601
 602                 if (!mem_cgroup_disabled())
 603                         size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
 604                 else
 605                         size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
 606         }
 607         return size;
 608 }
 609
 610 /*
 611  * Add a shrinker callback to be called from the vm.
 612  */
 613 static int __prealloc_shrinker(struct shrinker *shrinker)
 614 {
 615         unsigned int size;
 616         int err;
 617
 618         if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
 619                 err = prealloc_memcg_shrinker(shrinker);
 620                 if (err != -ENOSYS)
 621                         return err;
 622
 623                 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
 624         }
 625
 626         size = sizeof(*shrinker->nr_deferred);
 627         if (shrinker->flags & SHRINKER_NUMA_AWARE)
 628                 size *= nr_node_ids;
 629
 630         shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 631         if (!shrinker->nr_deferred)
 632                 return -ENOMEM;
 633
 634         return 0;
 635 }
 636
 637 #ifdef CONFIG_SHRINKER_DEBUG
 638 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
 639 {
 640         va_list ap;
 641         int err;
 642
 643         va_start(ap, fmt);
 644         shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
 645         va_end(ap);
 646         if (!shrinker->name)
 647                 return -ENOMEM;
 648
 649         err = __prealloc_shrinker(shrinker);
 650         if (err) {
 651                 kfree_const(shrinker->name);
 652                 shrinker->name = NULL;
 653         }
 654
 655         return err;
 656 }
 657 #else
 658 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
 659 {
 660         return __prealloc_shrinker(shrinker);
 661 }
 662 #endif
 663
 664 void free_prealloced_shrinker(struct shrinker *shrinker)
 665 {
 666 #ifdef CONFIG_SHRINKER_DEBUG
 667         kfree_const(shrinker->name);
 668         shrinker->name = NULL;
 669 #endif
 670         if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
 671                 down_write(&shrinker_rwsem);
 672                 unregister_memcg_shrinker(shrinker);
 673                 up_write(&shrinker_rwsem);
 674                 return;
 675         }
 676
 677         kfree(shrinker->nr_deferred);
 678         shrinker->nr_deferred = NULL;
 679 }
 680
 681 void register_shrinker_prepared(struct shrinker *shrinker)
 682 {
 683         down_write(&shrinker_rwsem);
 684         list_add_tail(&shrinker->list, &shrinker_list);
 685         shrinker->flags |= SHRINKER_REGISTERED;
 686         shrinker_debugfs_add(shrinker);
 687         up_write(&shrinker_rwsem);
 688 }
 689
 690 static int __register_shrinker(struct shrinker *shrinker)
 691 {
 692         int err = __prealloc_shrinker(shrinker);
 693
 694         if (err)
 695                 return err;
 696         register_shrinker_prepared(shrinker);
 697         return 0;
 698 }
 699
 700 #ifdef CONFIG_SHRINKER_DEBUG
 701 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
 702 {
 703         va_list ap;
 704         int err;
 705
 706         va_start(ap, fmt);
 707         shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
 708         va_end(ap);
 709         if (!shrinker->name)
 710                 return -ENOMEM;
 711
 712         err = __register_shrinker(shrinker);
 713         if (err) {
 714                 kfree_const(shrinker->name);
 715                 shrinker->name = NULL;
 716         }
 717         return err;
 718 }
 719 #else
 720 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
 721 {
 722         return __register_shrinker(shrinker);
 723 }
 724 #endif
 725 EXPORT_SYMBOL(register_shrinker);
 726
 727 /*
 728  * Remove one
 729  */
 730 void unregister_shrinker(struct shrinker *shrinker)
 731 {
 732         if (!(shrinker->flags & SHRINKER_REGISTERED))
 733                 return;
 734
 735         down_write(&shrinker_rwsem);
 736         list_del(&shrinker->list);
 737         shrinker->flags &= ~SHRINKER_REGISTERED;
 738         if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 739                 unregister_memcg_shrinker(shrinker);
 740         shrinker_debugfs_remove(shrinker);
 741         up_write(&shrinker_rwsem);
 742
 743         kfree(shrinker->nr_deferred);
 744         shrinker->nr_deferred = NULL;
 745 }
 746 EXPORT_SYMBOL(unregister_shrinker);
 747
 748 /**
 749  * synchronize_shrinkers - Wait for all running shrinkers to complete.
 750  *
 751  * This is equivalent to calling unregister_shrink() and register_shrinker(),
 752  * but atomically and with less overhead. This is useful to guarantee that all
 753  * shrinker invocations have seen an update, before freeing memory, similar to
 754  * rcu.
 755  */
 756 void synchronize_shrinkers(void)
 757 {
 758         down_write(&shrinker_rwsem);
 759         up_write(&shrinker_rwsem);
 760 }
 761 EXPORT_SYMBOL(synchronize_shrinkers);
 762
 763 #define SHRINK_BATCH 128
 764
 765 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 766                                     struct shrinker *shrinker, int priority)
 767 {
 768         unsigned long freed = 0;
 769         unsigned long long delta;
 770         long total_scan;
 771         long freeable;
 772         long nr;
 773         long new_nr;
 774         long batch_size = shrinker->batch ? shrinker->batch
 775                                           : SHRINK_BATCH;
 776         long scanned = 0, next_deferred;
 777
 778         freeable = shrinker->count_objects(shrinker, shrinkctl);
 779         if (freeable == 0 || freeable == SHRINK_EMPTY)
 780                 return freeable;
 781
 782         /*
 783          * copy the current shrinker scan count into a local variable
 784          * and zero it so that other concurrent shrinker invocations
 785          * don't also do this scanning work.
 786          */
 787         nr = xchg_nr_deferred(shrinker, shrinkctl);
 788
 789         if (shrinker->seeks) {
 790                 delta = freeable >> priority;
 791                 delta *= 4;
 792                 do_div(delta, shrinker->seeks);
 793         } else {
 794                 /*
 795                  * These objects don't require any IO to create. Trim
 796                  * them aggressively under memory pressure to keep
 797                  * them from causing refetches in the IO caches.
 798                  */
 799                 delta = freeable / 2;
 800         }
 801
 802         total_scan = nr >> priority;
 803         total_scan += delta;
 804         total_scan = min(total_scan, (2 * freeable));
 805
 806         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 807                                    freeable, delta, total_scan, priority);
 808
 809         /*
 810          * Normally, we should not scan less than batch_size objects in one
 811          * pass to avoid too frequent shrinker calls, but if the slab has less
 812          * than batch_size objects in total and we are really tight on memory,
 813          * we will try to reclaim all available objects, otherwise we can end
 814          * up failing allocations although there are plenty of reclaimable
 815          * objects spread over several slabs with usage less than the
 816          * batch_size.
 817          *
 818          * We detect the "tight on memory" situations by looking at the total
 819          * number of objects we want to scan (total_scan). If it is greater
 820          * than the total number of objects on slab (freeable), we must be
 821          * scanning at high prio and therefore should try to reclaim as much as
 822          * possible.
 823          */
 824         while (total_scan >= batch_size ||
 825                total_scan >= freeable) {
 826                 unsigned long ret;
 827                 unsigned long nr_to_scan = min(batch_size, total_scan);
 828
 829                 shrinkctl->nr_to_scan = nr_to_scan;
 830                 shrinkctl->nr_scanned = nr_to_scan;
 831                 ret = shrinker->scan_objects(shrinker, shrinkctl);
 832                 if (ret == SHRINK_STOP)
 833                         break;
 834                 freed += ret;
 835
 836                 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
 837                 total_scan -= shrinkctl->nr_scanned;
 838                 scanned += shrinkctl->nr_scanned;
 839
 840                 cond_resched();
 841         }
 842
 843         /*
 844          * The deferred work is increased by any new work (delta) that wasn't
 845          * done, decreased by old deferred work that was done now.
 846          *
 847          * And it is capped to two times of the freeable items.
 848          */
 849         next_deferred = max_t(long, (nr + delta - scanned), 0);
 850         next_deferred = min(next_deferred, (2 * freeable));
 851
 852         /*
 853          * move the unused scan count back into the shrinker in a
 854          * manner that handles concurrent updates.
 855          */
 856         new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 857
 858         trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
 859         return freed;
 860 }
 861
 862 #ifdef CONFIG_MEMCG
 863 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 864                         struct mem_cgroup *memcg, int priority)
 865 {
 866         struct shrinker_info *info;
 867         unsigned long ret, freed = 0;
 868         int i;
 869
 870         if (!mem_cgroup_online(memcg))
 871                 return 0;
 872
 873         if (!down_read_trylock(&shrinker_rwsem))
 874                 return 0;
 875
 876         info = shrinker_info_protected(memcg, nid);
 877         if (unlikely(!info))
 878                 goto unlock;
 879
 880         for_each_set_bit(i, info->map, shrinker_nr_max) {
 881                 struct shrink_control sc = {
 882                         .gfp_mask = gfp_mask,
 883                         .nid = nid,
 884                         .memcg = memcg,
 885                 };
 886                 struct shrinker *shrinker;
 887
 888                 shrinker = idr_find(&shrinker_idr, i);
 889                 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
 890                         if (!shrinker)
 891                                 clear_bit(i, info->map);
 892                         continue;
 893                 }
 894
 895                 /* Call non-slab shrinkers even though kmem is disabled */
 896                 if (!memcg_kmem_enabled() &&
 897                     !(shrinker->flags & SHRINKER_NONSLAB))
 898                         continue;
 899
 900                 ret = do_shrink_slab(&sc, shrinker, priority);
 901                 if (ret == SHRINK_EMPTY) {
 902                         clear_bit(i, info->map);
 903                         /*
 904                          * After the shrinker reported that it had no objects to
 905                          * free, but before we cleared the corresponding bit in
 906                          * the memcg shrinker map, a new object might have been
 907                          * added. To make sure, we have the bit set in this
 908                          * case, we invoke the shrinker one more time and reset
 909                          * the bit if it reports that it is not empty anymore.
 910                          * The memory barrier here pairs with the barrier in
 911                          * set_shrinker_bit():
 912                          *
 913                          * list_lru_add()     shrink_slab_memcg()
 914                          *   list_add_tail()    clear_bit()
 915                          *   <MB>               <MB>
 916                          *   set_bit()          do_shrink_slab()
 917                          */
 918                         smp_mb__after_atomic();
 919                         ret = do_shrink_slab(&sc, shrinker, priority);
 920                         if (ret == SHRINK_EMPTY)
 921                                 ret = 0;
 922                         else
 923                                 set_shrinker_bit(memcg, nid, i);
 924                 }
 925                 freed += ret;
 926
 927                 if (rwsem_is_contended(&shrinker_rwsem)) {
 928                         freed = freed ? : 1;
 929                         break;
 930                 }
 931         }
 932 unlock:
 933         up_read(&shrinker_rwsem);
 934         return freed;
 935 }
 936 #else /* CONFIG_MEMCG */
 937 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 938                         struct mem_cgroup *memcg, int priority)
 939 {
 940         return 0;
 941 }
 942 #endif /* CONFIG_MEMCG */
 943
 944 /**
 945  * shrink_slab - shrink slab caches
 946  * @gfp_mask: allocation context
 947  * @nid: node whose slab caches to target
 948  * @memcg: memory cgroup whose slab caches to target
 949  * @priority: the reclaim priority
 950  *
 951  * Call the shrink functions to age shrinkable caches.
 952  *
 953  * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 954  * unaware shrinkers will receive a node id of 0 instead.
 955  *
 956  * @memcg specifies the memory cgroup to target. Unaware shrinkers
 957  * are called only if it is the root cgroup.
 958  *
 959  * @priority is sc->priority, we take the number of objects and >> by priority
 960  * in order to get the scan target.
 961  *
 962  * Returns the number of reclaimed slab objects.
 963  */
 964 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 965                                  struct mem_cgroup *memcg,
 966                                  int priority)
 967 {
 968         unsigned long ret, freed = 0;
 969         struct shrinker *shrinker;
 970
 971         /*
 972          * The root memcg might be allocated even though memcg is disabled
 973          * via "cgroup_disable=memory" boot parameter.  This could make
 974          * mem_cgroup_is_root() return false, then just run memcg slab
 975          * shrink, but skip global shrink.  This may result in premature
 976          * oom.
 977          */
 978         if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
 979                 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 980
 981         if (!down_read_trylock(&shrinker_rwsem))
 982                 goto out;
 983
 984         list_for_each_entry(shrinker, &shrinker_list, list) {
 985                 struct shrink_control sc = {
 986                         .gfp_mask = gfp_mask,
 987                         .nid = nid,
 988                         .memcg = memcg,
 989                 };
 990
 991                 ret = do_shrink_slab(&sc, shrinker, priority);
 992                 if (ret == SHRINK_EMPTY)
 993                         ret = 0;
 994                 freed += ret;
 995                 /*
 996                  * Bail out if someone want to register a new shrinker to
 997                  * prevent the registration from being stalled for long periods
 998                  * by parallel ongoing shrinking.
 999                  */
1000                 if (rwsem_is_contended(&shrinker_rwsem)) {
1001                         freed = freed ? : 1;
1002                         break;
1003                 }
1004         }
1005
1006         up_read(&shrinker_rwsem);
1007 out:
1008         cond_resched();
1009         return freed;
1010 }
1011
1012 static void drop_slab_node(int nid)
1013 {
1014         unsigned long freed;
1015         int shift = 0;
1016
1017         do {
1018                 struct mem_cgroup *memcg = NULL;
1019
1020                 if (fatal_signal_pending(current))
1021                         return;
1022
1023                 freed = 0;
1024                 memcg = mem_cgroup_iter(NULL, NULL, NULL);
1025                 do {
1026                         freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
1027                 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
1028         } while ((freed >> shift++) > 1);
1029 }
1030
1031 void drop_slab(void)
1032 {
1033         int nid;
1034
1035         for_each_online_node(nid)
1036                 drop_slab_node(nid);
1037 }
1038
1039 static inline int is_page_cache_freeable(struct folio *folio)
1040 {
1041         /*
1042          * A freeable page cache page is referenced only by the caller
1043          * that isolated the page, the page cache and optional buffer
1044          * heads at page->private.
1045          */
1046         return folio_ref_count(folio) - folio_test_private(folio) ==
1047                 1 + folio_nr_pages(folio);
1048 }
1049
1050 /*
1051  * We detected a synchronous write error writing a folio out.  Probably
1052  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
1053  * fsync(), msync() or close().
1054  *
1055  * The tricky part is that after writepage we cannot touch the mapping: nothing
1056  * prevents it from being freed up.  But we have a ref on the folio and once
1057  * that folio is locked, the mapping is pinned.
1058  *
1059  * We're allowed to run sleeping folio_lock() here because we know the caller has
1060  * __GFP_FS.
1061  */
1062 static void handle_write_error(struct address_space *mapping,
1063                                 struct folio *folio, int error)
1064 {
1065         folio_lock(folio);
1066         if (folio_mapping(folio) == mapping)
1067                 mapping_set_error(mapping, error);
1068         folio_unlock(folio);
1069 }
1070
1071 static bool skip_throttle_noprogress(pg_data_t *pgdat)
1072 {
1073         int reclaimable = 0, write_pending = 0;
1074         int i;
1075
1076         /*
1077          * If kswapd is disabled, reschedule if necessary but do not
1078          * throttle as the system is likely near OOM.
1079          */
1080         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
1081                 return true;
1082
1083         /*
1084          * If there are a lot of dirty/writeback pages then do not
1085          * throttle as throttling will occur when the pages cycle
1086          * towards the end of the LRU if still under writeback.
1087          */
1088         for (i = 0; i < MAX_NR_ZONES; i++) {
1089                 struct zone *zone = pgdat->node_zones + i;
1090
1091                 if (!managed_zone(zone))
1092                         continue;
1093
1094                 reclaimable += zone_reclaimable_pages(zone);
1095                 write_pending += zone_page_state_snapshot(zone,
1096                                                   NR_ZONE_WRITE_PENDING);
1097         }
1098         if (2 * write_pending <= reclaimable)
1099                 return true;
1100
1101         return false;
1102 }
1103
1104 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
1105 {
1106         wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
1107         long timeout, ret;
1108         DEFINE_WAIT(wait);
1109
1110         /*
1111          * Do not throttle IO workers, kthreads other than kswapd or
1112          * workqueues. They may be required for reclaim to make
1113          * forward progress (e.g. journalling workqueues or kthreads).
1114          */
1115         if (!current_is_kswapd() &&
1116             current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
1117                 cond_resched();
1118                 return;
1119         }
1120
1121         /*
1122          * These figures are pulled out of thin air.
1123          * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
1124          * parallel reclaimers which is a short-lived event so the timeout is
1125          * short. Failing to make progress or waiting on writeback are
1126          * potentially long-lived events so use a longer timeout. This is shaky
1127          * logic as a failure to make progress could be due to anything from
1128          * writeback to a slow device to excessive references pages at the tail
1129          * of the inactive LRU.
1130          */
1131         switch(reason) {
1132         case VMSCAN_THROTTLE_WRITEBACK:
1133                 timeout = HZ/10;
1134
1135                 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
1136                         WRITE_ONCE(pgdat->nr_reclaim_start,
1137                                 node_page_state(pgdat, NR_THROTTLED_WRITTEN));
1138                 }
1139
1140                 break;
1141         case VMSCAN_THROTTLE_CONGESTED:
1142                 fallthrough;
1143         case VMSCAN_THROTTLE_NOPROGRESS:
1144                 if (skip_throttle_noprogress(pgdat)) {
1145                         cond_resched();
1146                         return;
1147                 }
1148
1149                 timeout = 1;
1150
1151                 break;
1152         case VMSCAN_THROTTLE_ISOLATED:
1153                 timeout = HZ/50;
1154                 break;
1155         default:
1156                 WARN_ON_ONCE(1);
1157                 timeout = HZ;
1158                 break;
1159         }
1160
1161         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1162         ret = schedule_timeout(timeout);
1163         finish_wait(wqh, &wait);
1164
1165         if (reason == VMSCAN_THROTTLE_WRITEBACK)
1166                 atomic_dec(&pgdat->nr_writeback_throttled);
1167
1168         trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
1169                                 jiffies_to_usecs(timeout - ret),
1170                                 reason);
1171 }
1172
1173 /*
1174  * Account for pages written if tasks are throttled waiting on dirty
1175  * pages to clean. If enough pages have been cleaned since throttling
1176  * started then wakeup the throttled tasks.
1177  */
1178 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
1179                                                         int nr_throttled)
1180 {
1181         unsigned long nr_written;
1182
1183         node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
1184
1185         /*
1186          * This is an inaccurate read as the per-cpu deltas may not
1187          * be synchronised. However, given that the system is
1188          * writeback throttled, it is not worth taking the penalty
1189          * of getting an accurate count. At worst, the throttle
1190          * timeout guarantees forward progress.
1191          */
1192         nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
1193                 READ_ONCE(pgdat->nr_reclaim_start);
1194
1195         if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
1196                 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
1197 }
1198
1199 /* possible outcome of pageout() */
1200 typedef enum {
1201         /* failed to write page out, page is locked */
1202         PAGE_KEEP,
1203         /* move page to the active list, page is locked */
1204         PAGE_ACTIVATE,
1205         /* page has been sent to the disk successfully, page is unlocked */
1206         PAGE_SUCCESS,
1207         /* page is clean and locked */
1208         PAGE_CLEAN,
1209 } pageout_t;
1210
1211 /*
1212  * pageout is called by shrink_page_list() for each dirty page.
1213  * Calls ->writepage().
1214  */
1215 static pageout_t pageout(struct folio *folio, struct address_space *mapping,
1216                          struct swap_iocb **plug)
1217 {
1218         /*
1219          * If the folio is dirty, only perform writeback if that write
1220          * will be non-blocking.  To prevent this allocation from being
1221          * stalled by pagecache activity.  But note that there may be
1222          * stalls if we need to run get_block().  We could test
1223          * PagePrivate for that.
1224          *
1225          * If this process is currently in __generic_file_write_iter() against
1226          * this folio's queue, we can perform writeback even if that
1227          * will block.
1228          *
1229          * If the folio is swapcache, write it back even if that would
1230          * block, for some throttling. This happens by accident, because
1231          * swap_backing_dev_info is bust: it doesn't reflect the
1232          * congestion state of the swapdevs.  Easy to fix, if needed.
1233          */
1234         if (!is_page_cache_freeable(folio))
1235                 return PAGE_KEEP;
1236         if (!mapping) {
1237                 /*
1238                  * Some data journaling orphaned folios can have
1239                  * folio->mapping == NULL while being dirty with clean buffers.
1240                  */
1241                 if (folio_test_private(folio)) {
1242                         if (try_to_free_buffers(folio)) {
1243                                 folio_clear_dirty(folio);
1244                                 pr_info("%s: orphaned folio\n", __func__);
1245                                 return PAGE_CLEAN;
1246                         }
1247                 }
1248                 return PAGE_KEEP;
1249         }
1250         if (mapping->a_ops->writepage == NULL)
1251                 return PAGE_ACTIVATE;
1252
1253         if (folio_clear_dirty_for_io(folio)) {
1254                 int res;
1255                 struct writeback_control wbc = {
1256                         .sync_mode = WB_SYNC_NONE,
1257                         .nr_to_write = SWAP_CLUSTER_MAX,
1258                         .range_start = 0,
1259                         .range_end = LLONG_MAX,
1260                         .for_reclaim = 1,
1261                         .swap_plug = plug,
1262                 };
1263
1264                 folio_set_reclaim(folio);
1265                 res = mapping->a_ops->writepage(&folio->page, &wbc);
1266                 if (res < 0)
1267                         handle_write_error(mapping, folio, res);
1268                 if (res == AOP_WRITEPAGE_ACTIVATE) {
1269                         folio_clear_reclaim(folio);
1270                         return PAGE_ACTIVATE;
1271                 }
1272
1273                 if (!folio_test_writeback(folio)) {
1274                         /* synchronous write or broken a_ops? */
1275                         folio_clear_reclaim(folio);
1276                 }
1277                 trace_mm_vmscan_write_folio(folio);
1278                 node_stat_add_folio(folio, NR_VMSCAN_WRITE);
1279                 return PAGE_SUCCESS;
1280         }
1281
1282         return PAGE_CLEAN;
1283 }
1284
1285 /*
1286  * Same as remove_mapping, but if the page is removed from the mapping, it
1287  * gets returned with a refcount of 0.
1288  */
1289 static int __remove_mapping(struct address_space *mapping, struct folio *folio,
1290                             bool reclaimed, struct mem_cgroup *target_memcg)
1291 {
1292         int refcount;
1293         void *shadow = NULL;
1294
1295         BUG_ON(!folio_test_locked(folio));
1296         BUG_ON(mapping != folio_mapping(folio));
1297
1298         if (!folio_test_swapcache(folio))
1299                 spin_lock(&mapping->host->i_lock);
1300         xa_lock_irq(&mapping->i_pages);
1301         /*
1302          * The non racy check for a busy page.
1303          *
1304          * Must be careful with the order of the tests. When someone has
1305          * a ref to the page, it may be possible that they dirty it then
1306          * drop the reference. So if PageDirty is tested before page_count
1307          * here, then the following race may occur:
1308          *
1309          * get_user_pages(&page);
1310          * [user mapping goes away]
1311          * write_to(page);
1312          *                              !PageDirty(page)    [good]
1313          * SetPageDirty(page);
1314          * put_page(page);
1315          *                              !page_count(page)   [good, discard it]
1316          *
1317          * [oops, our write_to data is lost]
1318          *
1319          * Reversing the order of the tests ensures such a situation cannot
1320          * escape unnoticed. The smp_rmb is needed to ensure the page->flags
1321          * load is not satisfied before that of page->_refcount.
1322          *
1323          * Note that if SetPageDirty is always performed via set_page_dirty,
1324          * and thus under the i_pages lock, then this ordering is not required.
1325          */
1326         refcount = 1 + folio_nr_pages(folio);
1327         if (!folio_ref_freeze(folio, refcount))
1328                 goto cannot_free;
1329         /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
1330         if (unlikely(folio_test_dirty(folio))) {
1331                 folio_ref_unfreeze(folio, refcount);
1332                 goto cannot_free;
1333         }
1334
1335         if (folio_test_swapcache(folio)) {
1336                 swp_entry_t swap = folio_swap_entry(folio);
1337
1338                 /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */
1339                 if (reclaimed && !mapping_exiting(mapping))
1340                         shadow = workingset_eviction(folio, target_memcg);
1341                 mem_cgroup_swapout(folio, swap);
1342                 __delete_from_swap_cache(folio, swap, shadow);
1343                 xa_unlock_irq(&mapping->i_pages);
1344                 put_swap_page(&folio->page, swap);
1345         } else {
1346                 void (*free_folio)(struct folio *);
1347
1348                 free_folio = mapping->a_ops->free_folio;
1349                 /*
1350                  * Remember a shadow entry for reclaimed file cache in
1351                  * order to detect refaults, thus thrashing, later on.
1352                  *
1353                  * But don't store shadows in an address space that is
1354                  * already exiting.  This is not just an optimization,
1355                  * inode reclaim needs to empty out the radix tree or
1356                  * the nodes are lost.  Don't plant shadows behind its
1357                  * back.
1358                  *
1359                  * We also don't store shadows for DAX mappings because the
1360                  * only page cache pages found in these are zero pages
1361                  * covering holes, and because we don't want to mix DAX
1362                  * exceptional entries and shadow exceptional entries in the
1363                  * same address_space.
1364                  */
1365                 if (reclaimed && folio_is_file_lru(folio) &&
1366                     !mapping_exiting(mapping) && !dax_mapping(mapping))
1367                         shadow = workingset_eviction(folio, target_memcg);
1368                 __filemap_remove_folio(folio, shadow);
1369                 xa_unlock_irq(&mapping->i_pages);
1370                 if (mapping_shrinkable(mapping))
1371                         inode_add_lru(mapping->host);
1372                 spin_unlock(&mapping->host->i_lock);
1373
1374                 if (free_folio)
1375                         free_folio(folio);
1376         }
1377
1378         return 1;
1379
1380 cannot_free:
1381         xa_unlock_irq(&mapping->i_pages);
1382         if (!folio_test_swapcache(folio))
1383                 spin_unlock(&mapping->host->i_lock);
1384         return 0;
1385 }
1386
1387 /**
1388  * remove_mapping() - Attempt to remove a folio from its mapping.
1389  * @mapping: The address space.
1390  * @folio: The folio to remove.
1391  *
1392  * If the folio is dirty, under writeback or if someone else has a ref
1393  * on it, removal will fail.
1394  * Return: The number of pages removed from the mapping.  0 if the folio
1395  * could not be removed.
1396  * Context: The caller should have a single refcount on the folio and
1397  * hold its lock.
1398  */
1399 long remove_mapping(struct address_space *mapping, struct folio *folio)
1400 {
1401         if (__remove_mapping(mapping, folio, false, NULL)) {
1402                 /*
1403                  * Unfreezing the refcount with 1 effectively
1404                  * drops the pagecache ref for us without requiring another
1405                  * atomic operation.
1406                  */
1407                 folio_ref_unfreeze(folio, 1);
1408                 return folio_nr_pages(folio);
1409         }
1410         return 0;
1411 }
1412
1413 /**
1414  * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
1415  * @folio: Folio to be returned to an LRU list.
1416  *
1417  * Add previously isolated @folio to appropriate LRU list.
1418  * The folio may still be unevictable for other reasons.
1419  *
1420  * Context: lru_lock must not be held, interrupts must be enabled.
1421  */
1422 void folio_putback_lru(struct folio *folio)
1423 {
1424         folio_add_lru(folio);
1425         folio_put(folio);               /* drop ref from isolate */
1426 }
1427
1428 enum page_references {
1429         PAGEREF_RECLAIM,
1430         PAGEREF_RECLAIM_CLEAN,
1431         PAGEREF_KEEP,
1432         PAGEREF_ACTIVATE,
1433 };
1434
1435 static enum page_references folio_check_references(struct folio *folio,
1436                                                   struct scan_control *sc)
1437 {
1438         int referenced_ptes, referenced_folio;
1439         unsigned long vm_flags;
1440
1441         referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
1442                                            &vm_flags);
1443         referenced_folio = folio_test_clear_referenced(folio);
1444
1445         /*
1446          * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
1447          * Let the folio, now marked Mlocked, be moved to the unevictable list.
1448          */
1449         if (vm_flags & VM_LOCKED)
1450                 return PAGEREF_ACTIVATE;
1451
1452         /* rmap lock contention: rotate */
1453         if (referenced_ptes == -1)
1454                 return PAGEREF_KEEP;
1455
1456         if (referenced_ptes) {
1457                 /*
1458                  * All mapped folios start out with page table
1459                  * references from the instantiating fault, so we need
1460                  * to look twice if a mapped file/anon folio is used more
1461                  * than once.
1462                  *
1463                  * Mark it and spare it for another trip around the
1464                  * inactive list.  Another page table reference will
1465                  * lead to its activation.
1466                  *
1467                  * Note: the mark is set for activated folios as well
1468                  * so that recently deactivated but used folios are
1469                  * quickly recovered.
1470                  */
1471                 folio_set_referenced(folio);
1472
1473                 if (referenced_folio || referenced_ptes > 1)
1474                         return PAGEREF_ACTIVATE;
1475
1476                 /*
1477                  * Activate file-backed executable folios after first usage.
1478                  */
1479                 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
1480                         return PAGEREF_ACTIVATE;
1481
1482                 return PAGEREF_KEEP;
1483         }
1484
1485         /* Reclaim if clean, defer dirty folios to writeback */
1486         if (referenced_folio && folio_is_file_lru(folio))
1487                 return PAGEREF_RECLAIM_CLEAN;
1488
1489         return PAGEREF_RECLAIM;
1490 }
1491
1492 /* Check if a page is dirty or under writeback */
1493 static void folio_check_dirty_writeback(struct folio *folio,
1494                                        bool *dirty, bool *writeback)
1495 {
1496         struct address_space *mapping;
1497
1498         /*
1499          * Anonymous pages are not handled by flushers and must be written
1500          * from reclaim context. Do not stall reclaim based on them.
1501          * MADV_FREE anonymous pages are put into inactive file list too.
1502          * They could be mistakenly treated as file lru. So further anon
1503          * test is needed.
1504          */
1505         if (!folio_is_file_lru(folio) ||
1506             (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
1507                 *dirty = false;
1508                 *writeback = false;
1509                 return;
1510         }
1511
1512         /* By default assume that the folio flags are accurate */
1513         *dirty = folio_test_dirty(folio);
1514         *writeback = folio_test_writeback(folio);
1515
1516         /* Verify dirty/writeback state if the filesystem supports it */
1517         if (!folio_test_private(folio))
1518                 return;
1519
1520         mapping = folio_mapping(folio);
1521         if (mapping && mapping->a_ops->is_dirty_writeback)
1522                 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
1523 }
1524
1525 static struct page *alloc_demote_page(struct page *page, unsigned long node)
1526 {
1527         struct migration_target_control mtc = {
1528                 /*
1529                  * Allocate from 'node', or fail quickly and quietly.
1530                  * When this happens, 'page' will likely just be discarded
1531                  * instead of migrated.
1532                  */
1533                 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
1534                             __GFP_THISNODE  | __GFP_NOWARN |
1535                             __GFP_NOMEMALLOC | GFP_NOWAIT,
1536                 .nid = node
1537         };
1538
1539         return alloc_migration_target(page, (unsigned long)&mtc);
1540 }
1541
1542 /*
1543  * Take pages on @demote_list and attempt to demote them to
1544  * another node.  Pages which are not demoted are left on
1545  * @demote_pages.
1546  */
1547 static unsigned int demote_page_list(struct list_head *demote_pages,
1548                                      struct pglist_data *pgdat)
1549 {
1550         int target_nid = next_demotion_node(pgdat->node_id);
1551         unsigned int nr_succeeded;
1552
1553         if (list_empty(demote_pages))
1554                 return 0;
1555
1556         if (target_nid == NUMA_NO_NODE)
1557                 return 0;
1558
1559         /* Demotion ignores all cpuset and mempolicy settings */
1560         migrate_pages(demote_pages, alloc_demote_page, NULL,
1561                             target_nid, MIGRATE_ASYNC, MR_DEMOTION,
1562                             &nr_succeeded);
1563
1564         if (current_is_kswapd())
1565                 __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
1566         else
1567                 __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
1568
1569         return nr_succeeded;
1570 }
1571
1572 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1573 {
1574         if (gfp_mask & __GFP_FS)
1575                 return true;
1576         if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
1577                 return false;
1578         /*
1579          * We can "enter_fs" for swap-cache with only __GFP_IO
1580          * providing this isn't SWP_FS_OPS.
1581          * ->flags can be updated non-atomicially (scan_swap_map_slots),
1582          * but that will never affect SWP_FS_OPS, so the data_race
1583          * is safe.
1584          */
1585         return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1586 }
1587
1588 /*
1589  * shrink_page_list() returns the number of reclaimed pages
1590  */
1591 static unsigned int shrink_page_list(struct list_head *page_list,
1592                                      struct pglist_data *pgdat,
1593                                      struct scan_control *sc,
1594                                      struct reclaim_stat *stat,
1595                                      bool ignore_references)
1596 {
1597         LIST_HEAD(ret_pages);
1598         LIST_HEAD(free_pages);
1599         LIST_HEAD(demote_pages);
1600         unsigned int nr_reclaimed = 0;
1601         unsigned int pgactivate = 0;
1602         bool do_demote_pass;
1603         struct swap_iocb *plug = NULL;
1604
1605         memset(stat, 0, sizeof(*stat));
1606         cond_resched();
1607         do_demote_pass = can_demote(pgdat->node_id, sc);
1608
1609 retry:
1610         while (!list_empty(page_list)) {
1611                 struct address_space *mapping;
1612                 struct folio *folio;
1613                 enum page_references references = PAGEREF_RECLAIM;
1614                 bool dirty, writeback;
1615                 unsigned int nr_pages;
1616
1617                 cond_resched();
1618
1619                 folio = lru_to_folio(page_list);
1620                 list_del(&folio->lru);
1621
1622                 if (!folio_trylock(folio))
1623                         goto keep;
1624
1625                 VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1626
1627                 nr_pages = folio_nr_pages(folio);
1628
1629                 /* Account the number of base pages */
1630                 sc->nr_scanned += nr_pages;
1631
1632                 if (unlikely(!folio_evictable(folio)))
1633                         goto activate_locked;
1634
1635                 if (!sc->may_unmap && folio_mapped(folio))
1636                         goto keep_locked;
1637
1638                 /* folio_update_gen() tried to promote this page? */
1639                 if (lru_gen_enabled() && !ignore_references &&
1640                     folio_mapped(folio) && folio_test_referenced(folio))
1641                         goto keep_locked;
1642
1643                 /*
1644                  * The number of dirty pages determines if a node is marked
1645                  * reclaim_congested. kswapd will stall and start writing
1646                  * folios if the tail of the LRU is all dirty unqueued folios.
1647                  */
1648                 folio_check_dirty_writeback(folio, &dirty, &writeback);
1649                 if (dirty || writeback)
1650                         stat->nr_dirty += nr_pages;
1651
1652                 if (dirty && !writeback)
1653                         stat->nr_unqueued_dirty += nr_pages;
1654
1655                 /*
1656                  * Treat this folio as congested if folios are cycling
1657                  * through the LRU so quickly that the folios marked
1658                  * for immediate reclaim are making it to the end of
1659                  * the LRU a second time.
1660                  */
1661                 if (writeback && folio_test_reclaim(folio))
1662                         stat->nr_congested += nr_pages;
1663
1664                 /*
1665                  * If a folio at the tail of the LRU is under writeback, there
1666                  * are three cases to consider.
1667                  *
1668                  * 1) If reclaim is encountering an excessive number
1669                  *    of folios under writeback and this folio has both
1670                  *    the writeback and reclaim flags set, then it
1671                  *    indicates that folios are being queued for I/O but
1672                  *    are being recycled through the LRU before the I/O
1673                  *    can complete. Waiting on the folio itself risks an
1674                  *    indefinite stall if it is impossible to writeback
1675                  *    the folio due to I/O error or disconnected storage
1676                  *    so instead note that the LRU is being scanned too
1677                  *    quickly and the caller can stall after the folio
1678                  *    list has been processed.
1679                  *
1680                  * 2) Global or new memcg reclaim encounters a folio that is
1681                  *    not marked for immediate reclaim, or the caller does not
1682                  *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
1683                  *    not to fs). In this case mark the folio for immediate
1684                  *    reclaim and continue scanning.
1685                  *
1686                  *    Require may_enter_fs() because we would wait on fs, which
1687                  *    may not have submitted I/O yet. And the loop driver might
1688                  *    enter reclaim, and deadlock if it waits on a folio for
1689                  *    which it is needed to do the write (loop masks off
1690                  *    __GFP_IO|__GFP_FS for this reason); but more thought
1691                  *    would probably show more reasons.
1692                  *
1693                  * 3) Legacy memcg encounters a folio that already has the
1694                  *    reclaim flag set. memcg does not have any dirty folio
1695                  *    throttling so we could easily OOM just because too many
1696                  *    folios are in writeback and there is nothing else to
1697                  *    reclaim. Wait for the writeback to complete.
1698                  *
1699                  * In cases 1) and 2) we activate the folios to get them out of
1700                  * the way while we continue scanning for clean folios on the
1701                  * inactive list and refilling from the active list. The
1702                  * observation here is that waiting for disk writes is more
1703                  * expensive than potentially causing reloads down the line.
1704                  * Since they're marked for immediate reclaim, they won't put
1705                  * memory pressure on the cache working set any longer than it
1706                  * takes to write them to disk.
1707                  */
1708                 if (folio_test_writeback(folio)) {
1709                         /* Case 1 above */
1710                         if (current_is_kswapd() &&
1711                             folio_test_reclaim(folio) &&
1712                             test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1713                                 stat->nr_immediate += nr_pages;
1714                                 goto activate_locked;
1715
1716                         /* Case 2 above */
1717                         } else if (writeback_throttling_sane(sc) ||
1718                             !folio_test_reclaim(folio) ||
1719                             !may_enter_fs(folio, sc->gfp_mask)) {
1720                                 /*
1721                                  * This is slightly racy -
1722                                  * folio_end_writeback() might have
1723                                  * just cleared the reclaim flag, then
1724                                  * setting the reclaim flag here ends up
1725                                  * interpreted as the readahead flag - but
1726                                  * that does not matter enough to care.
1727                                  * What we do want is for this folio to
1728                                  * have the reclaim flag set next time
1729                                  * memcg reclaim reaches the tests above,
1730                                  * so it will then wait for writeback to
1731                                  * avoid OOM; and it's also appropriate
1732                                  * in global reclaim.
1733                                  */
1734                                 folio_set_reclaim(folio);
1735                                 stat->nr_writeback += nr_pages;
1736                                 goto activate_locked;
1737
1738                         /* Case 3 above */
1739                         } else {
1740                                 folio_unlock(folio);
1741                                 folio_wait_writeback(folio);
1742                                 /* then go back and try same folio again */
1743                                 list_add_tail(&folio->lru, page_list);
1744                                 continue;
1745                         }
1746                 }
1747
1748                 if (!ignore_references)
1749                         references = folio_check_references(folio, sc);
1750
1751                 switch (references) {
1752                 case PAGEREF_ACTIVATE:
1753                         goto activate_locked;
1754                 case PAGEREF_KEEP:
1755                         stat->nr_ref_keep += nr_pages;
1756                         goto keep_locked;
1757                 case PAGEREF_RECLAIM:
1758                 case PAGEREF_RECLAIM_CLEAN:
1759                         ; /* try to reclaim the folio below */
1760                 }
1761
1762                 /*
1763                  * Before reclaiming the folio, try to relocate
1764                  * its contents to another node.
1765                  */
1766                 if (do_demote_pass &&
1767                     (thp_migration_supported() || !folio_test_large(folio))) {
1768                         list_add(&folio->lru, &demote_pages);
1769                         folio_unlock(folio);
1770                         continue;
1771                 }
1772
1773                 /*
1774                  * Anonymous process memory has backing store?
1775                  * Try to allocate it some swap space here.
1776                  * Lazyfree folio could be freed directly
1777                  */
1778                 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1779                         if (!folio_test_swapcache(folio)) {
1780                                 if (!(sc->gfp_mask & __GFP_IO))
1781                                         goto keep_locked;
1782                                 if (folio_maybe_dma_pinned(folio))
1783                                         goto keep_locked;
1784                                 if (folio_test_large(folio)) {
1785                                         /* cannot split folio, skip it */
1786                                         if (!can_split_folio(folio, NULL))
1787                                                 goto activate_locked;
1788                                         /*
1789                                          * Split folios without a PMD map right
1790                                          * away. Chances are some or all of the
1791                                          * tail pages can be freed without IO.
1792                                          */
1793                                         if (!folio_entire_mapcount(folio) &&
1794                                             split_folio_to_list(folio,
1795                                                                 page_list))
1796                                                 goto activate_locked;
1797                                 }
1798                                 if (!add_to_swap(folio)) {
1799                                         if (!folio_test_large(folio))
1800                                                 goto activate_locked_split;
1801                                         /* Fallback to swap normal pages */
1802                                         if (split_folio_to_list(folio,
1803                                                                 page_list))
1804                                                 goto activate_locked;
1805 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1806                                         count_vm_event(THP_SWPOUT_FALLBACK);
1807 #endif
1808                                         if (!add_to_swap(folio))
1809                                                 goto activate_locked_split;
1810                                 }
1811                         }
1812                 } else if (folio_test_swapbacked(folio) &&
1813                            folio_test_large(folio)) {
1814                         /* Split shmem folio */
1815                         if (split_folio_to_list(folio, page_list))
1816                                 goto keep_locked;
1817                 }
1818
1819                 /*
1820                  * If the folio was split above, the tail pages will make
1821                  * their own pass through this function and be accounted
1822                  * then.
1823                  */
1824                 if ((nr_pages > 1) && !folio_test_large(folio)) {
1825                         sc->nr_scanned -= (nr_pages - 1);
1826                         nr_pages = 1;
1827                 }
1828
1829                 /*
1830                  * The folio is mapped into the page tables of one or more
1831                  * processes. Try to unmap it here.
1832                  */
1833                 if (folio_mapped(folio)) {
1834                         enum ttu_flags flags = TTU_BATCH_FLUSH;
1835                         bool was_swapbacked = folio_test_swapbacked(folio);
1836
1837                         if (folio_test_pmd_mappable(folio))
1838                                 flags |= TTU_SPLIT_HUGE_PMD;
1839
1840                         try_to_unmap(folio, flags);
1841                         if (folio_mapped(folio)) {
1842                                 stat->nr_unmap_fail += nr_pages;
1843                                 if (!was_swapbacked &&
1844                                     folio_test_swapbacked(folio))
1845                                         stat->nr_lazyfree_fail += nr_pages;
1846                                 goto activate_locked;
1847                         }
1848                 }
1849
1850                 mapping = folio_mapping(folio);
1851                 if (folio_test_dirty(folio)) {
1852                         /*
1853                          * Only kswapd can writeback filesystem folios
1854                          * to avoid risk of stack overflow. But avoid
1855                          * injecting inefficient single-folio I/O into
1856                          * flusher writeback as much as possible: only
1857                          * write folios when we've encountered many
1858                          * dirty folios, and when we've already scanned
1859                          * the rest of the LRU for clean folios and see
1860                          * the same dirty folios again (with the reclaim
1861                          * flag set).
1862                          */
1863                         if (folio_is_file_lru(folio) &&
1864                             (!current_is_kswapd() ||
1865                              !folio_test_reclaim(folio) ||
1866                              !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1867                                 /*
1868                                  * Immediately reclaim when written back.
1869                                  * Similar in principle to deactivate_page()
1870                                  * except we already have the folio isolated
1871                                  * and know it's dirty
1872                                  */
1873                                 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
1874                                                 nr_pages);
1875                                 folio_set_reclaim(folio);
1876
1877                                 goto activate_locked;
1878                         }
1879
1880                         if (references == PAGEREF_RECLAIM_CLEAN)
1881                                 goto keep_locked;
1882                         if (!may_enter_fs(folio, sc->gfp_mask))
1883                                 goto keep_locked;
1884                         if (!sc->may_writepage)
1885                                 goto keep_locked;
1886
1887                         /*
1888                          * Folio is dirty. Flush the TLB if a writable entry
1889                          * potentially exists to avoid CPU writes after I/O
1890                          * starts and then write it out here.
1891                          */
1892                         try_to_unmap_flush_dirty();
1893                         switch (pageout(folio, mapping, &plug)) {
1894                         case PAGE_KEEP:
1895                                 goto keep_locked;
1896                         case PAGE_ACTIVATE:
1897                                 goto activate_locked;
1898                         case PAGE_SUCCESS:
1899                                 stat->nr_pageout += nr_pages;
1900
1901                                 if (folio_test_writeback(folio))
1902                                         goto keep;
1903                                 if (folio_test_dirty(folio))
1904                                         goto keep;
1905
1906                                 /*
1907                                  * A synchronous write - probably a ramdisk.  Go
1908                                  * ahead and try to reclaim the folio.
1909                                  */
1910                                 if (!folio_trylock(folio))
1911                                         goto keep;
1912                                 if (folio_test_dirty(folio) ||
1913                                     folio_test_writeback(folio))
1914                                         goto keep_locked;
1915                                 mapping = folio_mapping(folio);
1916                                 fallthrough;
1917                         case PAGE_CLEAN:
1918                                 ; /* try to free the folio below */
1919                         }
1920                 }
1921
1922                 /*
1923                  * If the folio has buffers, try to free the buffer
1924                  * mappings associated with this folio. If we succeed
1925                  * we try to free the folio as well.
1926                  *
1927                  * We do this even if the folio is dirty.
1928                  * filemap_release_folio() does not perform I/O, but it
1929                  * is possible for a folio to have the dirty flag set,
1930                  * but it is actually clean (all its buffers are clean).
1931                  * This happens if the buffers were written out directly,
1932                  * with submit_bh(). ext3 will do this, as well as
1933                  * the blockdev mapping.  filemap_release_folio() will
1934                  * discover that cleanness and will drop the buffers
1935                  * and mark the folio clean - it can be freed.
1936                  *
1937                  * Rarely, folios can have buffers and no ->mapping.
1938                  * These are the folios which were not successfully
1939                  * invalidated in truncate_cleanup_folio().  We try to
1940                  * drop those buffers here and if that worked, and the
1941                  * folio is no longer mapped into process address space
1942                  * (refcount == 1) it can be freed.  Otherwise, leave
1943                  * the folio on the LRU so it is swappable.
1944                  */
1945                 if (folio_has_private(folio)) {
1946                         if (!filemap_release_folio(folio, sc->gfp_mask))
1947                                 goto activate_locked;
1948                         if (!mapping && folio_ref_count(folio) == 1) {
1949                                 folio_unlock(folio);
1950                                 if (folio_put_testzero(folio))
1951                                         goto free_it;
1952                                 else {
1953                                         /*
1954                                          * rare race with speculative reference.
1955                                          * the speculative reference will free
1956                                          * this folio shortly, so we may
1957                                          * increment nr_reclaimed here (and
1958                                          * leave it off the LRU).
1959                                          */
1960                                         nr_reclaimed += nr_pages;
1961                                         continue;
1962                                 }
1963                         }
1964                 }
1965
1966                 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1967                         /* follow __remove_mapping for reference */
1968                         if (!folio_ref_freeze(folio, 1))
1969                                 goto keep_locked;
1970                         /*
1971                          * The folio has only one reference left, which is
1972                          * from the isolation. After the caller puts the
1973                          * folio back on the lru and drops the reference, the
1974                          * folio will be freed anyway. It doesn't matter
1975                          * which lru it goes on. So we don't bother checking
1976                          * the dirty flag here.
1977                          */
1978                         count_vm_events(PGLAZYFREED, nr_pages);
1979                         count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
1980                 } else if (!mapping || !__remove_mapping(mapping, folio, true,
1981                                                          sc->target_mem_cgroup))
1982                         goto keep_locked;
1983
1984                 folio_unlock(folio);
1985 free_it:
1986                 /*
1987                  * Folio may get swapped out as a whole, need to account
1988                  * all pages in it.
1989                  */
1990                 nr_reclaimed += nr_pages;
1991
1992                 /*
1993                  * Is there need to periodically free_page_list? It would
1994                  * appear not as the counts should be low
1995                  */
1996                 if (unlikely(folio_test_large(folio)))
1997                         destroy_large_folio(folio);
1998                 else
1999                         list_add(&folio->lru, &free_pages);
2000                 continue;
2001
2002 activate_locked_split:
2003                 /*
2004                  * The tail pages that are failed to add into swap cache
2005                  * reach here.  Fixup nr_scanned and nr_pages.
2006                  */
2007                 if (nr_pages > 1) {
2008                         sc->nr_scanned -= (nr_pages - 1);
2009                         nr_pages = 1;
2010                 }
2011 activate_locked:
2012                 /* Not a candidate for swapping, so reclaim swap space. */
2013                 if (folio_test_swapcache(folio) &&
2014                     (mem_cgroup_swap_full(&folio->page) ||
2015                      folio_test_mlocked(folio)))
2016                         try_to_free_swap(&folio->page);
2017                 VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
2018                 if (!folio_test_mlocked(folio)) {
2019                         int type = folio_is_file_lru(folio);
2020                         folio_set_active(folio);
2021                         stat->nr_activate[type] += nr_pages;
2022                         count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
2023                 }
2024 keep_locked:
2025                 folio_unlock(folio);
2026 keep:
2027                 list_add(&folio->lru, &ret_pages);
2028                 VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
2029                                 folio_test_unevictable(folio), folio);
2030         }
2031         /* 'page_list' is always empty here */
2032
2033         /* Migrate folios selected for demotion */
2034         nr_reclaimed += demote_page_list(&demote_pages, pgdat);
2035         /* Folios that could not be demoted are still in @demote_pages */
2036         if (!list_empty(&demote_pages)) {
2037                 /* Folios which weren't demoted go back on @page_list for retry: */
2038                 list_splice_init(&demote_pages, page_list);
2039                 do_demote_pass = false;
2040                 goto retry;
2041         }
2042
2043         pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
2044
2045         mem_cgroup_uncharge_list(&free_pages);
2046         try_to_unmap_flush();
2047         free_unref_page_list(&free_pages);
2048
2049         list_splice(&ret_pages, page_list);
2050         count_vm_events(PGACTIVATE, pgactivate);
2051
2052         if (plug)
2053                 swap_write_unplug(plug);
2054         return nr_reclaimed;
2055 }
2056
2057 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
2058                                             struct list_head *folio_list)
2059 {
2060         struct scan_control sc = {
2061                 .gfp_mask = GFP_KERNEL,
2062                 .may_unmap = 1,
2063         };
2064         struct reclaim_stat stat;
2065         unsigned int nr_reclaimed;
2066         struct folio *folio, *next;
2067         LIST_HEAD(clean_folios);
2068         unsigned int noreclaim_flag;
2069
2070         list_for_each_entry_safe(folio, next, folio_list, lru) {
2071                 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
2072                     !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
2073                     !folio_test_unevictable(folio)) {
2074                         folio_clear_active(folio);
2075                         list_move(&folio->lru, &clean_folios);
2076                 }
2077         }
2078
2079         /*
2080          * We should be safe here since we are only dealing with file pages and
2081          * we are not kswapd and therefore cannot write dirty file pages. But
2082          * call memalloc_noreclaim_save() anyway, just in case these conditions
2083          * change in the future.
2084          */
2085         noreclaim_flag = memalloc_noreclaim_save();
2086         nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
2087                                         &stat, true);
2088         memalloc_noreclaim_restore(noreclaim_flag);
2089
2090         list_splice(&clean_folios, folio_list);
2091         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2092                             -(long)nr_reclaimed);
2093         /*
2094          * Since lazyfree pages are isolated from file LRU from the beginning,
2095          * they will rotate back to anonymous LRU in the end if it failed to
2096          * discard so isolated count will be mismatched.
2097          * Compensate the isolated count for both LRU lists.
2098          */
2099         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
2100                             stat.nr_lazyfree_fail);
2101         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2102                             -(long)stat.nr_lazyfree_fail);
2103         return nr_reclaimed;
2104 }
2105
2106 /*
2107  * Update LRU sizes after isolating pages. The LRU size updates must
2108  * be complete before mem_cgroup_update_lru_size due to a sanity check.
2109  */
2110 static __always_inline void update_lru_sizes(struct lruvec *lruvec,
2111                         enum lru_list lru, unsigned long *nr_zone_taken)
2112 {
2113         int zid;
2114
2115         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2116                 if (!nr_zone_taken[zid])
2117                         continue;
2118
2119                 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
2120         }
2121
2122 }
2123
2124 /*
2125  * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
2126  *
2127  * lruvec->lru_lock is heavily contended.  Some of the functions that
2128  * shrink the lists perform better by taking out a batch of pages
2129  * and working on them outside the LRU lock.
2130  *
2131  * For pagecache intensive workloads, this function is the hottest
2132  * spot in the kernel (apart from copy_*_user functions).
2133  *
2134  * Lru_lock must be held before calling this function.
2135  *
2136  * @nr_to_scan: The number of eligible pages to look through on the list.
2137  * @lruvec:     The LRU vector to pull pages from.
2138  * @dst:        The temp list to put pages on to.
2139  * @nr_scanned: The number of pages that were scanned.
2140  * @sc:         The scan_control struct for this reclaim session
2141  * @lru:        LRU list id for isolating
2142  *
2143  * returns how many pages were moved onto *@dst.
2144  */
2145 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
2146                 struct lruvec *lruvec, struct list_head *dst,
2147                 unsigned long *nr_scanned, struct scan_control *sc,
2148                 enum lru_list lru)
2149 {
2150         struct list_head *src = &lruvec->lists[lru];
2151         unsigned long nr_taken = 0;
2152         unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
2153         unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
2154         unsigned long skipped = 0;
2155         unsigned long scan, total_scan, nr_pages;
2156         LIST_HEAD(folios_skipped);
2157
2158         total_scan = 0;
2159         scan = 0;
2160         while (scan < nr_to_scan && !list_empty(src)) {
2161                 struct list_head *move_to = src;
2162                 struct folio *folio;
2163
2164                 folio = lru_to_folio(src);
2165                 prefetchw_prev_lru_folio(folio, src, flags);
2166
2167                 nr_pages = folio_nr_pages(folio);
2168                 total_scan += nr_pages;
2169
2170                 if (folio_zonenum(folio) > sc->reclaim_idx) {
2171                         nr_skipped[folio_zonenum(folio)] += nr_pages;
2172                         move_to = &folios_skipped;
2173                         goto move;
2174                 }
2175
2176                 /*
2177                  * Do not count skipped folios because that makes the function
2178                  * return with no isolated folios if the LRU mostly contains
2179                  * ineligible folios.  This causes the VM to not reclaim any
2180                  * folios, triggering a premature OOM.
2181                  * Account all pages in a folio.
2182                  */
2183                 scan += nr_pages;
2184
2185                 if (!folio_test_lru(folio))
2186                         goto move;
2187                 if (!sc->may_unmap && folio_mapped(folio))
2188                         goto move;
2189
2190                 /*
2191                  * Be careful not to clear the lru flag until after we're
2192                  * sure the folio is not being freed elsewhere -- the
2193                  * folio release code relies on it.
2194                  */
2195                 if (unlikely(!folio_try_get(folio)))
2196                         goto move;
2197
2198                 if (!folio_test_clear_lru(folio)) {
2199                         /* Another thread is already isolating this folio */
2200                         folio_put(folio);
2201                         goto move;
2202                 }
2203
2204                 nr_taken += nr_pages;
2205                 nr_zone_taken[folio_zonenum(folio)] += nr_pages;
2206                 move_to = dst;
2207 move:
2208                 list_move(&folio->lru, move_to);
2209         }
2210
2211         /*
2212          * Splice any skipped folios to the start of the LRU list. Note that
2213          * this disrupts the LRU order when reclaiming for lower zones but
2214          * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
2215          * scanning would soon rescan the same folios to skip and waste lots
2216          * of cpu cycles.
2217          */
2218         if (!list_empty(&folios_skipped)) {
2219                 int zid;
2220
2221                 list_splice(&folios_skipped, src);
2222                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2223                         if (!nr_skipped[zid])
2224                                 continue;
2225
2226                         __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
2227                         skipped += nr_skipped[zid];
2228                 }
2229         }
2230         *nr_scanned = total_scan;
2231         trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
2232                                     total_scan, skipped, nr_taken,
2233                                     sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
2234         update_lru_sizes(lruvec, lru, nr_zone_taken);
2235         return nr_taken;
2236 }
2237
2238 /**
2239  * folio_isolate_lru() - Try to isolate a folio from its LRU list.
2240  * @folio: Folio to isolate from its LRU list.
2241  *
2242  * Isolate a @folio from an LRU list and adjust the vmstat statistic
2243  * corresponding to whatever LRU list the folio was on.
2244  *
2245  * The folio will have its LRU flag cleared.  If it was found on the
2246  * active list, it will have the Active flag set.  If it was found on the
2247  * unevictable list, it will have the Unevictable flag set.  These flags
2248  * may need to be cleared by the caller before letting the page go.
2249  *
2250  * Context:
2251  *
2252  * (1) Must be called with an elevated refcount on the page. This is a
2253  *     fundamental difference from isolate_lru_pages() (which is called
2254  *     without a stable reference).
2255  * (2) The lru_lock must not be held.
2256  * (3) Interrupts must be enabled.
2257  *
2258  * Return: 0 if the folio was removed from an LRU list.
2259  * -EBUSY if the folio was not on an LRU list.
2260  */
2261 int folio_isolate_lru(struct folio *folio)
2262 {
2263         int ret = -EBUSY;
2264
2265         VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
2266
2267         if (folio_test_clear_lru(folio)) {
2268                 struct lruvec *lruvec;
2269
2270                 folio_get(folio);
2271                 lruvec = folio_lruvec_lock_irq(folio);
2272                 lruvec_del_folio(lruvec, folio);
2273                 unlock_page_lruvec_irq(lruvec);
2274                 ret = 0;
2275         }
2276
2277         return ret;
2278 }
2279
2280 /*
2281  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
2282  * then get rescheduled. When there are massive number of tasks doing page
2283  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
2284  * the LRU list will go small and be scanned faster than necessary, leading to
2285  * unnecessary swapping, thrashing and OOM.
2286  */
2287 static int too_many_isolated(struct pglist_data *pgdat, int file,
2288                 struct scan_control *sc)
2289 {
2290         unsigned long inactive, isolated;
2291         bool too_many;
2292
2293         if (current_is_kswapd())
2294                 return 0;
2295
2296         if (!writeback_throttling_sane(sc))
2297                 return 0;
2298
2299         if (file) {
2300                 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
2301                 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
2302         } else {
2303                 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
2304                 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
2305         }
2306
2307         /*
2308          * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
2309          * won't get blocked by normal direct-reclaimers, forming a circular
2310          * deadlock.
2311          */
2312         if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
2313                 inactive >>= 3;
2314
2315         too_many = isolated > inactive;
2316
2317         /* Wake up tasks throttled due to too_many_isolated. */
2318         if (!too_many)
2319                 wake_throttle_isolated(pgdat);
2320
2321         return too_many;
2322 }
2323
2324 /*
2325  * move_pages_to_lru() moves folios from private @list to appropriate LRU list.
2326  * On return, @list is reused as a list of folios to be freed by the caller.
2327  *
2328  * Returns the number of pages moved to the given lruvec.
2329  */
2330 static unsigned int move_pages_to_lru(struct lruvec *lruvec,
2331                                       struct list_head *list)
2332 {
2333         int nr_pages, nr_moved = 0;
2334         LIST_HEAD(folios_to_free);
2335
2336         while (!list_empty(list)) {
2337                 struct folio *folio = lru_to_folio(list);
2338
2339                 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
2340                 list_del(&folio->lru);
2341                 if (unlikely(!folio_evictable(folio))) {
2342                         spin_unlock_irq(&lruvec->lru_lock);
2343                         folio_putback_lru(folio);
2344                         spin_lock_irq(&lruvec->lru_lock);
2345                         continue;
2346                 }
2347
2348                 /*
2349                  * The folio_set_lru needs to be kept here for list integrity.
2350                  * Otherwise:
2351                  *   #0 move_pages_to_lru             #1 release_pages
2352                  *   if (!folio_put_testzero())
2353                  *                                    if (folio_put_testzero())
2354                  *                                      !lru //skip lru_lock
2355                  *     folio_set_lru()
2356                  *     list_add(&folio->lru,)
2357                  *                                        list_add(&folio->lru,)
2358                  */
2359                 folio_set_lru(folio);
2360
2361                 if (unlikely(folio_put_testzero(folio))) {
2362                         __folio_clear_lru_flags(folio);
2363
2364                         if (unlikely(folio_test_large(folio))) {
2365                                 spin_unlock_irq(&lruvec->lru_lock);
2366                                 destroy_large_folio(folio);
2367                                 spin_lock_irq(&lruvec->lru_lock);
2368                         } else
2369                                 list_add(&folio->lru, &folios_to_free);
2370
2371                         continue;
2372                 }
2373
2374                 /*
2375                  * All pages were isolated from the same lruvec (and isolation
2376                  * inhibits memcg migration).
2377                  */
2378                 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
2379                 lruvec_add_folio(lruvec, folio);
2380                 nr_pages = folio_nr_pages(folio);
2381                 nr_moved += nr_pages;
2382                 if (folio_test_active(folio))
2383                         workingset_age_nonresident(lruvec, nr_pages);
2384         }
2385
2386         /*
2387          * To save our caller's stack, now use input list for pages to free.
2388          */
2389         list_splice(&folios_to_free, list);
2390
2391         return nr_moved;
2392 }
2393
2394 /*
2395  * If a kernel thread (such as nfsd for loop-back mounts) services a backing
2396  * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
2397  * we should not throttle.  Otherwise it is safe to do so.
2398  */
2399 static int current_may_throttle(void)
2400 {
2401         return !(current->flags & PF_LOCAL_THROTTLE);
2402 }
2403
2404 /*
2405  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
2406  * of reclaimed pages
2407  */
2408 static unsigned long
2409 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
2410                      struct scan_control *sc, enum lru_list lru)
2411 {
2412         LIST_HEAD(page_list);
2413         unsigned long nr_scanned;
2414         unsigned int nr_reclaimed = 0;
2415         unsigned long nr_taken;
2416         struct reclaim_stat stat;
2417         bool file = is_file_lru(lru);
2418         enum vm_event_item item;
2419         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2420         bool stalled = false;
2421
2422         while (unlikely(too_many_isolated(pgdat, file, sc))) {
2423                 if (stalled)
2424                         return 0;
2425
2426                 /* wait a bit for the reclaimer. */
2427                 stalled = true;
2428                 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
2429
2430                 /* We are about to die and free our memory. Return now. */
2431                 if (fatal_signal_pending(current))
2432                         return SWAP_CLUSTER_MAX;
2433         }
2434
2435         lru_add_drain();
2436
2437         spin_lock_irq(&lruvec->lru_lock);
2438
2439         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
2440                                      &nr_scanned, sc, lru);
2441
2442         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2443         item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
2444         if (!cgroup_reclaim(sc))
2445                 __count_vm_events(item, nr_scanned);
2446         __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
2447         __count_vm_events(PGSCAN_ANON + file, nr_scanned);
2448
2449         spin_unlock_irq(&lruvec->lru_lock);
2450
2451         if (nr_taken == 0)
2452                 return 0;
2453
2454         nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
2455
2456         spin_lock_irq(&lruvec->lru_lock);
2457         move_pages_to_lru(lruvec, &page_list);
2458
2459         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2460         item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
2461         if (!cgroup_reclaim(sc))
2462                 __count_vm_events(item, nr_reclaimed);
2463         __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2464         __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
2465         spin_unlock_irq(&lruvec->lru_lock);
2466
2467         lru_note_cost(lruvec, file, stat.nr_pageout);
2468         mem_cgroup_uncharge_list(&page_list);
2469         free_unref_page_list(&page_list);
2470
2471         /*
2472          * If dirty pages are scanned that are not queued for IO, it
2473          * implies that flushers are not doing their job. This can
2474          * happen when memory pressure pushes dirty pages to the end of
2475          * the LRU before the dirty limits are breached and the dirty
2476          * data has expired. It can also happen when the proportion of
2477          * dirty pages grows not through writes but through memory
2478          * pressure reclaiming all the clean cache. And in some cases,
2479          * the flushers simply cannot keep up with the allocation
2480          * rate. Nudge the flusher threads in case they are asleep.
2481          */
2482         if (stat.nr_unqueued_dirty == nr_taken)
2483                 wakeup_flusher_threads(WB_REASON_VMSCAN);
2484
2485         sc->nr.dirty += stat.nr_dirty;
2486         sc->nr.congested += stat.nr_congested;
2487         sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2488         sc->nr.writeback += stat.nr_writeback;
2489         sc->nr.immediate += stat.nr_immediate;
2490         sc->nr.taken += nr_taken;
2491         if (file)
2492                 sc->nr.file_taken += nr_taken;
2493
2494         trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2495                         nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2496         return nr_reclaimed;
2497 }
2498
2499 /*
2500  * shrink_active_list() moves folios from the active LRU to the inactive LRU.
2501  *
2502  * We move them the other way if the folio is referenced by one or more
2503  * processes.
2504  *
2505  * If the folios are mostly unmapped, the processing is fast and it is
2506  * appropriate to hold lru_lock across the whole operation.  But if
2507  * the folios are mapped, the processing is slow (folio_referenced()), so
2508  * we should drop lru_lock around each folio.  It's impossible to balance
2509  * this, so instead we remove the folios from the LRU while processing them.
2510  * It is safe to rely on the active flag against the non-LRU folios in here
2511  * because nobody will play with that bit on a non-LRU folio.
2512  *
2513  * The downside is that we have to touch folio->_refcount against each folio.
2514  * But we had to alter folio->flags anyway.
2515  */
2516 static void shrink_active_list(unsigned long nr_to_scan,
2517                                struct lruvec *lruvec,
2518                                struct scan_control *sc,
2519                                enum lru_list lru)
2520 {
2521         unsigned long nr_taken;
2522         unsigned long nr_scanned;
2523         unsigned long vm_flags;
2524         LIST_HEAD(l_hold);      /* The folios which were snipped off */
2525         LIST_HEAD(l_active);
2526         LIST_HEAD(l_inactive);
2527         unsigned nr_deactivate, nr_activate;
2528         unsigned nr_rotated = 0;
2529         int file = is_file_lru(lru);
2530         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2531
2532         lru_add_drain();
2533
2534         spin_lock_irq(&lruvec->lru_lock);
2535
2536         nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2537                                      &nr_scanned, sc, lru);
2538
2539         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2540
2541         if (!cgroup_reclaim(sc))
2542                 __count_vm_events(PGREFILL, nr_scanned);
2543         __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2544
2545         spin_unlock_irq(&lruvec->lru_lock);
2546
2547         while (!list_empty(&l_hold)) {
2548                 struct folio *folio;
2549
2550                 cond_resched();
2551                 folio = lru_to_folio(&l_hold);
2552                 list_del(&folio->lru);
2553
2554                 if (unlikely(!folio_evictable(folio))) {
2555                         folio_putback_lru(folio);
2556                         continue;
2557                 }
2558
2559                 if (unlikely(buffer_heads_over_limit)) {
2560                         if (folio_test_private(folio) && folio_trylock(folio)) {
2561                                 if (folio_test_private(folio))
2562                                         filemap_release_folio(folio, 0);
2563                                 folio_unlock(folio);
2564                         }
2565                 }
2566
2567                 /* Referenced or rmap lock contention: rotate */
2568                 if (folio_referenced(folio, 0, sc->target_mem_cgroup,
2569                                      &vm_flags) != 0) {
2570                         /*
2571                          * Identify referenced, file-backed active folios and
2572                          * give them one more trip around the active list. So
2573                          * that executable code get better chances to stay in
2574                          * memory under moderate memory pressure.  Anon folios
2575                          * are not likely to be evicted by use-once streaming
2576                          * IO, plus JVM can create lots of anon VM_EXEC folios,
2577                          * so we ignore them here.
2578                          */
2579                         if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2580                                 nr_rotated += folio_nr_pages(folio);
2581                                 list_add(&folio->lru, &l_active);
2582                                 continue;
2583                         }
2584                 }
2585
2586                 folio_clear_active(folio);      /* we are de-activating */
2587                 folio_set_workingset(folio);
2588                 list_add(&folio->lru, &l_inactive);
2589         }
2590
2591         /*
2592          * Move folios back to the lru list.
2593          */
2594         spin_lock_irq(&lruvec->lru_lock);
2595
2596         nr_activate = move_pages_to_lru(lruvec, &l_active);
2597         nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2598         /* Keep all free folios in l_active list */
2599         list_splice(&l_inactive, &l_active);
2600
2601         __count_vm_events(PGDEACTIVATE, nr_deactivate);
2602         __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2603
2604         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2605         spin_unlock_irq(&lruvec->lru_lock);
2606
2607         mem_cgroup_uncharge_list(&l_active);
2608         free_unref_page_list(&l_active);
2609         trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2610                         nr_deactivate, nr_rotated, sc->priority, file);
2611 }
2612
2613 static unsigned int reclaim_page_list(struct list_head *page_list,
2614                                       struct pglist_data *pgdat)
2615 {
2616         struct reclaim_stat dummy_stat;
2617         unsigned int nr_reclaimed;
2618         struct folio *folio;
2619         struct scan_control sc = {
2620                 .gfp_mask = GFP_KERNEL,
2621                 .may_writepage = 1,
2622                 .may_unmap = 1,
2623                 .may_swap = 1,
2624                 .no_demotion = 1,
2625         };
2626
2627         nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false);
2628         while (!list_empty(page_list)) {
2629                 folio = lru_to_folio(page_list);
2630                 list_del(&folio->lru);
2631                 folio_putback_lru(folio);
2632         }
2633
2634         return nr_reclaimed;
2635 }
2636
2637 unsigned long reclaim_pages(struct list_head *folio_list)
2638 {
2639         int nid;
2640         unsigned int nr_reclaimed = 0;
2641         LIST_HEAD(node_folio_list);
2642         unsigned int noreclaim_flag;
2643
2644         if (list_empty(folio_list))
2645                 return nr_reclaimed;
2646
2647         noreclaim_flag = memalloc_noreclaim_save();
2648
2649         nid = folio_nid(lru_to_folio(folio_list));
2650         do {
2651                 struct folio *folio = lru_to_folio(folio_list);
2652
2653                 if (nid == folio_nid(folio)) {
2654                         folio_clear_active(folio);
2655                         list_move(&folio->lru, &node_folio_list);
2656                         continue;
2657                 }
2658
2659                 nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
2660                 nid = folio_nid(lru_to_folio(folio_list));
2661         } while (!list_empty(folio_list));
2662
2663         nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
2664
2665         memalloc_noreclaim_restore(noreclaim_flag);
2666
2667         return nr_reclaimed;
2668 }
2669
2670 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2671                                  struct lruvec *lruvec, struct scan_control *sc)
2672 {
2673         if (is_active_lru(lru)) {
2674                 if (sc->may_deactivate & (1 << is_file_lru(lru)))
2675                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
2676                 else
2677                         sc->skipped_deactivate = 1;
2678                 return 0;
2679         }
2680
2681         return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2682 }
2683
2684 /*
2685  * The inactive anon list should be small enough that the VM never has
2686  * to do too much work.
2687  *
2688  * The inactive file list should be small enough to leave most memory
2689  * to the established workingset on the scan-resistant active list,
2690  * but large enough to avoid thrashing the aggregate readahead window.
2691  *
2692  * Both inactive lists should also be large enough that each inactive
2693  * page has a chance to be referenced again before it is reclaimed.
2694  *
2695  * If that fails and refaulting is observed, the inactive list grows.
2696  *
2697  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
2698  * on this LRU, maintained by the pageout code. An inactive_ratio
2699  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
2700  *
2701  * total     target    max
2702  * memory    ratio     inactive
2703  * -------------------------------------
2704  *   10MB       1         5MB
2705  *  100MB       1        50MB
2706  *    1GB       3       250MB
2707  *   10GB      10       0.9GB
2708  *  100GB      31         3GB
2709  *    1TB     101        10GB
2710  *   10TB     320        32GB
2711  */
2712 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2713 {
2714         enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2715         unsigned long inactive, active;
2716         unsigned long inactive_ratio;
2717         unsigned long gb;
2718
2719         inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2720         active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2721
2722         gb = (inactive + active) >> (30 - PAGE_SHIFT);
2723         if (gb)
2724                 inactive_ratio = int_sqrt(10 * gb);
2725         else
2726                 inactive_ratio = 1;
2727
2728         return inactive * inactive_ratio < active;
2729 }
2730
2731 enum scan_balance {
2732         SCAN_EQUAL,
2733         SCAN_FRACT,
2734         SCAN_ANON,
2735         SCAN_FILE,
2736 };
2737
2738 static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
2739 {
2740         unsigned long file;
2741         struct lruvec *target_lruvec;
2742
2743         if (lru_gen_enabled())
2744                 return;
2745
2746         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2747
2748         /*
2749          * Flush the memory cgroup stats, so that we read accurate per-memcg
2750          * lruvec stats for heuristics.
2751          */
2752         mem_cgroup_flush_stats();
2753
2754         /*
2755          * Determine the scan balance between anon and file LRUs.
2756          */
2757         spin_lock_irq(&target_lruvec->lru_lock);
2758         sc->anon_cost = target_lruvec->anon_cost;
2759         sc->file_cost = target_lruvec->file_cost;
2760         spin_unlock_irq(&target_lruvec->lru_lock);
2761
2762         /*
2763          * Target desirable inactive:active list ratios for the anon
2764          * and file LRU lists.
2765          */
2766         if (!sc->force_deactivate) {
2767                 unsigned long refaults;
2768
2769                 /*
2770                  * When refaults are being observed, it means a new
2771                  * workingset is being established. Deactivate to get
2772                  * rid of any stale active pages quickly.
2773                  */
2774                 refaults = lruvec_page_state(target_lruvec,
2775                                 WORKINGSET_ACTIVATE_ANON);
2776                 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
2777                         inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2778                         sc->may_deactivate |= DEACTIVATE_ANON;
2779                 else
2780                         sc->may_deactivate &= ~DEACTIVATE_ANON;
2781
2782                 refaults = lruvec_page_state(target_lruvec,
2783                                 WORKINGSET_ACTIVATE_FILE);
2784                 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
2785                     inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2786                         sc->may_deactivate |= DEACTIVATE_FILE;
2787                 else
2788                         sc->may_deactivate &= ~DEACTIVATE_FILE;
2789         } else
2790                 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2791
2792         /*
2793          * If we have plenty of inactive file pages that aren't
2794          * thrashing, try to reclaim those first before touching
2795          * anonymous pages.
2796          */
2797         file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2798         if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
2799                 sc->cache_trim_mode = 1;
2800         else
2801                 sc->cache_trim_mode = 0;
2802
2803         /*
2804          * Prevent the reclaimer from falling into the cache trap: as
2805          * cache pages start out inactive, every cache fault will tip
2806          * the scan balance towards the file LRU.  And as the file LRU
2807          * shrinks, so does the window for rotation from references.
2808          * This means we have a runaway feedback loop where a tiny
2809          * thrashing file LRU becomes infinitely more attractive than
2810          * anon pages.  Try to detect this based on file LRU size.
2811          */
2812         if (!cgroup_reclaim(sc)) {
2813                 unsigned long total_high_wmark = 0;
2814                 unsigned long free, anon;
2815                 int z;
2816
2817                 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2818                 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2819                            node_page_state(pgdat, NR_INACTIVE_FILE);
2820
2821                 for (z = 0; z < MAX_NR_ZONES; z++) {
2822                         struct zone *zone = &pgdat->node_zones[z];
2823
2824                         if (!managed_zone(zone))
2825                                 continue;
2826
2827                         total_high_wmark += high_wmark_pages(zone);
2828                 }
2829
2830                 /*
2831                  * Consider anon: if that's low too, this isn't a
2832                  * runaway file reclaim problem, but rather just
2833                  * extreme pressure. Reclaim as per usual then.
2834                  */
2835                 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2836
2837                 sc->file_is_tiny =
2838                         file + free <= total_high_wmark &&
2839                         !(sc->may_deactivate & DEACTIVATE_ANON) &&
2840                         anon >> sc->priority;
2841         }
2842 }
2843
2844 /*
2845  * Determine how aggressively the anon and file LRU lists should be
2846  * scanned.
2847  *
2848  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
2849  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
2850  */
2851 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2852                            unsigned long *nr)
2853 {
2854         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2855         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2856         unsigned long anon_cost, file_cost, total_cost;
2857         int swappiness = mem_cgroup_swappiness(memcg);
2858         u64 fraction[ANON_AND_FILE];
2859         u64 denominator = 0;    /* gcc */
2860         enum scan_balance scan_balance;
2861         unsigned long ap, fp;
2862         enum lru_list lru;
2863
2864         /* If we have no swap space, do not bother scanning anon pages. */
2865         if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2866                 scan_balance = SCAN_FILE;
2867                 goto out;
2868         }
2869
2870         /*
2871          * Global reclaim will swap to prevent OOM even with no
2872          * swappiness, but memcg users want to use this knob to
2873          * disable swapping for individual groups completely when
2874          * using the memory controller's swap limit feature would be
2875          * too expensive.
2876          */
2877         if (cgroup_reclaim(sc) && !swappiness) {
2878                 scan_balance = SCAN_FILE;
2879                 goto out;
2880         }
2881
2882         /*
2883          * Do not apply any pressure balancing cleverness when the
2884          * system is close to OOM, scan both anon and file equally
2885          * (unless the swappiness setting disagrees with swapping).
2886          */
2887         if (!sc->priority && swappiness) {
2888                 scan_balance = SCAN_EQUAL;
2889                 goto out;
2890         }
2891
2892         /*
2893          * If the system is almost out of file pages, force-scan anon.
2894          */
2895         if (sc->file_is_tiny) {
2896                 scan_balance = SCAN_ANON;
2897                 goto out;
2898         }
2899
2900         /*
2901          * If there is enough inactive page cache, we do not reclaim
2902          * anything from the anonymous working right now.
2903          */
2904         if (sc->cache_trim_mode) {
2905                 scan_balance = SCAN_FILE;
2906                 goto out;
2907         }
2908
2909         scan_balance = SCAN_FRACT;
2910         /*
2911          * Calculate the pressure balance between anon and file pages.
2912          *
2913          * The amount of pressure we put on each LRU is inversely
2914          * proportional to the cost of reclaiming each list, as
2915          * determined by the share of pages that are refaulting, times
2916          * the relative IO cost of bringing back a swapped out
2917          * anonymous page vs reloading a filesystem page (swappiness).
2918          *
2919          * Although we limit that influence to ensure no list gets
2920          * left behind completely: at least a third of the pressure is
2921          * applied, before swappiness.
2922          *
2923          * With swappiness at 100, anon and file have equal IO cost.
2924          */
2925         total_cost = sc->anon_cost + sc->file_cost;
2926         anon_cost = total_cost + sc->anon_cost;
2927         file_cost = total_cost + sc->file_cost;
2928         total_cost = anon_cost + file_cost;
2929
2930         ap = swappiness * (total_cost + 1);
2931         ap /= anon_cost + 1;
2932
2933         fp = (200 - swappiness) * (total_cost + 1);
2934         fp /= file_cost + 1;
2935
2936         fraction[0] = ap;
2937         fraction[1] = fp;
2938         denominator = ap + fp;
2939 out:
2940         for_each_evictable_lru(lru) {
2941                 int file = is_file_lru(lru);
2942                 unsigned long lruvec_size;
2943                 unsigned long low, min;
2944                 unsigned long scan;
2945
2946                 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2947                 mem_cgroup_protection(sc->target_mem_cgroup, memcg,
2948                                       &min, &low);
2949
2950                 if (min || low) {
2951                         /*
2952                          * Scale a cgroup's reclaim pressure by proportioning
2953                          * its current usage to its memory.low or memory.min
2954                          * setting.
2955                          *
2956                          * This is important, as otherwise scanning aggression
2957                          * becomes extremely binary -- from nothing as we
2958                          * approach the memory protection threshold, to totally
2959                          * nominal as we exceed it.  This results in requiring
2960                          * setting extremely liberal protection thresholds. It
2961                          * also means we simply get no protection at all if we
2962                          * set it too low, which is not ideal.
2963                          *
2964                          * If there is any protection in place, we reduce scan
2965                          * pressure by how much of the total memory used is
2966                          * within protection thresholds.
2967                          *
2968                          * There is one special case: in the first reclaim pass,
2969                          * we skip over all groups that are within their low
2970                          * protection. If that fails to reclaim enough pages to
2971                          * satisfy the reclaim goal, we come back and override
2972                          * the best-effort low protection. However, we still
2973                          * ideally want to honor how well-behaved groups are in
2974                          * that case instead of simply punishing them all
2975                          * equally. As such, we reclaim them based on how much
2976                          * memory they are using, reducing the scan pressure
2977                          * again by how much of the total memory used is under
2978                          * hard protection.
2979                          */
2980                         unsigned long cgroup_size = mem_cgroup_size(memcg);
2981                         unsigned long protection;
2982
2983                         /* memory.low scaling, make sure we retry before OOM */
2984                         if (!sc->memcg_low_reclaim && low > min) {
2985                                 protection = low;
2986                                 sc->memcg_low_skipped = 1;
2987                         } else {
2988                                 protection = min;
2989                         }
2990
2991                         /* Avoid TOCTOU with earlier protection check */
2992                         cgroup_size = max(cgroup_size, protection);
2993
2994                         scan = lruvec_size - lruvec_size * protection /
2995                                 (cgroup_size + 1);
2996
2997                         /*
2998                          * Minimally target SWAP_CLUSTER_MAX pages to keep
2999                          * reclaim moving forwards, avoiding decrementing
3000                          * sc->priority further than desirable.
3001                          */
3002                         scan = max(scan, SWAP_CLUSTER_MAX);
3003                 } else {
3004                         scan = lruvec_size;
3005                 }
3006
3007                 scan >>= sc->priority;
3008
3009                 /*
3010                  * If the cgroup's already been deleted, make sure to
3011                  * scrape out the remaining cache.
3012                  */
3013                 if (!scan && !mem_cgroup_online(memcg))
3014                         scan = min(lruvec_size, SWAP_CLUSTER_MAX);
3015
3016                 switch (scan_balance) {
3017                 case SCAN_EQUAL:
3018                         /* Scan lists relative to size */
3019                         break;
3020                 case SCAN_FRACT:
3021                         /*
3022                          * Scan types proportional to swappiness and
3023                          * their relative recent reclaim efficiency.
3024                          * Make sure we don't miss the last page on
3025                          * the offlined memory cgroups because of a
3026                          * round-off error.
3027                          */
3028                         scan = mem_cgroup_online(memcg) ?
3029                                div64_u64(scan * fraction[file], denominator) :
3030                                DIV64_U64_ROUND_UP(scan * fraction[file],
3031                                                   denominator);
3032                         break;
3033                 case SCAN_FILE:
3034                 case SCAN_ANON:
3035                         /* Scan one type exclusively */
3036                         if ((scan_balance == SCAN_FILE) != file)
3037                                 scan = 0;
3038                         break;
3039                 default:
3040                         /* Look ma, no brain */
3041                         BUG();
3042                 }
3043
3044                 nr[lru] = scan;
3045         }
3046 }
3047
3048 /*
3049  * Anonymous LRU management is a waste if there is
3050  * ultimately no way to reclaim the memory.
3051  */
3052 static bool can_age_anon_pages(struct pglist_data *pgdat,
3053                                struct scan_control *sc)
3054 {
3055         /* Aging the anon LRU is valuable if swap is present: */
3056         if (total_swap_pages > 0)
3057                 return true;
3058
3059         /* Also valuable if anon pages can be demoted: */
3060         return can_demote(pgdat->node_id, sc);
3061 }
3062
3063 #ifdef CONFIG_LRU_GEN
3064
3065 /******************************************************************************
3066  *                          shorthand helpers
3067  ******************************************************************************/
3068
3069 #define LRU_REFS_FLAGS  (BIT(PG_referenced) | BIT(PG_workingset))
3070
3071 #define DEFINE_MAX_SEQ(lruvec)                                          \
3072         unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
3073
3074 #define DEFINE_MIN_SEQ(lruvec)                                          \
3075         unsigned long min_seq[ANON_AND_FILE] = {                        \
3076                 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),      \
3077                 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),      \
3078         }
3079
3080 #define for_each_gen_type_zone(gen, type, zone)                         \
3081         for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)                   \
3082                 for ((type) = 0; (type) < ANON_AND_FILE; (type)++)      \
3083                         for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
3084
3085 static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
3086 {
3087         struct pglist_data *pgdat = NODE_DATA(nid);
3088
3089 #ifdef CONFIG_MEMCG
3090         if (memcg) {
3091                 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
3092
3093                 /* for hotadd_new_pgdat() */
3094                 if (!lruvec->pgdat)
3095                         lruvec->pgdat = pgdat;
3096
3097                 return lruvec;
3098         }
3099 #endif
3100         VM_WARN_ON_ONCE(!mem_cgroup_disabled());
3101
3102         return pgdat ? &pgdat->__lruvec : NULL;
3103 }
3104
3105 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
3106 {
3107         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3108         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
3109
3110         if (!can_demote(pgdat->node_id, sc) &&
3111             mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
3112                 return 0;
3113
3114         return mem_cgroup_swappiness(memcg);
3115 }
3116
3117 static int get_nr_gens(struct lruvec *lruvec, int type)
3118 {
3119         return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
3120 }
3121
3122 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
3123 {
3124         /* see the comment on lru_gen_struct */
3125         return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
3126                get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
3127                get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
3128 }
3129
3130 /******************************************************************************
3131  *                          refault feedback loop
3132  ******************************************************************************/
3133
3134 /*
3135  * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3136  *
3137  * The P term is refaulted/(evicted+protected) from a tier in the generation
3138  * currently being evicted; the I term is the exponential moving average of the
3139  * P term over the generations previously evicted, using the smoothing factor
3140  * 1/2; the D term isn't supported.
3141  *
3142  * The setpoint (SP) is always the first tier of one type; the process variable
3143  * (PV) is either any tier of the other type or any other tier of the same
3144  * type.
3145  *
3146  * The error is the difference between the SP and the PV; the correction is to
3147  * turn off protection when SP>PV or turn on protection when SP<PV.
3148  *
3149  * For future optimizations:
3150  * 1. The D term may discount the other two terms over time so that long-lived
3151  *    generations can resist stale information.
3152  */
3153 struct ctrl_pos {
3154         unsigned long refaulted;
3155         unsigned long total;
3156         int gain;
3157 };
3158
3159 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
3160                           struct ctrl_pos *pos)
3161 {
3162         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3163         int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3164
3165         pos->refaulted = lrugen->avg_refaulted[type][tier] +
3166                          atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3167         pos->total = lrugen->avg_total[type][tier] +
3168                      atomic_long_read(&lrugen->evicted[hist][type][tier]);
3169         if (tier)
3170                 pos->total += lrugen->protected[hist][type][tier - 1];
3171         pos->gain = gain;
3172 }
3173
3174 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
3175 {
3176         int hist, tier;
3177         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3178         bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
3179         unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
3180
3181         lockdep_assert_held(&lruvec->lru_lock);
3182
3183         if (!carryover && !clear)
3184                 return;
3185
3186         hist = lru_hist_from_seq(seq);
3187
3188         for (tier = 0; tier < MAX_NR_TIERS; tier++) {
3189                 if (carryover) {
3190                         unsigned long sum;
3191
3192                         sum = lrugen->avg_refaulted[type][tier] +
3193                               atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3194                         WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
3195
3196                         sum = lrugen->avg_total[type][tier] +
3197                               atomic_long_read(&lrugen->evicted[hist][type][tier]);
3198                         if (tier)
3199                                 sum += lrugen->protected[hist][type][tier - 1];
3200                         WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
3201                 }
3202
3203                 if (clear) {
3204                         atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
3205                         atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
3206                         if (tier)
3207                                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
3208                 }
3209         }
3210 }
3211
3212 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
3213 {
3214         /*
3215          * Return true if the PV has a limited number of refaults or a lower
3216          * refaulted/total than the SP.
3217          */
3218         return pv->refaulted < MIN_LRU_BATCH ||
3219                pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
3220                (sp->refaulted + 1) * pv->total * pv->gain;
3221 }
3222
3223 /******************************************************************************
3224  *                          the aging
3225  ******************************************************************************/
3226
3227 /* promote pages accessed through page tables */
3228 static int folio_update_gen(struct folio *folio, int gen)
3229 {
3230         unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3231
3232         VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
3233         VM_WARN_ON_ONCE(!rcu_read_lock_held());
3234
3235         do {
3236                 /* lru_gen_del_folio() has isolated this page? */
3237                 if (!(old_flags & LRU_GEN_MASK)) {
3238                         /* for shrink_page_list() */
3239                         new_flags = old_flags | BIT(PG_referenced);
3240                         continue;
3241                 }
3242
3243                 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
3244                 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
3245         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3246
3247         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3248 }
3249
3250 /* protect pages accessed multiple times through file descriptors */
3251 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
3252 {
3253         int type = folio_is_file_lru(folio);
3254         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3255         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3256         unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3257
3258         VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
3259
3260         do {
3261                 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3262                 /* folio_update_gen() has promoted this page? */
3263                 if (new_gen >= 0 && new_gen != old_gen)
3264                         return new_gen;
3265
3266                 new_gen = (old_gen + 1) % MAX_NR_GENS;
3267
3268                 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
3269                 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
3270                 /* for folio_end_writeback() */
3271                 if (reclaiming)
3272                         new_flags |= BIT(PG_reclaim);
3273         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3274
3275         lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3276
3277         return new_gen;
3278 }
3279
3280 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3281 {
3282         unsigned long pfn = pte_pfn(pte);
3283
3284         VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3285
3286         if (!pte_present(pte) || is_zero_pfn(pfn))
3287                 return -1;
3288
3289         if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
3290                 return -1;
3291
3292         if (WARN_ON_ONCE(!pfn_valid(pfn)))
3293                 return -1;
3294
3295         return pfn;
3296 }
3297
3298 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3299                                    struct pglist_data *pgdat)
3300 {
3301         struct folio *folio;
3302
3303         /* try to avoid unnecessary memory loads */
3304         if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3305                 return NULL;
3306
3307         folio = pfn_folio(pfn);
3308         if (folio_nid(folio) != pgdat->node_id)
3309                 return NULL;
3310
3311         if (folio_memcg_rcu(folio) != memcg)
3312                 return NULL;
3313
3314         return folio;
3315 }
3316
3317 static void inc_min_seq(struct lruvec *lruvec, int type)
3318 {
3319         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3320
3321         reset_ctrl_pos(lruvec, type, true);
3322         WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
3323 }
3324
3325 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
3326 {
3327         int gen, type, zone;
3328         bool success = false;
3329         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3330         DEFINE_MIN_SEQ(lruvec);
3331
3332         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3333
3334         /* find the oldest populated generation */
3335         for (type = !can_swap; type < ANON_AND_FILE; type++) {
3336                 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
3337                         gen = lru_gen_from_seq(min_seq[type]);
3338
3339                         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3340                                 if (!list_empty(&lrugen->lists[gen][type][zone]))
3341                                         goto next;
3342                         }
3343
3344                         min_seq[type]++;
3345                 }
3346 next:
3347                 ;
3348         }
3349
3350         /* see the comment on lru_gen_struct */
3351         if (can_swap) {
3352                 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
3353                 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
3354         }
3355
3356         for (type = !can_swap; type < ANON_AND_FILE; type++) {
3357                 if (min_seq[type] == lrugen->min_seq[type])
3358                         continue;
3359
3360                 reset_ctrl_pos(lruvec, type, true);
3361                 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
3362                 success = true;
3363         }
3364
3365         return success;
3366 }
3367
3368 static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
3369 {
3370         int prev, next;
3371         int type, zone;
3372         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3373
3374         spin_lock_irq(&lruvec->lru_lock);
3375
3376         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3377
3378         if (max_seq != lrugen->max_seq)
3379                 goto unlock;
3380
3381         for (type = ANON_AND_FILE - 1; type >= 0; type--) {
3382                 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
3383                         continue;
3384
3385                 VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
3386
3387                 inc_min_seq(lruvec, type);
3388         }
3389
3390         /*
3391          * Update the active/inactive LRU sizes for compatibility. Both sides of
3392          * the current max_seq need to be covered, since max_seq+1 can overlap
3393          * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
3394          * overlap, cold/hot inversion happens.
3395          */
3396         prev = lru_gen_from_seq(lrugen->max_seq - 1);
3397         next = lru_gen_from_seq(lrugen->max_seq + 1);
3398
3399         for (type = 0; type < ANON_AND_FILE; type++) {
3400                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3401                         enum lru_list lru = type * LRU_INACTIVE_FILE;
3402                         long delta = lrugen->nr_pages[prev][type][zone] -
3403                                      lrugen->nr_pages[next][type][zone];
3404
3405                         if (!delta)
3406                                 continue;
3407
3408                         __update_lru_size(lruvec, lru, zone, delta);
3409                         __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
3410                 }
3411         }
3412
3413         for (type = 0; type < ANON_AND_FILE; type++)
3414                 reset_ctrl_pos(lruvec, type, false);
3415
3416         /* make sure preceding modifications appear */
3417         smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
3418 unlock:
3419         spin_unlock_irq(&lruvec->lru_lock);
3420 }
3421
3422 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
3423                              struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
3424 {
3425         int gen, type, zone;
3426         unsigned long old = 0;
3427         unsigned long young = 0;
3428         unsigned long total = 0;
3429         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3430         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3431
3432         for (type = !can_swap; type < ANON_AND_FILE; type++) {
3433                 unsigned long seq;
3434
3435                 for (seq = min_seq[type]; seq <= max_seq; seq++) {
3436                         unsigned long size = 0;
3437
3438                         gen = lru_gen_from_seq(seq);
3439
3440                         for (zone = 0; zone < MAX_NR_ZONES; zone++)
3441                                 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
3442
3443                         total += size;
3444                         if (seq == max_seq)
3445                                 young += size;
3446                         else if (seq + MIN_NR_GENS == max_seq)
3447                                 old += size;
3448                 }
3449         }
3450
3451         /* try to scrape all its memory if this memcg was deleted */
3452         *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
3453
3454         /*
3455          * The aging tries to be lazy to reduce the overhead, while the eviction
3456          * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
3457          * ideal number of generations is MIN_NR_GENS+1.
3458          */
3459         if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
3460                 return true;
3461         if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
3462                 return false;
3463
3464         /*
3465          * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
3466          * of the total number of pages for each generation. A reasonable range
3467          * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
3468          * aging cares about the upper bound of hot pages, while the eviction
3469          * cares about the lower bound of cold pages.
3470          */
3471         if (young * MIN_NR_GENS > total)
3472                 return true;
3473         if (old * (MIN_NR_GENS + 2) < total)
3474                 return true;
3475
3476         return false;
3477 }
3478
3479 static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
3480 {
3481         bool need_aging;
3482         unsigned long nr_to_scan;
3483         int swappiness = get_swappiness(lruvec, sc);
3484         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3485         DEFINE_MAX_SEQ(lruvec);
3486         DEFINE_MIN_SEQ(lruvec);
3487
3488         VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
3489
3490         mem_cgroup_calculate_protection(NULL, memcg);
3491
3492         if (mem_cgroup_below_min(memcg))
3493                 return;
3494
3495         need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
3496         if (need_aging)
3497                 inc_max_seq(lruvec, max_seq, swappiness);
3498 }
3499
3500 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
3501 {
3502         struct mem_cgroup *memcg;
3503
3504         VM_WARN_ON_ONCE(!current_is_kswapd());
3505
3506         memcg = mem_cgroup_iter(NULL, NULL, NULL);
3507         do {
3508                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3509
3510                 age_lruvec(lruvec, sc);
3511
3512                 cond_resched();
3513         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
3514 }
3515
3516 /*
3517  * This function exploits spatial locality when shrink_page_list() walks the
3518  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
3519  */
3520 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
3521 {
3522         int i;
3523         pte_t *pte;
3524         unsigned long start;
3525         unsigned long end;
3526         unsigned long addr;
3527         unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
3528         struct folio *folio = pfn_folio(pvmw->pfn);
3529         struct mem_cgroup *memcg = folio_memcg(folio);
3530         struct pglist_data *pgdat = folio_pgdat(folio);
3531         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3532         DEFINE_MAX_SEQ(lruvec);
3533         int old_gen, new_gen = lru_gen_from_seq(max_seq);
3534
3535         lockdep_assert_held(pvmw->ptl);
3536         VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
3537
3538         if (spin_is_contended(pvmw->ptl))
3539                 return;
3540
3541         start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
3542         end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
3543
3544         if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
3545                 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
3546                         end = start + MIN_LRU_BATCH * PAGE_SIZE;
3547                 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
3548                         start = end - MIN_LRU_BATCH * PAGE_SIZE;
3549                 else {
3550                         start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
3551                         end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
3552                 }
3553         }
3554
3555         pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
3556
3557         rcu_read_lock();
3558         arch_enter_lazy_mmu_mode();
3559
3560         for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
3561                 unsigned long pfn;
3562
3563                 pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
3564                 if (pfn == -1)
3565                         continue;
3566
3567                 if (!pte_young(pte[i]))
3568                         continue;
3569
3570                 folio = get_pfn_folio(pfn, memcg, pgdat);
3571                 if (!folio)
3572                         continue;
3573
3574                 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
3575                         VM_WARN_ON_ONCE(true);
3576
3577                 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
3578                     !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3579                       !folio_test_swapcache(folio)))
3580                         folio_mark_dirty(folio);
3581
3582                 old_gen = folio_lru_gen(folio);
3583                 if (old_gen < 0)
3584                         folio_set_referenced(folio);
3585                 else if (old_gen != new_gen)
3586                         __set_bit(i, bitmap);
3587         }
3588
3589         arch_leave_lazy_mmu_mode();
3590         rcu_read_unlock();
3591
3592         if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
3593                 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
3594                         folio = pfn_folio(pte_pfn(pte[i]));
3595                         folio_activate(folio);
3596                 }
3597                 return;
3598         }
3599
3600         /* folio_update_gen() requires stable folio_memcg() */
3601         if (!mem_cgroup_trylock_pages(memcg))
3602                 return;
3603
3604         spin_lock_irq(&lruvec->lru_lock);
3605         new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
3606
3607         for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
3608                 folio = pfn_folio(pte_pfn(pte[i]));
3609                 if (folio_memcg_rcu(folio) != memcg)
3610                         continue;
3611
3612                 old_gen = folio_update_gen(folio, new_gen);
3613                 if (old_gen < 0 || old_gen == new_gen)
3614                         continue;
3615
3616                 lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3617         }
3618
3619         spin_unlock_irq(&lruvec->lru_lock);
3620
3621         mem_cgroup_unlock_pages();
3622 }
3623
3624 /******************************************************************************
3625  *                          the eviction
3626  ******************************************************************************/
3627
3628 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
3629 {
3630         bool success;
3631         int gen = folio_lru_gen(folio);
3632         int type = folio_is_file_lru(folio);
3633         int zone = folio_zonenum(folio);
3634         int delta = folio_nr_pages(folio);
3635         int refs = folio_lru_refs(folio);
3636         int tier = lru_tier_from_refs(refs);
3637         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3638
3639         VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
3640
3641         /* unevictable */
3642         if (!folio_evictable(folio)) {
3643                 success = lru_gen_del_folio(lruvec, folio, true);
3644                 VM_WARN_ON_ONCE_FOLIO(!success, folio);
3645                 folio_set_unevictable(folio);
3646                 lruvec_add_folio(lruvec, folio);
3647                 __count_vm_events(UNEVICTABLE_PGCULLED, delta);
3648                 return true;
3649         }
3650
3651         /* dirty lazyfree */
3652         if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
3653                 success = lru_gen_del_folio(lruvec, folio, true);
3654                 VM_WARN_ON_ONCE_FOLIO(!success, folio);
3655                 folio_set_swapbacked(folio);
3656                 lruvec_add_folio_tail(lruvec, folio);
3657                 return true;
3658         }
3659
3660         /* promoted */
3661         if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
3662                 list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
3663                 return true;
3664         }
3665
3666         /* protected */
3667         if (tier > tier_idx) {
3668                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3669
3670                 gen = folio_inc_gen(lruvec, folio, false);
3671                 list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
3672
3673                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
3674                            lrugen->protected[hist][type][tier - 1] + delta);
3675                 __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
3676                 return true;
3677         }
3678
3679         /* waiting for writeback */
3680         if (folio_test_locked(folio) || folio_test_writeback(folio) ||
3681             (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
3682                 gen = folio_inc_gen(lruvec, folio, true);
3683                 list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
3684                 return true;
3685         }
3686
3687         return false;
3688 }
3689
3690 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
3691 {
3692         bool success;
3693
3694         /* unmapping inhibited */
3695         if (!sc->may_unmap && folio_mapped(folio))
3696                 return false;
3697
3698         /* swapping inhibited */
3699         if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
3700             (folio_test_dirty(folio) ||
3701              (folio_test_anon(folio) && !folio_test_swapcache(folio))))
3702                 return false;
3703
3704         /* raced with release_pages() */
3705         if (!folio_try_get(folio))
3706                 return false;
3707
3708         /* raced with another isolation */
3709         if (!folio_test_clear_lru(folio)) {
3710                 folio_put(folio);
3711                 return false;
3712         }
3713
3714         /* see the comment on MAX_NR_TIERS */
3715         if (!folio_test_referenced(folio))
3716                 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
3717
3718         /* for shrink_page_list() */
3719         folio_clear_reclaim(folio);
3720         folio_clear_referenced(folio);
3721
3722         success = lru_gen_del_folio(lruvec, folio, true);
3723         VM_WARN_ON_ONCE_FOLIO(!success, folio);
3724
3725         return true;
3726 }
3727
3728 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
3729                        int type, int tier, struct list_head *list)
3730 {
3731         int gen, zone;
3732         enum vm_event_item item;
3733         int sorted = 0;
3734         int scanned = 0;
3735         int isolated = 0;
3736         int remaining = MAX_LRU_BATCH;
3737         struct lru_gen_struct *lrugen = &lruvec->lrugen;
3738         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3739
3740         VM_WARN_ON_ONCE(!list_empty(list));
3741
3742         if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
3743                 return 0;
3744
3745         gen = lru_gen_from_seq(lrugen->min_seq[type]);
3746
3747         for (zone = sc->reclaim_idx; zone >= 0; zone--) {
3748                 LIST_HEAD(moved);
3749                 int skipped = 0;
3750                 struct list_head *head = &lrugen->lists[gen][type][zone];
3751
3752                 while (!list_empty(head)) {
3753                         struct folio *folio = lru_to_folio(head);
3754                         int delta = folio_nr_pages(folio);
3755
3756                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
3757                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
3758                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
3759                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
3760
3761                         scanned += delta;
3762
3763                         if (sort_folio(lruvec, folio, tier))
3764                                 sorted += delta;
3765                         else if (isolate_folio(lruvec, folio, sc)) {
3766                                 list_add(&folio->lru, list);
3767                                 isolated += delta;
3768                         } else {
3769                                 list_move(&folio->lru, &moved);
3770                                 skipped += delta;
3771                         }
3772
3773                         if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
3774                                 break;
3775                 }
3776
3777                 if (skipped) {
3778                         list_splice(&moved, head);
3779                         __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
3780                 }
3781
3782                 if (!remaining || isolated >= MIN_LRU_BATCH)
3783                         break;
3784         }
3785
3786         item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
3787         if (!cgroup_reclaim(sc)) {
3788                 __count_vm_events(item, isolated);
3789                 __count_vm_events(PGREFILL, sorted);
3790         }
3791         __count_memcg_events(memcg, item, isolated);
3792         __count_memcg_events(memcg, PGREFILL, sorted);
3793         __count_vm_events(PGSCAN_ANON + type, isolated);
3794
3795         /*
3796          * There might not be eligible pages due to reclaim_idx, may_unmap and
3797          * may_writepage. Check the remaining to prevent livelock if it's not
3798          * making progress.
3799          */
3800         return isolated || !remaining ? scanned : 0;
3801 }
3802
3803 static int get_tier_idx(struct lruvec *lruvec, int type)
3804 {
3805         int tier;
3806         struct ctrl_pos sp, pv;
3807
3808         /*
3809          * To leave a margin for fluctuations, use a larger gain factor (1:2).
3810          * This value is chosen because any other tier would have at least twice
3811          * as many refaults as the first tier.
3812          */
3813         read_ctrl_pos(lruvec, type, 0, 1, &sp);
3814         for (tier = 1; tier < MAX_NR_TIERS; tier++) {
3815                 read_ctrl_pos(lruvec, type, tier, 2, &pv);
3816                 if (!positive_ctrl_err(&sp, &pv))
3817                         break;
3818         }
3819
3820         return tier - 1;
3821 }
3822
3823 static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
3824 {
3825         int type, tier;
3826         struct ctrl_pos sp, pv;
3827         int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
3828
3829         /*
3830          * Compare the first tier of anon with that of file to determine which
3831          * type to scan. Also need to compare other tiers of the selected type
3832          * with the first tier of the other type to determine the last tier (of
3833          * the selected type) to evict.
3834          */
3835         read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
3836         read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
3837         type = positive_ctrl_err(&sp, &pv);
3838
3839         read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
3840         for (tier = 1; tier < MAX_NR_TIERS; tier++) {
3841                 read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
3842                 if (!positive_ctrl_err(&sp, &pv))
3843                         break;
3844         }
3845
3846         *tier_idx = tier - 1;
3847
3848         return type;
3849 }
3850
3851 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
3852                           int *type_scanned, struct list_head *list)
3853 {
3854         int i;
3855         int type;
3856         int scanned;
3857         int tier = -1;
3858         DEFINE_MIN_SEQ(lruvec);
3859
3860         /*
3861          * Try to make the obvious choice first. When anon and file are both
3862          * available from the same generation, interpret swappiness 1 as file
3863          * first and 200 as anon first.
3864          */
3865         if (!swappiness)
3866                 type = LRU_GEN_FILE;
3867         else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
3868                 type = LRU_GEN_ANON;
3869         else if (swappiness == 1)
3870                 type = LRU_GEN_FILE;
3871         else if (swappiness == 200)
3872                 type = LRU_GEN_ANON;
3873         else
3874                 type = get_type_to_scan(lruvec, swappiness, &tier);
3875
3876         for (i = !swappiness; i < ANON_AND_FILE; i++) {
3877                 if (tier < 0)
3878                         tier = get_tier_idx(lruvec, type);
3879
3880                 scanned = scan_folios(lruvec, sc, type, tier, list);
3881                 if (scanned)
3882                         break;
3883
3884                 type = !type;
3885                 tier = -1;
3886         }
3887
3888         *type_scanned = type;
3889
3890         return scanned;
3891 }
3892
3893 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
3894 {
3895         int type;
3896         int scanned;
3897         int reclaimed;
3898         LIST_HEAD(list);
3899         struct folio *folio;
3900         enum vm_event_item item;
3901         struct reclaim_stat stat;
3902         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3903         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
3904
3905         spin_lock_irq(&lruvec->lru_lock);
3906
3907         scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
3908
3909         scanned += try_to_inc_min_seq(lruvec, swappiness);
3910
3911         if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
3912                 scanned = 0;
3913
3914         spin_unlock_irq(&lruvec->lru_lock);
3915
3916         if (list_empty(&list))
3917                 return scanned;
3918
3919         reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
3920
3921         list_for_each_entry(folio, &list, lru) {
3922                 /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
3923                 if (folio_test_workingset(folio))
3924                         folio_set_referenced(folio);
3925
3926                 /* don't add rejected pages to the oldest generation */
3927                 if (folio_test_reclaim(folio) &&
3928                     (folio_test_dirty(folio) || folio_test_writeback(folio)))
3929                         folio_clear_active(folio);
3930                 else
3931                         folio_set_active(folio);
3932         }
3933
3934         spin_lock_irq(&lruvec->lru_lock);
3935
3936         move_pages_to_lru(lruvec, &list);
3937
3938         item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
3939         if (!cgroup_reclaim(sc))
3940                 __count_vm_events(item, reclaimed);
3941         __count_memcg_events(memcg, item, reclaimed);
3942         __count_vm_events(PGSTEAL_ANON + type, reclaimed);
3943
3944         spin_unlock_irq(&lruvec->lru_lock);
3945
3946         mem_cgroup_uncharge_list(&list);
3947         free_unref_page_list(&list);
3948
3949         sc->nr_reclaimed += reclaimed;
3950
3951         return scanned;
3952 }
3953
3954 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
3955                                     bool can_swap)
3956 {
3957         bool need_aging;
3958         unsigned long nr_to_scan;
3959         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3960         DEFINE_MAX_SEQ(lruvec);
3961         DEFINE_MIN_SEQ(lruvec);
3962
3963         if (mem_cgroup_below_min(memcg) ||
3964             (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
3965                 return 0;
3966
3967         need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
3968         if (!need_aging)
3969                 return nr_to_scan;
3970
3971         /* skip the aging path at the default priority */
3972         if (sc->priority == DEF_PRIORITY)
3973                 goto done;
3974
3975         /* leave the work to lru_gen_age_node() */
3976         if (current_is_kswapd())
3977                 return 0;
3978
3979         inc_max_seq(lruvec, max_seq, can_swap);
3980 done:
3981         return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
3982 }
3983
3984 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
3985 {
3986         struct blk_plug plug;
3987         unsigned long scanned = 0;
3988
3989         lru_add_drain();
3990
3991         blk_start_plug(&plug);
3992
3993         while (true) {
3994                 int delta;
3995                 int swappiness;
3996                 unsigned long nr_to_scan;
3997
3998                 if (sc->may_swap)
3999                         swappiness = get_swappiness(lruvec, sc);
4000                 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
4001                         swappiness = 1;
4002                 else
4003                         swappiness = 0;
4004
4005                 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
4006                 if (!nr_to_scan)
4007                         break;
4008
4009                 delta = evict_folios(lruvec, sc, swappiness);
4010                 if (!delta)
4011                         break;
4012
4013                 scanned += delta;
4014                 if (scanned >= nr_to_scan)
4015                         break;
4016
4017                 cond_resched();
4018         }
4019
4020         blk_finish_plug(&plug);
4021 }
4022
4023 /******************************************************************************
4024  *                          initialization
4025  ******************************************************************************/
4026
4027 void lru_gen_init_lruvec(struct lruvec *lruvec)
4028 {
4029         int gen, type, zone;
4030         struct lru_gen_struct *lrugen = &lruvec->lrugen;
4031
4032         lrugen->max_seq = MIN_NR_GENS + 1;
4033
4034         for_each_gen_type_zone(gen, type, zone)
4035                 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
4036 }
4037
4038 #ifdef CONFIG_MEMCG
4039 void lru_gen_init_memcg(struct mem_cgroup *memcg)
4040 {
4041 }
4042
4043 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
4044 {
4045         int nid;
4046
4047         for_each_node(nid) {
4048                 struct lruvec *lruvec = get_lruvec(memcg, nid);
4049
4050                 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
4051                                            sizeof(lruvec->lrugen.nr_pages)));
4052         }
4053 }
4054 #endif
4055
4056 static int __init init_lru_gen(void)
4057 {
4058         BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
4059         BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
4060
4061         return 0;
4062 };
4063 late_initcall(init_lru_gen);
4064
4065 #else /* !CONFIG_LRU_GEN */
4066
4067 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4068 {
4069 }
4070
4071 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4072 {
4073 }
4074
4075 #endif /* CONFIG_LRU_GEN */
4076
4077 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4078 {
4079         unsigned long nr[NR_LRU_LISTS];
4080         unsigned long targets[NR_LRU_LISTS];
4081         unsigned long nr_to_scan;
4082         enum lru_list lru;
4083         unsigned long nr_reclaimed = 0;
4084         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
4085         struct blk_plug plug;
4086         bool scan_adjusted;
4087
4088         if (lru_gen_enabled()) {
4089                 lru_gen_shrink_lruvec(lruvec, sc);
4090                 return;
4091         }
4092
4093         get_scan_count(lruvec, sc, nr);
4094
4095         /* Record the original scan target for proportional adjustments later */
4096         memcpy(targets, nr, sizeof(nr));
4097
4098         /*
4099          * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
4100          * event that can occur when there is little memory pressure e.g.
4101          * multiple streaming readers/writers. Hence, we do not abort scanning
4102          * when the requested number of pages are reclaimed when scanning at
4103          * DEF_PRIORITY on the assumption that the fact we are direct
4104          * reclaiming implies that kswapd is not keeping up and it is best to
4105          * do a batch of work at once. For memcg reclaim one check is made to
4106          * abort proportional reclaim if either the file or anon lru has already
4107          * dropped to zero at the first pass.
4108          */
4109         scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
4110                          sc->priority == DEF_PRIORITY);
4111
4112         blk_start_plug(&plug);
4113         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
4114                                         nr[LRU_INACTIVE_FILE]) {
4115                 unsigned long nr_anon, nr_file, percentage;
4116                 unsigned long nr_scanned;
4117
4118                 for_each_evictable_lru(lru) {
4119                         if (nr[lru]) {
4120                                 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
4121                                 nr[lru] -= nr_to_scan;
4122
4123                                 nr_reclaimed += shrink_list(lru, nr_to_scan,
4124                                                             lruvec, sc);
4125                         }
4126                 }
4127
4128                 cond_resched();
4129
4130                 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
4131                         continue;
4132
4133                 /*
4134                  * For kswapd and memcg, reclaim at least the number of pages
4135                  * requested. Ensure that the anon and file LRUs are scanned
4136                  * proportionally what was requested by get_scan_count(). We
4137                  * stop reclaiming one LRU and reduce the amount scanning
4138                  * proportional to the original scan target.
4139                  */
4140                 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
4141                 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
4142
4143                 /*
4144                  * It's just vindictive to attack the larger once the smaller
4145                  * has gone to zero.  And given the way we stop scanning the
4146                  * smaller below, this makes sure that we only make one nudge
4147                  * towards proportionality once we've got nr_to_reclaim.
4148                  */
4149                 if (!nr_file || !nr_anon)
4150                         break;
4151
4152                 if (nr_file > nr_anon) {
4153                         unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
4154                                                 targets[LRU_ACTIVE_ANON] + 1;
4155                         lru = LRU_BASE;
4156                         percentage = nr_anon * 100 / scan_target;
4157                 } else {
4158                         unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
4159                                                 targets[LRU_ACTIVE_FILE] + 1;
4160                         lru = LRU_FILE;
4161                         percentage = nr_file * 100 / scan_target;
4162                 }
4163
4164                 /* Stop scanning the smaller of the LRU */
4165                 nr[lru] = 0;
4166                 nr[lru + LRU_ACTIVE] = 0;
4167
4168                 /*
4169                  * Recalculate the other LRU scan count based on its original
4170                  * scan target and the percentage scanning already complete
4171                  */
4172                 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
4173                 nr_scanned = targets[lru] - nr[lru];
4174                 nr[lru] = targets[lru] * (100 - percentage) / 100;
4175                 nr[lru] -= min(nr[lru], nr_scanned);
4176
4177                 lru += LRU_ACTIVE;
4178                 nr_scanned = targets[lru] - nr[lru];
4179                 nr[lru] = targets[lru] * (100 - percentage) / 100;
4180                 nr[lru] -= min(nr[lru], nr_scanned);
4181
4182                 scan_adjusted = true;
4183         }
4184         blk_finish_plug(&plug);
4185         sc->nr_reclaimed += nr_reclaimed;
4186
4187         /*
4188          * Even if we did not try to evict anon pages at all, we want to
4189          * rebalance the anon lru active/inactive ratio.
4190          */
4191         if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
4192             inactive_is_low(lruvec, LRU_INACTIVE_ANON))
4193                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
4194                                    sc, LRU_ACTIVE_ANON);
4195 }
4196
4197 /* Use reclaim/compaction for costly allocs or under memory pressure */
4198 static bool in_reclaim_compaction(struct scan_control *sc)
4199 {
4200         if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
4201                         (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
4202                          sc->priority < DEF_PRIORITY - 2))
4203                 return true;
4204
4205         return false;
4206 }
4207
4208 /*
4209  * Reclaim/compaction is used for high-order allocation requests. It reclaims
4210  * order-0 pages before compacting the zone. should_continue_reclaim() returns
4211  * true if more pages should be reclaimed such that when the page allocator
4212  * calls try_to_compact_pages() that it will have enough free pages to succeed.
4213  * It will give up earlier than that if there is difficulty reclaiming pages.
4214  */
4215 static inline bool should_continue_reclaim(struct pglist_data *pgdat,
4216                                         unsigned long nr_reclaimed,
4217                                         struct scan_control *sc)
4218 {
4219         unsigned long pages_for_compaction;
4220         unsigned long inactive_lru_pages;
4221         int z;
4222
4223         /* If not in reclaim/compaction mode, stop */
4224         if (!in_reclaim_compaction(sc))
4225                 return false;
4226
4227         /*
4228          * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
4229          * number of pages that were scanned. This will return to the caller
4230          * with the risk reclaim/compaction and the resulting allocation attempt
4231          * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
4232          * allocations through requiring that the full LRU list has been scanned
4233          * first, by assuming that zero delta of sc->nr_scanned means full LRU
4234          * scan, but that approximation was wrong, and there were corner cases
4235          * where always a non-zero amount of pages were scanned.
4236          */
4237         if (!nr_reclaimed)
4238                 return false;
4239
4240         /* If compaction would go ahead or the allocation would succeed, stop */
4241         for (z = 0; z <= sc->reclaim_idx; z++) {
4242                 struct zone *zone = &pgdat->node_zones[z];
4243                 if (!managed_zone(zone))
4244                         continue;
4245
4246                 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
4247                 case COMPACT_SUCCESS:
4248                 case COMPACT_CONTINUE:
4249                         return false;
4250                 default:
4251                         /* check next zone */
4252                         ;
4253                 }
4254         }
4255
4256         /*
4257          * If we have not reclaimed enough pages for compaction and the
4258          * inactive lists are large enough, continue reclaiming
4259          */
4260         pages_for_compaction = compact_gap(sc->order);
4261         inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
4262         if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
4263                 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
4264
4265         return inactive_lru_pages > pages_for_compaction;
4266 }
4267
4268 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
4269 {
4270         struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
4271         struct mem_cgroup *memcg;
4272
4273         memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
4274         do {
4275                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4276                 unsigned long reclaimed;
4277                 unsigned long scanned;
4278
4279                 /*
4280                  * This loop can become CPU-bound when target memcgs
4281                  * aren't eligible for reclaim - either because they
4282                  * don't have any reclaimable pages, or because their
4283                  * memory is explicitly protected. Avoid soft lockups.
4284                  */
4285                 cond_resched();
4286
4287                 mem_cgroup_calculate_protection(target_memcg, memcg);
4288
4289                 if (mem_cgroup_below_min(memcg)) {
4290                         /*
4291                          * Hard protection.
4292                          * If there is no reclaimable memory, OOM.
4293                          */
4294                         continue;
4295                 } else if (mem_cgroup_below_low(memcg)) {
4296                         /*
4297                          * Soft protection.
4298                          * Respect the protection only as long as
4299                          * there is an unprotected supply
4300                          * of reclaimable memory from other cgroups.
4301                          */
4302                         if (!sc->memcg_low_reclaim) {
4303                                 sc->memcg_low_skipped = 1;
4304                                 continue;
4305                         }
4306                         memcg_memory_event(memcg, MEMCG_LOW);
4307                 }
4308
4309                 reclaimed = sc->nr_reclaimed;
4310                 scanned = sc->nr_scanned;
4311
4312                 shrink_lruvec(lruvec, sc);
4313
4314                 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
4315                             sc->priority);
4316
4317                 /* Record the group's reclaim efficiency */
4318                 if (!sc->proactive)
4319                         vmpressure(sc->gfp_mask, memcg, false,
4320                                    sc->nr_scanned - scanned,
4321                                    sc->nr_reclaimed - reclaimed);
4322
4323         } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
4324 }
4325
4326 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
4327 {
4328         struct reclaim_state *reclaim_state = current->reclaim_state;
4329         unsigned long nr_reclaimed, nr_scanned;
4330         struct lruvec *target_lruvec;
4331         bool reclaimable = false;
4332
4333         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
4334
4335 again:
4336         memset(&sc->nr, 0, sizeof(sc->nr));
4337
4338         nr_reclaimed = sc->nr_reclaimed;
4339         nr_scanned = sc->nr_scanned;
4340
4341         prepare_scan_count(pgdat, sc);
4342
4343         shrink_node_memcgs(pgdat, sc);
4344
4345         if (reclaim_state) {
4346                 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
4347                 reclaim_state->reclaimed_slab = 0;
4348         }
4349
4350         /* Record the subtree's reclaim efficiency */
4351         if (!sc->proactive)
4352                 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
4353                            sc->nr_scanned - nr_scanned,
4354                            sc->nr_reclaimed - nr_reclaimed);
4355
4356         if (sc->nr_reclaimed - nr_reclaimed)
4357                 reclaimable = true;
4358
4359         if (current_is_kswapd()) {
4360                 /*
4361                  * If reclaim is isolating dirty pages under writeback,
4362                  * it implies that the long-lived page allocation rate
4363                  * is exceeding the page laundering rate. Either the
4364                  * global limits are not being effective at throttling
4365                  * processes due to the page distribution throughout
4366                  * zones or there is heavy usage of a slow backing
4367                  * device. The only option is to throttle from reclaim
4368                  * context which is not ideal as there is no guarantee
4369                  * the dirtying process is throttled in the same way
4370                  * balance_dirty_pages() manages.
4371                  *
4372                  * Once a node is flagged PGDAT_WRITEBACK, kswapd will
4373                  * count the number of pages under pages flagged for
4374                  * immediate reclaim and stall if any are encountered
4375                  * in the nr_immediate check below.
4376                  */
4377                 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
4378                         set_bit(PGDAT_WRITEBACK, &pgdat->flags);
4379
4380                 /* Allow kswapd to start writing pages during reclaim.*/
4381                 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
4382                         set_bit(PGDAT_DIRTY, &pgdat->flags);
4383
4384                 /*
4385                  * If kswapd scans pages marked for immediate
4386                  * reclaim and under writeback (nr_immediate), it
4387                  * implies that pages are cycling through the LRU
4388                  * faster than they are written so forcibly stall
4389                  * until some pages complete writeback.
4390                  */
4391                 if (sc->nr.immediate)
4392                         reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
4393         }
4394
4395         /*
4396          * Tag a node/memcg as congested if all the dirty pages were marked
4397          * for writeback and immediate reclaim (counted in nr.congested).
4398          *
4399          * Legacy memcg will stall in page writeback so avoid forcibly
4400          * stalling in reclaim_throttle().
4401          */
4402         if ((current_is_kswapd() ||
4403              (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
4404             sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
4405                 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
4406
4407         /*
4408          * Stall direct reclaim for IO completions if the lruvec is
4409          * node is congested. Allow kswapd to continue until it
4410          * starts encountering unqueued dirty pages or cycling through
4411          * the LRU too quickly.
4412          */
4413         if (!current_is_kswapd() && current_may_throttle() &&
4414             !sc->hibernation_mode &&
4415             test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
4416                 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
4417
4418         if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
4419                                     sc))
4420                 goto again;
4421
4422         /*
4423          * Kswapd gives up on balancing particular nodes after too
4424          * many failures to reclaim anything from them and goes to
4425          * sleep. On reclaim progress, reset the failure counter. A
4426          * successful direct reclaim run will revive a dormant kswapd.
4427          */
4428         if (reclaimable)
4429                 pgdat->kswapd_failures = 0;
4430 }
4431
4432 /*
4433  * Returns true if compaction should go ahead for a costly-order request, or
4434  * the allocation would already succeed without compaction. Return false if we
4435  * should reclaim first.
4436  */
4437 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
4438 {
4439         unsigned long watermark;
4440         enum compact_result suitable;
4441
4442         suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
4443         if (suitable == COMPACT_SUCCESS)
4444                 /* Allocation should succeed already. Don't reclaim. */
4445                 return true;
4446         if (suitable == COMPACT_SKIPPED)
4447                 /* Compaction cannot yet proceed. Do reclaim. */
4448                 return false;
4449
4450         /*
4451          * Compaction is already possible, but it takes time to run and there
4452          * are potentially other callers using the pages just freed. So proceed
4453          * with reclaim to make a buffer of free pages available to give
4454          * compaction a reasonable chance of completing and allocating the page.
4455          * Note that we won't actually reclaim the whole buffer in one attempt
4456          * as the target watermark in should_continue_reclaim() is lower. But if
4457          * we are already above the high+gap watermark, don't reclaim at all.
4458          */
4459         watermark = high_wmark_pages(zone) + compact_gap(sc->order);
4460
4461         return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
4462 }
4463
4464 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
4465 {
4466         /*
4467          * If reclaim is making progress greater than 12% efficiency then
4468          * wake all the NOPROGRESS throttled tasks.
4469          */
4470         if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
4471                 wait_queue_head_t *wqh;
4472
4473                 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
4474                 if (waitqueue_active(wqh))
4475                         wake_up(wqh);
4476
4477                 return;
4478         }
4479
4480         /*
4481          * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
4482          * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
4483          * under writeback and marked for immediate reclaim at the tail of the
4484          * LRU.
4485          */
4486         if (current_is_kswapd() || cgroup_reclaim(sc))
4487                 return;
4488
4489         /* Throttle if making no progress at high prioities. */
4490         if (sc->priority == 1 && !sc->nr_reclaimed)
4491                 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
4492 }
4493
4494 /*
4495  * This is the direct reclaim path, for page-allocating processes.  We only
4496  * try to reclaim pages from zones which will satisfy the caller's allocation
4497  * request.
4498  *
4499  * If a zone is deemed to be full of pinned pages then just give it a light
4500  * scan then give up on it.
4501  */
4502 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
4503 {
4504         struct zoneref *z;
4505         struct zone *zone;
4506         unsigned long nr_soft_reclaimed;
4507         unsigned long nr_soft_scanned;
4508         gfp_t orig_mask;
4509         pg_data_t *last_pgdat = NULL;
4510         pg_data_t *first_pgdat = NULL;
4511
4512         /*
4513          * If the number of buffer_heads in the machine exceeds the maximum
4514          * allowed level, force direct reclaim to scan the highmem zone as
4515          * highmem pages could be pinning lowmem pages storing buffer_heads
4516          */
4517         orig_mask = sc->gfp_mask;
4518         if (buffer_heads_over_limit) {
4519                 sc->gfp_mask |= __GFP_HIGHMEM;
4520                 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
4521         }
4522
4523         for_each_zone_zonelist_nodemask(zone, z, zonelist,
4524                                         sc->reclaim_idx, sc->nodemask) {
4525                 /*
4526                  * Take care memory controller reclaiming has small influence
4527                  * to global LRU.
4528                  */
4529                 if (!cgroup_reclaim(sc)) {
4530                         if (!cpuset_zone_allowed(zone,
4531                                                  GFP_KERNEL | __GFP_HARDWALL))
4532                                 continue;
4533
4534                         /*
4535                          * If we already have plenty of memory free for
4536                          * compaction in this zone, don't free any more.
4537                          * Even though compaction is invoked for any
4538                          * non-zero order, only frequent costly order
4539                          * reclamation is disruptive enough to become a
4540                          * noticeable problem, like transparent huge
4541                          * page allocations.
4542                          */
4543                         if (IS_ENABLED(CONFIG_COMPACTION) &&
4544                             sc->order > PAGE_ALLOC_COSTLY_ORDER &&
4545                             compaction_ready(zone, sc)) {
4546                                 sc->compaction_ready = true;
4547                                 continue;
4548                         }
4549
4550                         /*
4551                          * Shrink each node in the zonelist once. If the
4552                          * zonelist is ordered by zone (not the default) then a
4553                          * node may be shrunk multiple times but in that case
4554                          * the user prefers lower zones being preserved.
4555                          */
4556                         if (zone->zone_pgdat == last_pgdat)
4557                                 continue;
4558
4559                         /*
4560                          * This steals pages from memory cgroups over softlimit
4561                          * and returns the number of reclaimed pages and
4562                          * scanned pages. This works for global memory pressure
4563                          * and balancing, not for a memcg's limit.
4564                          */
4565                         nr_soft_scanned = 0;
4566                         nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
4567                                                 sc->order, sc->gfp_mask,
4568                                                 &nr_soft_scanned);
4569                         sc->nr_reclaimed += nr_soft_reclaimed;
4570                         sc->nr_scanned += nr_soft_scanned;
4571                         /* need some check for avoid more shrink_zone() */
4572                 }
4573
4574                 if (!first_pgdat)
4575                         first_pgdat = zone->zone_pgdat;
4576
4577                 /* See comment about same check for global reclaim above */
4578                 if (zone->zone_pgdat == last_pgdat)
4579                         continue;
4580                 last_pgdat = zone->zone_pgdat;
4581                 shrink_node(zone->zone_pgdat, sc);
4582         }
4583
4584         if (first_pgdat)
4585                 consider_reclaim_throttle(first_pgdat, sc);
4586
4587         /*
4588          * Restore to original mask to avoid the impact on the caller if we
4589          * promoted it to __GFP_HIGHMEM.
4590          */
4591         sc->gfp_mask = orig_mask;
4592 }
4593
4594 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
4595 {
4596         struct lruvec *target_lruvec;
4597         unsigned long refaults;
4598
4599         if (lru_gen_enabled())
4600                 return;
4601
4602         target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
4603         refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
4604         target_lruvec->refaults[WORKINGSET_ANON] = refaults;
4605         refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
4606         target_lruvec->refaults[WORKINGSET_FILE] = refaults;
4607 }
4608
4609 /*
4610  * This is the main entry point to direct page reclaim.
4611  *
4612  * If a full scan of the inactive list fails to free enough memory then we
4613  * are "out of memory" and something needs to be killed.
4614  *
4615  * If the caller is !__GFP_FS then the probability of a failure is reasonably
4616  * high - the zone may be full of dirty or under-writeback pages, which this
4617  * caller can't do much about.  We kick the writeback threads and take explicit
4618  * naps in the hope that some of these pages can be written.  But if the
4619  * allocating task holds filesystem locks which prevent writeout this might not
4620  * work, and the allocation attempt will fail.
4621  *
4622  * returns:     0, if no pages reclaimed
4623  *              else, the number of pages reclaimed
4624  */
4625 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
4626                                           struct scan_control *sc)
4627 {
4628         int initial_priority = sc->priority;
4629         pg_data_t *last_pgdat;
4630         struct zoneref *z;
4631         struct zone *zone;
4632 retry:
4633         delayacct_freepages_start();
4634
4635         if (!cgroup_reclaim(sc))
4636                 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
4637
4638         do {
4639                 if (!sc->proactive)
4640                         vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
4641                                         sc->priority);
4642                 sc->nr_scanned = 0;
4643                 shrink_zones(zonelist, sc);
4644
4645                 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
4646                         break;
4647
4648                 if (sc->compaction_ready)
4649                         break;
4650
4651                 /*
4652                  * If we're getting trouble reclaiming, start doing
4653                  * writepage even in laptop mode.
4654                  */
4655                 if (sc->priority < DEF_PRIORITY - 2)
4656                         sc->may_writepage = 1;
4657         } while (--sc->priority >= 0);
4658
4659         last_pgdat = NULL;
4660         for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
4661                                         sc->nodemask) {
4662                 if (zone->zone_pgdat == last_pgdat)
4663                         continue;
4664                 last_pgdat = zone->zone_pgdat;
4665
4666                 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
4667
4668                 if (cgroup_reclaim(sc)) {
4669                         struct lruvec *lruvec;
4670
4671                         lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
4672                                                    zone->zone_pgdat);
4673                         clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
4674                 }
4675         }
4676
4677         delayacct_freepages_end();
4678
4679         if (sc->nr_reclaimed)
4680                 return sc->nr_reclaimed;
4681
4682         /* Aborted reclaim to try compaction? don't OOM, then */
4683         if (sc->compaction_ready)
4684                 return 1;
4685
4686         /*
4687          * We make inactive:active ratio decisions based on the node's
4688          * composition of memory, but a restrictive reclaim_idx or a
4689          * memory.low cgroup setting can exempt large amounts of
4690          * memory from reclaim. Neither of which are very common, so
4691          * instead of doing costly eligibility calculations of the
4692          * entire cgroup subtree up front, we assume the estimates are
4693          * good, and retry with forcible deactivation if that fails.
4694          */
4695         if (sc->skipped_deactivate) {
4696                 sc->priority = initial_priority;
4697                 sc->force_deactivate = 1;
4698                 sc->skipped_deactivate = 0;
4699                 goto retry;
4700         }
4701
4702         /* Untapped cgroup reserves?  Don't OOM, retry. */
4703         if (sc->memcg_low_skipped) {
4704                 sc->priority = initial_priority;
4705                 sc->force_deactivate = 0;
4706                 sc->memcg_low_reclaim = 1;
4707                 sc->memcg_low_skipped = 0;
4708                 goto retry;
4709         }
4710
4711         return 0;
4712 }
4713
4714 static bool allow_direct_reclaim(pg_data_t *pgdat)
4715 {
4716         struct zone *zone;
4717         unsigned long pfmemalloc_reserve = 0;
4718         unsigned long free_pages = 0;
4719         int i;
4720         bool wmark_ok;
4721
4722         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
4723                 return true;
4724
4725         for (i = 0; i <= ZONE_NORMAL; i++) {
4726                 zone = &pgdat->node_zones[i];
4727                 if (!managed_zone(zone))
4728                         continue;
4729
4730                 if (!zone_reclaimable_pages(zone))
4731                         continue;
4732
4733                 pfmemalloc_reserve += min_wmark_pages(zone);
4734                 free_pages += zone_page_state(zone, NR_FREE_PAGES);
4735         }
4736
4737         /* If there are no reserves (unexpected config) then do not throttle */
4738         if (!pfmemalloc_reserve)
4739                 return true;
4740
4741         wmark_ok = free_pages > pfmemalloc_reserve / 2;
4742
4743         /* kswapd must be awake if processes are being throttled */
4744         if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
4745                 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
4746                         WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
4747
4748                 wake_up_interruptible(&pgdat->kswapd_wait);
4749         }
4750
4751         return wmark_ok;
4752 }
4753
4754 /*
4755  * Throttle direct reclaimers if backing storage is backed by the network
4756  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
4757  * depleted. kswapd will continue to make progress and wake the processes
4758  * when the low watermark is reached.
4759  *
4760  * Returns true if a fatal signal was delivered during throttling. If this
4761  * happens, the page allocator should not consider triggering the OOM killer.
4762  */
4763 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
4764                                         nodemask_t *nodemask)
4765 {
4766         struct zoneref *z;
4767         struct zone *zone;
4768         pg_data_t *pgdat = NULL;
4769
4770         /*
4771          * Kernel threads should not be throttled as they may be indirectly
4772          * responsible for cleaning pages necessary for reclaim to make forward
4773          * progress. kjournald for example may enter direct reclaim while
4774          * committing a transaction where throttling it could forcing other
4775          * processes to block on log_wait_commit().
4776          */
4777         if (current->flags & PF_KTHREAD)
4778                 goto out;
4779
4780         /*
4781          * If a fatal signal is pending, this process should not throttle.
4782          * It should return quickly so it can exit and free its memory
4783          */
4784         if (fatal_signal_pending(current))
4785                 goto out;
4786
4787         /*
4788          * Check if the pfmemalloc reserves are ok by finding the first node
4789          * with a usable ZONE_NORMAL or lower zone. The expectation is that
4790          * GFP_KERNEL will be required for allocating network buffers when
4791          * swapping over the network so ZONE_HIGHMEM is unusable.
4792          *
4793          * Throttling is based on the first usable node and throttled processes
4794          * wait on a queue until kswapd makes progress and wakes them. There
4795          * is an affinity then between processes waking up and where reclaim
4796          * progress has been made assuming the process wakes on the same node.
4797          * More importantly, processes running on remote nodes will not compete
4798          * for remote pfmemalloc reserves and processes on different nodes
4799          * should make reasonable progress.
4800          */
4801         for_each_zone_zonelist_nodemask(zone, z, zonelist,
4802                                         gfp_zone(gfp_mask), nodemask) {
4803                 if (zone_idx(zone) > ZONE_NORMAL)
4804                         continue;
4805
4806                 /* Throttle based on the first usable node */
4807                 pgdat = zone->zone_pgdat;
4808                 if (allow_direct_reclaim(pgdat))
4809                         goto out;
4810                 break;
4811         }
4812
4813         /* If no zone was usable by the allocation flags then do not throttle */
4814         if (!pgdat)
4815                 goto out;
4816
4817         /* Account for the throttling */
4818         count_vm_event(PGSCAN_DIRECT_THROTTLE);
4819
4820         /*
4821          * If the caller cannot enter the filesystem, it's possible that it
4822          * is due to the caller holding an FS lock or performing a journal
4823          * transaction in the case of a filesystem like ext[3|4]. In this case,
4824          * it is not safe to block on pfmemalloc_wait as kswapd could be
4825          * blocked waiting on the same lock. Instead, throttle for up to a
4826          * second before continuing.
4827          */
4828         if (!(gfp_mask & __GFP_FS))
4829                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
4830                         allow_direct_reclaim(pgdat), HZ);
4831         else
4832                 /* Throttle until kswapd wakes the process */
4833                 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
4834                         allow_direct_reclaim(pgdat));
4835
4836         if (fatal_signal_pending(current))
4837                 return true;
4838
4839 out:
4840         return false;
4841 }
4842
4843 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
4844                                 gfp_t gfp_mask, nodemask_t *nodemask)
4845 {
4846         unsigned long nr_reclaimed;
4847         struct scan_control sc = {
4848                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
4849                 .gfp_mask = current_gfp_context(gfp_mask),
4850                 .reclaim_idx = gfp_zone(gfp_mask),
4851                 .order = order,
4852                 .nodemask = nodemask,
4853                 .priority = DEF_PRIORITY,
4854                 .may_writepage = !laptop_mode,
4855                 .may_unmap = 1,
4856                 .may_swap = 1,
4857         };
4858
4859         /*
4860          * scan_control uses s8 fields for order, priority, and reclaim_idx.
4861          * Confirm they are large enough for max values.
4862          */
4863         BUILD_BUG_ON(MAX_ORDER > S8_MAX);
4864         BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
4865         BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
4866
4867         /*
4868          * Do not enter reclaim if fatal signal was delivered while throttled.
4869          * 1 is returned so that the page allocator does not OOM kill at this
4870          * point.
4871          */
4872         if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
4873                 return 1;
4874
4875         set_task_reclaim_state(current, &sc.reclaim_state);
4876         trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
4877
4878         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4879
4880         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
4881         set_task_reclaim_state(current, NULL);
4882
4883         return nr_reclaimed;
4884 }
4885
4886 #ifdef CONFIG_MEMCG
4887
4888 /* Only used by soft limit reclaim. Do not reuse for anything else. */
4889 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
4890                                                 gfp_t gfp_mask, bool noswap,
4891                                                 pg_data_t *pgdat,
4892                                                 unsigned long *nr_scanned)
4893 {
4894         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4895         struct scan_control sc = {
4896                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
4897                 .target_mem_cgroup = memcg,
4898                 .may_writepage = !laptop_mode,
4899                 .may_unmap = 1,
4900                 .reclaim_idx = MAX_NR_ZONES - 1,
4901                 .may_swap = !noswap,
4902         };
4903
4904         WARN_ON_ONCE(!current->reclaim_state);
4905
4906         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
4907                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
4908
4909         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
4910                                                       sc.gfp_mask);
4911
4912         /*
4913          * NOTE: Although we can get the priority field, using it
4914          * here is not a good idea, since it limits the pages we can scan.
4915          * if we don't reclaim here, the shrink_node from balance_pgdat
4916          * will pick up pages from other mem cgroup's as well. We hack
4917          * the priority and make it zero.
4918          */
4919         shrink_lruvec(lruvec, &sc);
4920
4921         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
4922
4923         *nr_scanned = sc.nr_scanned;
4924
4925         return sc.nr_reclaimed;
4926 }
4927
4928 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
4929                                            unsigned long nr_pages,
4930                                            gfp_t gfp_mask,
4931                                            unsigned int reclaim_options)
4932 {
4933         unsigned long nr_reclaimed;
4934         unsigned int noreclaim_flag;
4935         struct scan_control sc = {
4936                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4937                 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
4938                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
4939                 .reclaim_idx = MAX_NR_ZONES - 1,
4940                 .target_mem_cgroup = memcg,
4941                 .priority = DEF_PRIORITY,
4942                 .may_writepage = !laptop_mode,
4943                 .may_unmap = 1,
4944                 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
4945                 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
4946         };
4947         /*
4948          * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
4949          * equal pressure on all the nodes. This is based on the assumption that
4950          * the reclaim does not bail out early.
4951          */
4952         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4953
4954         set_task_reclaim_state(current, &sc.reclaim_state);
4955         trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
4956         noreclaim_flag = memalloc_noreclaim_save();
4957
4958         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4959
4960         memalloc_noreclaim_restore(noreclaim_flag);
4961         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
4962         set_task_reclaim_state(current, NULL);
4963
4964         return nr_reclaimed;
4965 }
4966 #endif
4967
4968 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4969 {
4970         struct mem_cgroup *memcg;
4971         struct lruvec *lruvec;
4972
4973         if (lru_gen_enabled()) {
4974                 lru_gen_age_node(pgdat, sc);
4975                 return;
4976         }
4977
4978         if (!can_age_anon_pages(pgdat, sc))
4979                 return;
4980
4981         lruvec = mem_cgroup_lruvec(NULL, pgdat);
4982         if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
4983                 return;
4984
4985         memcg = mem_cgroup_iter(NULL, NULL, NULL);
4986         do {
4987                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
4988                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
4989                                    sc, LRU_ACTIVE_ANON);
4990                 memcg = mem_cgroup_iter(NULL, memcg, NULL);
4991         } while (memcg);
4992 }
4993
4994 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
4995 {
4996         int i;
4997         struct zone *zone;
4998
4999         /*
5000          * Check for watermark boosts top-down as the higher zones
5001          * are more likely to be boosted. Both watermarks and boosts
5002          * should not be checked at the same time as reclaim would
5003          * start prematurely when there is no boosting and a lower
5004          * zone is balanced.
5005          */
5006         for (i = highest_zoneidx; i >= 0; i--) {
5007                 zone = pgdat->node_zones + i;
5008                 if (!managed_zone(zone))
5009                         continue;
5010
5011                 if (zone->watermark_boost)
5012                         return true;
5013         }
5014
5015         return false;
5016 }
5017
5018 /*
5019  * Returns true if there is an eligible zone balanced for the request order
5020  * and highest_zoneidx
5021  */
5022 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
5023 {
5024         int i;
5025         unsigned long mark = -1;
5026         struct zone *zone;
5027
5028         /*
5029          * Check watermarks bottom-up as lower zones are more likely to
5030          * meet watermarks.
5031          */
5032         for (i = 0; i <= highest_zoneidx; i++) {
5033                 zone = pgdat->node_zones + i;
5034
5035                 if (!managed_zone(zone))
5036                         continue;
5037
5038                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
5039                         mark = wmark_pages(zone, WMARK_PROMO);
5040                 else
5041                         mark = high_wmark_pages(zone);
5042                 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
5043                         return true;
5044         }
5045
5046         /*
5047          * If a node has no managed zone within highest_zoneidx, it does not
5048          * need balancing by definition. This can happen if a zone-restricted
5049          * allocation tries to wake a remote kswapd.
5050          */
5051         if (mark == -1)
5052                 return true;
5053
5054         return false;
5055 }
5056
5057 /* Clear pgdat state for congested, dirty or under writeback. */
5058 static void clear_pgdat_congested(pg_data_t *pgdat)
5059 {
5060         struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
5061
5062         clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
5063         clear_bit(PGDAT_DIRTY, &pgdat->flags);
5064         clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
5065 }
5066
5067 /*
5068  * Prepare kswapd for sleeping. This verifies that there are no processes
5069  * waiting in throttle_direct_reclaim() and that watermarks have been met.
5070  *
5071  * Returns true if kswapd is ready to sleep
5072  */
5073 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
5074                                 int highest_zoneidx)
5075 {
5076         /*
5077          * The throttled processes are normally woken up in balance_pgdat() as
5078          * soon as allow_direct_reclaim() is true. But there is a potential
5079          * race between when kswapd checks the watermarks and a process gets
5080          * throttled. There is also a potential race if processes get
5081          * throttled, kswapd wakes, a large process exits thereby balancing the
5082          * zones, which causes kswapd to exit balance_pgdat() before reaching
5083          * the wake up checks. If kswapd is going to sleep, no process should
5084          * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
5085          * the wake up is premature, processes will wake kswapd and get
5086          * throttled again. The difference from wake ups in balance_pgdat() is
5087          * that here we are under prepare_to_wait().
5088          */
5089         if (waitqueue_active(&pgdat->pfmemalloc_wait))
5090                 wake_up_all(&pgdat->pfmemalloc_wait);
5091
5092         /* Hopeless node, leave it to direct reclaim */
5093         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
5094                 return true;
5095
5096         if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
5097                 clear_pgdat_congested(pgdat);
5098                 return true;
5099         }
5100
5101         return false;
5102 }
5103
5104 /*
5105  * kswapd shrinks a node of pages that are at or below the highest usable
5106  * zone that is currently unbalanced.
5107  *
5108  * Returns true if kswapd scanned at least the requested number of pages to
5109  * reclaim or if the lack of progress was due to pages under writeback.
5110  * This is used to determine if the scanning priority needs to be raised.
5111  */
5112 static bool kswapd_shrink_node(pg_data_t *pgdat,
5113                                struct scan_control *sc)
5114 {
5115         struct zone *zone;
5116         int z;
5117
5118         /* Reclaim a number of pages proportional to the number of zones */
5119         sc->nr_to_reclaim = 0;
5120         for (z = 0; z <= sc->reclaim_idx; z++) {
5121                 zone = pgdat->node_zones + z;
5122                 if (!managed_zone(zone))
5123                         continue;
5124
5125                 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
5126         }
5127
5128         /*
5129          * Historically care was taken to put equal pressure on all zones but
5130          * now pressure is applied based on node LRU order.
5131          */
5132         shrink_node(pgdat, sc);
5133
5134         /*
5135          * Fragmentation may mean that the system cannot be rebalanced for
5136          * high-order allocations. If twice the allocation size has been
5137          * reclaimed then recheck watermarks only at order-0 to prevent
5138          * excessive reclaim. Assume that a process requested a high-order
5139          * can direct reclaim/compact.
5140          */
5141         if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
5142                 sc->order = 0;
5143
5144         return sc->nr_scanned >= sc->nr_to_reclaim;
5145 }
5146
5147 /* Page allocator PCP high watermark is lowered if reclaim is active. */
5148 static inline void
5149 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
5150 {
5151         int i;
5152         struct zone *zone;
5153
5154         for (i = 0; i <= highest_zoneidx; i++) {
5155                 zone = pgdat->node_zones + i;
5156
5157                 if (!managed_zone(zone))
5158                         continue;
5159
5160                 if (active)
5161                         set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
5162                 else
5163                         clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
5164         }
5165 }
5166
5167 static inline void
5168 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
5169 {
5170         update_reclaim_active(pgdat, highest_zoneidx, true);
5171 }
5172
5173 static inline void
5174 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
5175 {
5176         update_reclaim_active(pgdat, highest_zoneidx, false);
5177 }
5178
5179 /*
5180  * For kswapd, balance_pgdat() will reclaim pages across a node from zones
5181  * that are eligible for use by the caller until at least one zone is
5182  * balanced.
5183  *
5184  * Returns the order kswapd finished reclaiming at.
5185  *
5186  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
5187  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
5188  * found to have free_pages <= high_wmark_pages(zone), any page in that zone
5189  * or lower is eligible for reclaim until at least one usable zone is
5190  * balanced.
5191  */
5192 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
5193 {
5194         int i;
5195         unsigned long nr_soft_reclaimed;
5196         unsigned long nr_soft_scanned;
5197         unsigned long pflags;
5198         unsigned long nr_boost_reclaim;
5199         unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
5200         bool boosted;
5201         struct zone *zone;
5202         struct scan_control sc = {
5203                 .gfp_mask = GFP_KERNEL,
5204                 .order = order,
5205                 .may_unmap = 1,
5206         };
5207
5208         set_task_reclaim_state(current, &sc.reclaim_state);
5209         psi_memstall_enter(&pflags);
5210         __fs_reclaim_acquire(_THIS_IP_);
5211
5212         count_vm_event(PAGEOUTRUN);
5213
5214         /*
5215          * Account for the reclaim boost. Note that the zone boost is left in
5216          * place so that parallel allocations that are near the watermark will
5217          * stall or direct reclaim until kswapd is finished.
5218          */
5219         nr_boost_reclaim = 0;
5220         for (i = 0; i <= highest_zoneidx; i++) {
5221                 zone = pgdat->node_zones + i;
5222                 if (!managed_zone(zone))
5223                         continue;
5224
5225                 nr_boost_reclaim += zone->watermark_boost;
5226                 zone_boosts[i] = zone->watermark_boost;
5227         }
5228         boosted = nr_boost_reclaim;
5229
5230 restart:
5231         set_reclaim_active(pgdat, highest_zoneidx);
5232         sc.priority = DEF_PRIORITY;
5233         do {
5234                 unsigned long nr_reclaimed = sc.nr_reclaimed;
5235                 bool raise_priority = true;
5236                 bool balanced;
5237                 bool ret;
5238
5239                 sc.reclaim_idx = highest_zoneidx;
5240
5241                 /*
5242                  * If the number of buffer_heads exceeds the maximum allowed
5243                  * then consider reclaiming from all zones. This has a dual
5244                  * purpose -- on 64-bit systems it is expected that
5245                  * buffer_heads are stripped during active rotation. On 32-bit
5246                  * systems, highmem pages can pin lowmem memory and shrinking
5247                  * buffers can relieve lowmem pressure. Reclaim may still not
5248                  * go ahead if all eligible zones for the original allocation
5249                  * request are balanced to avoid excessive reclaim from kswapd.
5250                  */
5251                 if (buffer_heads_over_limit) {
5252                         for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
5253                                 zone = pgdat->node_zones + i;
5254                                 if (!managed_zone(zone))
5255                                         continue;
5256
5257                                 sc.reclaim_idx = i;
5258                                 break;
5259                         }
5260                 }
5261
5262                 /*
5263                  * If the pgdat is imbalanced then ignore boosting and preserve
5264                  * the watermarks for a later time and restart. Note that the
5265                  * zone watermarks will be still reset at the end of balancing
5266                  * on the grounds that the normal reclaim should be enough to
5267                  * re-evaluate if boosting is required when kswapd next wakes.
5268                  */
5269                 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
5270                 if (!balanced && nr_boost_reclaim) {
5271                         nr_boost_reclaim = 0;
5272                         goto restart;
5273                 }
5274
5275                 /*
5276                  * If boosting is not active then only reclaim if there are no
5277                  * eligible zones. Note that sc.reclaim_idx is not used as
5278                  * buffer_heads_over_limit may have adjusted it.
5279                  */
5280                 if (!nr_boost_reclaim && balanced)
5281                         goto out;
5282
5283                 /* Limit the priority of boosting to avoid reclaim writeback */
5284                 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
5285                         raise_priority = false;
5286
5287                 /*
5288                  * Do not writeback or swap pages for boosted reclaim. The
5289                  * intent is to relieve pressure not issue sub-optimal IO
5290                  * from reclaim context. If no pages are reclaimed, the
5291                  * reclaim will be aborted.
5292                  */
5293                 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
5294                 sc.may_swap = !nr_boost_reclaim;
5295
5296                 /*
5297                  * Do some background aging, to give pages a chance to be
5298                  * referenced before reclaiming. All pages are rotated
5299                  * regardless of classzone as this is about consistent aging.
5300                  */
5301                 kswapd_age_node(pgdat, &sc);
5302
5303                 /*
5304                  * If we're getting trouble reclaiming, start doing writepage
5305                  * even in laptop mode.
5306                  */
5307                 if (sc.priority < DEF_PRIORITY - 2)
5308                         sc.may_writepage = 1;
5309
5310                 /* Call soft limit reclaim before calling shrink_node. */
5311                 sc.nr_scanned = 0;
5312                 nr_soft_scanned = 0;
5313                 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
5314                                                 sc.gfp_mask, &nr_soft_scanned);
5315                 sc.nr_reclaimed += nr_soft_reclaimed;
5316
5317                 /*
5318                  * There should be no need to raise the scanning priority if
5319                  * enough pages are already being scanned that that high
5320                  * watermark would be met at 100% efficiency.
5321                  */
5322                 if (kswapd_shrink_node(pgdat, &sc))
5323                         raise_priority = false;
5324
5325                 /*
5326                  * If the low watermark is met there is no need for processes
5327                  * to be throttled on pfmemalloc_wait as they should not be
5328                  * able to safely make forward progress. Wake them
5329                  */
5330                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
5331                                 allow_direct_reclaim(pgdat))
5332                         wake_up_all(&pgdat->pfmemalloc_wait);
5333
5334                 /* Check if kswapd should be suspending */
5335                 __fs_reclaim_release(_THIS_IP_);
5336                 ret = try_to_freeze();
5337                 __fs_reclaim_acquire(_THIS_IP_);
5338                 if (ret || kthread_should_stop())
5339                         break;
5340
5341                 /*
5342                  * Raise priority if scanning rate is too low or there was no
5343                  * progress in reclaiming pages
5344                  */
5345                 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
5346                 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
5347
5348                 /*
5349                  * If reclaim made no progress for a boost, stop reclaim as
5350                  * IO cannot be queued and it could be an infinite loop in
5351                  * extreme circumstances.
5352                  */
5353                 if (nr_boost_reclaim && !nr_reclaimed)
5354                         break;
5355
5356                 if (raise_priority || !nr_reclaimed)
5357                         sc.priority--;
5358         } while (sc.priority >= 1);
5359
5360         if (!sc.nr_reclaimed)
5361                 pgdat->kswapd_failures++;
5362
5363 out:
5364         clear_reclaim_active(pgdat, highest_zoneidx);
5365
5366         /* If reclaim was boosted, account for the reclaim done in this pass */
5367         if (boosted) {
5368                 unsigned long flags;
5369
5370                 for (i = 0; i <= highest_zoneidx; i++) {
5371                         if (!zone_boosts[i])
5372                                 continue;
5373
5374                         /* Increments are under the zone lock */
5375                         zone = pgdat->node_zones + i;
5376                         spin_lock_irqsave(&zone->lock, flags);
5377                         zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
5378                         spin_unlock_irqrestore(&zone->lock, flags);
5379                 }
5380
5381                 /*
5382                  * As there is now likely space, wakeup kcompact to defragment
5383                  * pageblocks.
5384                  */
5385                 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
5386         }
5387
5388         snapshot_refaults(NULL, pgdat);
5389         __fs_reclaim_release(_THIS_IP_);
5390         psi_memstall_leave(&pflags);
5391         set_task_reclaim_state(current, NULL);
5392
5393         /*
5394          * Return the order kswapd stopped reclaiming at as
5395          * prepare_kswapd_sleep() takes it into account. If another caller
5396          * entered the allocator slow path while kswapd was awake, order will
5397          * remain at the higher level.
5398          */
5399         return sc.order;
5400 }
5401
5402 /*
5403  * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
5404  * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
5405  * not a valid index then either kswapd runs for first time or kswapd couldn't
5406  * sleep after previous reclaim attempt (node is still unbalanced). In that
5407  * case return the zone index of the previous kswapd reclaim cycle.
5408  */
5409 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
5410                                            enum zone_type prev_highest_zoneidx)
5411 {
5412         enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
5413
5414         return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
5415 }
5416
5417 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
5418                                 unsigned int highest_zoneidx)
5419 {
5420         long remaining = 0;
5421         DEFINE_WAIT(wait);
5422
5423         if (freezing(current) || kthread_should_stop())
5424                 return;
5425
5426         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
5427
5428         /*
5429          * Try to sleep for a short interval. Note that kcompactd will only be
5430          * woken if it is possible to sleep for a short interval. This is
5431          * deliberate on the assumption that if reclaim cannot keep an
5432          * eligible zone balanced that it's also unlikely that compaction will
5433          * succeed.
5434          */
5435         if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
5436                 /*
5437                  * Compaction records what page blocks it recently failed to
5438                  * isolate pages from and skips them in the future scanning.
5439                  * When kswapd is going to sleep, it is reasonable to assume
5440                  * that pages and compaction may succeed so reset the cache.
5441                  */
5442                 reset_isolation_suitable(pgdat);
5443
5444                 /*
5445                  * We have freed the memory, now we should compact it to make
5446                  * allocation of the requested order possible.
5447                  */
5448                 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
5449
5450                 remaining = schedule_timeout(HZ/10);
5451
5452                 /*
5453                  * If woken prematurely then reset kswapd_highest_zoneidx and
5454                  * order. The values will either be from a wakeup request or
5455                  * the previous request that slept prematurely.
5456                  */
5457                 if (remaining) {
5458                         WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
5459                                         kswapd_highest_zoneidx(pgdat,
5460                                                         highest_zoneidx));
5461
5462                         if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
5463                                 WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
5464                 }
5465
5466                 finish_wait(&pgdat->kswapd_wait, &wait);
5467                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
5468         }
5469
5470         /*
5471          * After a short sleep, check if it was a premature sleep. If not, then
5472          * go fully to sleep until explicitly woken up.
5473          */
5474         if (!remaining &&
5475             prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
5476                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
5477
5478                 /*
5479                  * vmstat counters are not perfectly accurate and the estimated
5480                  * value for counters such as NR_FREE_PAGES can deviate from the
5481                  * true value by nr_online_cpus * threshold. To avoid the zone
5482                  * watermarks being breached while under pressure, we reduce the
5483                  * per-cpu vmstat threshold while kswapd is awake and restore
5484                  * them before going back to sleep.
5485                  */
5486                 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
5487
5488                 if (!kthread_should_stop())
5489                         schedule();
5490
5491                 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
5492         } else {
5493                 if (remaining)
5494                         count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
5495                 else
5496                         count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
5497         }
5498         finish_wait(&pgdat->kswapd_wait, &wait);
5499 }
5500
5501 /*
5502  * The background pageout daemon, started as a kernel thread
5503  * from the init process.
5504  *
5505  * This basically trickles out pages so that we have _some_
5506  * free memory available even if there is no other activity
5507  * that frees anything up. This is needed for things like routing
5508  * etc, where we otherwise might have all activity going on in
5509  * asynchronous contexts that cannot page things out.
5510  *
5511  * If there are applications that are active memory-allocators
5512  * (most normal use), this basically shouldn't matter.
5513  */
5514 static int kswapd(void *p)
5515 {
5516         unsigned int alloc_order, reclaim_order;
5517         unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
5518         pg_data_t *pgdat = (pg_data_t *)p;
5519         struct task_struct *tsk = current;
5520         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
5521
5522         if (!cpumask_empty(cpumask))
5523                 set_cpus_allowed_ptr(tsk, cpumask);
5524
5525         /*
5526          * Tell the memory management that we're a "memory allocator",
5527          * and that if we need more memory we should get access to it
5528          * regardless (see "__alloc_pages()"). "kswapd" should
5529          * never get caught in the normal page freeing logic.
5530          *
5531          * (Kswapd normally doesn't need memory anyway, but sometimes
5532          * you need a small amount of memory in order to be able to
5533          * page out something else, and this flag essentially protects
5534          * us from recursively trying to free more memory as we're
5535          * trying to free the first piece of memory in the first place).
5536          */
5537         tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
5538         set_freezable();
5539
5540         WRITE_ONCE(pgdat->kswapd_order, 0);
5541         WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
5542         atomic_set(&pgdat->nr_writeback_throttled, 0);
5543         for ( ; ; ) {
5544                 bool ret;
5545
5546                 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
5547                 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
5548                                                         highest_zoneidx);
5549
5550 kswapd_try_sleep:
5551                 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
5552                                         highest_zoneidx);
5553
5554                 /* Read the new order and highest_zoneidx */
5555                 alloc_order = READ_ONCE(pgdat->kswapd_order);
5556                 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
5557                                                         highest_zoneidx);
5558                 WRITE_ONCE(pgdat->kswapd_order, 0);
5559                 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
5560
5561                 ret = try_to_freeze();
5562                 if (kthread_should_stop())
5563                         break;
5564
5565                 /*
5566                  * We can speed up thawing tasks if we don't call balance_pgdat
5567                  * after returning from the refrigerator
5568                  */
5569                 if (ret)
5570                         continue;
5571
5572                 /*
5573                  * Reclaim begins at the requested order but if a high-order
5574                  * reclaim fails then kswapd falls back to reclaiming for
5575                  * order-0. If that happens, kswapd will consider sleeping
5576                  * for the order it finished reclaiming at (reclaim_order)
5577                  * but kcompactd is woken to compact for the original
5578                  * request (alloc_order).
5579                  */
5580                 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
5581                                                 alloc_order);
5582                 reclaim_order = balance_pgdat(pgdat, alloc_order,
5583                                                 highest_zoneidx);
5584                 if (reclaim_order < alloc_order)
5585                         goto kswapd_try_sleep;
5586         }
5587
5588         tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
5589
5590         return 0;
5591 }
5592
5593 /*
5594  * A zone is low on free memory or too fragmented for high-order memory.  If
5595  * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
5596  * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
5597  * has failed or is not needed, still wake up kcompactd if only compaction is
5598  * needed.
5599  */
5600 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
5601                    enum zone_type highest_zoneidx)
5602 {
5603         pg_data_t *pgdat;
5604         enum zone_type curr_idx;
5605
5606         if (!managed_zone(zone))
5607                 return;
5608
5609         if (!cpuset_zone_allowed(zone, gfp_flags))
5610                 return;
5611
5612         pgdat = zone->zone_pgdat;
5613         curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
5614
5615         if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
5616                 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
5617
5618         if (READ_ONCE(pgdat->kswapd_order) < order)
5619                 WRITE_ONCE(pgdat->kswapd_order, order);
5620
5621         if (!waitqueue_active(&pgdat->kswapd_wait))
5622                 return;
5623
5624         /* Hopeless node, leave it to direct reclaim if possible */
5625         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
5626             (pgdat_balanced(pgdat, order, highest_zoneidx) &&
5627              !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
5628                 /*
5629                  * There may be plenty of free memory available, but it's too
5630                  * fragmented for high-order allocations.  Wake up kcompactd
5631                  * and rely on compaction_suitable() to determine if it's
5632                  * needed.  If it fails, it will defer subsequent attempts to
5633                  * ratelimit its work.
5634                  */
5635                 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
5636                         wakeup_kcompactd(pgdat, order, highest_zoneidx);
5637                 return;
5638         }
5639
5640         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
5641                                       gfp_flags);
5642         wake_up_interruptible(&pgdat->kswapd_wait);
5643 }
5644
5645 #ifdef CONFIG_HIBERNATION
5646 /*
5647  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
5648  * freed pages.
5649  *
5650  * Rather than trying to age LRUs the aim is to preserve the overall
5651  * LRU order by reclaiming preferentially
5652  * inactive > active > active referenced > active mapped
5653  */
5654 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
5655 {
5656         struct scan_control sc = {
5657                 .nr_to_reclaim = nr_to_reclaim,
5658                 .gfp_mask = GFP_HIGHUSER_MOVABLE,
5659                 .reclaim_idx = MAX_NR_ZONES - 1,
5660                 .priority = DEF_PRIORITY,
5661                 .may_writepage = 1,
5662                 .may_unmap = 1,
5663                 .may_swap = 1,
5664                 .hibernation_mode = 1,
5665         };
5666         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
5667         unsigned long nr_reclaimed;
5668         unsigned int noreclaim_flag;
5669
5670         fs_reclaim_acquire(sc.gfp_mask);
5671         noreclaim_flag = memalloc_noreclaim_save();
5672         set_task_reclaim_state(current, &sc.reclaim_state);
5673
5674         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
5675
5676         set_task_reclaim_state(current, NULL);
5677         memalloc_noreclaim_restore(noreclaim_flag);
5678         fs_reclaim_release(sc.gfp_mask);
5679
5680         return nr_reclaimed;
5681 }
5682 #endif /* CONFIG_HIBERNATION */
5683
5684 /*
5685  * This kswapd start function will be called by init and node-hot-add.
5686  */
5687 void kswapd_run(int nid)
5688 {
5689         pg_data_t *pgdat = NODE_DATA(nid);
5690
5691         pgdat_kswapd_lock(pgdat);
5692         if (!pgdat->kswapd) {
5693                 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
5694                 if (IS_ERR(pgdat->kswapd)) {
5695                         /* failure at boot is fatal */
5696                         BUG_ON(system_state < SYSTEM_RUNNING);
5697                         pr_err("Failed to start kswapd on node %d\n", nid);
5698                         pgdat->kswapd = NULL;
5699                 }
5700         }
5701         pgdat_kswapd_unlock(pgdat);
5702 }
5703
5704 /*
5705  * Called by memory hotplug when all memory in a node is offlined.  Caller must
5706  * be holding mem_hotplug_begin/done().
5707  */
5708 void kswapd_stop(int nid)
5709 {
5710         pg_data_t *pgdat = NODE_DATA(nid);
5711         struct task_struct *kswapd;
5712
5713         pgdat_kswapd_lock(pgdat);
5714         kswapd = pgdat->kswapd;
5715         if (kswapd) {
5716                 kthread_stop(kswapd);
5717                 pgdat->kswapd = NULL;
5718         }
5719         pgdat_kswapd_unlock(pgdat);
5720 }
5721
5722 static int __init kswapd_init(void)
5723 {
5724         int nid;
5725
5726         swap_setup();
5727         for_each_node_state(nid, N_MEMORY)
5728                 kswapd_run(nid);
5729         return 0;
5730 }
5731
5732 module_init(kswapd_init)
5733
5734 #ifdef CONFIG_NUMA
5735 /*
5736  * Node reclaim mode
5737  *
5738  * If non-zero call node_reclaim when the number of free pages falls below
5739  * the watermarks.
5740  */
5741 int node_reclaim_mode __read_mostly;
5742
5743 /*
5744  * Priority for NODE_RECLAIM. This determines the fraction of pages
5745  * of a node considered for each zone_reclaim. 4 scans 1/16th of
5746  * a zone.
5747  */
5748 #define NODE_RECLAIM_PRIORITY 4
5749
5750 /*
5751  * Percentage of pages in a zone that must be unmapped for node_reclaim to
5752  * occur.
5753  */
5754 int sysctl_min_unmapped_ratio = 1;
5755
5756 /*
5757  * If the number of slab pages in a zone grows beyond this percentage then
5758  * slab reclaim needs to occur.
5759  */
5760 int sysctl_min_slab_ratio = 5;
5761
5762 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
5763 {
5764         unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
5765         unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
5766                 node_page_state(pgdat, NR_ACTIVE_FILE);
5767
5768         /*
5769          * It's possible for there to be more file mapped pages than
5770          * accounted for by the pages on the file LRU lists because
5771          * tmpfs pages accounted for as ANON can also be FILE_MAPPED
5772          */
5773         return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
5774 }
5775
5776 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
5777 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
5778 {
5779         unsigned long nr_pagecache_reclaimable;
5780         unsigned long delta = 0;
5781
5782         /*
5783          * If RECLAIM_UNMAP is set, then all file pages are considered
5784          * potentially reclaimable. Otherwise, we have to worry about
5785          * pages like swapcache and node_unmapped_file_pages() provides
5786          * a better estimate
5787          */
5788         if (node_reclaim_mode & RECLAIM_UNMAP)
5789                 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
5790         else
5791                 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
5792
5793         /* If we can't clean pages, remove dirty pages from consideration */
5794         if (!(node_reclaim_mode & RECLAIM_WRITE))
5795                 delta += node_page_state(pgdat, NR_FILE_DIRTY);
5796
5797         /* Watch for any possible underflows due to delta */
5798         if (unlikely(delta > nr_pagecache_reclaimable))
5799                 delta = nr_pagecache_reclaimable;
5800
5801         return nr_pagecache_reclaimable - delta;
5802 }
5803
5804 /*
5805  * Try to free up some pages from this node through reclaim.
5806  */
5807 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
5808 {
5809         /* Minimum pages needed in order to stay on node */
5810         const unsigned long nr_pages = 1 << order;
5811         struct task_struct *p = current;
5812         unsigned int noreclaim_flag;
5813         struct scan_control sc = {
5814                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
5815                 .gfp_mask = current_gfp_context(gfp_mask),
5816                 .order = order,
5817                 .priority = NODE_RECLAIM_PRIORITY,
5818                 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
5819                 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
5820                 .may_swap = 1,
5821                 .reclaim_idx = gfp_zone(gfp_mask),
5822         };
5823         unsigned long pflags;
5824
5825         trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
5826                                            sc.gfp_mask);
5827
5828         cond_resched();
5829         psi_memstall_enter(&pflags);
5830         fs_reclaim_acquire(sc.gfp_mask);
5831         /*
5832          * We need to be able to allocate from the reserves for RECLAIM_UNMAP
5833          */
5834         noreclaim_flag = memalloc_noreclaim_save();
5835         set_task_reclaim_state(p, &sc.reclaim_state);
5836
5837         if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
5838             node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
5839                 /*
5840                  * Free memory by calling shrink node with increasing
5841                  * priorities until we have enough memory freed.
5842                  */
5843                 do {
5844                         shrink_node(pgdat, &sc);
5845                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
5846         }
5847
5848         set_task_reclaim_state(p, NULL);
5849         memalloc_noreclaim_restore(noreclaim_flag);
5850         fs_reclaim_release(sc.gfp_mask);
5851         psi_memstall_leave(&pflags);
5852
5853         trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
5854
5855         return sc.nr_reclaimed >= nr_pages;
5856 }
5857
5858 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
5859 {
5860         int ret;
5861
5862         /*
5863          * Node reclaim reclaims unmapped file backed pages and
5864          * slab pages if we are over the defined limits.
5865          *
5866          * A small portion of unmapped file backed pages is needed for
5867          * file I/O otherwise pages read by file I/O will be immediately
5868          * thrown out if the node is overallocated. So we do not reclaim
5869          * if less than a specified percentage of the node is used by
5870          * unmapped file backed pages.
5871          */
5872         if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
5873             node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
5874             pgdat->min_slab_pages)
5875                 return NODE_RECLAIM_FULL;
5876
5877         /*
5878          * Do not scan if the allocation should not be delayed.
5879          */
5880         if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
5881                 return NODE_RECLAIM_NOSCAN;
5882
5883         /*
5884          * Only run node reclaim on the local node or on nodes that do not
5885          * have associated processors. This will favor the local processor
5886          * over remote processors and spread off node memory allocations
5887          * as wide as possible.
5888          */
5889         if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
5890                 return NODE_RECLAIM_NOSCAN;
5891
5892         if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
5893                 return NODE_RECLAIM_NOSCAN;
5894
5895         ret = __node_reclaim(pgdat, gfp_mask, order);
5896         clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
5897
5898         if (!ret)
5899                 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
5900
5901         return ret;
5902 }
5903 #endif
5904
5905 void check_move_unevictable_pages(struct pagevec *pvec)
5906 {
5907         struct folio_batch fbatch;
5908         unsigned i;
5909
5910         folio_batch_init(&fbatch);
5911         for (i = 0; i < pvec->nr; i++) {
5912                 struct page *page = pvec->pages[i];
5913
5914                 if (PageTransTail(page))
5915                         continue;
5916                 folio_batch_add(&fbatch, page_folio(page));
5917         }
5918         check_move_unevictable_folios(&fbatch);
5919 }
5920 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
5921
5922 /**
5923  * check_move_unevictable_folios - Move evictable folios to appropriate zone
5924  * lru list
5925  * @fbatch: Batch of lru folios to check.
5926  *
5927  * Checks folios for evictability, if an evictable folio is in the unevictable
5928  * lru list, moves it to the appropriate evictable lru list. This function
5929  * should be only used for lru folios.
5930  */
5931 void check_move_unevictable_folios(struct folio_batch *fbatch)
5932 {
5933         struct lruvec *lruvec = NULL;
5934         int pgscanned = 0;
5935         int pgrescued = 0;
5936         int i;
5937
5938         for (i = 0; i < fbatch->nr; i++) {
5939                 struct folio *folio = fbatch->folios[i];
5940                 int nr_pages = folio_nr_pages(folio);
5941
5942                 pgscanned += nr_pages;
5943
5944                 /* block memcg migration while the folio moves between lrus */
5945                 if (!folio_test_clear_lru(folio))
5946                         continue;
5947
5948                 lruvec = folio_lruvec_relock_irq(folio, lruvec);
5949                 if (folio_evictable(folio) && folio_test_unevictable(folio)) {
5950                         lruvec_del_folio(lruvec, folio);
5951                         folio_clear_unevictable(folio);
5952                         lruvec_add_folio(lruvec, folio);
5953                         pgrescued += nr_pages;
5954                 }
5955                 folio_set_lru(folio);
5956         }
5957
5958         if (lruvec) {
5959                 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
5960                 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
5961                 unlock_page_lruvec_irq(lruvec);
5962         } else if (pgscanned) {
5963                 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
5964         }
5965 }
5966 EXPORT_SYMBOL_GPL(check_move_unevictable_folios);