mm/memcontrol.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* memcontrol.c - Memory Controller
   3  *
   4  * Copyright IBM Corporation, 2007
   5  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6  *
   7  * Copyright 2007 OpenVZ SWsoft Inc
   8  * Author: Pavel Emelianov <xemul@openvz.org>
   9  *
  10  * Memory thresholds
  11  * Copyright (C) 2009 Nokia Corporation
  12  * Author: Kirill A. Shutemov
  13  *
  14  * Kernel Memory Controller
  15  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16  * Authors: Glauber Costa and Suleiman Souhlal
  17  *
  18  * Native page reclaim
  19  * Charge lifetime sanitation
  20  * Lockless page tracking & accounting
  21  * Unified hierarchy configuration model
  22  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23  *
  24  * Per memcg lru locking
  25  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  26  */
  27
  28 #include <linux/cgroup-defs.h>
  29 #include <linux/page_counter.h>
  30 #include <linux/memcontrol.h>
  31 #include <linux/cgroup.h>
  32 #include <linux/cpuset.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/shmem_fs.h>
  35 #include <linux/hugetlb.h>
  36 #include <linux/pagemap.h>
  37 #include <linux/pagevec.h>
  38 #include <linux/vm_event_item.h>
  39 #include <linux/smp.h>
  40 #include <linux/page-flags.h>
  41 #include <linux/backing-dev.h>
  42 #include <linux/bit_spinlock.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/limits.h>
  45 #include <linux/export.h>
  46 #include <linux/list.h>
  47 #include <linux/mutex.h>
  48 #include <linux/rbtree.h>
  49 #include <linux/slab.h>
  50 #include <linux/swapops.h>
  51 #include <linux/spinlock.h>
  52 #include <linux/fs.h>
  53 #include <linux/seq_file.h>
  54 #include <linux/parser.h>
  55 #include <linux/vmpressure.h>
  56 #include <linux/memremap.h>
  57 #include <linux/mm_inline.h>
  58 #include <linux/swap_cgroup.h>
  59 #include <linux/cpu.h>
  60 #include <linux/oom.h>
  61 #include <linux/lockdep.h>
  62 #include <linux/resume_user_mode.h>
  63 #include <linux/psi.h>
  64 #include <linux/seq_buf.h>
  65 #include <linux/sched/isolation.h>
  66 #include <linux/kmemleak.h>
  67 #include "internal.h"
  68 #include <net/sock.h>
  69 #include <net/ip.h>
  70 #include "slab.h"
  71 #include "memcontrol-v1.h"
  72
  73 #include <linux/uaccess.h>
  74
  75 #define CREATE_TRACE_POINTS
  76 #include <trace/events/memcg.h>
  77 #undef CREATE_TRACE_POINTS
  78
  79 #include <trace/events/vmscan.h>
  80
  81 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  82 EXPORT_SYMBOL(memory_cgrp_subsys);
  83
  84 struct mem_cgroup *root_mem_cgroup __read_mostly;
  85
  86 /* Active memory cgroup to use from an interrupt context */
  87 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  88 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
  89
  90 /* Socket memory accounting disabled? */
  91 static bool cgroup_memory_nosocket __ro_after_init;
  92
  93 /* Kernel memory accounting disabled? */
  94 static bool cgroup_memory_nokmem __ro_after_init;
  95
  96 /* BPF memory accounting disabled? */
  97 static bool cgroup_memory_nobpf __ro_after_init;
  98
  99 static struct kmem_cache *memcg_cachep;
 100 static struct kmem_cache *memcg_pn_cachep;
 101
 102 #ifdef CONFIG_CGROUP_WRITEBACK
 103 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 104 #endif
 105
 106 static inline bool task_is_dying(void)
 107 {
 108         return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 109                 (current->flags & PF_EXITING);
 110 }
 111
 112 /* Some nice accessors for the vmpressure. */
 113 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 114 {
 115         if (!memcg)
 116                 memcg = root_mem_cgroup;
 117         return &memcg->vmpressure;
 118 }
 119
 120 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 121 {
 122         return container_of(vmpr, struct mem_cgroup, vmpressure);
 123 }
 124
 125 #define SEQ_BUF_SIZE SZ_4K
 126 #define CURRENT_OBJCG_UPDATE_BIT 0
 127 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
 128
 129 static DEFINE_SPINLOCK(objcg_lock);
 130
 131 bool mem_cgroup_kmem_disabled(void)
 132 {
 133         return cgroup_memory_nokmem;
 134 }
 135
 136 static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
 137
 138 static void obj_cgroup_release(struct percpu_ref *ref)
 139 {
 140         struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 141         unsigned int nr_bytes;
 142         unsigned int nr_pages;
 143         unsigned long flags;
 144
 145         /*
 146          * At this point all allocated objects are freed, and
 147          * objcg->nr_charged_bytes can't have an arbitrary byte value.
 148          * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 149          *
 150          * The following sequence can lead to it:
 151          * 1) CPU0: objcg == stock->cached_objcg
 152          * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 153          *          PAGE_SIZE bytes are charged
 154          * 3) CPU1: a process from another memcg is allocating something,
 155          *          the stock if flushed,
 156          *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 157          * 5) CPU0: we do release this object,
 158          *          92 bytes are added to stock->nr_bytes
 159          * 6) CPU0: stock is flushed,
 160          *          92 bytes are added to objcg->nr_charged_bytes
 161          *
 162          * In the result, nr_charged_bytes == PAGE_SIZE.
 163          * This page will be uncharged in obj_cgroup_release().
 164          */
 165         nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 166         WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 167         nr_pages = nr_bytes >> PAGE_SHIFT;
 168
 169         if (nr_pages) {
 170                 struct mem_cgroup *memcg;
 171
 172                 memcg = get_mem_cgroup_from_objcg(objcg);
 173                 mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
 174                 memcg1_account_kmem(memcg, -nr_pages);
 175                 if (!mem_cgroup_is_root(memcg))
 176                         memcg_uncharge(memcg, nr_pages);
 177                 mem_cgroup_put(memcg);
 178         }
 179
 180         spin_lock_irqsave(&objcg_lock, flags);
 181         list_del(&objcg->list);
 182         spin_unlock_irqrestore(&objcg_lock, flags);
 183
 184         percpu_ref_exit(ref);
 185         kfree_rcu(objcg, rcu);
 186 }
 187
 188 static struct obj_cgroup *obj_cgroup_alloc(void)
 189 {
 190         struct obj_cgroup *objcg;
 191         int ret;
 192
 193         objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 194         if (!objcg)
 195                 return NULL;
 196
 197         ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 198                               GFP_KERNEL);
 199         if (ret) {
 200                 kfree(objcg);
 201                 return NULL;
 202         }
 203         INIT_LIST_HEAD(&objcg->list);
 204         return objcg;
 205 }
 206
 207 static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 208                                   struct mem_cgroup *parent)
 209 {
 210         struct obj_cgroup *objcg, *iter;
 211
 212         objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 213
 214         spin_lock_irq(&objcg_lock);
 215
 216         /* 1) Ready to reparent active objcg. */
 217         list_add(&objcg->list, &memcg->objcg_list);
 218         /* 2) Reparent active objcg and already reparented objcgs to parent. */
 219         list_for_each_entry(iter, &memcg->objcg_list, list)
 220                 WRITE_ONCE(iter->memcg, parent);
 221         /* 3) Move already reparented objcgs to the parent's list */
 222         list_splice(&memcg->objcg_list, &parent->objcg_list);
 223
 224         spin_unlock_irq(&objcg_lock);
 225
 226         percpu_ref_kill(&objcg->refcnt);
 227 }
 228
 229 /*
 230  * A lot of the calls to the cache allocation functions are expected to be
 231  * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
 232  * conditional to this static branch, we'll have to allow modules that does
 233  * kmem_cache_alloc and the such to see this symbol as well
 234  */
 235 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
 236 EXPORT_SYMBOL(memcg_kmem_online_key);
 237
 238 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
 239 EXPORT_SYMBOL(memcg_bpf_enabled_key);
 240
 241 /**
 242  * mem_cgroup_css_from_folio - css of the memcg associated with a folio
 243  * @folio: folio of interest
 244  *
 245  * If memcg is bound to the default hierarchy, css of the memcg associated
 246  * with @folio is returned.  The returned css remains associated with @folio
 247  * until it is released.
 248  *
 249  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 250  * is returned.
 251  */
 252 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
 253 {
 254         struct mem_cgroup *memcg = folio_memcg(folio);
 255
 256         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 257                 memcg = root_mem_cgroup;
 258
 259         return &memcg->css;
 260 }
 261
 262 /**
 263  * page_cgroup_ino - return inode number of the memcg a page is charged to
 264  * @page: the page
 265  *
 266  * Look up the closest online ancestor of the memory cgroup @page is charged to
 267  * and return its inode number or 0 if @page is not charged to any cgroup. It
 268  * is safe to call this function without holding a reference to @page.
 269  *
 270  * Note, this function is inherently racy, because there is nothing to prevent
 271  * the cgroup inode from getting torn down and potentially reallocated a moment
 272  * after page_cgroup_ino() returns, so it only should be used by callers that
 273  * do not care (such as procfs interfaces).
 274  */
 275 ino_t page_cgroup_ino(struct page *page)
 276 {
 277         struct mem_cgroup *memcg;
 278         unsigned long ino = 0;
 279
 280         rcu_read_lock();
 281         /* page_folio() is racy here, but the entire function is racy anyway */
 282         memcg = folio_memcg_check(page_folio(page));
 283
 284         while (memcg && !(memcg->css.flags & CSS_ONLINE))
 285                 memcg = parent_mem_cgroup(memcg);
 286         if (memcg)
 287                 ino = cgroup_ino(memcg->css.cgroup);
 288         rcu_read_unlock();
 289         return ino;
 290 }
 291
 292 /* Subset of node_stat_item for memcg stats */
 293 static const unsigned int memcg_node_stat_items[] = {
 294         NR_INACTIVE_ANON,
 295         NR_ACTIVE_ANON,
 296         NR_INACTIVE_FILE,
 297         NR_ACTIVE_FILE,
 298         NR_UNEVICTABLE,
 299         NR_SLAB_RECLAIMABLE_B,
 300         NR_SLAB_UNRECLAIMABLE_B,
 301         WORKINGSET_REFAULT_ANON,
 302         WORKINGSET_REFAULT_FILE,
 303         WORKINGSET_ACTIVATE_ANON,
 304         WORKINGSET_ACTIVATE_FILE,
 305         WORKINGSET_RESTORE_ANON,
 306         WORKINGSET_RESTORE_FILE,
 307         WORKINGSET_NODERECLAIM,
 308         NR_ANON_MAPPED,
 309         NR_FILE_MAPPED,
 310         NR_FILE_PAGES,
 311         NR_FILE_DIRTY,
 312         NR_WRITEBACK,
 313         NR_SHMEM,
 314         NR_SHMEM_THPS,
 315         NR_FILE_THPS,
 316         NR_ANON_THPS,
 317         NR_KERNEL_STACK_KB,
 318         NR_PAGETABLE,
 319         NR_SECONDARY_PAGETABLE,
 320 #ifdef CONFIG_SWAP
 321         NR_SWAPCACHE,
 322 #endif
 323 #ifdef CONFIG_NUMA_BALANCING
 324         PGPROMOTE_SUCCESS,
 325 #endif
 326         PGDEMOTE_KSWAPD,
 327         PGDEMOTE_DIRECT,
 328         PGDEMOTE_KHUGEPAGED,
 329         PGDEMOTE_PROACTIVE,
 330 #ifdef CONFIG_HUGETLB_PAGE
 331         NR_HUGETLB,
 332 #endif
 333 };
 334
 335 static const unsigned int memcg_stat_items[] = {
 336         MEMCG_SWAP,
 337         MEMCG_SOCK,
 338         MEMCG_PERCPU_B,
 339         MEMCG_VMALLOC,
 340         MEMCG_KMEM,
 341         MEMCG_ZSWAP_B,
 342         MEMCG_ZSWAPPED,
 343 };
 344
 345 #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
 346 #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
 347                            ARRAY_SIZE(memcg_stat_items))
 348 #define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
 349 static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
 350
 351 static void init_memcg_stats(void)
 352 {
 353         u8 i, j = 0;
 354
 355         BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);
 356
 357         memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));
 358
 359         for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
 360                 mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;
 361
 362         for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
 363                 mem_cgroup_stats_index[memcg_stat_items[i]] = j;
 364 }
 365
 366 static inline int memcg_stats_index(int idx)
 367 {
 368         return mem_cgroup_stats_index[idx];
 369 }
 370
 371 struct lruvec_stats_percpu {
 372         /* Local (CPU and cgroup) state */
 373         long state[NR_MEMCG_NODE_STAT_ITEMS];
 374
 375         /* Delta calculation for lockless upward propagation */
 376         long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
 377 };
 378
 379 struct lruvec_stats {
 380         /* Aggregated (CPU and subtree) state */
 381         long state[NR_MEMCG_NODE_STAT_ITEMS];
 382
 383         /* Non-hierarchical (CPU aggregated) state */
 384         long state_local[NR_MEMCG_NODE_STAT_ITEMS];
 385
 386         /* Pending child counts during tree propagation */
 387         long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
 388 };
 389
 390 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
 391 {
 392         struct mem_cgroup_per_node *pn;
 393         long x;
 394         int i;
 395
 396         if (mem_cgroup_disabled())
 397                 return node_page_state(lruvec_pgdat(lruvec), idx);
 398
 399         i = memcg_stats_index(idx);
 400         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 401                 return 0;
 402
 403         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 404         x = READ_ONCE(pn->lruvec_stats->state[i]);
 405 #ifdef CONFIG_SMP
 406         if (x < 0)
 407                 x = 0;
 408 #endif
 409         return x;
 410 }
 411
 412 unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 413                                       enum node_stat_item idx)
 414 {
 415         struct mem_cgroup_per_node *pn;
 416         long x;
 417         int i;
 418
 419         if (mem_cgroup_disabled())
 420                 return node_page_state(lruvec_pgdat(lruvec), idx);
 421
 422         i = memcg_stats_index(idx);
 423         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 424                 return 0;
 425
 426         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 427         x = READ_ONCE(pn->lruvec_stats->state_local[i]);
 428 #ifdef CONFIG_SMP
 429         if (x < 0)
 430                 x = 0;
 431 #endif
 432         return x;
 433 }
 434
 435 /* Subset of vm_event_item to report for memcg event stats */
 436 static const unsigned int memcg_vm_event_stat[] = {
 437 #ifdef CONFIG_MEMCG_V1
 438         PGPGIN,
 439         PGPGOUT,
 440 #endif
 441         PSWPIN,
 442         PSWPOUT,
 443         PGSCAN_KSWAPD,
 444         PGSCAN_DIRECT,
 445         PGSCAN_KHUGEPAGED,
 446         PGSCAN_PROACTIVE,
 447         PGSTEAL_KSWAPD,
 448         PGSTEAL_DIRECT,
 449         PGSTEAL_KHUGEPAGED,
 450         PGSTEAL_PROACTIVE,
 451         PGFAULT,
 452         PGMAJFAULT,
 453         PGREFILL,
 454         PGACTIVATE,
 455         PGDEACTIVATE,
 456         PGLAZYFREE,
 457         PGLAZYFREED,
 458 #ifdef CONFIG_SWAP
 459         SWPIN_ZERO,
 460         SWPOUT_ZERO,
 461 #endif
 462 #ifdef CONFIG_ZSWAP
 463         ZSWPIN,
 464         ZSWPOUT,
 465         ZSWPWB,
 466 #endif
 467 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 468         THP_FAULT_ALLOC,
 469         THP_COLLAPSE_ALLOC,
 470         THP_SWPOUT,
 471         THP_SWPOUT_FALLBACK,
 472 #endif
 473 #ifdef CONFIG_NUMA_BALANCING
 474         NUMA_PAGE_MIGRATE,
 475         NUMA_PTE_UPDATES,
 476         NUMA_HINT_FAULTS,
 477         NUMA_TASK_MIGRATE,
 478         NUMA_TASK_SWAP,
 479 #endif
 480 };
 481
 482 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
 483 static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
 484
 485 static void init_memcg_events(void)
 486 {
 487         u8 i;
 488
 489         BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);
 490
 491         memset(mem_cgroup_events_index, U8_MAX,
 492                sizeof(mem_cgroup_events_index));
 493
 494         for (i = 0; i < NR_MEMCG_EVENTS; ++i)
 495                 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
 496 }
 497
 498 static inline int memcg_events_index(enum vm_event_item idx)
 499 {
 500         return mem_cgroup_events_index[idx];
 501 }
 502
 503 struct memcg_vmstats_percpu {
 504         /* Stats updates since the last flush */
 505         unsigned int                    stats_updates;
 506
 507         /* Cached pointers for fast iteration in memcg_rstat_updated() */
 508         struct memcg_vmstats_percpu __percpu    *parent_pcpu;
 509         struct memcg_vmstats                    *vmstats;
 510
 511         /* The above should fit a single cacheline for memcg_rstat_updated() */
 512
 513         /* Local (CPU and cgroup) page state & events */
 514         long                    state[MEMCG_VMSTAT_SIZE];
 515         unsigned long           events[NR_MEMCG_EVENTS];
 516
 517         /* Delta calculation for lockless upward propagation */
 518         long                    state_prev[MEMCG_VMSTAT_SIZE];
 519         unsigned long           events_prev[NR_MEMCG_EVENTS];
 520 } ____cacheline_aligned;
 521
 522 struct memcg_vmstats {
 523         /* Aggregated (CPU and subtree) page state & events */
 524         long                    state[MEMCG_VMSTAT_SIZE];
 525         unsigned long           events[NR_MEMCG_EVENTS];
 526
 527         /* Non-hierarchical (CPU aggregated) page state & events */
 528         long                    state_local[MEMCG_VMSTAT_SIZE];
 529         unsigned long           events_local[NR_MEMCG_EVENTS];
 530
 531         /* Pending child counts during tree propagation */
 532         long                    state_pending[MEMCG_VMSTAT_SIZE];
 533         unsigned long           events_pending[NR_MEMCG_EVENTS];
 534
 535         /* Stats updates since the last flush */
 536         atomic_t                stats_updates;
 537 };
 538
 539 /*
 540  * memcg and lruvec stats flushing
 541  *
 542  * Many codepaths leading to stats update or read are performance sensitive and
 543  * adding stats flushing in such codepaths is not desirable. So, to optimize the
 544  * flushing the kernel does:
 545  *
 546  * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
 547  *    rstat update tree grow unbounded.
 548  *
 549  * 2) Flush the stats synchronously on reader side only when there are more than
 550  *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
 551  *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
 552  *    only for 2 seconds due to (1).
 553  */
 554 static void flush_memcg_stats_dwork(struct work_struct *w);
 555 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
 556 static u64 flush_last_time;
 557
 558 #define FLUSH_TIME (2UL*HZ)
 559
 560 static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
 561 {
 562         return atomic_read(&vmstats->stats_updates) >
 563                 MEMCG_CHARGE_BATCH * num_online_cpus();
 564 }
 565
 566 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
 567                                        int cpu)
 568 {
 569         struct memcg_vmstats_percpu __percpu *statc_pcpu;
 570         struct memcg_vmstats_percpu *statc;
 571         unsigned int stats_updates;
 572
 573         if (!val)
 574                 return;
 575
 576         /* TODO: add to cgroup update tree once it is nmi-safe. */
 577         if (!in_nmi())
 578                 css_rstat_updated(&memcg->css, cpu);
 579         statc_pcpu = memcg->vmstats_percpu;
 580         for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
 581                 statc = this_cpu_ptr(statc_pcpu);
 582                 /*
 583                  * If @memcg is already flushable then all its ancestors are
 584                  * flushable as well and also there is no need to increase
 585                  * stats_updates.
 586                  */
 587                 if (memcg_vmstats_needs_flush(statc->vmstats))
 588                         break;
 589
 590                 stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
 591                                                     abs(val));
 592                 if (stats_updates < MEMCG_CHARGE_BATCH)
 593                         continue;
 594
 595                 stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
 596                 atomic_add(stats_updates, &statc->vmstats->stats_updates);
 597         }
 598 }
 599
 600 static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
 601 {
 602         bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
 603
 604         trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
 605                 force, needs_flush);
 606
 607         if (!force && !needs_flush)
 608                 return;
 609
 610         if (mem_cgroup_is_root(memcg))
 611                 WRITE_ONCE(flush_last_time, jiffies_64);
 612
 613         css_rstat_flush(&memcg->css);
 614 }
 615
 616 /*
 617  * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
 618  * @memcg: root of the subtree to flush
 619  *
 620  * Flushing is serialized by the underlying global rstat lock. There is also a
 621  * minimum amount of work to be done even if there are no stat updates to flush.
 622  * Hence, we only flush the stats if the updates delta exceeds a threshold. This
 623  * avoids unnecessary work and contention on the underlying lock.
 624  */
 625 void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 626 {
 627         if (mem_cgroup_disabled())
 628                 return;
 629
 630         if (!memcg)
 631                 memcg = root_mem_cgroup;
 632
 633         __mem_cgroup_flush_stats(memcg, false);
 634 }
 635
 636 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 637 {
 638         /* Only flush if the periodic flusher is one full cycle late */
 639         if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
 640                 mem_cgroup_flush_stats(memcg);
 641 }
 642
 643 static void flush_memcg_stats_dwork(struct work_struct *w)
 644 {
 645         /*
 646          * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
 647          * in latency-sensitive paths is as cheap as possible.
 648          */
 649         __mem_cgroup_flush_stats(root_mem_cgroup, true);
 650         queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 651 }
 652
 653 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 654 {
 655         long x;
 656         int i = memcg_stats_index(idx);
 657
 658         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 659                 return 0;
 660
 661         x = READ_ONCE(memcg->vmstats->state[i]);
 662 #ifdef CONFIG_SMP
 663         if (x < 0)
 664                 x = 0;
 665 #endif
 666         return x;
 667 }
 668
 669 static int memcg_page_state_unit(int item);
 670
 671 /*
 672  * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
 673  * up non-zero sub-page updates to 1 page as zero page updates are ignored.
 674  */
 675 static int memcg_state_val_in_pages(int idx, int val)
 676 {
 677         int unit = memcg_page_state_unit(idx);
 678
 679         if (!val || unit == PAGE_SIZE)
 680                 return val;
 681         else
 682                 return max(val * unit / PAGE_SIZE, 1UL);
 683 }
 684
 685 /**
 686  * mod_memcg_state - update cgroup memory statistics
 687  * @memcg: the memory cgroup
 688  * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 689  * @val: delta to add to the counter, can be negative
 690  */
 691 void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
 692                        int val)
 693 {
 694         int i = memcg_stats_index(idx);
 695         int cpu;
 696
 697         if (mem_cgroup_disabled())
 698                 return;
 699
 700         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 701                 return;
 702
 703         cpu = get_cpu();
 704
 705         this_cpu_add(memcg->vmstats_percpu->state[i], val);
 706         val = memcg_state_val_in_pages(idx, val);
 707         memcg_rstat_updated(memcg, val, cpu);
 708         trace_mod_memcg_state(memcg, idx, val);
 709
 710         put_cpu();
 711 }
 712
 713 #ifdef CONFIG_MEMCG_V1
 714 /* idx can be of type enum memcg_stat_item or node_stat_item. */
 715 unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 716 {
 717         long x;
 718         int i = memcg_stats_index(idx);
 719
 720         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 721                 return 0;
 722
 723         x = READ_ONCE(memcg->vmstats->state_local[i]);
 724 #ifdef CONFIG_SMP
 725         if (x < 0)
 726                 x = 0;
 727 #endif
 728         return x;
 729 }
 730 #endif
 731
 732 static void mod_memcg_lruvec_state(struct lruvec *lruvec,
 733                                      enum node_stat_item idx,
 734                                      int val)
 735 {
 736         struct mem_cgroup_per_node *pn;
 737         struct mem_cgroup *memcg;
 738         int i = memcg_stats_index(idx);
 739         int cpu;
 740
 741         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 742                 return;
 743
 744         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 745         memcg = pn->memcg;
 746
 747         cpu = get_cpu();
 748
 749         /* Update memcg */
 750         this_cpu_add(memcg->vmstats_percpu->state[i], val);
 751
 752         /* Update lruvec */
 753         this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
 754
 755         val = memcg_state_val_in_pages(idx, val);
 756         memcg_rstat_updated(memcg, val, cpu);
 757         trace_mod_memcg_lruvec_state(memcg, idx, val);
 758
 759         put_cpu();
 760 }
 761
 762 /**
 763  * __mod_lruvec_state - update lruvec memory statistics
 764  * @lruvec: the lruvec
 765  * @idx: the stat item
 766  * @val: delta to add to the counter, can be negative
 767  *
 768  * The lruvec is the intersection of the NUMA node and a cgroup. This
 769  * function updates the all three counters that are affected by a
 770  * change of state at this level: per-node, per-cgroup, per-lruvec.
 771  */
 772 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 773                         int val)
 774 {
 775         /* Update node */
 776         __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 777
 778         /* Update memcg and lruvec */
 779         if (!mem_cgroup_disabled())
 780                 mod_memcg_lruvec_state(lruvec, idx, val);
 781 }
 782
 783 void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 784                              int val)
 785 {
 786         struct mem_cgroup *memcg;
 787         pg_data_t *pgdat = folio_pgdat(folio);
 788         struct lruvec *lruvec;
 789
 790         rcu_read_lock();
 791         memcg = folio_memcg(folio);
 792         /* Untracked pages have no memcg, no lruvec. Update only the node */
 793         if (!memcg) {
 794                 rcu_read_unlock();
 795                 __mod_node_page_state(pgdat, idx, val);
 796                 return;
 797         }
 798
 799         lruvec = mem_cgroup_lruvec(memcg, pgdat);
 800         __mod_lruvec_state(lruvec, idx, val);
 801         rcu_read_unlock();
 802 }
 803 EXPORT_SYMBOL(__lruvec_stat_mod_folio);
 804
 805 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 806 {
 807         pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 808         struct mem_cgroup *memcg;
 809         struct lruvec *lruvec;
 810
 811         rcu_read_lock();
 812         memcg = mem_cgroup_from_slab_obj(p);
 813
 814         /*
 815          * Untracked pages have no memcg, no lruvec. Update only the
 816          * node. If we reparent the slab objects to the root memcg,
 817          * when we free the slab object, we need to update the per-memcg
 818          * vmstats to keep it correct for the root memcg.
 819          */
 820         if (!memcg) {
 821                 __mod_node_page_state(pgdat, idx, val);
 822         } else {
 823                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
 824                 __mod_lruvec_state(lruvec, idx, val);
 825         }
 826         rcu_read_unlock();
 827 }
 828
 829 /**
 830  * count_memcg_events - account VM events in a cgroup
 831  * @memcg: the memory cgroup
 832  * @idx: the event item
 833  * @count: the number of events that occurred
 834  */
 835 void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 836                           unsigned long count)
 837 {
 838         int i = memcg_events_index(idx);
 839         int cpu;
 840
 841         if (mem_cgroup_disabled())
 842                 return;
 843
 844         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
 845                 return;
 846
 847         cpu = get_cpu();
 848
 849         this_cpu_add(memcg->vmstats_percpu->events[i], count);
 850         memcg_rstat_updated(memcg, count, cpu);
 851         trace_count_memcg_events(memcg, idx, count);
 852
 853         put_cpu();
 854 }
 855
 856 unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 857 {
 858         int i = memcg_events_index(event);
 859
 860         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 861                 return 0;
 862
 863         return READ_ONCE(memcg->vmstats->events[i]);
 864 }
 865
 866 #ifdef CONFIG_MEMCG_V1
 867 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 868 {
 869         int i = memcg_events_index(event);
 870
 871         if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
 872                 return 0;
 873
 874         return READ_ONCE(memcg->vmstats->events_local[i]);
 875 }
 876 #endif
 877
 878 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 879 {
 880         /*
 881          * mm_update_next_owner() may clear mm->owner to NULL
 882          * if it races with swapoff, page migration, etc.
 883          * So this can be called with p == NULL.
 884          */
 885         if (unlikely(!p))
 886                 return NULL;
 887
 888         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 889 }
 890 EXPORT_SYMBOL(mem_cgroup_from_task);
 891
 892 static __always_inline struct mem_cgroup *active_memcg(void)
 893 {
 894         if (!in_task())
 895                 return this_cpu_read(int_active_memcg);
 896         else
 897                 return current->active_memcg;
 898 }
 899
 900 /**
 901  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 902  * @mm: mm from which memcg should be extracted. It can be NULL.
 903  *
 904  * Obtain a reference on mm->memcg and returns it if successful. If mm
 905  * is NULL, then the memcg is chosen as follows:
 906  * 1) The active memcg, if set.
 907  * 2) current->mm->memcg, if available
 908  * 3) root memcg
 909  * If mem_cgroup is disabled, NULL is returned.
 910  */
 911 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 912 {
 913         struct mem_cgroup *memcg;
 914
 915         if (mem_cgroup_disabled())
 916                 return NULL;
 917
 918         /*
 919          * Page cache insertions can happen without an
 920          * actual mm context, e.g. during disk probing
 921          * on boot, loopback IO, acct() writes etc.
 922          *
 923          * No need to css_get on root memcg as the reference
 924          * counting is disabled on the root level in the
 925          * cgroup core. See CSS_NO_REF.
 926          */
 927         if (unlikely(!mm)) {
 928                 memcg = active_memcg();
 929                 if (unlikely(memcg)) {
 930                         /* remote memcg must hold a ref */
 931                         css_get(&memcg->css);
 932                         return memcg;
 933                 }
 934                 mm = current->mm;
 935                 if (unlikely(!mm))
 936                         return root_mem_cgroup;
 937         }
 938
 939         rcu_read_lock();
 940         do {
 941                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 942                 if (unlikely(!memcg))
 943                         memcg = root_mem_cgroup;
 944         } while (!css_tryget(&memcg->css));
 945         rcu_read_unlock();
 946         return memcg;
 947 }
 948 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 949
 950 /**
 951  * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
 952  */
 953 struct mem_cgroup *get_mem_cgroup_from_current(void)
 954 {
 955         struct mem_cgroup *memcg;
 956
 957         if (mem_cgroup_disabled())
 958                 return NULL;
 959
 960 again:
 961         rcu_read_lock();
 962         memcg = mem_cgroup_from_task(current);
 963         if (!css_tryget(&memcg->css)) {
 964                 rcu_read_unlock();
 965                 goto again;
 966         }
 967         rcu_read_unlock();
 968         return memcg;
 969 }
 970
 971 /**
 972  * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
 973  * @folio: folio from which memcg should be extracted.
 974  */
 975 struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
 976 {
 977         struct mem_cgroup *memcg = folio_memcg(folio);
 978
 979         if (mem_cgroup_disabled())
 980                 return NULL;
 981
 982         rcu_read_lock();
 983         if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
 984                 memcg = root_mem_cgroup;
 985         rcu_read_unlock();
 986         return memcg;
 987 }
 988
 989 /**
 990  * mem_cgroup_iter - iterate over memory cgroup hierarchy
 991  * @root: hierarchy root
 992  * @prev: previously returned memcg, NULL on first invocation
 993  * @reclaim: cookie for shared reclaim walks, NULL for full walks
 994  *
 995  * Returns references to children of the hierarchy below @root, or
 996  * @root itself, or %NULL after a full round-trip.
 997  *
 998  * Caller must pass the return value in @prev on subsequent
 999  * invocations for reference counting, or use mem_cgroup_iter_break()
1000  * to cancel a hierarchy walk before the round-trip is complete.
1001  *
1002  * Reclaimers can specify a node in @reclaim to divide up the memcgs
1003  * in the hierarchy among all concurrent reclaimers operating on the
1004  * same node.
1005  */
1006 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1007                                    struct mem_cgroup *prev,
1008                                    struct mem_cgroup_reclaim_cookie *reclaim)
1009 {
1010         struct mem_cgroup_reclaim_iter *iter;
1011         struct cgroup_subsys_state *css;
1012         struct mem_cgroup *pos;
1013         struct mem_cgroup *next;
1014
1015         if (mem_cgroup_disabled())
1016                 return NULL;
1017
1018         if (!root)
1019                 root = root_mem_cgroup;
1020
1021         rcu_read_lock();
1022 restart:
1023         next = NULL;
1024
1025         if (reclaim) {
1026                 int gen;
1027                 int nid = reclaim->pgdat->node_id;
1028
1029                 iter = &root->nodeinfo[nid]->iter;
1030                 gen = atomic_read(&iter->generation);
1031
1032                 /*
1033                  * On start, join the current reclaim iteration cycle.
1034                  * Exit when a concurrent walker completes it.
1035                  */
1036                 if (!prev)
1037                         reclaim->generation = gen;
1038                 else if (reclaim->generation != gen)
1039                         goto out_unlock;
1040
1041                 pos = READ_ONCE(iter->position);
1042         } else
1043                 pos = prev;
1044
1045         css = pos ? &pos->css : NULL;
1046
1047         while ((css = css_next_descendant_pre(css, &root->css))) {
1048                 /*
1049                  * Verify the css and acquire a reference.  The root
1050                  * is provided by the caller, so we know it's alive
1051                  * and kicking, and don't take an extra reference.
1052                  */
1053                 if (css == &root->css || css_tryget(css))
1054                         break;
1055         }
1056
1057         next = mem_cgroup_from_css(css);
1058
1059         if (reclaim) {
1060                 /*
1061                  * The position could have already been updated by a competing
1062                  * thread, so check that the value hasn't changed since we read
1063                  * it to avoid reclaiming from the same cgroup twice.
1064                  */
1065                 if (cmpxchg(&iter->position, pos, next) != pos) {
1066                         if (css && css != &root->css)
1067                                 css_put(css);
1068                         goto restart;
1069                 }
1070
1071                 if (!next) {
1072                         atomic_inc(&iter->generation);
1073
1074                         /*
1075                          * Reclaimers share the hierarchy walk, and a
1076                          * new one might jump in right at the end of
1077                          * the hierarchy - make sure they see at least
1078                          * one group and restart from the beginning.
1079                          */
1080                         if (!prev)
1081                                 goto restart;
1082                 }
1083         }
1084
1085 out_unlock:
1086         rcu_read_unlock();
1087         if (prev && prev != root)
1088                 css_put(&prev->css);
1089
1090         return next;
1091 }
1092
1093 /**
1094  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1095  * @root: hierarchy root
1096  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1097  */
1098 void mem_cgroup_iter_break(struct mem_cgroup *root,
1099                            struct mem_cgroup *prev)
1100 {
1101         if (!root)
1102                 root = root_mem_cgroup;
1103         if (prev && prev != root)
1104                 css_put(&prev->css);
1105 }
1106
1107 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1108                                         struct mem_cgroup *dead_memcg)
1109 {
1110         struct mem_cgroup_reclaim_iter *iter;
1111         struct mem_cgroup_per_node *mz;
1112         int nid;
1113
1114         for_each_node(nid) {
1115                 mz = from->nodeinfo[nid];
1116                 iter = &mz->iter;
1117                 cmpxchg(&iter->position, dead_memcg, NULL);
1118         }
1119 }
1120
1121 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1122 {
1123         struct mem_cgroup *memcg = dead_memcg;
1124         struct mem_cgroup *last;
1125
1126         do {
1127                 __invalidate_reclaim_iterators(memcg, dead_memcg);
1128                 last = memcg;
1129         } while ((memcg = parent_mem_cgroup(memcg)));
1130
1131         /*
1132          * When cgroup1 non-hierarchy mode is used,
1133          * parent_mem_cgroup() does not walk all the way up to the
1134          * cgroup root (root_mem_cgroup). So we have to handle
1135          * dead_memcg from cgroup root separately.
1136          */
1137         if (!mem_cgroup_is_root(last))
1138                 __invalidate_reclaim_iterators(root_mem_cgroup,
1139                                                 dead_memcg);
1140 }
1141
1142 /**
1143  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1144  * @memcg: hierarchy root
1145  * @fn: function to call for each task
1146  * @arg: argument passed to @fn
1147  *
1148  * This function iterates over tasks attached to @memcg or to any of its
1149  * descendants and calls @fn for each task. If @fn returns a non-zero
1150  * value, the function breaks the iteration loop. Otherwise, it will iterate
1151  * over all tasks and return 0.
1152  *
1153  * This function must not be called for the root memory cgroup.
1154  */
1155 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1156                            int (*fn)(struct task_struct *, void *), void *arg)
1157 {
1158         struct mem_cgroup *iter;
1159         int ret = 0;
1160
1161         BUG_ON(mem_cgroup_is_root(memcg));
1162
1163         for_each_mem_cgroup_tree(iter, memcg) {
1164                 struct css_task_iter it;
1165                 struct task_struct *task;
1166
1167                 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1168                 while (!ret && (task = css_task_iter_next(&it))) {
1169                         ret = fn(task, arg);
1170                         /* Avoid potential softlockup warning */
1171                         cond_resched();
1172                 }
1173                 css_task_iter_end(&it);
1174                 if (ret) {
1175                         mem_cgroup_iter_break(memcg, iter);
1176                         break;
1177                 }
1178         }
1179 }
1180
1181 #ifdef CONFIG_DEBUG_VM
1182 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1183 {
1184         struct mem_cgroup *memcg;
1185
1186         if (mem_cgroup_disabled())
1187                 return;
1188
1189         memcg = folio_memcg(folio);
1190
1191         if (!memcg)
1192                 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
1193         else
1194                 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1195 }
1196 #endif
1197
1198 /**
1199  * folio_lruvec_lock - Lock the lruvec for a folio.
1200  * @folio: Pointer to the folio.
1201  *
1202  * These functions are safe to use under any of the following conditions:
1203  * - folio locked
1204  * - folio_test_lru false
1205  * - folio frozen (refcount of 0)
1206  *
1207  * Return: The lruvec this folio is on with its lock held.
1208  */
1209 struct lruvec *folio_lruvec_lock(struct folio *folio)
1210 {
1211         struct lruvec *lruvec = folio_lruvec(folio);
1212
1213         spin_lock(&lruvec->lru_lock);
1214         lruvec_memcg_debug(lruvec, folio);
1215
1216         return lruvec;
1217 }
1218
1219 /**
1220  * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1221  * @folio: Pointer to the folio.
1222  *
1223  * These functions are safe to use under any of the following conditions:
1224  * - folio locked
1225  * - folio_test_lru false
1226  * - folio frozen (refcount of 0)
1227  *
1228  * Return: The lruvec this folio is on with its lock held and interrupts
1229  * disabled.
1230  */
1231 struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1232 {
1233         struct lruvec *lruvec = folio_lruvec(folio);
1234
1235         spin_lock_irq(&lruvec->lru_lock);
1236         lruvec_memcg_debug(lruvec, folio);
1237
1238         return lruvec;
1239 }
1240
1241 /**
1242  * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1243  * @folio: Pointer to the folio.
1244  * @flags: Pointer to irqsave flags.
1245  *
1246  * These functions are safe to use under any of the following conditions:
1247  * - folio locked
1248  * - folio_test_lru false
1249  * - folio frozen (refcount of 0)
1250  *
1251  * Return: The lruvec this folio is on with its lock held and interrupts
1252  * disabled.
1253  */
1254 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1255                 unsigned long *flags)
1256 {
1257         struct lruvec *lruvec = folio_lruvec(folio);
1258
1259         spin_lock_irqsave(&lruvec->lru_lock, *flags);
1260         lruvec_memcg_debug(lruvec, folio);
1261
1262         return lruvec;
1263 }
1264
1265 /**
1266  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1267  * @lruvec: mem_cgroup per zone lru vector
1268  * @lru: index of lru list the page is sitting on
1269  * @zid: zone id of the accounted pages
1270  * @nr_pages: positive when adding or negative when removing
1271  *
1272  * This function must be called under lru_lock, just before a page is added
1273  * to or just after a page is removed from an lru list.
1274  */
1275 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1276                                 int zid, int nr_pages)
1277 {
1278         struct mem_cgroup_per_node *mz;
1279         unsigned long *lru_size;
1280         long size;
1281
1282         if (mem_cgroup_disabled())
1283                 return;
1284
1285         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1286         lru_size = &mz->lru_zone_size[zid][lru];
1287
1288         if (nr_pages < 0)
1289                 *lru_size += nr_pages;
1290
1291         size = *lru_size;
1292         if (WARN_ONCE(size < 0,
1293                 "%s(%p, %d, %d): lru_size %ld\n",
1294                 __func__, lruvec, lru, nr_pages, size)) {
1295                 VM_BUG_ON(1);
1296                 *lru_size = 0;
1297         }
1298
1299         if (nr_pages > 0)
1300                 *lru_size += nr_pages;
1301 }
1302
1303 /**
1304  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1305  * @memcg: the memory cgroup
1306  *
1307  * Returns the maximum amount of memory @mem can be charged with, in
1308  * pages.
1309  */
1310 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1311 {
1312         unsigned long margin = 0;
1313         unsigned long count;
1314         unsigned long limit;
1315
1316         count = page_counter_read(&memcg->memory);
1317         limit = READ_ONCE(memcg->memory.max);
1318         if (count < limit)
1319                 margin = limit - count;
1320
1321         if (do_memsw_account()) {
1322                 count = page_counter_read(&memcg->memsw);
1323                 limit = READ_ONCE(memcg->memsw.max);
1324                 if (count < limit)
1325                         margin = min(margin, limit - count);
1326                 else
1327                         margin = 0;
1328         }
1329
1330         return margin;
1331 }
1332
1333 struct memory_stat {
1334         const char *name;
1335         unsigned int idx;
1336 };
1337
1338 static const struct memory_stat memory_stats[] = {
1339         { "anon",                       NR_ANON_MAPPED                  },
1340         { "file",                       NR_FILE_PAGES                   },
1341         { "kernel",                     MEMCG_KMEM                      },
1342         { "kernel_stack",               NR_KERNEL_STACK_KB              },
1343         { "pagetables",                 NR_PAGETABLE                    },
1344         { "sec_pagetables",             NR_SECONDARY_PAGETABLE          },
1345         { "percpu",                     MEMCG_PERCPU_B                  },
1346         { "sock",                       MEMCG_SOCK                      },
1347         { "vmalloc",                    MEMCG_VMALLOC                   },
1348         { "shmem",                      NR_SHMEM                        },
1349 #ifdef CONFIG_ZSWAP
1350         { "zswap",                      MEMCG_ZSWAP_B                   },
1351         { "zswapped",                   MEMCG_ZSWAPPED                  },
1352 #endif
1353         { "file_mapped",                NR_FILE_MAPPED                  },
1354         { "file_dirty",                 NR_FILE_DIRTY                   },
1355         { "file_writeback",             NR_WRITEBACK                    },
1356 #ifdef CONFIG_SWAP
1357         { "swapcached",                 NR_SWAPCACHE                    },
1358 #endif
1359 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1360         { "anon_thp",                   NR_ANON_THPS                    },
1361         { "file_thp",                   NR_FILE_THPS                    },
1362         { "shmem_thp",                  NR_SHMEM_THPS                   },
1363 #endif
1364         { "inactive_anon",              NR_INACTIVE_ANON                },
1365         { "active_anon",                NR_ACTIVE_ANON                  },
1366         { "inactive_file",              NR_INACTIVE_FILE                },
1367         { "active_file",                NR_ACTIVE_FILE                  },
1368         { "unevictable",                NR_UNEVICTABLE                  },
1369         { "slab_reclaimable",           NR_SLAB_RECLAIMABLE_B           },
1370         { "slab_unreclaimable",         NR_SLAB_UNRECLAIMABLE_B         },
1371 #ifdef CONFIG_HUGETLB_PAGE
1372         { "hugetlb",                    NR_HUGETLB                      },
1373 #endif
1374
1375         /* The memory events */
1376         { "workingset_refault_anon",    WORKINGSET_REFAULT_ANON         },
1377         { "workingset_refault_file",    WORKINGSET_REFAULT_FILE         },
1378         { "workingset_activate_anon",   WORKINGSET_ACTIVATE_ANON        },
1379         { "workingset_activate_file",   WORKINGSET_ACTIVATE_FILE        },
1380         { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
1381         { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
1382         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
1383
1384         { "pgdemote_kswapd",            PGDEMOTE_KSWAPD         },
1385         { "pgdemote_direct",            PGDEMOTE_DIRECT         },
1386         { "pgdemote_khugepaged",        PGDEMOTE_KHUGEPAGED     },
1387         { "pgdemote_proactive",         PGDEMOTE_PROACTIVE      },
1388 #ifdef CONFIG_NUMA_BALANCING
1389         { "pgpromote_success",          PGPROMOTE_SUCCESS       },
1390 #endif
1391 };
1392
1393 /* The actual unit of the state item, not the same as the output unit */
1394 static int memcg_page_state_unit(int item)
1395 {
1396         switch (item) {
1397         case MEMCG_PERCPU_B:
1398         case MEMCG_ZSWAP_B:
1399         case NR_SLAB_RECLAIMABLE_B:
1400         case NR_SLAB_UNRECLAIMABLE_B:
1401                 return 1;
1402         case NR_KERNEL_STACK_KB:
1403                 return SZ_1K;
1404         default:
1405                 return PAGE_SIZE;
1406         }
1407 }
1408
1409 /* Translate stat items to the correct unit for memory.stat output */
1410 static int memcg_page_state_output_unit(int item)
1411 {
1412         /*
1413          * Workingset state is actually in pages, but we export it to userspace
1414          * as a scalar count of events, so special case it here.
1415          *
1416          * Demotion and promotion activities are exported in pages, consistent
1417          * with their global counterparts.
1418          */
1419         switch (item) {
1420         case WORKINGSET_REFAULT_ANON:
1421         case WORKINGSET_REFAULT_FILE:
1422         case WORKINGSET_ACTIVATE_ANON:
1423         case WORKINGSET_ACTIVATE_FILE:
1424         case WORKINGSET_RESTORE_ANON:
1425         case WORKINGSET_RESTORE_FILE:
1426         case WORKINGSET_NODERECLAIM:
1427         case PGDEMOTE_KSWAPD:
1428         case PGDEMOTE_DIRECT:
1429         case PGDEMOTE_KHUGEPAGED:
1430         case PGDEMOTE_PROACTIVE:
1431 #ifdef CONFIG_NUMA_BALANCING
1432         case PGPROMOTE_SUCCESS:
1433 #endif
1434                 return 1;
1435         default:
1436                 return memcg_page_state_unit(item);
1437         }
1438 }
1439
1440 unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
1441 {
1442         return memcg_page_state(memcg, item) *
1443                 memcg_page_state_output_unit(item);
1444 }
1445
1446 #ifdef CONFIG_MEMCG_V1
1447 unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
1448 {
1449         return memcg_page_state_local(memcg, item) *
1450                 memcg_page_state_output_unit(item);
1451 }
1452 #endif
1453
1454 #ifdef CONFIG_HUGETLB_PAGE
1455 static bool memcg_accounts_hugetlb(void)
1456 {
1457         return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
1458 }
1459 #else /* CONFIG_HUGETLB_PAGE */
1460 static bool memcg_accounts_hugetlb(void)
1461 {
1462         return false;
1463 }
1464 #endif /* CONFIG_HUGETLB_PAGE */
1465
1466 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1467 {
1468         int i;
1469
1470         /*
1471          * Provide statistics on the state of the memory subsystem as
1472          * well as cumulative event counters that show past behavior.
1473          *
1474          * This list is ordered following a combination of these gradients:
1475          * 1) generic big picture -> specifics and details
1476          * 2) reflecting userspace activity -> reflecting kernel heuristics
1477          *
1478          * Current memory state:
1479          */
1480         mem_cgroup_flush_stats(memcg);
1481
1482         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1483                 u64 size;
1484
1485 #ifdef CONFIG_HUGETLB_PAGE
1486                 if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
1487                         !memcg_accounts_hugetlb())
1488                         continue;
1489 #endif
1490                 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1491                 seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
1492
1493                 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1494                         size += memcg_page_state_output(memcg,
1495                                                         NR_SLAB_RECLAIMABLE_B);
1496                         seq_buf_printf(s, "slab %llu\n", size);
1497                 }
1498         }
1499
1500         /* Accumulated memory events */
1501         seq_buf_printf(s, "pgscan %lu\n",
1502                        memcg_events(memcg, PGSCAN_KSWAPD) +
1503                        memcg_events(memcg, PGSCAN_DIRECT) +
1504                        memcg_events(memcg, PGSCAN_PROACTIVE) +
1505                        memcg_events(memcg, PGSCAN_KHUGEPAGED));
1506         seq_buf_printf(s, "pgsteal %lu\n",
1507                        memcg_events(memcg, PGSTEAL_KSWAPD) +
1508                        memcg_events(memcg, PGSTEAL_DIRECT) +
1509                        memcg_events(memcg, PGSTEAL_PROACTIVE) +
1510                        memcg_events(memcg, PGSTEAL_KHUGEPAGED));
1511
1512         for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
1513 #ifdef CONFIG_MEMCG_V1
1514                 if (memcg_vm_event_stat[i] == PGPGIN ||
1515                     memcg_vm_event_stat[i] == PGPGOUT)
1516                         continue;
1517 #endif
1518                 seq_buf_printf(s, "%s %lu\n",
1519                                vm_event_name(memcg_vm_event_stat[i]),
1520                                memcg_events(memcg, memcg_vm_event_stat[i]));
1521         }
1522 }
1523
1524 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1525 {
1526         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1527                 memcg_stat_format(memcg, s);
1528         else
1529                 memcg1_stat_format(memcg, s);
1530         if (seq_buf_has_overflowed(s))
1531                 pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__);
1532 }
1533
1534 /**
1535  * mem_cgroup_print_oom_context: Print OOM information relevant to
1536  * memory controller.
1537  * @memcg: The memory cgroup that went over limit
1538  * @p: Task that is going to be killed
1539  *
1540  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1541  * enabled
1542  */
1543 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1544 {
1545         rcu_read_lock();
1546
1547         if (memcg) {
1548                 pr_cont(",oom_memcg=");
1549                 pr_cont_cgroup_path(memcg->css.cgroup);
1550         } else
1551                 pr_cont(",global_oom");
1552         if (p) {
1553                 pr_cont(",task_memcg=");
1554                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1555         }
1556         rcu_read_unlock();
1557 }
1558
1559 /**
1560  * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1561  * memory controller.
1562  * @memcg: The memory cgroup that went over limit
1563  */
1564 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1565 {
1566         /* Use static buffer, for the caller is holding oom_lock. */
1567         static char buf[SEQ_BUF_SIZE];
1568         struct seq_buf s;
1569         unsigned long memory_failcnt;
1570
1571         lockdep_assert_held(&oom_lock);
1572
1573         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1574                 memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]);
1575         else
1576                 memory_failcnt = memcg->memory.failcnt;
1577
1578         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1579                 K((u64)page_counter_read(&memcg->memory)),
1580                 K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt);
1581         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1582                 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1583                         K((u64)page_counter_read(&memcg->swap)),
1584                         K((u64)READ_ONCE(memcg->swap.max)),
1585                         atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
1586 #ifdef CONFIG_MEMCG_V1
1587         else {
1588                 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1589                         K((u64)page_counter_read(&memcg->memsw)),
1590                         K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1591                 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1592                         K((u64)page_counter_read(&memcg->kmem)),
1593                         K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1594         }
1595 #endif
1596
1597         pr_info("Memory cgroup stats for ");
1598         pr_cont_cgroup_path(memcg->css.cgroup);
1599         pr_cont(":");
1600         seq_buf_init(&s, buf, SEQ_BUF_SIZE);
1601         memory_stat_format(memcg, &s);
1602         seq_buf_do_printk(&s, KERN_INFO);
1603 }
1604
1605 /*
1606  * Return the memory (and swap, if configured) limit for a memcg.
1607  */
1608 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1609 {
1610         unsigned long max = READ_ONCE(memcg->memory.max);
1611
1612         if (do_memsw_account()) {
1613                 if (mem_cgroup_swappiness(memcg)) {
1614                         /* Calculate swap excess capacity from memsw limit */
1615                         unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1616
1617                         max += min(swap, (unsigned long)total_swap_pages);
1618                 }
1619         } else {
1620                 if (mem_cgroup_swappiness(memcg))
1621                         max += min(READ_ONCE(memcg->swap.max),
1622                                    (unsigned long)total_swap_pages);
1623         }
1624         return max;
1625 }
1626
1627 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1628 {
1629         return page_counter_read(&memcg->memory);
1630 }
1631
1632 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1633                                      int order)
1634 {
1635         struct oom_control oc = {
1636                 .zonelist = NULL,
1637                 .nodemask = NULL,
1638                 .memcg = memcg,
1639                 .gfp_mask = gfp_mask,
1640                 .order = order,
1641         };
1642         bool ret = true;
1643
1644         if (mutex_lock_killable(&oom_lock))
1645                 return true;
1646
1647         if (mem_cgroup_margin(memcg) >= (1 << order))
1648                 goto unlock;
1649
1650         /*
1651          * A few threads which were not waiting at mutex_lock_killable() can
1652          * fail to bail out. Therefore, check again after holding oom_lock.
1653          */
1654         ret = out_of_memory(&oc);
1655
1656 unlock:
1657         mutex_unlock(&oom_lock);
1658         return ret;
1659 }
1660
1661 /*
1662  * Returns true if successfully killed one or more processes. Though in some
1663  * corner cases it can return true even without killing any process.
1664  */
1665 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1666 {
1667         bool locked, ret;
1668
1669         if (order > PAGE_ALLOC_COSTLY_ORDER)
1670                 return false;
1671
1672         memcg_memory_event(memcg, MEMCG_OOM);
1673
1674         if (!memcg1_oom_prepare(memcg, &locked))
1675                 return false;
1676
1677         ret = mem_cgroup_out_of_memory(memcg, mask, order);
1678
1679         memcg1_oom_finish(memcg, locked);
1680
1681         return ret;
1682 }
1683
1684 /**
1685  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1686  * @victim: task to be killed by the OOM killer
1687  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1688  *
1689  * Returns a pointer to a memory cgroup, which has to be cleaned up
1690  * by killing all belonging OOM-killable tasks.
1691  *
1692  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1693  */
1694 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1695                                             struct mem_cgroup *oom_domain)
1696 {
1697         struct mem_cgroup *oom_group = NULL;
1698         struct mem_cgroup *memcg;
1699
1700         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1701                 return NULL;
1702
1703         if (!oom_domain)
1704                 oom_domain = root_mem_cgroup;
1705
1706         rcu_read_lock();
1707
1708         memcg = mem_cgroup_from_task(victim);
1709         if (mem_cgroup_is_root(memcg))
1710                 goto out;
1711
1712         /*
1713          * If the victim task has been asynchronously moved to a different
1714          * memory cgroup, we might end up killing tasks outside oom_domain.
1715          * In this case it's better to ignore memory.group.oom.
1716          */
1717         if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1718                 goto out;
1719
1720         /*
1721          * Traverse the memory cgroup hierarchy from the victim task's
1722          * cgroup up to the OOMing cgroup (or root) to find the
1723          * highest-level memory cgroup with oom.group set.
1724          */
1725         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1726                 if (READ_ONCE(memcg->oom_group))
1727                         oom_group = memcg;
1728
1729                 if (memcg == oom_domain)
1730                         break;
1731         }
1732
1733         if (oom_group)
1734                 css_get(&oom_group->css);
1735 out:
1736         rcu_read_unlock();
1737
1738         return oom_group;
1739 }
1740
1741 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1742 {
1743         pr_info("Tasks in ");
1744         pr_cont_cgroup_path(memcg->css.cgroup);
1745         pr_cont(" are going to be killed due to memory.oom.group set\n");
1746 }
1747
1748 /*
1749  * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
1750  * nr_pages in a single cacheline. This may change in future.
1751  */
1752 #define NR_MEMCG_STOCK 7
1753 #define FLUSHING_CACHED_CHARGE  0
1754 struct memcg_stock_pcp {
1755         local_trylock_t lock;
1756         uint8_t nr_pages[NR_MEMCG_STOCK];
1757         struct mem_cgroup *cached[NR_MEMCG_STOCK];
1758
1759         struct work_struct work;
1760         unsigned long flags;
1761 };
1762
1763 static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
1764         .lock = INIT_LOCAL_TRYLOCK(lock),
1765 };
1766
1767 struct obj_stock_pcp {
1768         local_trylock_t lock;
1769         unsigned int nr_bytes;
1770         struct obj_cgroup *cached_objcg;
1771         struct pglist_data *cached_pgdat;
1772         int nr_slab_reclaimable_b;
1773         int nr_slab_unreclaimable_b;
1774
1775         struct work_struct work;
1776         unsigned long flags;
1777 };
1778
1779 static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
1780         .lock = INIT_LOCAL_TRYLOCK(lock),
1781 };
1782
1783 static DEFINE_MUTEX(percpu_charge_mutex);
1784
1785 static void drain_obj_stock(struct obj_stock_pcp *stock);
1786 static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
1787                                      struct mem_cgroup *root_memcg);
1788
1789 /**
1790  * consume_stock: Try to consume stocked charge on this cpu.
1791  * @memcg: memcg to consume from.
1792  * @nr_pages: how many pages to charge.
1793  *
1794  * Consume the cached charge if enough nr_pages are present otherwise return
1795  * failure. Also return failure for charge request larger than
1796  * MEMCG_CHARGE_BATCH or if the local lock is already taken.
1797  *
1798  * returns true if successful, false otherwise.
1799  */
1800 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1801 {
1802         struct memcg_stock_pcp *stock;
1803         uint8_t stock_pages;
1804         bool ret = false;
1805         int i;
1806
1807         if (nr_pages > MEMCG_CHARGE_BATCH ||
1808             !local_trylock(&memcg_stock.lock))
1809                 return ret;
1810
1811         stock = this_cpu_ptr(&memcg_stock);
1812
1813         for (i = 0; i < NR_MEMCG_STOCK; ++i) {
1814                 if (memcg != READ_ONCE(stock->cached[i]))
1815                         continue;
1816
1817                 stock_pages = READ_ONCE(stock->nr_pages[i]);
1818                 if (stock_pages >= nr_pages) {
1819                         WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
1820                         ret = true;
1821                 }
1822                 break;
1823         }
1824
1825         local_unlock(&memcg_stock.lock);
1826
1827         return ret;
1828 }
1829
1830 static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
1831 {
1832         page_counter_uncharge(&memcg->memory, nr_pages);
1833         if (do_memsw_account())
1834                 page_counter_uncharge(&memcg->memsw, nr_pages);
1835 }
1836
1837 /*
1838  * Returns stocks cached in percpu and reset cached information.
1839  */
1840 static void drain_stock(struct memcg_stock_pcp *stock, int i)
1841 {
1842         struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
1843         uint8_t stock_pages;
1844
1845         if (!old)
1846                 return;
1847
1848         stock_pages = READ_ONCE(stock->nr_pages[i]);
1849         if (stock_pages) {
1850                 memcg_uncharge(old, stock_pages);
1851                 WRITE_ONCE(stock->nr_pages[i], 0);
1852         }
1853
1854         css_put(&old->css);
1855         WRITE_ONCE(stock->cached[i], NULL);
1856 }
1857
1858 static void drain_stock_fully(struct memcg_stock_pcp *stock)
1859 {
1860         int i;
1861
1862         for (i = 0; i < NR_MEMCG_STOCK; ++i)
1863                 drain_stock(stock, i);
1864 }
1865
1866 static void drain_local_memcg_stock(struct work_struct *dummy)
1867 {
1868         struct memcg_stock_pcp *stock;
1869
1870         if (WARN_ONCE(!in_task(), "drain in non-task context"))
1871                 return;
1872
1873         local_lock(&memcg_stock.lock);
1874
1875         stock = this_cpu_ptr(&memcg_stock);
1876         drain_stock_fully(stock);
1877         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1878
1879         local_unlock(&memcg_stock.lock);
1880 }
1881
1882 static void drain_local_obj_stock(struct work_struct *dummy)
1883 {
1884         struct obj_stock_pcp *stock;
1885
1886         if (WARN_ONCE(!in_task(), "drain in non-task context"))
1887                 return;
1888
1889         local_lock(&obj_stock.lock);
1890
1891         stock = this_cpu_ptr(&obj_stock);
1892         drain_obj_stock(stock);
1893         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1894
1895         local_unlock(&obj_stock.lock);
1896 }
1897
1898 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1899 {
1900         struct memcg_stock_pcp *stock;
1901         struct mem_cgroup *cached;
1902         uint8_t stock_pages;
1903         bool success = false;
1904         int empty_slot = -1;
1905         int i;
1906
1907         /*
1908          * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
1909          * decide to increase it more than 127 then we will need more careful
1910          * handling of nr_pages[] in struct memcg_stock_pcp.
1911          */
1912         BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
1913
1914         VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
1915
1916         if (nr_pages > MEMCG_CHARGE_BATCH ||
1917             !local_trylock(&memcg_stock.lock)) {
1918                 /*
1919                  * In case of larger than batch refill or unlikely failure to
1920                  * lock the percpu memcg_stock.lock, uncharge memcg directly.
1921                  */
1922                 memcg_uncharge(memcg, nr_pages);
1923                 return;
1924         }
1925
1926         stock = this_cpu_ptr(&memcg_stock);
1927         for (i = 0; i < NR_MEMCG_STOCK; ++i) {
1928                 cached = READ_ONCE(stock->cached[i]);
1929                 if (!cached && empty_slot == -1)
1930                         empty_slot = i;
1931                 if (memcg == READ_ONCE(stock->cached[i])) {
1932                         stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
1933                         WRITE_ONCE(stock->nr_pages[i], stock_pages);
1934                         if (stock_pages > MEMCG_CHARGE_BATCH)
1935                                 drain_stock(stock, i);
1936                         success = true;
1937                         break;
1938                 }
1939         }
1940
1941         if (!success) {
1942                 i = empty_slot;
1943                 if (i == -1) {
1944                         i = get_random_u32_below(NR_MEMCG_STOCK);
1945                         drain_stock(stock, i);
1946                 }
1947                 css_get(&memcg->css);
1948                 WRITE_ONCE(stock->cached[i], memcg);
1949                 WRITE_ONCE(stock->nr_pages[i], nr_pages);
1950         }
1951
1952         local_unlock(&memcg_stock.lock);
1953 }
1954
1955 static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
1956                                   struct mem_cgroup *root_memcg)
1957 {
1958         struct mem_cgroup *memcg;
1959         bool flush = false;
1960         int i;
1961
1962         rcu_read_lock();
1963         for (i = 0; i < NR_MEMCG_STOCK; ++i) {
1964                 memcg = READ_ONCE(stock->cached[i]);
1965                 if (!memcg)
1966                         continue;
1967
1968                 if (READ_ONCE(stock->nr_pages[i]) &&
1969                     mem_cgroup_is_descendant(memcg, root_memcg)) {
1970                         flush = true;
1971                         break;
1972                 }
1973         }
1974         rcu_read_unlock();
1975         return flush;
1976 }
1977
1978 /*
1979  * Drains all per-CPU charge caches for given root_memcg resp. subtree
1980  * of the hierarchy under it.
1981  */
1982 void drain_all_stock(struct mem_cgroup *root_memcg)
1983 {
1984         int cpu, curcpu;
1985
1986         /* If someone's already draining, avoid adding running more workers. */
1987         if (!mutex_trylock(&percpu_charge_mutex))
1988                 return;
1989         /*
1990          * Notify other cpus that system-wide "drain" is running
1991          * We do not care about races with the cpu hotplug because cpu down
1992          * as well as workers from this path always operate on the local
1993          * per-cpu data. CPU up doesn't touch memcg_stock at all.
1994          */
1995         migrate_disable();
1996         curcpu = smp_processor_id();
1997         for_each_online_cpu(cpu) {
1998                 struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
1999                 struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
2000
2001                 if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
2002                     is_memcg_drain_needed(memcg_st, root_memcg) &&
2003                     !test_and_set_bit(FLUSHING_CACHED_CHARGE,
2004                                       &memcg_st->flags)) {
2005                         if (cpu == curcpu)
2006                                 drain_local_memcg_stock(&memcg_st->work);
2007                         else if (!cpu_is_isolated(cpu))
2008                                 schedule_work_on(cpu, &memcg_st->work);
2009                 }
2010
2011                 if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
2012                     obj_stock_flush_required(obj_st, root_memcg) &&
2013                     !test_and_set_bit(FLUSHING_CACHED_CHARGE,
2014                                       &obj_st->flags)) {
2015                         if (cpu == curcpu)
2016                                 drain_local_obj_stock(&obj_st->work);
2017                         else if (!cpu_is_isolated(cpu))
2018                                 schedule_work_on(cpu, &obj_st->work);
2019                 }
2020         }
2021         migrate_enable();
2022         mutex_unlock(&percpu_charge_mutex);
2023 }
2024
2025 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2026 {
2027         /* no need for the local lock */
2028         drain_obj_stock(&per_cpu(obj_stock, cpu));
2029         drain_stock_fully(&per_cpu(memcg_stock, cpu));
2030
2031         return 0;
2032 }
2033
2034 static unsigned long reclaim_high(struct mem_cgroup *memcg,
2035                                   unsigned int nr_pages,
2036                                   gfp_t gfp_mask)
2037 {
2038         unsigned long nr_reclaimed = 0;
2039
2040         do {
2041                 unsigned long pflags;
2042
2043                 if (page_counter_read(&memcg->memory) <=
2044                     READ_ONCE(memcg->memory.high))
2045                         continue;
2046
2047                 memcg_memory_event(memcg, MEMCG_HIGH);
2048
2049                 psi_memstall_enter(&pflags);
2050                 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2051                                                         gfp_mask,
2052                                                         MEMCG_RECLAIM_MAY_SWAP,
2053                                                         NULL);
2054                 psi_memstall_leave(&pflags);
2055         } while ((memcg = parent_mem_cgroup(memcg)) &&
2056                  !mem_cgroup_is_root(memcg));
2057
2058         return nr_reclaimed;
2059 }
2060
2061 static void high_work_func(struct work_struct *work)
2062 {
2063         struct mem_cgroup *memcg;
2064
2065         memcg = container_of(work, struct mem_cgroup, high_work);
2066         reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2067 }
2068
2069 /*
2070  * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2071  * enough to still cause a significant slowdown in most cases, while still
2072  * allowing diagnostics and tracing to proceed without becoming stuck.
2073  */
2074 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2075
2076 /*
2077  * When calculating the delay, we use these either side of the exponentiation to
2078  * maintain precision and scale to a reasonable number of jiffies (see the table
2079  * below.
2080  *
2081  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2082  *   overage ratio to a delay.
2083  * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2084  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2085  *   to produce a reasonable delay curve.
2086  *
2087  * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2088  * reasonable delay curve compared to precision-adjusted overage, not
2089  * penalising heavily at first, but still making sure that growth beyond the
2090  * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2091  * example, with a high of 100 megabytes:
2092  *
2093  *  +-------+------------------------+
2094  *  | usage | time to allocate in ms |
2095  *  +-------+------------------------+
2096  *  | 100M  |                      0 |
2097  *  | 101M  |                      6 |
2098  *  | 102M  |                     25 |
2099  *  | 103M  |                     57 |
2100  *  | 104M  |                    102 |
2101  *  | 105M  |                    159 |
2102  *  | 106M  |                    230 |
2103  *  | 107M  |                    313 |
2104  *  | 108M  |                    409 |
2105  *  | 109M  |                    518 |
2106  *  | 110M  |                    639 |
2107  *  | 111M  |                    774 |
2108  *  | 112M  |                    921 |
2109  *  | 113M  |                   1081 |
2110  *  | 114M  |                   1254 |
2111  *  | 115M  |                   1439 |
2112  *  | 116M  |                   1638 |
2113  *  | 117M  |                   1849 |
2114  *  | 118M  |                   2000 |
2115  *  | 119M  |                   2000 |
2116  *  | 120M  |                   2000 |
2117  *  +-------+------------------------+
2118  */
2119  #define MEMCG_DELAY_PRECISION_SHIFT 20
2120  #define MEMCG_DELAY_SCALING_SHIFT 14
2121
2122 static u64 calculate_overage(unsigned long usage, unsigned long high)
2123 {
2124         u64 overage;
2125
2126         if (usage <= high)
2127                 return 0;
2128
2129         /*
2130          * Prevent division by 0 in overage calculation by acting as if
2131          * it was a threshold of 1 page
2132          */
2133         high = max(high, 1UL);
2134
2135         overage = usage - high;
2136         overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2137         return div64_u64(overage, high);
2138 }
2139
2140 static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2141 {
2142         u64 overage, max_overage = 0;
2143
2144         do {
2145                 overage = calculate_overage(page_counter_read(&memcg->memory),
2146                                             READ_ONCE(memcg->memory.high));
2147                 max_overage = max(overage, max_overage);
2148         } while ((memcg = parent_mem_cgroup(memcg)) &&
2149                  !mem_cgroup_is_root(memcg));
2150
2151         return max_overage;
2152 }
2153
2154 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2155 {
2156         u64 overage, max_overage = 0;
2157
2158         do {
2159                 overage = calculate_overage(page_counter_read(&memcg->swap),
2160                                             READ_ONCE(memcg->swap.high));
2161                 if (overage)
2162                         memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2163                 max_overage = max(overage, max_overage);
2164         } while ((memcg = parent_mem_cgroup(memcg)) &&
2165                  !mem_cgroup_is_root(memcg));
2166
2167         return max_overage;
2168 }
2169
2170 /*
2171  * Get the number of jiffies that we should penalise a mischievous cgroup which
2172  * is exceeding its memory.high by checking both it and its ancestors.
2173  */
2174 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2175                                           unsigned int nr_pages,
2176                                           u64 max_overage)
2177 {
2178         unsigned long penalty_jiffies;
2179
2180         if (!max_overage)
2181                 return 0;
2182
2183         /*
2184          * We use overage compared to memory.high to calculate the number of
2185          * jiffies to sleep (penalty_jiffies). Ideally this value should be
2186          * fairly lenient on small overages, and increasingly harsh when the
2187          * memcg in question makes it clear that it has no intention of stopping
2188          * its crazy behaviour, so we exponentially increase the delay based on
2189          * overage amount.
2190          */
2191         penalty_jiffies = max_overage * max_overage * HZ;
2192         penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2193         penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2194
2195         /*
2196          * Factor in the task's own contribution to the overage, such that four
2197          * N-sized allocations are throttled approximately the same as one
2198          * 4N-sized allocation.
2199          *
2200          * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2201          * larger the current charge patch is than that.
2202          */
2203         return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2204 }
2205
2206 /*
2207  * Reclaims memory over the high limit. Called directly from
2208  * try_charge() (context permitting), as well as from the userland
2209  * return path where reclaim is always able to block.
2210  */
2211 void mem_cgroup_handle_over_high(gfp_t gfp_mask)
2212 {
2213         unsigned long penalty_jiffies;
2214         unsigned long pflags;
2215         unsigned long nr_reclaimed;
2216         unsigned int nr_pages = current->memcg_nr_pages_over_high;
2217         int nr_retries = MAX_RECLAIM_RETRIES;
2218         struct mem_cgroup *memcg;
2219         bool in_retry = false;
2220
2221         if (likely(!nr_pages))
2222                 return;
2223
2224         memcg = get_mem_cgroup_from_mm(current->mm);
2225         current->memcg_nr_pages_over_high = 0;
2226
2227 retry_reclaim:
2228         /*
2229          * Bail if the task is already exiting. Unlike memory.max,
2230          * memory.high enforcement isn't as strict, and there is no
2231          * OOM killer involved, which means the excess could already
2232          * be much bigger (and still growing) than it could for
2233          * memory.max; the dying task could get stuck in fruitless
2234          * reclaim for a long time, which isn't desirable.
2235          */
2236         if (task_is_dying())
2237                 goto out;
2238
2239         /*
2240          * The allocating task should reclaim at least the batch size, but for
2241          * subsequent retries we only want to do what's necessary to prevent oom
2242          * or breaching resource isolation.
2243          *
2244          * This is distinct from memory.max or page allocator behaviour because
2245          * memory.high is currently batched, whereas memory.max and the page
2246          * allocator run every time an allocation is made.
2247          */
2248         nr_reclaimed = reclaim_high(memcg,
2249                                     in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2250                                     gfp_mask);
2251
2252         /*
2253          * memory.high is breached and reclaim is unable to keep up. Throttle
2254          * allocators proactively to slow down excessive growth.
2255          */
2256         penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2257                                                mem_find_max_overage(memcg));
2258
2259         penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2260                                                 swap_find_max_overage(memcg));
2261
2262         /*
2263          * Clamp the max delay per usermode return so as to still keep the
2264          * application moving forwards and also permit diagnostics, albeit
2265          * extremely slowly.
2266          */
2267         penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2268
2269         /*
2270          * Don't sleep if the amount of jiffies this memcg owes us is so low
2271          * that it's not even worth doing, in an attempt to be nice to those who
2272          * go only a small amount over their memory.high value and maybe haven't
2273          * been aggressively reclaimed enough yet.
2274          */
2275         if (penalty_jiffies <= HZ / 100)
2276                 goto out;
2277
2278         /*
2279          * If reclaim is making forward progress but we're still over
2280          * memory.high, we want to encourage that rather than doing allocator
2281          * throttling.
2282          */
2283         if (nr_reclaimed || nr_retries--) {
2284                 in_retry = true;
2285                 goto retry_reclaim;
2286         }
2287
2288         /*
2289          * Reclaim didn't manage to push usage below the limit, slow
2290          * this allocating task down.
2291          *
2292          * If we exit early, we're guaranteed to die (since
2293          * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2294          * need to account for any ill-begotten jiffies to pay them off later.
2295          */
2296         psi_memstall_enter(&pflags);
2297         schedule_timeout_killable(penalty_jiffies);
2298         psi_memstall_leave(&pflags);
2299
2300 out:
2301         css_put(&memcg->css);
2302 }
2303
2304 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2305                             unsigned int nr_pages)
2306 {
2307         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2308         int nr_retries = MAX_RECLAIM_RETRIES;
2309         struct mem_cgroup *mem_over_limit;
2310         struct page_counter *counter;
2311         unsigned long nr_reclaimed;
2312         bool passed_oom = false;
2313         unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2314         bool drained = false;
2315         bool raised_max_event = false;
2316         unsigned long pflags;
2317
2318 retry:
2319         if (consume_stock(memcg, nr_pages))
2320                 return 0;
2321
2322         if (!gfpflags_allow_spinning(gfp_mask))
2323                 /* Avoid the refill and flush of the older stock */
2324                 batch = nr_pages;
2325
2326         if (!do_memsw_account() ||
2327             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2328                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2329                         goto done_restock;
2330                 if (do_memsw_account())
2331                         page_counter_uncharge(&memcg->memsw, batch);
2332                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2333         } else {
2334                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2335                 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2336         }
2337
2338         if (batch > nr_pages) {
2339                 batch = nr_pages;
2340                 goto retry;
2341         }
2342
2343         /*
2344          * Prevent unbounded recursion when reclaim operations need to
2345          * allocate memory. This might exceed the limits temporarily,
2346          * but we prefer facilitating memory reclaim and getting back
2347          * under the limit over triggering OOM kills in these cases.
2348          */
2349         if (unlikely(current->flags & PF_MEMALLOC))
2350                 goto force;
2351
2352         if (unlikely(task_in_memcg_oom(current)))
2353                 goto nomem;
2354
2355         if (!gfpflags_allow_blocking(gfp_mask))
2356                 goto nomem;
2357
2358         memcg_memory_event(mem_over_limit, MEMCG_MAX);
2359         raised_max_event = true;
2360
2361         psi_memstall_enter(&pflags);
2362         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2363                                                     gfp_mask, reclaim_options, NULL);
2364         psi_memstall_leave(&pflags);
2365
2366         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2367                 goto retry;
2368
2369         if (!drained) {
2370                 drain_all_stock(mem_over_limit);
2371                 drained = true;
2372                 goto retry;
2373         }
2374
2375         if (gfp_mask & __GFP_NORETRY)
2376                 goto nomem;
2377         /*
2378          * Even though the limit is exceeded at this point, reclaim
2379          * may have been able to free some pages.  Retry the charge
2380          * before killing the task.
2381          *
2382          * Only for regular pages, though: huge pages are rather
2383          * unlikely to succeed so close to the limit, and we fall back
2384          * to regular pages anyway in case of failure.
2385          */
2386         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2387                 goto retry;
2388
2389         if (nr_retries--)
2390                 goto retry;
2391
2392         if (gfp_mask & __GFP_RETRY_MAYFAIL)
2393                 goto nomem;
2394
2395         /* Avoid endless loop for tasks bypassed by the oom killer */
2396         if (passed_oom && task_is_dying())
2397                 goto nomem;
2398
2399         /*
2400          * keep retrying as long as the memcg oom killer is able to make
2401          * a forward progress or bypass the charge if the oom killer
2402          * couldn't make any progress.
2403          */
2404         if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2405                            get_order(nr_pages * PAGE_SIZE))) {
2406                 passed_oom = true;
2407                 nr_retries = MAX_RECLAIM_RETRIES;
2408                 goto retry;
2409         }
2410 nomem:
2411         /*
2412          * Memcg doesn't have a dedicated reserve for atomic
2413          * allocations. But like the global atomic pool, we need to
2414          * put the burden of reclaim on regular allocation requests
2415          * and let these go through as privileged allocations.
2416          */
2417         if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2418                 return -ENOMEM;
2419 force:
2420         /*
2421          * If the allocation has to be enforced, don't forget to raise
2422          * a MEMCG_MAX event.
2423          */
2424         if (!raised_max_event)
2425                 memcg_memory_event(mem_over_limit, MEMCG_MAX);
2426
2427         /*
2428          * The allocation either can't fail or will lead to more memory
2429          * being freed very soon.  Allow memory usage go over the limit
2430          * temporarily by force charging it.
2431          */
2432         page_counter_charge(&memcg->memory, nr_pages);
2433         if (do_memsw_account())
2434                 page_counter_charge(&memcg->memsw, nr_pages);
2435
2436         return 0;
2437
2438 done_restock:
2439         if (batch > nr_pages)
2440                 refill_stock(memcg, batch - nr_pages);
2441
2442         /*
2443          * If the hierarchy is above the normal consumption range, schedule
2444          * reclaim on returning to userland.  We can perform reclaim here
2445          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2446          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2447          * not recorded as it most likely matches current's and won't
2448          * change in the meantime.  As high limit is checked again before
2449          * reclaim, the cost of mismatch is negligible.
2450          */
2451         do {
2452                 bool mem_high, swap_high;
2453
2454                 mem_high = page_counter_read(&memcg->memory) >
2455                         READ_ONCE(memcg->memory.high);
2456                 swap_high = page_counter_read(&memcg->swap) >
2457                         READ_ONCE(memcg->swap.high);
2458
2459                 /* Don't bother a random interrupted task */
2460                 if (!in_task()) {
2461                         if (mem_high) {
2462                                 schedule_work(&memcg->high_work);
2463                                 break;
2464                         }
2465                         continue;
2466                 }
2467
2468                 if (mem_high || swap_high) {
2469                         /*
2470                          * The allocating tasks in this cgroup will need to do
2471                          * reclaim or be throttled to prevent further growth
2472                          * of the memory or swap footprints.
2473                          *
2474                          * Target some best-effort fairness between the tasks,
2475                          * and distribute reclaim work and delay penalties
2476                          * based on how much each task is actually allocating.
2477                          */
2478                         current->memcg_nr_pages_over_high += batch;
2479                         set_notify_resume(current);
2480                         break;
2481                 }
2482         } while ((memcg = parent_mem_cgroup(memcg)));
2483
2484         /*
2485          * Reclaim is set up above to be called from the userland
2486          * return path. But also attempt synchronous reclaim to avoid
2487          * excessive overrun while the task is still inside the
2488          * kernel. If this is successful, the return path will see it
2489          * when it rechecks the overage and simply bail out.
2490          */
2491         if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2492             !(current->flags & PF_MEMALLOC) &&
2493             gfpflags_allow_blocking(gfp_mask))
2494                 mem_cgroup_handle_over_high(gfp_mask);
2495         return 0;
2496 }
2497
2498 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2499                              unsigned int nr_pages)
2500 {
2501         if (mem_cgroup_is_root(memcg))
2502                 return 0;
2503
2504         return try_charge_memcg(memcg, gfp_mask, nr_pages);
2505 }
2506
2507 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2508 {
2509         VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
2510         /*
2511          * Any of the following ensures page's memcg stability:
2512          *
2513          * - the page lock
2514          * - LRU isolation
2515          * - exclusive reference
2516          */
2517         folio->memcg_data = (unsigned long)memcg;
2518 }
2519
2520 #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
2521 static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
2522                                          struct pglist_data *pgdat,
2523                                          enum node_stat_item idx, int nr)
2524 {
2525         struct lruvec *lruvec;
2526
2527         if (likely(!in_nmi())) {
2528                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
2529                 mod_memcg_lruvec_state(lruvec, idx, nr);
2530         } else {
2531                 struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
2532
2533                 /* TODO: add to cgroup update tree once it is nmi-safe. */
2534                 if (idx == NR_SLAB_RECLAIMABLE_B)
2535                         atomic_add(nr, &pn->slab_reclaimable);
2536                 else
2537                         atomic_add(nr, &pn->slab_unreclaimable);
2538         }
2539 }
2540 #else
2541 static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
2542                                          struct pglist_data *pgdat,
2543                                          enum node_stat_item idx, int nr)
2544 {
2545         struct lruvec *lruvec;
2546
2547         lruvec = mem_cgroup_lruvec(memcg, pgdat);
2548         mod_memcg_lruvec_state(lruvec, idx, nr);
2549 }
2550 #endif
2551
2552 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
2553                                        struct pglist_data *pgdat,
2554                                        enum node_stat_item idx, int nr)
2555 {
2556         struct mem_cgroup *memcg;
2557
2558         rcu_read_lock();
2559         memcg = obj_cgroup_memcg(objcg);
2560         account_slab_nmi_safe(memcg, pgdat, idx, nr);
2561         rcu_read_unlock();
2562 }
2563
2564 static __always_inline
2565 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
2566 {
2567         /*
2568          * Slab objects are accounted individually, not per-page.
2569          * Memcg membership data for each individual object is saved in
2570          * slab->obj_exts.
2571          */
2572         if (folio_test_slab(folio)) {
2573                 struct slabobj_ext *obj_exts;
2574                 struct slab *slab;
2575                 unsigned int off;
2576
2577                 slab = folio_slab(folio);
2578                 obj_exts = slab_obj_exts(slab);
2579                 if (!obj_exts)
2580                         return NULL;
2581
2582                 off = obj_to_index(slab->slab_cache, slab, p);
2583                 if (obj_exts[off].objcg)
2584                         return obj_cgroup_memcg(obj_exts[off].objcg);
2585
2586                 return NULL;
2587         }
2588
2589         /*
2590          * folio_memcg_check() is used here, because in theory we can encounter
2591          * a folio where the slab flag has been cleared already, but
2592          * slab->obj_exts has not been freed yet
2593          * folio_memcg_check() will guarantee that a proper memory
2594          * cgroup pointer or NULL will be returned.
2595          */
2596         return folio_memcg_check(folio);
2597 }
2598
2599 /*
2600  * Returns a pointer to the memory cgroup to which the kernel object is charged.
2601  * It is not suitable for objects allocated using vmalloc().
2602  *
2603  * A passed kernel object must be a slab object or a generic kernel page.
2604  *
2605  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2606  * cgroup_mutex, etc.
2607  */
2608 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
2609 {
2610         if (mem_cgroup_disabled())
2611                 return NULL;
2612
2613         return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
2614 }
2615
2616 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
2617 {
2618         struct obj_cgroup *objcg = NULL;
2619
2620         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2621                 objcg = rcu_dereference(memcg->objcg);
2622                 if (likely(objcg && obj_cgroup_tryget(objcg)))
2623                         break;
2624                 objcg = NULL;
2625         }
2626         return objcg;
2627 }
2628
2629 static struct obj_cgroup *current_objcg_update(void)
2630 {
2631         struct mem_cgroup *memcg;
2632         struct obj_cgroup *old, *objcg = NULL;
2633
2634         do {
2635                 /* Atomically drop the update bit. */
2636                 old = xchg(&current->objcg, NULL);
2637                 if (old) {
2638                         old = (struct obj_cgroup *)
2639                                 ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
2640                         obj_cgroup_put(old);
2641
2642                         old = NULL;
2643                 }
2644
2645                 /* If new objcg is NULL, no reason for the second atomic update. */
2646                 if (!current->mm || (current->flags & PF_KTHREAD))
2647                         return NULL;
2648
2649                 /*
2650                  * Release the objcg pointer from the previous iteration,
2651                  * if try_cmpxcg() below fails.
2652                  */
2653                 if (unlikely(objcg)) {
2654                         obj_cgroup_put(objcg);
2655                         objcg = NULL;
2656                 }
2657
2658                 /*
2659                  * Obtain the new objcg pointer. The current task can be
2660                  * asynchronously moved to another memcg and the previous
2661                  * memcg can be offlined. So let's get the memcg pointer
2662                  * and try get a reference to objcg under a rcu read lock.
2663                  */
2664
2665                 rcu_read_lock();
2666                 memcg = mem_cgroup_from_task(current);
2667                 objcg = __get_obj_cgroup_from_memcg(memcg);
2668                 rcu_read_unlock();
2669
2670                 /*
2671                  * Try set up a new objcg pointer atomically. If it
2672                  * fails, it means the update flag was set concurrently, so
2673                  * the whole procedure should be repeated.
2674                  */
2675         } while (!try_cmpxchg(&current->objcg, &old, objcg));
2676
2677         return objcg;
2678 }
2679
2680 __always_inline struct obj_cgroup *current_obj_cgroup(void)
2681 {
2682         struct mem_cgroup *memcg;
2683         struct obj_cgroup *objcg;
2684
2685         if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
2686                 return NULL;
2687
2688         if (in_task()) {
2689                 memcg = current->active_memcg;
2690                 if (unlikely(memcg))
2691                         goto from_memcg;
2692
2693                 objcg = READ_ONCE(current->objcg);
2694                 if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
2695                         objcg = current_objcg_update();
2696                 /*
2697                  * Objcg reference is kept by the task, so it's safe
2698                  * to use the objcg by the current task.
2699                  */
2700                 return objcg;
2701         }
2702
2703         memcg = this_cpu_read(int_active_memcg);
2704         if (unlikely(memcg))
2705                 goto from_memcg;
2706
2707         return NULL;
2708
2709 from_memcg:
2710         objcg = NULL;
2711         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2712                 /*
2713                  * Memcg pointer is protected by scope (see set_active_memcg())
2714                  * and is pinning the corresponding objcg, so objcg can't go
2715                  * away and can be used within the scope without any additional
2716                  * protection.
2717                  */
2718                 objcg = rcu_dereference_check(memcg->objcg, 1);
2719                 if (likely(objcg))
2720                         break;
2721         }
2722
2723         return objcg;
2724 }
2725
2726 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
2727 {
2728         struct obj_cgroup *objcg;
2729
2730         if (!memcg_kmem_online())
2731                 return NULL;
2732
2733         if (folio_memcg_kmem(folio)) {
2734                 objcg = __folio_objcg(folio);
2735                 obj_cgroup_get(objcg);
2736         } else {
2737                 struct mem_cgroup *memcg;
2738
2739                 rcu_read_lock();
2740                 memcg = __folio_memcg(folio);
2741                 if (memcg)
2742                         objcg = __get_obj_cgroup_from_memcg(memcg);
2743                 else
2744                         objcg = NULL;
2745                 rcu_read_unlock();
2746         }
2747         return objcg;
2748 }
2749
2750 #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
2751 static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
2752 {
2753         if (likely(!in_nmi())) {
2754                 mod_memcg_state(memcg, MEMCG_KMEM, val);
2755         } else {
2756                 /* TODO: add to cgroup update tree once it is nmi-safe. */
2757                 atomic_add(val, &memcg->kmem_stat);
2758         }
2759 }
2760 #else
2761 static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
2762 {
2763         mod_memcg_state(memcg, MEMCG_KMEM, val);
2764 }
2765 #endif
2766
2767 /*
2768  * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
2769  * @objcg: object cgroup to uncharge
2770  * @nr_pages: number of pages to uncharge
2771  */
2772 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2773                                       unsigned int nr_pages)
2774 {
2775         struct mem_cgroup *memcg;
2776
2777         memcg = get_mem_cgroup_from_objcg(objcg);
2778
2779         account_kmem_nmi_safe(memcg, -nr_pages);
2780         memcg1_account_kmem(memcg, -nr_pages);
2781         if (!mem_cgroup_is_root(memcg))
2782                 refill_stock(memcg, nr_pages);
2783
2784         css_put(&memcg->css);
2785 }
2786
2787 /*
2788  * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
2789  * @objcg: object cgroup to charge
2790  * @gfp: reclaim mode
2791  * @nr_pages: number of pages to charge
2792  *
2793  * Returns 0 on success, an error code on failure.
2794  */
2795 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2796                                    unsigned int nr_pages)
2797 {
2798         struct mem_cgroup *memcg;
2799         int ret;
2800
2801         memcg = get_mem_cgroup_from_objcg(objcg);
2802
2803         ret = try_charge_memcg(memcg, gfp, nr_pages);
2804         if (ret)
2805                 goto out;
2806
2807         account_kmem_nmi_safe(memcg, nr_pages);
2808         memcg1_account_kmem(memcg, nr_pages);
2809 out:
2810         css_put(&memcg->css);
2811
2812         return ret;
2813 }
2814
2815 static struct obj_cgroup *page_objcg(const struct page *page)
2816 {
2817         unsigned long memcg_data = page->memcg_data;
2818
2819         if (mem_cgroup_disabled() || !memcg_data)
2820                 return NULL;
2821
2822         VM_BUG_ON_PAGE((memcg_data & OBJEXTS_FLAGS_MASK) != MEMCG_DATA_KMEM,
2823                         page);
2824         return (struct obj_cgroup *)(memcg_data - MEMCG_DATA_KMEM);
2825 }
2826
2827 static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
2828 {
2829         page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
2830 }
2831
2832 /**
2833  * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2834  * @page: page to charge
2835  * @gfp: reclaim mode
2836  * @order: allocation order
2837  *
2838  * Returns 0 on success, an error code on failure.
2839  */
2840 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2841 {
2842         struct obj_cgroup *objcg;
2843         int ret = 0;
2844
2845         objcg = current_obj_cgroup();
2846         if (objcg) {
2847                 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
2848                 if (!ret) {
2849                         obj_cgroup_get(objcg);
2850                         page_set_objcg(page, objcg);
2851                         return 0;
2852                 }
2853         }
2854         return ret;
2855 }
2856
2857 /**
2858  * __memcg_kmem_uncharge_page: uncharge a kmem page
2859  * @page: page to uncharge
2860  * @order: allocation order
2861  */
2862 void __memcg_kmem_uncharge_page(struct page *page, int order)
2863 {
2864         struct obj_cgroup *objcg = page_objcg(page);
2865         unsigned int nr_pages = 1 << order;
2866
2867         if (!objcg)
2868                 return;
2869
2870         obj_cgroup_uncharge_pages(objcg, nr_pages);
2871         page->memcg_data = 0;
2872         obj_cgroup_put(objcg);
2873 }
2874
2875 static void __account_obj_stock(struct obj_cgroup *objcg,
2876                                 struct obj_stock_pcp *stock, int nr,
2877                                 struct pglist_data *pgdat, enum node_stat_item idx)
2878 {
2879         int *bytes;
2880
2881         /*
2882          * Save vmstat data in stock and skip vmstat array update unless
2883          * accumulating over a page of vmstat data or when pgdat changes.
2884          */
2885         if (stock->cached_pgdat != pgdat) {
2886                 /* Flush the existing cached vmstat data */
2887                 struct pglist_data *oldpg = stock->cached_pgdat;
2888
2889                 if (stock->nr_slab_reclaimable_b) {
2890                         mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
2891                                           stock->nr_slab_reclaimable_b);
2892                         stock->nr_slab_reclaimable_b = 0;
2893                 }
2894                 if (stock->nr_slab_unreclaimable_b) {
2895                         mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
2896                                           stock->nr_slab_unreclaimable_b);
2897                         stock->nr_slab_unreclaimable_b = 0;
2898                 }
2899                 stock->cached_pgdat = pgdat;
2900         }
2901
2902         bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
2903                                                : &stock->nr_slab_unreclaimable_b;
2904         /*
2905          * Even for large object >= PAGE_SIZE, the vmstat data will still be
2906          * cached locally at least once before pushing it out.
2907          */
2908         if (!*bytes) {
2909                 *bytes = nr;
2910                 nr = 0;
2911         } else {
2912                 *bytes += nr;
2913                 if (abs(*bytes) > PAGE_SIZE) {
2914                         nr = *bytes;
2915                         *bytes = 0;
2916                 } else {
2917                         nr = 0;
2918                 }
2919         }
2920         if (nr)
2921                 mod_objcg_mlstate(objcg, pgdat, idx, nr);
2922 }
2923
2924 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
2925                               struct pglist_data *pgdat, enum node_stat_item idx)
2926 {
2927         struct obj_stock_pcp *stock;
2928         bool ret = false;
2929
2930         if (!local_trylock(&obj_stock.lock))
2931                 return ret;
2932
2933         stock = this_cpu_ptr(&obj_stock);
2934         if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
2935                 stock->nr_bytes -= nr_bytes;
2936                 ret = true;
2937
2938                 if (pgdat)
2939                         __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
2940         }
2941
2942         local_unlock(&obj_stock.lock);
2943
2944         return ret;
2945 }
2946
2947 static void drain_obj_stock(struct obj_stock_pcp *stock)
2948 {
2949         struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
2950
2951         if (!old)
2952                 return;
2953
2954         if (stock->nr_bytes) {
2955                 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2956                 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2957
2958                 if (nr_pages) {
2959                         struct mem_cgroup *memcg;
2960
2961                         memcg = get_mem_cgroup_from_objcg(old);
2962
2963                         mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2964                         memcg1_account_kmem(memcg, -nr_pages);
2965                         if (!mem_cgroup_is_root(memcg))
2966                                 memcg_uncharge(memcg, nr_pages);
2967
2968                         css_put(&memcg->css);
2969                 }
2970
2971                 /*
2972                  * The leftover is flushed to the centralized per-memcg value.
2973                  * On the next attempt to refill obj stock it will be moved
2974                  * to a per-cpu stock (probably, on an other CPU), see
2975                  * refill_obj_stock().
2976                  *
2977                  * How often it's flushed is a trade-off between the memory
2978                  * limit enforcement accuracy and potential CPU contention,
2979                  * so it might be changed in the future.
2980                  */
2981                 atomic_add(nr_bytes, &old->nr_charged_bytes);
2982                 stock->nr_bytes = 0;
2983         }
2984
2985         /*
2986          * Flush the vmstat data in current stock
2987          */
2988         if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
2989                 if (stock->nr_slab_reclaimable_b) {
2990                         mod_objcg_mlstate(old, stock->cached_pgdat,
2991                                           NR_SLAB_RECLAIMABLE_B,
2992                                           stock->nr_slab_reclaimable_b);
2993                         stock->nr_slab_reclaimable_b = 0;
2994                 }
2995                 if (stock->nr_slab_unreclaimable_b) {
2996                         mod_objcg_mlstate(old, stock->cached_pgdat,
2997                                           NR_SLAB_UNRECLAIMABLE_B,
2998                                           stock->nr_slab_unreclaimable_b);
2999                         stock->nr_slab_unreclaimable_b = 0;
3000                 }
3001                 stock->cached_pgdat = NULL;
3002         }
3003
3004         WRITE_ONCE(stock->cached_objcg, NULL);
3005         obj_cgroup_put(old);
3006 }
3007
3008 static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
3009                                      struct mem_cgroup *root_memcg)
3010 {
3011         struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
3012         struct mem_cgroup *memcg;
3013         bool flush = false;
3014
3015         rcu_read_lock();
3016         if (objcg) {
3017                 memcg = obj_cgroup_memcg(objcg);
3018                 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3019                         flush = true;
3020         }
3021         rcu_read_unlock();
3022
3023         return flush;
3024 }
3025
3026 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3027                 bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
3028                 enum node_stat_item idx)
3029 {
3030         struct obj_stock_pcp *stock;
3031         unsigned int nr_pages = 0;
3032
3033         if (!local_trylock(&obj_stock.lock)) {
3034                 if (pgdat)
3035                         mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
3036                 nr_pages = nr_bytes >> PAGE_SHIFT;
3037                 nr_bytes = nr_bytes & (PAGE_SIZE - 1);
3038                 atomic_add(nr_bytes, &objcg->nr_charged_bytes);
3039                 goto out;
3040         }
3041
3042         stock = this_cpu_ptr(&obj_stock);
3043         if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
3044                 drain_obj_stock(stock);
3045                 obj_cgroup_get(objcg);
3046                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3047                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3048                 WRITE_ONCE(stock->cached_objcg, objcg);
3049
3050                 allow_uncharge = true;  /* Allow uncharge when objcg changes */
3051         }
3052         stock->nr_bytes += nr_bytes;
3053
3054         if (pgdat)
3055                 __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
3056
3057         if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3058                 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3059                 stock->nr_bytes &= (PAGE_SIZE - 1);
3060         }
3061
3062         local_unlock(&obj_stock.lock);
3063 out:
3064         if (nr_pages)
3065                 obj_cgroup_uncharge_pages(objcg, nr_pages);
3066 }
3067
3068 static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
3069                                      struct pglist_data *pgdat, enum node_stat_item idx)
3070 {
3071         unsigned int nr_pages, nr_bytes;
3072         int ret;
3073
3074         if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
3075                 return 0;
3076
3077         /*
3078          * In theory, objcg->nr_charged_bytes can have enough
3079          * pre-charged bytes to satisfy the allocation. However,
3080          * flushing objcg->nr_charged_bytes requires two atomic
3081          * operations, and objcg->nr_charged_bytes can't be big.
3082          * The shared objcg->nr_charged_bytes can also become a
3083          * performance bottleneck if all tasks of the same memcg are
3084          * trying to update it. So it's better to ignore it and try
3085          * grab some new pages. The stock's nr_bytes will be flushed to
3086          * objcg->nr_charged_bytes later on when objcg changes.
3087          *
3088          * The stock's nr_bytes may contain enough pre-charged bytes
3089          * to allow one less page from being charged, but we can't rely
3090          * on the pre-charged bytes not being changed outside of
3091          * consume_obj_stock() or refill_obj_stock(). So ignore those
3092          * pre-charged bytes as well when charging pages. To avoid a
3093          * page uncharge right after a page charge, we set the
3094          * allow_uncharge flag to false when calling refill_obj_stock()
3095          * to temporarily allow the pre-charged bytes to exceed the page
3096          * size limit. The maximum reachable value of the pre-charged
3097          * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
3098          * race.
3099          */
3100         nr_pages = size >> PAGE_SHIFT;
3101         nr_bytes = size & (PAGE_SIZE - 1);
3102
3103         if (nr_bytes)
3104                 nr_pages += 1;
3105
3106         ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3107         if (!ret && (nr_bytes || pgdat))
3108                 refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
3109                                          false, size, pgdat, idx);
3110
3111         return ret;
3112 }
3113
3114 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3115 {
3116         return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
3117 }
3118
3119 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3120 {
3121         refill_obj_stock(objcg, size, true, 0, NULL, 0);
3122 }
3123
3124 static inline size_t obj_full_size(struct kmem_cache *s)
3125 {
3126         /*
3127          * For each accounted object there is an extra space which is used
3128          * to store obj_cgroup membership. Charge it too.
3129          */
3130         return s->size + sizeof(struct obj_cgroup *);
3131 }
3132
3133 bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
3134                                   gfp_t flags, size_t size, void **p)
3135 {
3136         struct obj_cgroup *objcg;
3137         struct slab *slab;
3138         unsigned long off;
3139         size_t i;
3140
3141         /*
3142          * The obtained objcg pointer is safe to use within the current scope,
3143          * defined by current task or set_active_memcg() pair.
3144          * obj_cgroup_get() is used to get a permanent reference.
3145          */
3146         objcg = current_obj_cgroup();
3147         if (!objcg)
3148                 return true;
3149
3150         /*
3151          * slab_alloc_node() avoids the NULL check, so we might be called with a
3152          * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill
3153          * the whole requested size.
3154          * return success as there's nothing to free back
3155          */
3156         if (unlikely(*p == NULL))
3157                 return true;
3158
3159         flags &= gfp_allowed_mask;
3160
3161         if (lru) {
3162                 int ret;
3163                 struct mem_cgroup *memcg;
3164
3165                 memcg = get_mem_cgroup_from_objcg(objcg);
3166                 ret = memcg_list_lru_alloc(memcg, lru, flags);
3167                 css_put(&memcg->css);
3168
3169                 if (ret)
3170                         return false;
3171         }
3172
3173         for (i = 0; i < size; i++) {
3174                 slab = virt_to_slab(p[i]);
3175
3176                 if (!slab_obj_exts(slab) &&
3177                     alloc_slab_obj_exts(slab, s, flags, false)) {
3178                         continue;
3179                 }
3180
3181                 /*
3182                  * if we fail and size is 1, memcg_alloc_abort_single() will
3183                  * just free the object, which is ok as we have not assigned
3184                  * objcg to its obj_ext yet
3185                  *
3186                  * for larger sizes, kmem_cache_free_bulk() will uncharge
3187                  * any objects that were already charged and obj_ext assigned
3188                  *
3189                  * TODO: we could batch this until slab_pgdat(slab) changes
3190                  * between iterations, with a more complicated undo
3191                  */
3192                 if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
3193                                         slab_pgdat(slab), cache_vmstat_idx(s)))
3194                         return false;
3195
3196                 off = obj_to_index(s, slab, p[i]);
3197                 obj_cgroup_get(objcg);
3198                 slab_obj_exts(slab)[off].objcg = objcg;
3199         }
3200
3201         return true;
3202 }
3203
3204 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
3205                             void **p, int objects, struct slabobj_ext *obj_exts)
3206 {
3207         size_t obj_size = obj_full_size(s);
3208
3209         for (int i = 0; i < objects; i++) {
3210                 struct obj_cgroup *objcg;
3211                 unsigned int off;
3212
3213                 off = obj_to_index(s, slab, p[i]);
3214                 objcg = obj_exts[off].objcg;
3215                 if (!objcg)
3216                         continue;
3217
3218                 obj_exts[off].objcg = NULL;
3219                 refill_obj_stock(objcg, obj_size, true, -obj_size,
3220                                  slab_pgdat(slab), cache_vmstat_idx(s));
3221                 obj_cgroup_put(objcg);
3222         }
3223 }
3224
3225 /*
3226  * The objcg is only set on the first page, so transfer it to all the
3227  * other pages.
3228  */
3229 void split_page_memcg(struct page *page, unsigned order)
3230 {
3231         struct obj_cgroup *objcg = page_objcg(page);
3232         unsigned int i, nr = 1 << order;
3233
3234         if (!objcg)
3235                 return;
3236
3237         for (i = 1; i < nr; i++)
3238                 page_set_objcg(&page[i], objcg);
3239
3240         obj_cgroup_get_many(objcg, nr - 1);
3241 }
3242
3243 void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
3244                 unsigned new_order)
3245 {
3246         unsigned new_refs;
3247
3248         if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
3249                 return;
3250
3251         new_refs = (1 << (old_order - new_order)) - 1;
3252         css_get_many(&__folio_memcg(folio)->css, new_refs);
3253 }
3254
3255 unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3256 {
3257         unsigned long val;
3258
3259         if (mem_cgroup_is_root(memcg)) {
3260                 /*
3261                  * Approximate root's usage from global state. This isn't
3262                  * perfect, but the root usage was always an approximation.
3263                  */
3264                 val = global_node_page_state(NR_FILE_PAGES) +
3265                         global_node_page_state(NR_ANON_MAPPED);
3266                 if (swap)
3267                         val += total_swap_pages - get_nr_swap_pages();
3268         } else {
3269                 if (!swap)
3270                         val = page_counter_read(&memcg->memory);
3271                 else
3272                         val = page_counter_read(&memcg->memsw);
3273         }
3274         return val;
3275 }
3276
3277 static int memcg_online_kmem(struct mem_cgroup *memcg)
3278 {
3279         struct obj_cgroup *objcg;
3280
3281         if (mem_cgroup_kmem_disabled())
3282                 return 0;
3283
3284         if (unlikely(mem_cgroup_is_root(memcg)))
3285                 return 0;
3286
3287         objcg = obj_cgroup_alloc();
3288         if (!objcg)
3289                 return -ENOMEM;
3290
3291         objcg->memcg = memcg;
3292         rcu_assign_pointer(memcg->objcg, objcg);
3293         obj_cgroup_get(objcg);
3294         memcg->orig_objcg = objcg;
3295
3296         static_branch_enable(&memcg_kmem_online_key);
3297
3298         memcg->kmemcg_id = memcg->id.id;
3299
3300         return 0;
3301 }
3302
3303 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3304 {
3305         struct mem_cgroup *parent;
3306
3307         if (mem_cgroup_kmem_disabled())
3308                 return;
3309
3310         if (unlikely(mem_cgroup_is_root(memcg)))
3311                 return;
3312
3313         parent = parent_mem_cgroup(memcg);
3314         if (!parent)
3315                 parent = root_mem_cgroup;
3316
3317         memcg_reparent_list_lrus(memcg, parent);
3318
3319         /*
3320          * Objcg's reparenting must be after list_lru's, make sure list_lru
3321          * helpers won't use parent's list_lru until child is drained.
3322          */
3323         memcg_reparent_objcgs(memcg, parent);
3324 }
3325
3326 #ifdef CONFIG_CGROUP_WRITEBACK
3327
3328 #include <trace/events/writeback.h>
3329
3330 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3331 {
3332         return wb_domain_init(&memcg->cgwb_domain, gfp);
3333 }
3334
3335 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3336 {
3337         wb_domain_exit(&memcg->cgwb_domain);
3338 }
3339
3340 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3341 {
3342         wb_domain_size_changed(&memcg->cgwb_domain);
3343 }
3344
3345 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3346 {
3347         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3348
3349         if (!memcg->css.parent)
3350                 return NULL;
3351
3352         return &memcg->cgwb_domain;
3353 }
3354
3355 /**
3356  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3357  * @wb: bdi_writeback in question
3358  * @pfilepages: out parameter for number of file pages
3359  * @pheadroom: out parameter for number of allocatable pages according to memcg
3360  * @pdirty: out parameter for number of dirty pages
3361  * @pwriteback: out parameter for number of pages under writeback
3362  *
3363  * Determine the numbers of file, headroom, dirty, and writeback pages in
3364  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3365  * is a bit more involved.
3366  *
3367  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3368  * headroom is calculated as the lowest headroom of itself and the
3369  * ancestors.  Note that this doesn't consider the actual amount of
3370  * available memory in the system.  The caller should further cap
3371  * *@pheadroom accordingly.
3372  */
3373 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3374                          unsigned long *pheadroom, unsigned long *pdirty,
3375                          unsigned long *pwriteback)
3376 {
3377         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3378         struct mem_cgroup *parent;
3379
3380         mem_cgroup_flush_stats_ratelimited(memcg);
3381
3382         *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3383         *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3384         *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
3385                         memcg_page_state(memcg, NR_ACTIVE_FILE);
3386
3387         *pheadroom = PAGE_COUNTER_MAX;
3388         while ((parent = parent_mem_cgroup(memcg))) {
3389                 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
3390                                             READ_ONCE(memcg->memory.high));
3391                 unsigned long used = page_counter_read(&memcg->memory);
3392
3393                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3394                 memcg = parent;
3395         }
3396 }
3397
3398 /*
3399  * Foreign dirty flushing
3400  *
3401  * There's an inherent mismatch between memcg and writeback.  The former
3402  * tracks ownership per-page while the latter per-inode.  This was a
3403  * deliberate design decision because honoring per-page ownership in the
3404  * writeback path is complicated, may lead to higher CPU and IO overheads
3405  * and deemed unnecessary given that write-sharing an inode across
3406  * different cgroups isn't a common use-case.
3407  *
3408  * Combined with inode majority-writer ownership switching, this works well
3409  * enough in most cases but there are some pathological cases.  For
3410  * example, let's say there are two cgroups A and B which keep writing to
3411  * different but confined parts of the same inode.  B owns the inode and
3412  * A's memory is limited far below B's.  A's dirty ratio can rise enough to
3413  * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
3414  * triggering background writeback.  A will be slowed down without a way to
3415  * make writeback of the dirty pages happen.
3416  *
3417  * Conditions like the above can lead to a cgroup getting repeatedly and
3418  * severely throttled after making some progress after each
3419  * dirty_expire_interval while the underlying IO device is almost
3420  * completely idle.
3421  *
3422  * Solving this problem completely requires matching the ownership tracking
3423  * granularities between memcg and writeback in either direction.  However,
3424  * the more egregious behaviors can be avoided by simply remembering the
3425  * most recent foreign dirtying events and initiating remote flushes on
3426  * them when local writeback isn't enough to keep the memory clean enough.
3427  *
3428  * The following two functions implement such mechanism.  When a foreign
3429  * page - a page whose memcg and writeback ownerships don't match - is
3430  * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
3431  * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
3432  * decides that the memcg needs to sleep due to high dirty ratio, it calls
3433  * mem_cgroup_flush_foreign() which queues writeback on the recorded
3434  * foreign bdi_writebacks which haven't expired.  Both the numbers of
3435  * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
3436  * limited to MEMCG_CGWB_FRN_CNT.
3437  *
3438  * The mechanism only remembers IDs and doesn't hold any object references.
3439  * As being wrong occasionally doesn't matter, updates and accesses to the
3440  * records are lockless and racy.
3441  */
3442 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
3443                                              struct bdi_writeback *wb)
3444 {
3445         struct mem_cgroup *memcg = folio_memcg(folio);
3446         struct memcg_cgwb_frn *frn;
3447         u64 now = get_jiffies_64();
3448         u64 oldest_at = now;
3449         int oldest = -1;
3450         int i;
3451
3452         trace_track_foreign_dirty(folio, wb);
3453
3454         /*
3455          * Pick the slot to use.  If there is already a slot for @wb, keep
3456          * using it.  If not replace the oldest one which isn't being
3457          * written out.
3458          */
3459         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3460                 frn = &memcg->cgwb_frn[i];
3461                 if (frn->bdi_id == wb->bdi->id &&
3462                     frn->memcg_id == wb->memcg_css->id)
3463                         break;
3464                 if (time_before64(frn->at, oldest_at) &&
3465                     atomic_read(&frn->done.cnt) == 1) {
3466                         oldest = i;
3467                         oldest_at = frn->at;
3468                 }
3469         }
3470
3471         if (i < MEMCG_CGWB_FRN_CNT) {
3472                 /*
3473                  * Re-using an existing one.  Update timestamp lazily to
3474                  * avoid making the cacheline hot.  We want them to be
3475                  * reasonably up-to-date and significantly shorter than
3476                  * dirty_expire_interval as that's what expires the record.
3477                  * Use the shorter of 1s and dirty_expire_interval / 8.
3478                  */
3479                 unsigned long update_intv =
3480                         min_t(unsigned long, HZ,
3481                               msecs_to_jiffies(dirty_expire_interval * 10) / 8);
3482
3483                 if (time_before64(frn->at, now - update_intv))
3484                         frn->at = now;
3485         } else if (oldest >= 0) {
3486                 /* replace the oldest free one */
3487                 frn = &memcg->cgwb_frn[oldest];
3488                 frn->bdi_id = wb->bdi->id;
3489                 frn->memcg_id = wb->memcg_css->id;
3490                 frn->at = now;
3491         }
3492 }
3493
3494 /* issue foreign writeback flushes for recorded foreign dirtying events */
3495 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
3496 {
3497         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3498         unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
3499         u64 now = jiffies_64;
3500         int i;
3501
3502         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3503                 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
3504
3505                 /*
3506                  * If the record is older than dirty_expire_interval,
3507                  * writeback on it has already started.  No need to kick it
3508                  * off again.  Also, don't start a new one if there's
3509                  * already one in flight.
3510                  */
3511                 if (time_after64(frn->at, now - intv) &&
3512                     atomic_read(&frn->done.cnt) == 1) {
3513                         frn->at = 0;
3514                         trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
3515                         cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
3516                                                WB_REASON_FOREIGN_FLUSH,
3517                                                &frn->done);
3518                 }
3519         }
3520 }
3521
3522 #else   /* CONFIG_CGROUP_WRITEBACK */
3523
3524 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3525 {
3526         return 0;
3527 }
3528
3529 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3530 {
3531 }
3532
3533 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3534 {
3535 }
3536
3537 #endif  /* CONFIG_CGROUP_WRITEBACK */
3538
3539 /*
3540  * Private memory cgroup IDR
3541  *
3542  * Swap-out records and page cache shadow entries need to store memcg
3543  * references in constrained space, so we maintain an ID space that is
3544  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
3545  * memory-controlled cgroups to 64k.
3546  *
3547  * However, there usually are many references to the offline CSS after
3548  * the cgroup has been destroyed, such as page cache or reclaimable
3549  * slab objects, that don't need to hang on to the ID. We want to keep
3550  * those dead CSS from occupying IDs, or we might quickly exhaust the
3551  * relatively small ID space and prevent the creation of new cgroups
3552  * even when there are much fewer than 64k cgroups - possibly none.
3553  *
3554  * Maintain a private 16-bit ID space for memcg, and allow the ID to
3555  * be freed and recycled when it's no longer needed, which is usually
3556  * when the CSS is offlined.
3557  *
3558  * The only exception to that are records of swapped out tmpfs/shmem
3559  * pages that need to be attributed to live ancestors on swapin. But
3560  * those references are manageable from userspace.
3561  */
3562
3563 #define MEM_CGROUP_ID_MAX       ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
3564 static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids);
3565
3566 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
3567 {
3568         if (memcg->id.id > 0) {
3569                 xa_erase(&mem_cgroup_ids, memcg->id.id);
3570                 memcg->id.id = 0;
3571         }
3572 }
3573
3574 void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
3575                                            unsigned int n)
3576 {
3577         refcount_add(n, &memcg->id.ref);
3578 }
3579
3580 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
3581 {
3582         if (refcount_sub_and_test(n, &memcg->id.ref)) {
3583                 mem_cgroup_id_remove(memcg);
3584
3585                 /* Memcg ID pins CSS */
3586                 css_put(&memcg->css);
3587         }
3588 }
3589
3590 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
3591 {
3592         mem_cgroup_id_put_many(memcg, 1);
3593 }
3594
3595 struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
3596 {
3597         while (!refcount_inc_not_zero(&memcg->id.ref)) {
3598                 /*
3599                  * The root cgroup cannot be destroyed, so it's refcount must
3600                  * always be >= 1.
3601                  */
3602                 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
3603                         VM_BUG_ON(1);
3604                         break;
3605                 }
3606                 memcg = parent_mem_cgroup(memcg);
3607                 if (!memcg)
3608                         memcg = root_mem_cgroup;
3609         }
3610         return memcg;
3611 }
3612
3613 /**
3614  * mem_cgroup_from_id - look up a memcg from a memcg id
3615  * @id: the memcg id to look up
3616  *
3617  * Caller must hold rcu_read_lock().
3618  */
3619 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
3620 {
3621         WARN_ON_ONCE(!rcu_read_lock_held());
3622         return xa_load(&mem_cgroup_ids, id);
3623 }
3624
3625 #ifdef CONFIG_SHRINKER_DEBUG
3626 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
3627 {
3628         struct cgroup *cgrp;
3629         struct cgroup_subsys_state *css;
3630         struct mem_cgroup *memcg;
3631
3632         cgrp = cgroup_get_from_id(ino);
3633         if (IS_ERR(cgrp))
3634                 return ERR_CAST(cgrp);
3635
3636         css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
3637         if (css)
3638                 memcg = container_of(css, struct mem_cgroup, css);
3639         else
3640                 memcg = ERR_PTR(-ENOENT);
3641
3642         cgroup_put(cgrp);
3643
3644         return memcg;
3645 }
3646 #endif
3647
3648 static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn)
3649 {
3650         if (!pn)
3651                 return;
3652
3653         free_percpu(pn->lruvec_stats_percpu);
3654         kfree(pn->lruvec_stats);
3655         kfree(pn);
3656 }
3657
3658 static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
3659 {
3660         struct mem_cgroup_per_node *pn;
3661
3662         pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
3663                                    node);
3664         if (!pn)
3665                 return false;
3666
3667         pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
3668                                         GFP_KERNEL_ACCOUNT, node);
3669         if (!pn->lruvec_stats)
3670                 goto fail;
3671
3672         pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
3673                                                    GFP_KERNEL_ACCOUNT);
3674         if (!pn->lruvec_stats_percpu)
3675                 goto fail;
3676
3677         lruvec_init(&pn->lruvec);
3678         pn->memcg = memcg;
3679
3680         memcg->nodeinfo[node] = pn;
3681         return true;
3682 fail:
3683         free_mem_cgroup_per_node_info(pn);
3684         return false;
3685 }
3686
3687 static void __mem_cgroup_free(struct mem_cgroup *memcg)
3688 {
3689         int node;
3690
3691         obj_cgroup_put(memcg->orig_objcg);
3692
3693         for_each_node(node)
3694                 free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
3695         memcg1_free_events(memcg);
3696         kfree(memcg->vmstats);
3697         free_percpu(memcg->vmstats_percpu);
3698         kfree(memcg);
3699 }
3700
3701 static void mem_cgroup_free(struct mem_cgroup *memcg)
3702 {
3703         lru_gen_exit_memcg(memcg);
3704         memcg_wb_domain_exit(memcg);
3705         __mem_cgroup_free(memcg);
3706 }
3707
3708 static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
3709 {
3710         struct memcg_vmstats_percpu *statc;
3711         struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
3712         struct mem_cgroup *memcg;
3713         int node, cpu;
3714         int __maybe_unused i;
3715         long error;
3716
3717         memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
3718         if (!memcg)
3719                 return ERR_PTR(-ENOMEM);
3720
3721         error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL,
3722                          XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL);
3723         if (error)
3724                 goto fail;
3725         error = -ENOMEM;
3726
3727         memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
3728                                  GFP_KERNEL_ACCOUNT);
3729         if (!memcg->vmstats)
3730                 goto fail;
3731
3732         memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
3733                                                  GFP_KERNEL_ACCOUNT);
3734         if (!memcg->vmstats_percpu)
3735                 goto fail;
3736
3737         if (!memcg1_alloc_events(memcg))
3738                 goto fail;
3739
3740         for_each_possible_cpu(cpu) {
3741                 if (parent)
3742                         pstatc_pcpu = parent->vmstats_percpu;
3743                 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3744                 statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
3745                 statc->vmstats = memcg->vmstats;
3746         }
3747
3748         for_each_node(node)
3749                 if (!alloc_mem_cgroup_per_node_info(memcg, node))
3750                         goto fail;
3751
3752         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
3753                 goto fail;
3754
3755         INIT_WORK(&memcg->high_work, high_work_func);
3756         vmpressure_init(&memcg->vmpressure);
3757         INIT_LIST_HEAD(&memcg->memory_peaks);
3758         INIT_LIST_HEAD(&memcg->swap_peaks);
3759         spin_lock_init(&memcg->peaks_lock);
3760         memcg->socket_pressure = jiffies;
3761         memcg1_memcg_init(memcg);
3762         memcg->kmemcg_id = -1;
3763         INIT_LIST_HEAD(&memcg->objcg_list);
3764 #ifdef CONFIG_CGROUP_WRITEBACK
3765         INIT_LIST_HEAD(&memcg->cgwb_list);
3766         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3767                 memcg->cgwb_frn[i].done =
3768                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
3769 #endif
3770 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3771         spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
3772         INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
3773         memcg->deferred_split_queue.split_queue_len = 0;
3774 #endif
3775         lru_gen_init_memcg(memcg);
3776         return memcg;
3777 fail:
3778         mem_cgroup_id_remove(memcg);
3779         __mem_cgroup_free(memcg);
3780         return ERR_PTR(error);
3781 }
3782
3783 static struct cgroup_subsys_state * __ref
3784 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
3785 {
3786         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
3787         struct mem_cgroup *memcg, *old_memcg;
3788         bool memcg_on_dfl = cgroup_subsys_on_dfl(memory_cgrp_subsys);
3789
3790         old_memcg = set_active_memcg(parent);
3791         memcg = mem_cgroup_alloc(parent);
3792         set_active_memcg(old_memcg);
3793         if (IS_ERR(memcg))
3794                 return ERR_CAST(memcg);
3795
3796         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3797         memcg1_soft_limit_reset(memcg);
3798 #ifdef CONFIG_ZSWAP
3799         memcg->zswap_max = PAGE_COUNTER_MAX;
3800         WRITE_ONCE(memcg->zswap_writeback, true);
3801 #endif
3802         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3803         if (parent) {
3804                 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
3805
3806                 page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
3807                 page_counter_init(&memcg->swap, &parent->swap, false);
3808 #ifdef CONFIG_MEMCG_V1
3809                 memcg->memory.track_failcnt = !memcg_on_dfl;
3810                 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
3811                 page_counter_init(&memcg->kmem, &parent->kmem, false);
3812                 page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
3813 #endif
3814         } else {
3815                 init_memcg_stats();
3816                 init_memcg_events();
3817                 page_counter_init(&memcg->memory, NULL, true);
3818                 page_counter_init(&memcg->swap, NULL, false);
3819 #ifdef CONFIG_MEMCG_V1
3820                 page_counter_init(&memcg->kmem, NULL, false);
3821                 page_counter_init(&memcg->tcpmem, NULL, false);
3822 #endif
3823                 root_mem_cgroup = memcg;
3824                 return &memcg->css;
3825         }
3826
3827         if (memcg_on_dfl && !cgroup_memory_nosocket)
3828                 static_branch_inc(&memcg_sockets_enabled_key);
3829
3830         if (!cgroup_memory_nobpf)
3831                 static_branch_inc(&memcg_bpf_enabled_key);
3832
3833         return &memcg->css;
3834 }
3835
3836 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
3837 {
3838         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3839
3840         if (memcg_online_kmem(memcg))
3841                 goto remove_id;
3842
3843         /*
3844          * A memcg must be visible for expand_shrinker_info()
3845          * by the time the maps are allocated. So, we allocate maps
3846          * here, when for_each_mem_cgroup() can't skip it.
3847          */
3848         if (alloc_shrinker_info(memcg))
3849                 goto offline_kmem;
3850
3851         if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
3852                 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
3853                                    FLUSH_TIME);
3854         lru_gen_online_memcg(memcg);
3855
3856         /* Online state pins memcg ID, memcg ID pins CSS */
3857         refcount_set(&memcg->id.ref, 1);
3858         css_get(css);
3859
3860         /*
3861          * Ensure mem_cgroup_from_id() works once we're fully online.
3862          *
3863          * We could do this earlier and require callers to filter with
3864          * css_tryget_online(). But right now there are no users that
3865          * need earlier access, and the workingset code relies on the
3866          * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
3867          * publish it here at the end of onlining. This matches the
3868          * regular ID destruction during offlining.
3869          */
3870         xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL);
3871
3872         return 0;
3873 offline_kmem:
3874         memcg_offline_kmem(memcg);
3875 remove_id:
3876         mem_cgroup_id_remove(memcg);
3877         return -ENOMEM;
3878 }
3879
3880 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
3881 {
3882         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3883
3884         memcg1_css_offline(memcg);
3885
3886         page_counter_set_min(&memcg->memory, 0);
3887         page_counter_set_low(&memcg->memory, 0);
3888
3889         zswap_memcg_offline_cleanup(memcg);
3890
3891         memcg_offline_kmem(memcg);
3892         reparent_shrinker_deferred(memcg);
3893         wb_memcg_offline(memcg);
3894         lru_gen_offline_memcg(memcg);
3895
3896         drain_all_stock(memcg);
3897
3898         mem_cgroup_id_put(memcg);
3899 }
3900
3901 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
3902 {
3903         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3904
3905         invalidate_reclaim_iterators(memcg);
3906         lru_gen_release_memcg(memcg);
3907 }
3908
3909 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
3910 {
3911         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3912         int __maybe_unused i;
3913
3914 #ifdef CONFIG_CGROUP_WRITEBACK
3915         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3916                 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
3917 #endif
3918         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3919                 static_branch_dec(&memcg_sockets_enabled_key);
3920
3921         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg))
3922                 static_branch_dec(&memcg_sockets_enabled_key);
3923
3924         if (!cgroup_memory_nobpf)
3925                 static_branch_dec(&memcg_bpf_enabled_key);
3926
3927         vmpressure_cleanup(&memcg->vmpressure);
3928         cancel_work_sync(&memcg->high_work);
3929         memcg1_remove_from_trees(memcg);
3930         free_shrinker_info(memcg);
3931         mem_cgroup_free(memcg);
3932 }
3933
3934 /**
3935  * mem_cgroup_css_reset - reset the states of a mem_cgroup
3936  * @css: the target css
3937  *
3938  * Reset the states of the mem_cgroup associated with @css.  This is
3939  * invoked when the userland requests disabling on the default hierarchy
3940  * but the memcg is pinned through dependency.  The memcg should stop
3941  * applying policies and should revert to the vanilla state as it may be
3942  * made visible again.
3943  *
3944  * The current implementation only resets the essential configurations.
3945  * This needs to be expanded to cover all the visible parts.
3946  */
3947 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
3948 {
3949         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3950
3951         page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
3952         page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
3953 #ifdef CONFIG_MEMCG_V1
3954         page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
3955         page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
3956 #endif
3957         page_counter_set_min(&memcg->memory, 0);
3958         page_counter_set_low(&memcg->memory, 0);
3959         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3960         memcg1_soft_limit_reset(memcg);
3961         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3962         memcg_wb_domain_size_changed(memcg);
3963 }
3964
3965 struct aggregate_control {
3966         /* pointer to the aggregated (CPU and subtree aggregated) counters */
3967         long *aggregate;
3968         /* pointer to the non-hierarchichal (CPU aggregated) counters */
3969         long *local;
3970         /* pointer to the pending child counters during tree propagation */
3971         long *pending;
3972         /* pointer to the parent's pending counters, could be NULL */
3973         long *ppending;
3974         /* pointer to the percpu counters to be aggregated */
3975         long *cstat;
3976         /* pointer to the percpu counters of the last aggregation*/
3977         long *cstat_prev;
3978         /* size of the above counters */
3979         int size;
3980 };
3981
3982 static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
3983 {
3984         int i;
3985         long delta, delta_cpu, v;
3986
3987         for (i = 0; i < ac->size; i++) {
3988                 /*
3989                  * Collect the aggregated propagation counts of groups
3990                  * below us. We're in a per-cpu loop here and this is
3991                  * a global counter, so the first cycle will get them.
3992                  */
3993                 delta = ac->pending[i];
3994                 if (delta)
3995                         ac->pending[i] = 0;
3996
3997                 /* Add CPU changes on this level since the last flush */
3998                 delta_cpu = 0;
3999                 v = READ_ONCE(ac->cstat[i]);
4000                 if (v != ac->cstat_prev[i]) {
4001                         delta_cpu = v - ac->cstat_prev[i];
4002                         delta += delta_cpu;
4003                         ac->cstat_prev[i] = v;
4004                 }
4005
4006                 /* Aggregate counts on this level and propagate upwards */
4007                 if (delta_cpu)
4008                         ac->local[i] += delta_cpu;
4009
4010                 if (delta) {
4011                         ac->aggregate[i] += delta;
4012                         if (ac->ppending)
4013                                 ac->ppending[i] += delta;
4014                 }
4015         }
4016 }
4017
4018 #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
4019 static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
4020                             int cpu)
4021 {
4022         int nid;
4023
4024         if (atomic_read(&memcg->kmem_stat)) {
4025                 int kmem = atomic_xchg(&memcg->kmem_stat, 0);
4026                 int index = memcg_stats_index(MEMCG_KMEM);
4027
4028                 memcg->vmstats->state[index] += kmem;
4029                 if (parent)
4030                         parent->vmstats->state_pending[index] += kmem;
4031         }
4032
4033         for_each_node_state(nid, N_MEMORY) {
4034                 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
4035                 struct lruvec_stats *lstats = pn->lruvec_stats;
4036                 struct lruvec_stats *plstats = NULL;
4037
4038                 if (parent)
4039                         plstats = parent->nodeinfo[nid]->lruvec_stats;
4040
4041                 if (atomic_read(&pn->slab_reclaimable)) {
4042                         int slab = atomic_xchg(&pn->slab_reclaimable, 0);
4043                         int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
4044
4045                         lstats->state[index] += slab;
4046                         if (plstats)
4047                                 plstats->state_pending[index] += slab;
4048                 }
4049                 if (atomic_read(&pn->slab_unreclaimable)) {
4050                         int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
4051                         int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
4052
4053                         lstats->state[index] += slab;
4054                         if (plstats)
4055                                 plstats->state_pending[index] += slab;
4056                 }
4057         }
4058 }
4059 #else
4060 static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
4061                             int cpu)
4062 {}
4063 #endif
4064
4065 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
4066 {
4067         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4068         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4069         struct memcg_vmstats_percpu *statc;
4070         struct aggregate_control ac;
4071         int nid;
4072
4073         flush_nmi_stats(memcg, parent, cpu);
4074
4075         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
4076
4077         ac = (struct aggregate_control) {
4078                 .aggregate = memcg->vmstats->state,
4079                 .local = memcg->vmstats->state_local,
4080                 .pending = memcg->vmstats->state_pending,
4081                 .ppending = parent ? parent->vmstats->state_pending : NULL,
4082                 .cstat = statc->state,
4083                 .cstat_prev = statc->state_prev,
4084                 .size = MEMCG_VMSTAT_SIZE,
4085         };
4086         mem_cgroup_stat_aggregate(&ac);
4087
4088         ac = (struct aggregate_control) {
4089                 .aggregate = memcg->vmstats->events,
4090                 .local = memcg->vmstats->events_local,
4091                 .pending = memcg->vmstats->events_pending,
4092                 .ppending = parent ? parent->vmstats->events_pending : NULL,
4093                 .cstat = statc->events,
4094                 .cstat_prev = statc->events_prev,
4095                 .size = NR_MEMCG_EVENTS,
4096         };
4097         mem_cgroup_stat_aggregate(&ac);
4098
4099         for_each_node_state(nid, N_MEMORY) {
4100                 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
4101                 struct lruvec_stats *lstats = pn->lruvec_stats;
4102                 struct lruvec_stats *plstats = NULL;
4103                 struct lruvec_stats_percpu *lstatc;
4104
4105                 if (parent)
4106                         plstats = parent->nodeinfo[nid]->lruvec_stats;
4107
4108                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
4109
4110                 ac = (struct aggregate_control) {
4111                         .aggregate = lstats->state,
4112                         .local = lstats->state_local,
4113                         .pending = lstats->state_pending,
4114                         .ppending = plstats ? plstats->state_pending : NULL,
4115                         .cstat = lstatc->state,
4116                         .cstat_prev = lstatc->state_prev,
4117                         .size = NR_MEMCG_NODE_STAT_ITEMS,
4118                 };
4119                 mem_cgroup_stat_aggregate(&ac);
4120
4121         }
4122         WRITE_ONCE(statc->stats_updates, 0);
4123         /* We are in a per-cpu loop here, only do the atomic write once */
4124         if (atomic_read(&memcg->vmstats->stats_updates))
4125                 atomic_set(&memcg->vmstats->stats_updates, 0);
4126 }
4127
4128 static void mem_cgroup_fork(struct task_struct *task)
4129 {
4130         /*
4131          * Set the update flag to cause task->objcg to be initialized lazily
4132          * on the first allocation. It can be done without any synchronization
4133          * because it's always performed on the current task, so does
4134          * current_objcg_update().
4135          */
4136         task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
4137 }
4138
4139 static void mem_cgroup_exit(struct task_struct *task)
4140 {
4141         struct obj_cgroup *objcg = task->objcg;
4142
4143         objcg = (struct obj_cgroup *)
4144                 ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
4145         obj_cgroup_put(objcg);
4146
4147         /*
4148          * Some kernel allocations can happen after this point,
4149          * but let's ignore them. It can be done without any synchronization
4150          * because it's always performed on the current task, so does
4151          * current_objcg_update().
4152          */
4153         task->objcg = NULL;
4154 }
4155
4156 #ifdef CONFIG_LRU_GEN
4157 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
4158 {
4159         struct task_struct *task;
4160         struct cgroup_subsys_state *css;
4161
4162         /* find the first leader if there is any */
4163         cgroup_taskset_for_each_leader(task, css, tset)
4164                 break;
4165
4166         if (!task)
4167                 return;
4168
4169         task_lock(task);
4170         if (task->mm && READ_ONCE(task->mm->owner) == task)
4171                 lru_gen_migrate_mm(task->mm);
4172         task_unlock(task);
4173 }
4174 #else
4175 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
4176 #endif /* CONFIG_LRU_GEN */
4177
4178 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
4179 {
4180         struct task_struct *task;
4181         struct cgroup_subsys_state *css;
4182
4183         cgroup_taskset_for_each(task, css, tset) {
4184                 /* atomically set the update bit */
4185                 set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
4186         }
4187 }
4188
4189 static void mem_cgroup_attach(struct cgroup_taskset *tset)
4190 {
4191         mem_cgroup_lru_gen_attach(tset);
4192         mem_cgroup_kmem_attach(tset);
4193 }
4194
4195 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
4196 {
4197         if (value == PAGE_COUNTER_MAX)
4198                 seq_puts(m, "max\n");
4199         else
4200                 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
4201
4202         return 0;
4203 }
4204
4205 static u64 memory_current_read(struct cgroup_subsys_state *css,
4206                                struct cftype *cft)
4207 {
4208         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4209
4210         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
4211 }
4212
4213 #define OFP_PEAK_UNSET (((-1UL)))
4214
4215 static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
4216 {
4217         struct cgroup_of_peak *ofp = of_peak(sf->private);
4218         u64 fd_peak = READ_ONCE(ofp->value), peak;
4219
4220         /* User wants global or local peak? */
4221         if (fd_peak == OFP_PEAK_UNSET)
4222                 peak = pc->watermark;
4223         else
4224                 peak = max(fd_peak, READ_ONCE(pc->local_watermark));
4225
4226         seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
4227         return 0;
4228 }
4229
4230 static int memory_peak_show(struct seq_file *sf, void *v)
4231 {
4232         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4233
4234         return peak_show(sf, v, &memcg->memory);
4235 }
4236
4237 static int peak_open(struct kernfs_open_file *of)
4238 {
4239         struct cgroup_of_peak *ofp = of_peak(of);
4240
4241         ofp->value = OFP_PEAK_UNSET;
4242         return 0;
4243 }
4244
4245 static void peak_release(struct kernfs_open_file *of)
4246 {
4247         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4248         struct cgroup_of_peak *ofp = of_peak(of);
4249
4250         if (ofp->value == OFP_PEAK_UNSET) {
4251                 /* fast path (no writes on this fd) */
4252                 return;
4253         }
4254         spin_lock(&memcg->peaks_lock);
4255         list_del(&ofp->list);
4256         spin_unlock(&memcg->peaks_lock);
4257 }
4258
4259 static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
4260                           loff_t off, struct page_counter *pc,
4261                           struct list_head *watchers)
4262 {
4263         unsigned long usage;
4264         struct cgroup_of_peak *peer_ctx;
4265         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4266         struct cgroup_of_peak *ofp = of_peak(of);
4267
4268         spin_lock(&memcg->peaks_lock);
4269
4270         usage = page_counter_read(pc);
4271         WRITE_ONCE(pc->local_watermark, usage);
4272
4273         list_for_each_entry(peer_ctx, watchers, list)
4274                 if (usage > peer_ctx->value)
4275                         WRITE_ONCE(peer_ctx->value, usage);
4276
4277         /* initial write, register watcher */
4278         if (ofp->value == OFP_PEAK_UNSET)
4279                 list_add(&ofp->list, watchers);
4280
4281         WRITE_ONCE(ofp->value, usage);
4282         spin_unlock(&memcg->peaks_lock);
4283
4284         return nbytes;
4285 }
4286
4287 static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
4288                                  size_t nbytes, loff_t off)
4289 {
4290         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4291
4292         return peak_write(of, buf, nbytes, off, &memcg->memory,
4293                           &memcg->memory_peaks);
4294 }
4295
4296 #undef OFP_PEAK_UNSET
4297
4298 static int memory_min_show(struct seq_file *m, void *v)
4299 {
4300         return seq_puts_memcg_tunable(m,
4301                 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
4302 }
4303
4304 static ssize_t memory_min_write(struct kernfs_open_file *of,
4305                                 char *buf, size_t nbytes, loff_t off)
4306 {
4307         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4308         unsigned long min;
4309         int err;
4310
4311         buf = strstrip(buf);
4312         err = page_counter_memparse(buf, "max", &min);
4313         if (err)
4314                 return err;
4315
4316         page_counter_set_min(&memcg->memory, min);
4317
4318         return nbytes;
4319 }
4320
4321 static int memory_low_show(struct seq_file *m, void *v)
4322 {
4323         return seq_puts_memcg_tunable(m,
4324                 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
4325 }
4326
4327 static ssize_t memory_low_write(struct kernfs_open_file *of,
4328                                 char *buf, size_t nbytes, loff_t off)
4329 {
4330         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4331         unsigned long low;
4332         int err;
4333
4334         buf = strstrip(buf);
4335         err = page_counter_memparse(buf, "max", &low);
4336         if (err)
4337                 return err;
4338
4339         page_counter_set_low(&memcg->memory, low);
4340
4341         return nbytes;
4342 }
4343
4344 static int memory_high_show(struct seq_file *m, void *v)
4345 {
4346         return seq_puts_memcg_tunable(m,
4347                 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
4348 }
4349
4350 static ssize_t memory_high_write(struct kernfs_open_file *of,
4351                                  char *buf, size_t nbytes, loff_t off)
4352 {
4353         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4354         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4355         bool drained = false;
4356         unsigned long high;
4357         int err;
4358
4359         buf = strstrip(buf);
4360         err = page_counter_memparse(buf, "max", &high);
4361         if (err)
4362                 return err;
4363
4364         page_counter_set_high(&memcg->memory, high);
4365
4366         if (of->file->f_flags & O_NONBLOCK)
4367                 goto out;
4368
4369         for (;;) {
4370                 unsigned long nr_pages = page_counter_read(&memcg->memory);
4371                 unsigned long reclaimed;
4372
4373                 if (nr_pages <= high)
4374                         break;
4375
4376                 if (signal_pending(current))
4377                         break;
4378
4379                 if (!drained) {
4380                         drain_all_stock(memcg);
4381                         drained = true;
4382                         continue;
4383                 }
4384
4385                 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
4386                                         GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
4387
4388                 if (!reclaimed && !nr_retries--)
4389                         break;
4390         }
4391 out:
4392         memcg_wb_domain_size_changed(memcg);
4393         return nbytes;
4394 }
4395
4396 static int memory_max_show(struct seq_file *m, void *v)
4397 {
4398         return seq_puts_memcg_tunable(m,
4399                 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
4400 }
4401
4402 static ssize_t memory_max_write(struct kernfs_open_file *of,
4403                                 char *buf, size_t nbytes, loff_t off)
4404 {
4405         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4406         unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
4407         bool drained = false;
4408         unsigned long max;
4409         int err;
4410
4411         buf = strstrip(buf);
4412         err = page_counter_memparse(buf, "max", &max);
4413         if (err)
4414                 return err;
4415
4416         xchg(&memcg->memory.max, max);
4417
4418         if (of->file->f_flags & O_NONBLOCK)
4419                 goto out;
4420
4421         for (;;) {
4422                 unsigned long nr_pages = page_counter_read(&memcg->memory);
4423
4424                 if (nr_pages <= max)
4425                         break;
4426
4427                 if (signal_pending(current))
4428                         break;
4429
4430                 if (!drained) {
4431                         drain_all_stock(memcg);
4432                         drained = true;
4433                         continue;
4434                 }
4435
4436                 if (nr_reclaims) {
4437                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
4438                                         GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
4439                                 nr_reclaims--;
4440                         continue;
4441                 }
4442
4443                 memcg_memory_event(memcg, MEMCG_OOM);
4444                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
4445                         break;
4446                 cond_resched();
4447         }
4448 out:
4449         memcg_wb_domain_size_changed(memcg);
4450         return nbytes;
4451 }
4452
4453 /*
4454  * Note: don't forget to update the 'samples/cgroup/memcg_event_listener'
4455  * if any new events become available.
4456  */
4457 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
4458 {
4459         seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
4460         seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
4461         seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
4462         seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
4463         seq_printf(m, "oom_kill %lu\n",
4464                    atomic_long_read(&events[MEMCG_OOM_KILL]));
4465         seq_printf(m, "oom_group_kill %lu\n",
4466                    atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
4467 }
4468
4469 static int memory_events_show(struct seq_file *m, void *v)
4470 {
4471         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4472
4473         __memory_events_show(m, memcg->memory_events);
4474         return 0;
4475 }
4476
4477 static int memory_events_local_show(struct seq_file *m, void *v)
4478 {
4479         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4480
4481         __memory_events_show(m, memcg->memory_events_local);
4482         return 0;
4483 }
4484
4485 int memory_stat_show(struct seq_file *m, void *v)
4486 {
4487         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4488         char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL);
4489         struct seq_buf s;
4490
4491         if (!buf)
4492                 return -ENOMEM;
4493         seq_buf_init(&s, buf, SEQ_BUF_SIZE);
4494         memory_stat_format(memcg, &s);
4495         seq_puts(m, buf);
4496         kfree(buf);
4497         return 0;
4498 }
4499
4500 #ifdef CONFIG_NUMA
4501 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
4502                                                      int item)
4503 {
4504         return lruvec_page_state(lruvec, item) *
4505                 memcg_page_state_output_unit(item);
4506 }
4507
4508 static int memory_numa_stat_show(struct seq_file *m, void *v)
4509 {
4510         int i;
4511         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4512
4513         mem_cgroup_flush_stats(memcg);
4514
4515         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
4516                 int nid;
4517
4518                 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
4519                         continue;
4520
4521                 seq_printf(m, "%s", memory_stats[i].name);
4522                 for_each_node_state(nid, N_MEMORY) {
4523                         u64 size;
4524                         struct lruvec *lruvec;
4525
4526                         lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4527                         size = lruvec_page_state_output(lruvec,
4528                                                         memory_stats[i].idx);
4529                         seq_printf(m, " N%d=%llu", nid, size);
4530                 }
4531                 seq_putc(m, '\n');
4532         }
4533
4534         return 0;
4535 }
4536 #endif
4537
4538 static int memory_oom_group_show(struct seq_file *m, void *v)
4539 {
4540         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4541
4542         seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
4543
4544         return 0;
4545 }
4546
4547 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
4548                                       char *buf, size_t nbytes, loff_t off)
4549 {
4550         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4551         int ret, oom_group;
4552
4553         buf = strstrip(buf);
4554         if (!buf)
4555                 return -EINVAL;
4556
4557         ret = kstrtoint(buf, 0, &oom_group);
4558         if (ret)
4559                 return ret;
4560
4561         if (oom_group != 0 && oom_group != 1)
4562                 return -EINVAL;
4563
4564         WRITE_ONCE(memcg->oom_group, oom_group);
4565
4566         return nbytes;
4567 }
4568
4569 enum {
4570         MEMORY_RECLAIM_SWAPPINESS = 0,
4571         MEMORY_RECLAIM_SWAPPINESS_MAX,
4572         MEMORY_RECLAIM_NULL,
4573 };
4574
4575 static const match_table_t tokens = {
4576         { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
4577         { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
4578         { MEMORY_RECLAIM_NULL, NULL },
4579 };
4580
4581 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
4582                               size_t nbytes, loff_t off)
4583 {
4584         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4585         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4586         unsigned long nr_to_reclaim, nr_reclaimed = 0;
4587         int swappiness = -1;
4588         unsigned int reclaim_options;
4589         char *old_buf, *start;
4590         substring_t args[MAX_OPT_ARGS];
4591
4592         buf = strstrip(buf);
4593
4594         old_buf = buf;
4595         nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
4596         if (buf == old_buf)
4597                 return -EINVAL;
4598
4599         buf = strstrip(buf);
4600
4601         while ((start = strsep(&buf, " ")) != NULL) {
4602                 if (!strlen(start))
4603                         continue;
4604                 switch (match_token(start, tokens, args)) {
4605                 case MEMORY_RECLAIM_SWAPPINESS:
4606                         if (match_int(&args[0], &swappiness))
4607                                 return -EINVAL;
4608                         if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
4609                                 return -EINVAL;
4610                         break;
4611                 case MEMORY_RECLAIM_SWAPPINESS_MAX:
4612                         swappiness = SWAPPINESS_ANON_ONLY;
4613                         break;
4614                 default:
4615                         return -EINVAL;
4616                 }
4617         }
4618
4619         reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
4620         while (nr_reclaimed < nr_to_reclaim) {
4621                 /* Will converge on zero, but reclaim enforces a minimum */
4622                 unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
4623                 unsigned long reclaimed;
4624
4625                 if (signal_pending(current))
4626                         return -EINTR;
4627
4628                 /*
4629                  * This is the final attempt, drain percpu lru caches in the
4630                  * hope of introducing more evictable pages for
4631                  * try_to_free_mem_cgroup_pages().
4632                  */
4633                 if (!nr_retries)
4634                         lru_add_drain_all();
4635
4636                 reclaimed = try_to_free_mem_cgroup_pages(memcg,
4637                                         batch_size, GFP_KERNEL,
4638                                         reclaim_options,
4639                                         swappiness == -1 ? NULL : &swappiness);
4640
4641                 if (!reclaimed && !nr_retries--)
4642                         return -EAGAIN;
4643
4644                 nr_reclaimed += reclaimed;
4645         }
4646
4647         return nbytes;
4648 }
4649
4650 static struct cftype memory_files[] = {
4651         {
4652                 .name = "current",
4653                 .flags = CFTYPE_NOT_ON_ROOT,
4654                 .read_u64 = memory_current_read,
4655         },
4656         {
4657                 .name = "peak",
4658                 .flags = CFTYPE_NOT_ON_ROOT,
4659                 .open = peak_open,
4660                 .release = peak_release,
4661                 .seq_show = memory_peak_show,
4662                 .write = memory_peak_write,
4663         },
4664         {
4665                 .name = "min",
4666                 .flags = CFTYPE_NOT_ON_ROOT,
4667                 .seq_show = memory_min_show,
4668                 .write = memory_min_write,
4669         },
4670         {
4671                 .name = "low",
4672                 .flags = CFTYPE_NOT_ON_ROOT,
4673                 .seq_show = memory_low_show,
4674                 .write = memory_low_write,
4675         },
4676         {
4677                 .name = "high",
4678                 .flags = CFTYPE_NOT_ON_ROOT,
4679                 .seq_show = memory_high_show,
4680                 .write = memory_high_write,
4681         },
4682         {
4683                 .name = "max",
4684                 .flags = CFTYPE_NOT_ON_ROOT,
4685                 .seq_show = memory_max_show,
4686                 .write = memory_max_write,
4687         },
4688         {
4689                 .name = "events",
4690                 .flags = CFTYPE_NOT_ON_ROOT,
4691                 .file_offset = offsetof(struct mem_cgroup, events_file),
4692                 .seq_show = memory_events_show,
4693         },
4694         {
4695                 .name = "events.local",
4696                 .flags = CFTYPE_NOT_ON_ROOT,
4697                 .file_offset = offsetof(struct mem_cgroup, events_local_file),
4698                 .seq_show = memory_events_local_show,
4699         },
4700         {
4701                 .name = "stat",
4702                 .seq_show = memory_stat_show,
4703         },
4704 #ifdef CONFIG_NUMA
4705         {
4706                 .name = "numa_stat",
4707                 .seq_show = memory_numa_stat_show,
4708         },
4709 #endif
4710         {
4711                 .name = "oom.group",
4712                 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
4713                 .seq_show = memory_oom_group_show,
4714                 .write = memory_oom_group_write,
4715         },
4716         {
4717                 .name = "reclaim",
4718                 .flags = CFTYPE_NS_DELEGATABLE,
4719                 .write = memory_reclaim,
4720         },
4721         { }     /* terminate */
4722 };
4723
4724 struct cgroup_subsys memory_cgrp_subsys = {
4725         .css_alloc = mem_cgroup_css_alloc,
4726         .css_online = mem_cgroup_css_online,
4727         .css_offline = mem_cgroup_css_offline,
4728         .css_released = mem_cgroup_css_released,
4729         .css_free = mem_cgroup_css_free,
4730         .css_reset = mem_cgroup_css_reset,
4731         .css_rstat_flush = mem_cgroup_css_rstat_flush,
4732         .attach = mem_cgroup_attach,
4733         .fork = mem_cgroup_fork,
4734         .exit = mem_cgroup_exit,
4735         .dfl_cftypes = memory_files,
4736 #ifdef CONFIG_MEMCG_V1
4737         .legacy_cftypes = mem_cgroup_legacy_files,
4738 #endif
4739         .early_init = 0,
4740 };
4741
4742 /**
4743  * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
4744  * @root: the top ancestor of the sub-tree being checked
4745  * @memcg: the memory cgroup to check
4746  *
4747  * WARNING: This function is not stateless! It can only be used as part
4748  *          of a top-down tree iteration, not for isolated queries.
4749  */
4750 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
4751                                      struct mem_cgroup *memcg)
4752 {
4753         bool recursive_protection =
4754                 cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT;
4755
4756         if (mem_cgroup_disabled())
4757                 return;
4758
4759         if (!root)
4760                 root = root_mem_cgroup;
4761
4762         page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
4763 }
4764
4765 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
4766                         gfp_t gfp)
4767 {
4768         int ret;
4769
4770         ret = try_charge(memcg, gfp, folio_nr_pages(folio));
4771         if (ret)
4772                 goto out;
4773
4774         css_get(&memcg->css);
4775         commit_charge(folio, memcg);
4776         memcg1_commit_charge(folio, memcg);
4777 out:
4778         return ret;
4779 }
4780
4781 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
4782 {
4783         struct mem_cgroup *memcg;
4784         int ret;
4785
4786         memcg = get_mem_cgroup_from_mm(mm);
4787         ret = charge_memcg(folio, memcg, gfp);
4788         css_put(&memcg->css);
4789
4790         return ret;
4791 }
4792
4793 /**
4794  * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
4795  * @folio: folio being charged
4796  * @gfp: reclaim mode
4797  *
4798  * This function is called when allocating a huge page folio, after the page has
4799  * already been obtained and charged to the appropriate hugetlb cgroup
4800  * controller (if it is enabled).
4801  *
4802  * Returns ENOMEM if the memcg is already full.
4803  * Returns 0 if either the charge was successful, or if we skip the charging.
4804  */
4805 int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
4806 {
4807         struct mem_cgroup *memcg = get_mem_cgroup_from_current();
4808         int ret = 0;
4809
4810         /*
4811          * Even memcg does not account for hugetlb, we still want to update
4812          * system-level stats via lruvec_stat_mod_folio. Return 0, and skip
4813          * charging the memcg.
4814          */
4815         if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() ||
4816                 !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
4817                 goto out;
4818
4819         if (charge_memcg(folio, memcg, gfp))
4820                 ret = -ENOMEM;
4821
4822 out:
4823         mem_cgroup_put(memcg);
4824         return ret;
4825 }
4826
4827 /**
4828  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
4829  * @folio: folio to charge.
4830  * @mm: mm context of the victim
4831  * @gfp: reclaim mode
4832  * @entry: swap entry for which the folio is allocated
4833  *
4834  * This function charges a folio allocated for swapin. Please call this before
4835  * adding the folio to the swapcache.
4836  *
4837  * Returns 0 on success. Otherwise, an error code is returned.
4838  */
4839 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
4840                                   gfp_t gfp, swp_entry_t entry)
4841 {
4842         struct mem_cgroup *memcg;
4843         unsigned short id;
4844         int ret;
4845
4846         if (mem_cgroup_disabled())
4847                 return 0;
4848
4849         id = lookup_swap_cgroup_id(entry);
4850         rcu_read_lock();
4851         memcg = mem_cgroup_from_id(id);
4852         if (!memcg || !css_tryget_online(&memcg->css))
4853                 memcg = get_mem_cgroup_from_mm(mm);
4854         rcu_read_unlock();
4855
4856         ret = charge_memcg(folio, memcg, gfp);
4857
4858         css_put(&memcg->css);
4859         return ret;
4860 }
4861
4862 struct uncharge_gather {
4863         struct mem_cgroup *memcg;
4864         unsigned long nr_memory;
4865         unsigned long pgpgout;
4866         unsigned long nr_kmem;
4867         int nid;
4868 };
4869
4870 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
4871 {
4872         memset(ug, 0, sizeof(*ug));
4873 }
4874
4875 static void uncharge_batch(const struct uncharge_gather *ug)
4876 {
4877         if (ug->nr_memory) {
4878                 memcg_uncharge(ug->memcg, ug->nr_memory);
4879                 if (ug->nr_kmem) {
4880                         mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
4881                         memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
4882                 }
4883                 memcg1_oom_recover(ug->memcg);
4884         }
4885
4886         memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
4887
4888         /* drop reference from uncharge_folio */
4889         css_put(&ug->memcg->css);
4890 }
4891
4892 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
4893 {
4894         long nr_pages;
4895         struct mem_cgroup *memcg;
4896         struct obj_cgroup *objcg;
4897
4898         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4899
4900         /*
4901          * Nobody should be changing or seriously looking at
4902          * folio memcg or objcg at this point, we have fully
4903          * exclusive access to the folio.
4904          */
4905         if (folio_memcg_kmem(folio)) {
4906                 objcg = __folio_objcg(folio);
4907                 /*
4908                  * This get matches the put at the end of the function and
4909                  * kmem pages do not hold memcg references anymore.
4910                  */
4911                 memcg = get_mem_cgroup_from_objcg(objcg);
4912         } else {
4913                 memcg = __folio_memcg(folio);
4914         }
4915
4916         if (!memcg)
4917                 return;
4918
4919         if (ug->memcg != memcg) {
4920                 if (ug->memcg) {
4921                         uncharge_batch(ug);
4922                         uncharge_gather_clear(ug);
4923                 }
4924                 ug->memcg = memcg;
4925                 ug->nid = folio_nid(folio);
4926
4927                 /* pairs with css_put in uncharge_batch */
4928                 css_get(&memcg->css);
4929         }
4930
4931         nr_pages = folio_nr_pages(folio);
4932
4933         if (folio_memcg_kmem(folio)) {
4934                 ug->nr_memory += nr_pages;
4935                 ug->nr_kmem += nr_pages;
4936
4937                 folio->memcg_data = 0;
4938                 obj_cgroup_put(objcg);
4939         } else {
4940                 /* LRU pages aren't accounted at the root level */
4941                 if (!mem_cgroup_is_root(memcg))
4942                         ug->nr_memory += nr_pages;
4943                 ug->pgpgout++;
4944
4945                 WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
4946                 folio->memcg_data = 0;
4947         }
4948
4949         css_put(&memcg->css);
4950 }
4951
4952 void __mem_cgroup_uncharge(struct folio *folio)
4953 {
4954         struct uncharge_gather ug;
4955
4956         /* Don't touch folio->lru of any random page, pre-check: */
4957         if (!folio_memcg_charged(folio))
4958                 return;
4959
4960         uncharge_gather_clear(&ug);
4961         uncharge_folio(folio, &ug);
4962         uncharge_batch(&ug);
4963 }
4964
4965 void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
4966 {
4967         struct uncharge_gather ug;
4968         unsigned int i;
4969
4970         uncharge_gather_clear(&ug);
4971         for (i = 0; i < folios->nr; i++)
4972                 uncharge_folio(folios->folios[i], &ug);
4973         if (ug.memcg)
4974                 uncharge_batch(&ug);
4975 }
4976
4977 /**
4978  * mem_cgroup_replace_folio - Charge a folio's replacement.
4979  * @old: Currently circulating folio.
4980  * @new: Replacement folio.
4981  *
4982  * Charge @new as a replacement folio for @old. @old will
4983  * be uncharged upon free.
4984  *
4985  * Both folios must be locked, @new->mapping must be set up.
4986  */
4987 void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
4988 {
4989         struct mem_cgroup *memcg;
4990         long nr_pages = folio_nr_pages(new);
4991
4992         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4993         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4994         VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4995         VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
4996
4997         if (mem_cgroup_disabled())
4998                 return;
4999
5000         /* Page cache replacement: new folio already charged? */
5001         if (folio_memcg_charged(new))
5002                 return;
5003
5004         memcg = folio_memcg(old);
5005         VM_WARN_ON_ONCE_FOLIO(!memcg, old);
5006         if (!memcg)
5007                 return;
5008
5009         /* Force-charge the new page. The old one will be freed soon */
5010         if (!mem_cgroup_is_root(memcg)) {
5011                 page_counter_charge(&memcg->memory, nr_pages);
5012                 if (do_memsw_account())
5013                         page_counter_charge(&memcg->memsw, nr_pages);
5014         }
5015
5016         css_get(&memcg->css);
5017         commit_charge(new, memcg);
5018         memcg1_commit_charge(new, memcg);
5019 }
5020
5021 /**
5022  * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
5023  * @old: Currently circulating folio.
5024  * @new: Replacement folio.
5025  *
5026  * Transfer the memcg data from the old folio to the new folio for migration.
5027  * The old folio's data info will be cleared. Note that the memory counters
5028  * will remain unchanged throughout the process.
5029  *
5030  * Both folios must be locked, @new->mapping must be set up.
5031  */
5032 void mem_cgroup_migrate(struct folio *old, struct folio *new)
5033 {
5034         struct mem_cgroup *memcg;
5035
5036         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
5037         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
5038         VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
5039         VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
5040         VM_BUG_ON_FOLIO(folio_test_lru(old), old);
5041
5042         if (mem_cgroup_disabled())
5043                 return;
5044
5045         memcg = folio_memcg(old);
5046         /*
5047          * Note that it is normal to see !memcg for a hugetlb folio.
5048          * For e.g, itt could have been allocated when memory_hugetlb_accounting
5049          * was not selected.
5050          */
5051         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
5052         if (!memcg)
5053                 return;
5054
5055         /* Transfer the charge and the css ref */
5056         commit_charge(new, memcg);
5057
5058         /* Warning should never happen, so don't worry about refcount non-0 */
5059         WARN_ON_ONCE(folio_unqueue_deferred_split(old));
5060         old->memcg_data = 0;
5061 }
5062
5063 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5064 EXPORT_SYMBOL(memcg_sockets_enabled_key);
5065
5066 void mem_cgroup_sk_alloc(struct sock *sk)
5067 {
5068         struct mem_cgroup *memcg;
5069
5070         if (!mem_cgroup_sockets_enabled)
5071                 return;
5072
5073         /* Do not associate the sock with unrelated interrupted task's memcg. */
5074         if (!in_task())
5075                 return;
5076
5077         rcu_read_lock();
5078         memcg = mem_cgroup_from_task(current);
5079         if (mem_cgroup_is_root(memcg))
5080                 goto out;
5081         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
5082                 goto out;
5083         if (css_tryget(&memcg->css))
5084                 sk->sk_memcg = memcg;
5085 out:
5086         rcu_read_unlock();
5087 }
5088
5089 void mem_cgroup_sk_free(struct sock *sk)
5090 {
5091         if (sk->sk_memcg)
5092                 css_put(&sk->sk_memcg->css);
5093 }
5094
5095 /**
5096  * mem_cgroup_charge_skmem - charge socket memory
5097  * @memcg: memcg to charge
5098  * @nr_pages: number of pages to charge
5099  * @gfp_mask: reclaim mode
5100  *
5101  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5102  * @memcg's configured limit, %false if it doesn't.
5103  */
5104 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
5105                              gfp_t gfp_mask)
5106 {
5107         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5108                 return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
5109
5110         if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
5111                 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
5112                 return true;
5113         }
5114
5115         return false;
5116 }
5117
5118 /**
5119  * mem_cgroup_uncharge_skmem - uncharge socket memory
5120  * @memcg: memcg to uncharge
5121  * @nr_pages: number of pages to uncharge
5122  */
5123 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5124 {
5125         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5126                 memcg1_uncharge_skmem(memcg, nr_pages);
5127                 return;
5128         }
5129
5130         mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
5131
5132         refill_stock(memcg, nr_pages);
5133 }
5134
5135 static int __init cgroup_memory(char *s)
5136 {
5137         char *token;
5138
5139         while ((token = strsep(&s, ",")) != NULL) {
5140                 if (!*token)
5141                         continue;
5142                 if (!strcmp(token, "nosocket"))
5143                         cgroup_memory_nosocket = true;
5144                 if (!strcmp(token, "nokmem"))
5145                         cgroup_memory_nokmem = true;
5146                 if (!strcmp(token, "nobpf"))
5147                         cgroup_memory_nobpf = true;
5148         }
5149         return 1;
5150 }
5151 __setup("cgroup.memory=", cgroup_memory);
5152
5153 /*
5154  * Memory controller init before cgroup_init() initialize root_mem_cgroup.
5155  *
5156  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
5157  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
5158  * basically everything that doesn't depend on a specific mem_cgroup structure
5159  * should be initialized from here.
5160  */
5161 int __init mem_cgroup_init(void)
5162 {
5163         unsigned int memcg_size;
5164         int cpu;
5165
5166         /*
5167          * Currently s32 type (can refer to struct batched_lruvec_stat) is
5168          * used for per-memcg-per-cpu caching of per-node statistics. In order
5169          * to work fine, we should make sure that the overfill threshold can't
5170          * exceed S32_MAX / PAGE_SIZE.
5171          */
5172         BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
5173
5174         cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
5175                                   memcg_hotplug_cpu_dead);
5176
5177         for_each_possible_cpu(cpu) {
5178                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5179                           drain_local_memcg_stock);
5180                 INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
5181                           drain_local_obj_stock);
5182         }
5183
5184         memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
5185         memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
5186                                          SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
5187
5188         memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
5189                                      SLAB_PANIC | SLAB_HWCACHE_ALIGN);
5190
5191         return 0;
5192 }
5193
5194 #ifdef CONFIG_SWAP
5195 /**
5196  * __mem_cgroup_try_charge_swap - try charging swap space for a folio
5197  * @folio: folio being added to swap
5198  * @entry: swap entry to charge
5199  *
5200  * Try to charge @folio's memcg for the swap space at @entry.
5201  *
5202  * Returns 0 on success, -ENOMEM on failure.
5203  */
5204 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
5205 {
5206         unsigned int nr_pages = folio_nr_pages(folio);
5207         struct page_counter *counter;
5208         struct mem_cgroup *memcg;
5209
5210         if (do_memsw_account())
5211                 return 0;
5212
5213         memcg = folio_memcg(folio);
5214
5215         VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
5216         if (!memcg)
5217                 return 0;
5218
5219         if (!entry.val) {
5220                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
5221                 return 0;
5222         }
5223
5224         memcg = mem_cgroup_id_get_online(memcg);
5225
5226         if (!mem_cgroup_is_root(memcg) &&
5227             !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
5228                 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
5229                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
5230                 mem_cgroup_id_put(memcg);
5231                 return -ENOMEM;
5232         }
5233
5234         /* Get references for the tail pages, too */
5235         if (nr_pages > 1)
5236                 mem_cgroup_id_get_many(memcg, nr_pages - 1);
5237         mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
5238
5239         swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
5240
5241         return 0;
5242 }
5243
5244 /**
5245  * __mem_cgroup_uncharge_swap - uncharge swap space
5246  * @entry: swap entry to uncharge
5247  * @nr_pages: the amount of swap space to uncharge
5248  */
5249 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
5250 {
5251         struct mem_cgroup *memcg;
5252         unsigned short id;
5253
5254         id = swap_cgroup_clear(entry, nr_pages);
5255         rcu_read_lock();
5256         memcg = mem_cgroup_from_id(id);
5257         if (memcg) {
5258                 if (!mem_cgroup_is_root(memcg)) {
5259                         if (do_memsw_account())
5260                                 page_counter_uncharge(&memcg->memsw, nr_pages);
5261                         else
5262                                 page_counter_uncharge(&memcg->swap, nr_pages);
5263                 }
5264                 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
5265                 mem_cgroup_id_put_many(memcg, nr_pages);
5266         }
5267         rcu_read_unlock();
5268 }
5269
5270 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5271 {
5272         long nr_swap_pages = get_nr_swap_pages();
5273
5274         if (mem_cgroup_disabled() || do_memsw_account())
5275                 return nr_swap_pages;
5276         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
5277                 nr_swap_pages = min_t(long, nr_swap_pages,
5278                                       READ_ONCE(memcg->swap.max) -
5279                                       page_counter_read(&memcg->swap));
5280         return nr_swap_pages;
5281 }
5282
5283 bool mem_cgroup_swap_full(struct folio *folio)
5284 {
5285         struct mem_cgroup *memcg;
5286
5287         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
5288
5289         if (vm_swap_full())
5290                 return true;
5291         if (do_memsw_account())
5292                 return false;
5293
5294         memcg = folio_memcg(folio);
5295         if (!memcg)
5296                 return false;
5297
5298         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
5299                 unsigned long usage = page_counter_read(&memcg->swap);
5300
5301                 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
5302                     usage * 2 >= READ_ONCE(memcg->swap.max))
5303                         return true;
5304         }
5305
5306         return false;
5307 }
5308
5309 static int __init setup_swap_account(char *s)
5310 {
5311         bool res;
5312
5313         if (!kstrtobool(s, &res) && !res)
5314                 pr_warn_once("The swapaccount=0 commandline option is deprecated "
5315                              "in favor of configuring swap control via cgroupfs. "
5316                              "Please report your usecase to linux-mm@kvack.org if you "
5317                              "depend on this functionality.\n");
5318         return 1;
5319 }
5320 __setup("swapaccount=", setup_swap_account);
5321
5322 static u64 swap_current_read(struct cgroup_subsys_state *css,
5323                              struct cftype *cft)
5324 {
5325         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5326
5327         return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
5328 }
5329
5330 static int swap_peak_show(struct seq_file *sf, void *v)
5331 {
5332         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5333
5334         return peak_show(sf, v, &memcg->swap);
5335 }
5336
5337 static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
5338                                size_t nbytes, loff_t off)
5339 {
5340         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5341
5342         return peak_write(of, buf, nbytes, off, &memcg->swap,
5343                           &memcg->swap_peaks);
5344 }
5345
5346 static int swap_high_show(struct seq_file *m, void *v)
5347 {
5348         return seq_puts_memcg_tunable(m,
5349                 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
5350 }
5351
5352 static ssize_t swap_high_write(struct kernfs_open_file *of,
5353                                char *buf, size_t nbytes, loff_t off)
5354 {
5355         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5356         unsigned long high;
5357         int err;
5358
5359         buf = strstrip(buf);
5360         err = page_counter_memparse(buf, "max", &high);
5361         if (err)
5362                 return err;
5363
5364         page_counter_set_high(&memcg->swap, high);
5365
5366         return nbytes;
5367 }
5368
5369 static int swap_max_show(struct seq_file *m, void *v)
5370 {
5371         return seq_puts_memcg_tunable(m,
5372                 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
5373 }
5374
5375 static ssize_t swap_max_write(struct kernfs_open_file *of,
5376                               char *buf, size_t nbytes, loff_t off)
5377 {
5378         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5379         unsigned long max;
5380         int err;
5381
5382         buf = strstrip(buf);
5383         err = page_counter_memparse(buf, "max", &max);
5384         if (err)
5385                 return err;
5386
5387         xchg(&memcg->swap.max, max);
5388
5389         return nbytes;
5390 }
5391
5392 static int swap_events_show(struct seq_file *m, void *v)
5393 {
5394         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5395
5396         seq_printf(m, "high %lu\n",
5397                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
5398         seq_printf(m, "max %lu\n",
5399                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
5400         seq_printf(m, "fail %lu\n",
5401                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
5402
5403         return 0;
5404 }
5405
5406 static struct cftype swap_files[] = {
5407         {
5408                 .name = "swap.current",
5409                 .flags = CFTYPE_NOT_ON_ROOT,
5410                 .read_u64 = swap_current_read,
5411         },
5412         {
5413                 .name = "swap.high",
5414                 .flags = CFTYPE_NOT_ON_ROOT,
5415                 .seq_show = swap_high_show,
5416                 .write = swap_high_write,
5417         },
5418         {
5419                 .name = "swap.max",
5420                 .flags = CFTYPE_NOT_ON_ROOT,
5421                 .seq_show = swap_max_show,
5422                 .write = swap_max_write,
5423         },
5424         {
5425                 .name = "swap.peak",
5426                 .flags = CFTYPE_NOT_ON_ROOT,
5427                 .open = peak_open,
5428                 .release = peak_release,
5429                 .seq_show = swap_peak_show,
5430                 .write = swap_peak_write,
5431         },
5432         {
5433                 .name = "swap.events",
5434                 .flags = CFTYPE_NOT_ON_ROOT,
5435                 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
5436                 .seq_show = swap_events_show,
5437         },
5438         { }     /* terminate */
5439 };
5440
5441 #ifdef CONFIG_ZSWAP
5442 /**
5443  * obj_cgroup_may_zswap - check if this cgroup can zswap
5444  * @objcg: the object cgroup
5445  *
5446  * Check if the hierarchical zswap limit has been reached.
5447  *
5448  * This doesn't check for specific headroom, and it is not atomic
5449  * either. But with zswap, the size of the allocation is only known
5450  * once compression has occurred, and this optimistic pre-check avoids
5451  * spending cycles on compression when there is already no room left
5452  * or zswap is disabled altogether somewhere in the hierarchy.
5453  */
5454 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
5455 {
5456         struct mem_cgroup *memcg, *original_memcg;
5457         bool ret = true;
5458
5459         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5460                 return true;
5461
5462         original_memcg = get_mem_cgroup_from_objcg(objcg);
5463         for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
5464              memcg = parent_mem_cgroup(memcg)) {
5465                 unsigned long max = READ_ONCE(memcg->zswap_max);
5466                 unsigned long pages;
5467
5468                 if (max == PAGE_COUNTER_MAX)
5469                         continue;
5470                 if (max == 0) {
5471                         ret = false;
5472                         break;
5473                 }
5474
5475                 /* Force flush to get accurate stats for charging */
5476                 __mem_cgroup_flush_stats(memcg, true);
5477                 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
5478                 if (pages < max)
5479                         continue;
5480                 ret = false;
5481                 break;
5482         }
5483         mem_cgroup_put(original_memcg);
5484         return ret;
5485 }
5486
5487 /**
5488  * obj_cgroup_charge_zswap - charge compression backend memory
5489  * @objcg: the object cgroup
5490  * @size: size of compressed object
5491  *
5492  * This forces the charge after obj_cgroup_may_zswap() allowed
5493  * compression and storage in zwap for this cgroup to go ahead.
5494  */
5495 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
5496 {
5497         struct mem_cgroup *memcg;
5498
5499         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5500                 return;
5501
5502         VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
5503
5504         /* PF_MEMALLOC context, charging must succeed */
5505         if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
5506                 VM_WARN_ON_ONCE(1);
5507
5508         rcu_read_lock();
5509         memcg = obj_cgroup_memcg(objcg);
5510         mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
5511         mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
5512         rcu_read_unlock();
5513 }
5514
5515 /**
5516  * obj_cgroup_uncharge_zswap - uncharge compression backend memory
5517  * @objcg: the object cgroup
5518  * @size: size of compressed object
5519  *
5520  * Uncharges zswap memory on page in.
5521  */
5522 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
5523 {
5524         struct mem_cgroup *memcg;
5525
5526         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5527                 return;
5528
5529         obj_cgroup_uncharge(objcg, size);
5530
5531         rcu_read_lock();
5532         memcg = obj_cgroup_memcg(objcg);
5533         mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
5534         mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
5535         rcu_read_unlock();
5536 }
5537
5538 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
5539 {
5540         /* if zswap is disabled, do not block pages going to the swapping device */
5541         if (!zswap_is_enabled())
5542                 return true;
5543
5544         for (; memcg; memcg = parent_mem_cgroup(memcg))
5545                 if (!READ_ONCE(memcg->zswap_writeback))
5546                         return false;
5547
5548         return true;
5549 }
5550
5551 static u64 zswap_current_read(struct cgroup_subsys_state *css,
5552                               struct cftype *cft)
5553 {
5554         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5555
5556         mem_cgroup_flush_stats(memcg);
5557         return memcg_page_state(memcg, MEMCG_ZSWAP_B);
5558 }
5559
5560 static int zswap_max_show(struct seq_file *m, void *v)
5561 {
5562         return seq_puts_memcg_tunable(m,
5563                 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
5564 }
5565
5566 static ssize_t zswap_max_write(struct kernfs_open_file *of,
5567                                char *buf, size_t nbytes, loff_t off)
5568 {
5569         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5570         unsigned long max;
5571         int err;
5572
5573         buf = strstrip(buf);
5574         err = page_counter_memparse(buf, "max", &max);
5575         if (err)
5576                 return err;
5577
5578         xchg(&memcg->zswap_max, max);
5579
5580         return nbytes;
5581 }
5582
5583 static int zswap_writeback_show(struct seq_file *m, void *v)
5584 {
5585         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5586
5587         seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback));
5588         return 0;
5589 }
5590
5591 static ssize_t zswap_writeback_write(struct kernfs_open_file *of,
5592                                 char *buf, size_t nbytes, loff_t off)
5593 {
5594         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5595         int zswap_writeback;
5596         ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback);
5597
5598         if (parse_ret)
5599                 return parse_ret;
5600
5601         if (zswap_writeback != 0 && zswap_writeback != 1)
5602                 return -EINVAL;
5603
5604         WRITE_ONCE(memcg->zswap_writeback, zswap_writeback);
5605         return nbytes;
5606 }
5607
5608 static struct cftype zswap_files[] = {
5609         {
5610                 .name = "zswap.current",
5611                 .flags = CFTYPE_NOT_ON_ROOT,
5612                 .read_u64 = zswap_current_read,
5613         },
5614         {
5615                 .name = "zswap.max",
5616                 .flags = CFTYPE_NOT_ON_ROOT,
5617                 .seq_show = zswap_max_show,
5618                 .write = zswap_max_write,
5619         },
5620         {
5621                 .name = "zswap.writeback",
5622                 .seq_show = zswap_writeback_show,
5623                 .write = zswap_writeback_write,
5624         },
5625         { }     /* terminate */
5626 };
5627 #endif /* CONFIG_ZSWAP */
5628
5629 static int __init mem_cgroup_swap_init(void)
5630 {
5631         if (mem_cgroup_disabled())
5632                 return 0;
5633
5634         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
5635 #ifdef CONFIG_MEMCG_V1
5636         WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
5637 #endif
5638 #ifdef CONFIG_ZSWAP
5639         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
5640 #endif
5641         return 0;
5642 }
5643 subsys_initcall(mem_cgroup_swap_init);
5644
5645 #endif /* CONFIG_SWAP */
5646
5647 bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
5648 {
5649         return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
5650 }