memcg: multi-memcg percpu charge cache

author Shakeel Butt <shakeel.butt@linux.dev>

Wed, 16 Apr 2025 18:02:29 +0000 (11:02 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 12 May 2025 00:48:32 +0000 (17:48 -0700)
author Shakeel Butt <shakeel.butt@linux.dev>
Wed, 16 Apr 2025 18:02:29 +0000 (11:02 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 12 May 2025 00:48:32 +0000 (17:48 -0700)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index e64ac7942a21d51dc186aea282ceabc872f4794a..3020bb82c94cca0c87f24b9da965e9e8bdd5e122 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1769,10 +1769,15 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
         pr_cont(" are going to be killed due to memory.oom.group set\n");
  }
  
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
  struct memcg_stock_pcp {
         local_trylock_t stock_lock;
-       struct mem_cgroup *cached; /* this never be root cgroup */
-       unsigned int nr_pages;
+       uint8_t nr_pages[NR_MEMCG_STOCK];
+       struct mem_cgroup *cached[NR_MEMCG_STOCK];
  
         struct obj_cgroup *cached_objcg;
         struct pglist_data *cached_pgdat;
@@ -1784,7 +1789,7 @@ struct memcg_stock_pcp {
         unsigned long flags;
  #define FLUSHING_CACHED_CHARGE 0
  };
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
         .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
  };
  static DEFINE_MUTEX(percpu_charge_mutex);
@@ -1809,9 +1814,10 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
                           gfp_t gfp_mask)
  {
         struct memcg_stock_pcp *stock;
-       unsigned int stock_pages;
+       uint8_t stock_pages;
         unsigned long flags;
         bool ret = false;
+       int i;
  
         if (nr_pages > MEMCG_CHARGE_BATCH)
                 return ret;
@@ -1822,10 +1828,17 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
                 return ret;
  
         stock = this_cpu_ptr(&memcg_stock);
-       stock_pages = READ_ONCE(stock->nr_pages);
-       if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
-               WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
-               ret = true;
+
+       for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+               if (memcg != READ_ONCE(stock->cached[i]))
+                       continue;
+
+               stock_pages = READ_ONCE(stock->nr_pages[i]);
+               if (stock_pages >= nr_pages) {
+                       WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+                       ret = true;
+               }
+               break;
         }
  
         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
@@ -1843,21 +1856,30 @@ static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
  /*
   * Returns stocks cached in percpu and reset cached information.
   */
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
  {
-       unsigned int stock_pages = READ_ONCE(stock->nr_pages);
-       struct mem_cgroup *old = READ_ONCE(stock->cached);
+       struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+       uint8_t stock_pages;
  
         if (!old)
                 return;
  
+       stock_pages = READ_ONCE(stock->nr_pages[i]);
         if (stock_pages) {
                 memcg_uncharge(old, stock_pages);
-               WRITE_ONCE(stock->nr_pages, 0);
+               WRITE_ONCE(stock->nr_pages[i], 0);
         }
  
         css_put(&old->css);
-       WRITE_ONCE(stock->cached, NULL);
+       WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+       int i;
+
+       for (i = 0; i < NR_MEMCG_STOCK; ++i)
+               drain_stock(stock, i);
  }
  
  static void drain_local_stock(struct work_struct *dummy)
@@ -1874,7 +1896,7 @@ static void drain_local_stock(struct work_struct *dummy)
  
         stock = this_cpu_ptr(&memcg_stock);
         drain_obj_stock(stock);
-       drain_stock(stock);
+       drain_stock_fully(stock);
         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
  
         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
@@ -1883,35 +1905,91 @@ static void drain_local_stock(struct work_struct *dummy)
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
         struct memcg_stock_pcp *stock;
-       unsigned int stock_pages;
+       struct mem_cgroup *cached;
+       uint8_t stock_pages;
         unsigned long flags;
+       bool success = false;
+       int empty_slot = -1;
+       int i;
+
+       /*
+        * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+        * decide to increase it more than 127 then we will need more careful
+        * handling of nr_pages[] in struct memcg_stock_pcp.
+        */
+       BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
  
         VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
  
-       if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+       if (nr_pages > MEMCG_CHARGE_BATCH ||
+           !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
                 /*
-                * In case of unlikely failure to lock percpu stock_lock
-                * uncharge memcg directly.
+                * In case of larger than batch refill or unlikely failure to
+                * lock the percpu stock_lock, uncharge memcg directly.
                  */
                 memcg_uncharge(memcg, nr_pages);
                 return;
         }
  
         stock = this_cpu_ptr(&memcg_stock);
-       if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
-               drain_stock(stock);
-               css_get(&memcg->css);
-               WRITE_ONCE(stock->cached, memcg);
+       for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+               cached = READ_ONCE(stock->cached[i]);
+               if (!cached && empty_slot == -1)
+                       empty_slot = i;
+               if (memcg == READ_ONCE(stock->cached[i])) {
+                       stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+                       WRITE_ONCE(stock->nr_pages[i], stock_pages);
+                       if (stock_pages > MEMCG_CHARGE_BATCH)
+                               drain_stock(stock, i);
+                       success = true;
+                       break;
+               }
         }
-       stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
-       WRITE_ONCE(stock->nr_pages, stock_pages);
  
-       if (stock_pages > MEMCG_CHARGE_BATCH)
-               drain_stock(stock);
+       if (!success) {
+               i = empty_slot;
+               if (i == -1) {
+                       i = get_random_u32_below(NR_MEMCG_STOCK);
+                       drain_stock(stock, i);
+               }
+               css_get(&memcg->css);
+               WRITE_ONCE(stock->cached[i], memcg);
+               WRITE_ONCE(stock->nr_pages[i], nr_pages);
+       }
  
         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
  }
  
+static bool is_drain_needed(struct memcg_stock_pcp *stock,
+                           struct mem_cgroup *root_memcg)
+{
+       struct mem_cgroup *memcg;
+       bool flush = false;
+       int i;
+
+       rcu_read_lock();
+
+       if (obj_stock_flush_required(stock, root_memcg)) {
+               flush = true;
+               goto out;
+       }
+
+       for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+               memcg = READ_ONCE(stock->cached[i]);
+               if (!memcg)
+                       continue;
+
+               if (READ_ONCE(stock->nr_pages[i]) &&
+                   mem_cgroup_is_descendant(memcg, root_memcg)) {
+                       flush = true;
+                       break;
+               }
+       }
+out:
+       rcu_read_unlock();
+       return flush;
+}
+
  /*
   * Drains all per-CPU charge caches for given root_memcg resp. subtree
   * of the hierarchy under it.
@@ -1933,17 +2011,7 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
         curcpu = smp_processor_id();
         for_each_online_cpu(cpu) {
                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-               struct mem_cgroup *memcg;
-               bool flush = false;
-
-               rcu_read_lock();
-               memcg = READ_ONCE(stock->cached);
-               if (memcg && READ_ONCE(stock->nr_pages) &&
-                   mem_cgroup_is_descendant(memcg, root_memcg))
-                       flush = true;
-               else if (obj_stock_flush_required(stock, root_memcg))
-                       flush = true;
-               rcu_read_unlock();
+               bool flush = is_drain_needed(stock, root_memcg);
  
                 if (flush &&
                     !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
@@ -1969,7 +2037,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
         drain_obj_stock(stock);
         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
  
-       drain_stock(stock);
+       drain_stock_fully(stock);
  
         return 0;
  }
author	Shakeel Butt <shakeel.butt@linux.dev>
	Wed, 16 Apr 2025 18:02:29 +0000 (11:02 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 12 May 2025 00:48:32 +0000 (17:48 -0700)