Merge tag 'trace-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux...

[linux-2.6-block.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 5b009b233ab8921110148de54b6b96e69d5ab638..774bd6e21e2788ac1ee094c84176b488543fd6c3 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -249,6 +249,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
         return container_of(vmpr, struct mem_cgroup, vmpressure);
  }
  
+#define CURRENT_OBJCG_UPDATE_BIT 0
+#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
+
  #ifdef CONFIG_MEMCG_KMEM
  static DEFINE_SPINLOCK(objcg_lock);
  
@@ -704,6 +707,8 @@ static const unsigned int memcg_vm_event_stat[] = {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         THP_FAULT_ALLOC,
         THP_COLLAPSE_ALLOC,
+       THP_SWPOUT,
+       THP_SWPOUT_FALLBACK,
  #endif
  };
  
@@ -761,6 +766,22 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
         return x;
  }
  
+static int memcg_page_state_unit(int item);
+
+/*
+ * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
+ * up non-zero sub-page updates to 1 page as zero page updates are ignored.
+ */
+static int memcg_state_val_in_pages(int idx, int val)
+{
+       int unit = memcg_page_state_unit(idx);
+
+       if (!val || unit == PAGE_SIZE)
+               return val;
+       else
+               return max(val * unit / PAGE_SIZE, 1UL);
+}
+
  /**
   * __mod_memcg_state - update cgroup memory statistics
   * @memcg: the memory cgroup
@@ -773,7 +794,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
                 return;
  
         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
-       memcg_rstat_updated(memcg, val);
+       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
  }
  
  /* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -798,7 +819,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
         memcg = pn->memcg;
  
         /*
-        * The caller from rmap relay on disabled preemption becase they never
+        * The caller from rmap relies on disabled preemption because they never
          * update their counter from in-interrupt context. For these two
          * counters we check that the update is never performed from an
          * interrupt context while other caller need to have disabled interrupt.
@@ -824,7 +845,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
         /* Update lruvec */
         __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
  
-       memcg_rstat_updated(memcg, val);
+       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
         memcg_stats_unlock();
  }
  
@@ -1068,17 +1089,25 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
  }
  EXPORT_SYMBOL(get_mem_cgroup_from_mm);
  
-static __always_inline bool memcg_kmem_bypass(void)
+/**
+ * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
+ */
+struct mem_cgroup *get_mem_cgroup_from_current(void)
  {
-       /* Allow remote memcg charging from any context. */
-       if (unlikely(active_memcg()))
-               return false;
+       struct mem_cgroup *memcg;
  
-       /* Memcg to charge can't be determined. */
-       if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
-               return true;
+       if (mem_cgroup_disabled())
+               return NULL;
  
-       return false;
+again:
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(current);
+       if (!css_tryget(&memcg->css)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+       return memcg;
  }
  
  /**
@@ -1533,7 +1562,7 @@ static const struct memory_stat memory_stats[] = {
         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
  };
  
-/* Translate stat items to the correct unit for memory.stat output */
+/* The actual unit of the state item, not the same as the output unit */
  static int memcg_page_state_unit(int item)
  {
         switch (item) {
@@ -1541,6 +1570,22 @@ static int memcg_page_state_unit(int item)
         case MEMCG_ZSWAP_B:
         case NR_SLAB_RECLAIMABLE_B:
         case NR_SLAB_UNRECLAIMABLE_B:
+               return 1;
+       case NR_KERNEL_STACK_KB:
+               return SZ_1K;
+       default:
+               return PAGE_SIZE;
+       }
+}
+
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_output_unit(int item)
+{
+       /*
+        * Workingset state is actually in pages, but we export it to userspace
+        * as a scalar count of events, so special case it here.
+        */
+       switch (item) {
         case WORKINGSET_REFAULT_ANON:
         case WORKINGSET_REFAULT_FILE:
         case WORKINGSET_ACTIVATE_ANON:
@@ -1549,17 +1594,23 @@ static int memcg_page_state_unit(int item)
         case WORKINGSET_RESTORE_FILE:
         case WORKINGSET_NODERECLAIM:
                 return 1;
-       case NR_KERNEL_STACK_KB:
-               return SZ_1K;
         default:
-               return PAGE_SIZE;
+               return memcg_page_state_unit(item);
         }
  }
  
  static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
                                                     int item)
  {
-       return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
+       return memcg_page_state(memcg, item) *
+               memcg_page_state_output_unit(item);
+}
+
+static inline unsigned long memcg_page_state_local_output(
+               struct mem_cgroup *memcg, int item)
+{
+       return memcg_page_state_local(memcg, item) *
+               memcg_page_state_output_unit(item);
  }
  
  static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
@@ -2833,7 +2884,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         return try_charge_memcg(memcg, gfp_mask, nr_pages);
  }
  
-static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+/**
+ * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
+ * @memcg: memcg previously charged.
+ * @nr_pages: number of pages previously charged.
+ */
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
         if (mem_cgroup_is_root(memcg))
                 return;
@@ -2858,6 +2914,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
         folio->memcg_data = (unsigned long)memcg;
  }
  
+/**
+ * mem_cgroup_commit_charge - commit a previously successful try_charge().
+ * @folio: folio to commit the charge to.
+ * @memcg: memcg previously charged.
+ */
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+{
+       css_get(&memcg->css);
+       commit_charge(folio, memcg);
+
+       local_irq_disable();
+       mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
+       memcg_check_events(memcg, folio_nid(folio));
+       local_irq_enable();
+}
+
  #ifdef CONFIG_MEMCG_KMEM
  /*
   * The allocated objcg pointers array is not accounted directly.
@@ -3007,28 +3079,105 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
  
         for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
                 objcg = rcu_dereference(memcg->objcg);
-               if (objcg && obj_cgroup_tryget(objcg))
+               if (likely(objcg && obj_cgroup_tryget(objcg)))
                         break;
                 objcg = NULL;
         }
         return objcg;
  }
  
-__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+static struct obj_cgroup *current_objcg_update(void)
  {
-       struct obj_cgroup *objcg = NULL;
         struct mem_cgroup *memcg;
+       struct obj_cgroup *old, *objcg = NULL;
  
-       if (memcg_kmem_bypass())
-               return NULL;
+       do {
+               /* Atomically drop the update bit. */
+               old = xchg(&current->objcg, NULL);
+               if (old) {
+                       old = (struct obj_cgroup *)
+                               ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
+                       if (old)
+                               obj_cgroup_put(old);
+
+                       old = NULL;
+               }
  
-       rcu_read_lock();
-       if (unlikely(active_memcg()))
-               memcg = active_memcg();
-       else
+               /* If new objcg is NULL, no reason for the second atomic update. */
+               if (!current->mm || (current->flags & PF_KTHREAD))
+                       return NULL;
+
+               /*
+                * Release the objcg pointer from the previous iteration,
+                * if try_cmpxcg() below fails.
+                */
+               if (unlikely(objcg)) {
+                       obj_cgroup_put(objcg);
+                       objcg = NULL;
+               }
+
+               /*
+                * Obtain the new objcg pointer. The current task can be
+                * asynchronously moved to another memcg and the previous
+                * memcg can be offlined. So let's get the memcg pointer
+                * and try get a reference to objcg under a rcu read lock.
+                */
+
+               rcu_read_lock();
                 memcg = mem_cgroup_from_task(current);
-       objcg = __get_obj_cgroup_from_memcg(memcg);
-       rcu_read_unlock();
+               objcg = __get_obj_cgroup_from_memcg(memcg);
+               rcu_read_unlock();
+
+               /*
+                * Try set up a new objcg pointer atomically. If it
+                * fails, it means the update flag was set concurrently, so
+                * the whole procedure should be repeated.
+                */
+       } while (!try_cmpxchg(&current->objcg, &old, objcg));
+
+       return objcg;
+}
+
+__always_inline struct obj_cgroup *current_obj_cgroup(void)
+{
+       struct mem_cgroup *memcg;
+       struct obj_cgroup *objcg;
+
+       if (in_task()) {
+               memcg = current->active_memcg;
+               if (unlikely(memcg))
+                       goto from_memcg;
+
+               objcg = READ_ONCE(current->objcg);
+               if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
+                       objcg = current_objcg_update();
+               /*
+                * Objcg reference is kept by the task, so it's safe
+                * to use the objcg by the current task.
+                */
+               return objcg;
+       }
+
+       memcg = this_cpu_read(int_active_memcg);
+       if (unlikely(memcg))
+               goto from_memcg;
+
+       return NULL;
+
+from_memcg:
+       for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+               /*
+                * Memcg pointer is protected by scope (see set_active_memcg())
+                * and is pinning the corresponding objcg, so objcg can't go
+                * away and can be used within the scope without any additional
+                * protection.
+                */
+               objcg = rcu_dereference_check(memcg->objcg, 1);
+               if (likely(objcg))
+                       break;
+               objcg = NULL;
+       }
+
         return objcg;
  }
  
@@ -3126,15 +3275,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
         struct obj_cgroup *objcg;
         int ret = 0;
  
-       objcg = get_obj_cgroup_from_current();
+       objcg = current_obj_cgroup();
         if (objcg) {
                 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
                 if (!ret) {
+                       obj_cgroup_get(objcg);
                         page->memcg_data = (unsigned long)objcg |
                                 MEMCG_DATA_KMEM;
                         return 0;
                 }
-               obj_cgroup_put(objcg);
         }
         return ret;
  }
@@ -3761,6 +3910,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
  
         objcg->memcg = memcg;
         rcu_assign_pointer(memcg->objcg, objcg);
+       obj_cgroup_get(objcg);
+       memcg->orig_objcg = objcg;
  
         static_branch_enable(&memcg_kmem_online_key);
  
@@ -4066,7 +4217,10 @@ static const unsigned int memcg1_stats[] = {
         NR_WRITEBACK,
         WORKINGSET_REFAULT_ANON,
         WORKINGSET_REFAULT_FILE,
+#ifdef CONFIG_SWAP
         MEMCG_SWAP,
+       NR_SWAPCACHE,
+#endif
  };
  
  static const char *const memcg1_stat_names[] = {
@@ -4081,7 +4235,10 @@ static const char *const memcg1_stat_names[] = {
         "writeback",
         "workingset_refault_anon",
         "workingset_refault_file",
+#ifdef CONFIG_SWAP
         "swap",
+       "swapcached",
+#endif
  };
  
  /* Universal VM events cgroup1 shows, original sort order */
@@ -4105,11 +4262,8 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                 unsigned long nr;
  
-               if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
-                       continue;
-               nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-               seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
-                          nr * memcg_page_state_unit(memcg1_stats[i]));
+               nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
+               seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
         }
  
         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4129,18 +4283,15 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
         }
         seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
                        (u64)memory * PAGE_SIZE);
-       if (do_memsw_account())
-               seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
-                              (u64)memsw * PAGE_SIZE);
+       seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+                      (u64)memsw * PAGE_SIZE);
  
         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                 unsigned long nr;
  
-               if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
-                       continue;
-               nr = memcg_page_state(memcg, memcg1_stats[i]);
+               nr = memcg_page_state_output(memcg, memcg1_stats[i]);
                 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
-                          (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
+                              (u64)nr);
         }
  
         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -5268,6 +5419,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
  {
         int node;
  
+       if (memcg->orig_objcg)
+               obj_cgroup_put(memcg->orig_objcg);
+
         for_each_node(node)
                 free_mem_cgroup_per_node_info(memcg, node);
         kfree(memcg->vmstats);
@@ -6076,7 +6230,7 @@ static void __mem_cgroup_clear_mc(void)
  
         /* we must uncharge all the leftover precharges from mc.to */
         if (mc.precharge) {
-               cancel_charge(mc.to, mc.precharge);
+               mem_cgroup_cancel_charge(mc.to, mc.precharge);
                 mc.precharge = 0;
         }
         /*
@@ -6084,7 +6238,7 @@ static void __mem_cgroup_clear_mc(void)
          * we must uncharge here.
          */
         if (mc.moved_charge) {
-               cancel_charge(mc.from, mc.moved_charge);
+               mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
                 mc.moved_charge = 0;
         }
         /* we must fixup refcnts and charges */
@@ -6364,6 +6518,7 @@ static void mem_cgroup_move_task(void)
                 mem_cgroup_clear_mc();
         }
  }
+
  #else  /* !CONFIG_MMU */
  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
  {
@@ -6377,8 +6532,39 @@ static void mem_cgroup_move_task(void)
  }
  #endif
  
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_fork(struct task_struct *task)
+{
+       /*
+        * Set the update flag to cause task->objcg to be initialized lazily
+        * on the first allocation. It can be done without any synchronization
+        * because it's always performed on the current task, so does
+        * current_objcg_update().
+        */
+       task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
+}
+
+static void mem_cgroup_exit(struct task_struct *task)
+{
+       struct obj_cgroup *objcg = task->objcg;
+
+       objcg = (struct obj_cgroup *)
+               ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
+       if (objcg)
+               obj_cgroup_put(objcg);
+
+       /*
+        * Some kernel allocations can happen after this point,
+        * but let's ignore them. It can be done without any synchronization
+        * because it's always performed on the current task, so does
+        * current_objcg_update().
+        */
+       task->objcg = NULL;
+}
+#endif
+
  #ifdef CONFIG_LRU_GEN
-static void mem_cgroup_attach(struct cgroup_taskset *tset)
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
  {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
@@ -6396,10 +6582,31 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset)
         task_unlock(task);
  }
  #else
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
+{
+       struct task_struct *task;
+       struct cgroup_subsys_state *css;
+
+       cgroup_taskset_for_each(task, css, tset) {
+               /* atomically set the update bit */
+               set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
+       }
+}
+#else
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
  static void mem_cgroup_attach(struct cgroup_taskset *tset)
  {
+       mem_cgroup_lru_gen_attach(tset);
+       mem_cgroup_kmem_attach(tset);
  }
-#endif /* CONFIG_LRU_GEN */
+#endif
  
  static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
  {
@@ -6622,7 +6829,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
  static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
                                                      int item)
  {
-       return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+       return lruvec_page_state(lruvec, item) *
+               memcg_page_state_output_unit(item);
  }
  
  static int memory_numa_stat_show(struct seq_file *m, void *v)
@@ -6808,9 +7016,15 @@ struct cgroup_subsys memory_cgrp_subsys = {
         .css_reset = mem_cgroup_css_reset,
         .css_rstat_flush = mem_cgroup_css_rstat_flush,
         .can_attach = mem_cgroup_can_attach,
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
         .attach = mem_cgroup_attach,
+#endif
         .cancel_attach = mem_cgroup_cancel_attach,
         .post_attach = mem_cgroup_move_task,
+#ifdef CONFIG_MEMCG_KMEM
+       .fork = mem_cgroup_fork,
+       .exit = mem_cgroup_exit,
+#endif
         .dfl_cftypes = memory_files,
         .legacy_cftypes = mem_cgroup_legacy_files,
         .early_init = 0,
@@ -6990,20 +7204,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
  static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
                         gfp_t gfp)
  {
-       long nr_pages = folio_nr_pages(folio);
         int ret;
  
-       ret = try_charge(memcg, gfp, nr_pages);
+       ret = try_charge(memcg, gfp, folio_nr_pages(folio));
         if (ret)
                 goto out;
  
-       css_get(&memcg->css);
-       commit_charge(folio, memcg);
-
-       local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, nr_pages);
-       memcg_check_events(memcg, folio_nid(folio));
-       local_irq_enable();
+       mem_cgroup_commit_charge(folio, memcg);
  out:
         return ret;
  }
@@ -7020,6 +7227,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
         return ret;
  }
  
+/**
+ * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
+ * @memcg: memcg to charge.
+ * @gfp: reclaim mode.
+ * @nr_pages: number of pages to charge.
+ *
+ * This function is called when allocating a huge page folio to determine if
+ * the memcg has the capacity for it. It does not commit the charge yet,
+ * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ *
+ * Once we have obtained the hugetlb folio, we can call
+ * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
+ * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
+ * of try_charge().
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+                       long nr_pages)
+{
+       /*
+        * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
+        * but do not attempt to commit charge later (or cancel on error) either.
+        */
+       if (mem_cgroup_disabled() || !memcg ||
+               !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+               !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+               return -EOPNOTSUPP;
+
+       if (try_charge(memcg, gfp, nr_pages))
+               return -ENOMEM;
+
+       return 0;
+}
+
  /**
   * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
   * @folio: folio to charge.
@@ -7216,16 +7458,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
  }
  
  /**
- * mem_cgroup_migrate - Charge a folio's replacement.
+ * mem_cgroup_replace_folio - Charge a folio's replacement.
   * @old: Currently circulating folio.
   * @new: Replacement folio.
   *
   * Charge @new as a replacement folio for @old. @old will
- * be uncharged upon free.
+ * be uncharged upon free. This is only used by the page cache
+ * (in replace_page_cache_folio()).
   *
   * Both folios must be locked, @new->mapping must be set up.
   */
-void mem_cgroup_migrate(struct folio *old, struct folio *new)
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
  {
         struct mem_cgroup *memcg;
         long nr_pages = folio_nr_pages(new);
@@ -7264,6 +7507,44 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
         local_irq_restore(flags);
  }
  
+/**
+ * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
+ *
+ * Transfer the memcg data from the old folio to the new folio for migration.
+ * The old folio's data info will be cleared. Note that the memory counters
+ * will remain unchanged throughout the process.
+ *
+ * Both folios must be locked, @new->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
+{
+       struct mem_cgroup *memcg;
+
+       VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+       VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+       VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+       VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
+
+       if (mem_cgroup_disabled())
+               return;
+
+       memcg = folio_memcg(old);
+       /*
+        * Note that it is normal to see !memcg for a hugetlb folio.
+        * For e.g, itt could have been allocated when memory_hugetlb_accounting
+        * was not selected.
+        */
+       VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
+       if (!memcg)
+               return;
+
+       /* Transfer the charge and the css ref */
+       commit_charge(new, memcg);
+       old->memcg_data = 0;
+}
+
  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
  EXPORT_SYMBOL(memcg_sockets_enabled_key);
  
@@ -7763,7 +8044,7 @@ static struct cftype memsw_files[] = {
   *
   * This doesn't check for specific headroom, and it is not atomic
   * either. But with zswap, the size of the allocation is only known
- * once compression has occured, and this optimistic pre-check avoids
+ * once compression has occurred, and this optimistic pre-check avoids
   * spending cycles on compression when there is already no room left
   * or zswap is disabled altogether somewhere in the hierarchy.
   */