Merge tag 'cgroup-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 16 May 2024 00:06:08 +0000 (17:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 16 May 2024 00:06:08 +0000 (17:06 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 16 May 2024 00:06:08 +0000 (17:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 16 May 2024 00:06:08 +0000 (17:06 -0700)
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst

index 9343148ee99366ff499f069ebcd94eeedf44b2ec..a3e2edb3d2745a4dfc88cb5c18fc2cb425d8da9d 100644 (file)
--- a/Documentation/admin-guide/cgroup-v1/cgroups.rst
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -570,7 +570,7 @@ visible to cgroup_for_each_child/descendant_*() iterators. The
  subsystem may choose to fail creation by returning -errno. This
  callback can be used to implement reliable state sharing and
  propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
+cgroup_for_each_live_descendant_pre() for details.
  
  ``void css_offline(struct cgroup *cgrp);``
  (cgroup_mutex held by caller)
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst

index 1f128458ddea49cd3c91d21a38f3d03e1af13726..9f8e27355cba54373bf5196f2a67663df07a6a1e 100644 (file)
--- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -102,7 +102,7 @@ Under below explanation, we assume CONFIG_SWAP=y.
         The logic is very clear. (About migration, see below)
  
         Note:
-         __remove_from_page_cache() is called by remove_from_page_cache()
+         __filemap_remove_folio() is called by filemap_remove_folio()
           and __remove_mapping().
  
  6. Shmem(tmpfs) Page Cache
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index eaf9e66e472a38b25300ede5d34b2444451f063e..f554df7ac649b43da45b44bbbec7ef3966f52dd5 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1058,12 +1058,15 @@ cpufreq governor about the minimum desired frequency which should always be
  provided by a CPU, as well as the maximum desired frequency, which should not
  be exceeded by a CPU.
  
-WARNING: cgroup2 doesn't yet support control of realtime processes and
-the cpu controller can only be enabled when all RT processes are in
-the root cgroup.  Be aware that system management software may already
-have placed RT processes into nonroot cgroups during the system boot
-process, and these processes may need to be moved to the root cgroup
-before the cpu controller can be enabled.
+WARNING: cgroup2 doesn't yet support control of realtime processes. For
+a kernel built with the CONFIG_RT_GROUP_SCHED option enabled for group
+scheduling of realtime processes, the cpu controller can only be enabled
+when all RT processes are in the root cgroup.  This limitation does
+not apply if CONFIG_RT_GROUP_SCHED is disabled.  Be aware that system
+management software may already have placed RT processes into nonroot
+cgroups during the system boot process, and these processes may need
+to be moved to the root cgroup before the cpu controller can be enabled
+with a CONFIG_RT_GROUP_SCHED enabled kernel.
  
  
  CPU Interface Files
@@ -2190,11 +2193,25 @@ PID Interface Files
         Hard limit of number of processes.
  
    pids.current
-       A read-only single value file which exists on all cgroups.
+       A read-only single value file which exists on non-root cgroups.
  
         The number of processes currently in the cgroup and its
         descendants.
  
+  pids.peak
+       A read-only single value file which exists on non-root cgroups.
+
+       The maximum value that the number of processes in the cgroup and its
+       descendants has ever reached.
+
+  pids.events
+       A read-only flat-keyed file which exists on non-root cgroups. The
+       following entries are defined. Unless specified otherwise, a value
+       change in this file generates a file modified event.
+
+         max
+               Number of times fork failed because limit was hit.
+
  Organisational operations are not blocked by cgroup policies, so it is
  possible to have pids.current > pids.max.  This can be done by either
  setting the limit to be smaller than pids.current, or attaching enough
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 34aaf0e87def8f0e6f529e56983e322493ad2fef..2150ca60394ba11727e9322e98f17e1205d4277b 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -690,7 +690,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
  void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
  void cgroup_rstat_flush(struct cgroup *cgrp);
  void cgroup_rstat_flush_hold(struct cgroup *cgrp);
-void cgroup_rstat_flush_release(void);
+void cgroup_rstat_flush_release(struct cgroup *cgrp);
  
  /*
   * Basic resource stats.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 0ce6ff0d9c9aad7f5f6fb31ba5f15207061cdfd9..de4cf0ee96f79eeae224e4433b905b1692c5fc6f 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -70,7 +70,6 @@ extern int cpuset_init(void);
  extern void cpuset_init_smp(void);
  extern void cpuset_force_rebuild(void);
  extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
  extern void inc_dl_tasks_cs(struct task_struct *task);
  extern void dec_dl_tasks_cs(struct task_struct *task);
  extern void cpuset_lock(void);
@@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void)
         partition_sched_domains(1, NULL, NULL);
  }
  
-static inline void cpuset_wait_for_hotplug(void) { }
-
  static inline void inc_dl_tasks_cs(struct task_struct *task) { }
  static inline void dec_dl_tasks_cs(struct task_struct *task) { }
  static inline void cpuset_lock(void) { }
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h

index dd7d7c9efecdf33d7c28c89d769e7d00e70cae2d..0b95865a90f3d82946d4e4f804e7ee60c76effb5 100644 (file)
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -204,6 +204,98 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
         TP_ARGS(cgrp, path, val)
  );
  
+DECLARE_EVENT_CLASS(cgroup_rstat,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended),
+
+       TP_STRUCT__entry(
+               __field(        int,            root                    )
+               __field(        int,            level                   )
+               __field(        u64,            id                      )
+               __field(        int,            cpu                     )
+               __field(        bool,           contended               )
+       ),
+
+       TP_fast_assign(
+               __entry->root = cgrp->root->hierarchy_id;
+               __entry->id = cgroup_id(cgrp);
+               __entry->level = cgrp->level;
+               __entry->cpu = cpu;
+               __entry->contended = contended;
+       ),
+
+       TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
+                 __entry->root, __entry->id, __entry->level,
+                 __entry->cpu, __entry->contended)
+);
+
+/* Related to global: cgroup_rstat_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+/* Related to per CPU: cgroup_rstat_cpu_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
  #endif /* _TRACE_CGROUP_H */
  
  /* This part must be outside protection */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c

index 520a11cb12f44cca94d3b4edd6f4caa2f8f3d805..b9dbf6bf2779d684d29122dbebd40fc40d67e3a1 100644 (file)
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str)
                                 continue;
  
                         cgroup_no_v1_mask |= 1 << i;
+                       break;
                 }
         }
         return 1;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index a66c088c851cfb87036ccc93317f7357769cb268..e32b6972c47840a8f6880964100b3356ab849f2a 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work)
         } else {
                 /* cgroup free path */
                 atomic_dec(&cgrp->root->nr_cgrps);
-               cgroup1_pidlist_destroy_all(cgrp);
+               if (!cgroup_on_dfl(cgrp))
+                       cgroup1_pidlist_destroy_all(cgrp);
                 cancel_work_sync(&cgrp->release_agent_work);
                 bpf_cgrp_storage_free(cgrp);
  
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index 4237c8748715d4c66f2d46787fb7297643abcddb..a10e4bd0c0c16caf623a511d49a4b86668add570 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -201,6 +201,14 @@ struct cpuset {
         struct list_head remote_sibling;
  };
  
+/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+       struct work_struct work;
+       struct cpuset *cs;
+};
+
  /*
   * Exclusive CPUs distributed out to sub-partitions of top_cpuset
   */
@@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
  }
  
  static struct cpuset top_cpuset = {
-       .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
-                 (1 << CS_MEM_EXCLUSIVE)),
+       .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+                BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
         .partition_root_state = PRS_ROOT,
+       .relax_domain_level = -1,
         .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
  };
  
@@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock);
  
  static struct workqueue_struct *cpuset_migrate_mm_wq;
  
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
  
  static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
         rcu_read_lock();
         cs = task_cs(tsk);
  
-       while (!cpumask_intersects(cs->effective_cpus, pmask)) {
+       while (!cpumask_intersects(cs->effective_cpus, pmask))
                 cs = parent_cs(cs);
-               if (unlikely(!cs)) {
-                       /*
-                        * The top cpuset doesn't have any online cpu as a
-                        * consequence of a race between cpuset_hotplug_work
-                        * and cpu hotplug notifier.  But we know the top
-                        * cpuset's effective_cpus is on its way to be
-                        * identical to cpu_online_mask.
-                        */
-                       goto out_unlock;
-               }
-       }
-       cpumask_and(pmask, pmask, cs->effective_cpus);
  
-out_unlock:
+       cpumask_and(pmask, pmask, cs->effective_cpus);
         rcu_read_unlock();
  }
  
@@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void)
         /*
          * If we have raced with CPU hotplug, return early to avoid
          * passing doms with offlined cpu to partition_sched_domains().
-        * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+        * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
          *
          * With no CPUs in any subpartitions, top_cpuset's effective CPUs
          * should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void)
  }
  #endif /* CONFIG_SMP */
  
-void rebuild_sched_domains(void)
+static void rebuild_sched_domains_cpuslocked(void)
  {
-       cpus_read_lock();
         mutex_lock(&cpuset_mutex);
         rebuild_sched_domains_locked();
         mutex_unlock(&cpuset_mutex);
+}
+
+void rebuild_sched_domains(void)
+{
+       cpus_read_lock();
+       rebuild_sched_domains_cpuslocked();
         cpus_read_unlock();
  }
  
@@ -2079,14 +2075,11 @@ write_error:
  
         /*
          * For partcmd_update without newmask, it is being called from
-        * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
-        * Update the load balance flag and scheduling domain if
-        * cpus_read_trylock() is successful.
+        * cpuset_handle_hotplug(). Update the load balance flag and
+        * scheduling domain accordingly.
          */
-       if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+       if ((cmd == partcmd_update) && !newmask)
                 update_partition_sd_lb(cs, old_prs);
-               cpus_read_unlock();
-       }
  
         notify_partition_change(cs, old_prs);
         return 0;
@@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
          * proceeding, so that we don't end up keep removing tasks added
          * after execution capability is restored.
          *
-        * cpuset_hotplug_work calls back into cgroup core via
-        * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+        * cpuset_handle_hotplug may call back into cgroup core asynchronously
+        * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
          * operation like this one can lead to a deadlock through kernfs
          * active_ref protection.  Let's break the protection.  Losing the
          * protection is okay as we check whether @cs is online after
@@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
          */
         css_get(&cs->css);
         kernfs_break_active_protection(of->kn);
-       flush_work(&cpuset_hotplug_work);
  
         cpus_read_lock();
         mutex_lock(&cpuset_mutex);
@@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
  
         buf = strstrip(buf);
  
-       /*
-        * Convert "root" to ENABLED, and convert "member" to DISABLED.
-        */
         if (!strcmp(buf, "root"))
                 val = PRS_ROOT;
         else if (!strcmp(buf, "member"))
@@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
                 cs->effective_mems = parent->effective_mems;
                 cs->use_parent_ecpus = true;
                 parent->child_ecpus_count++;
-               /*
-                * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
-                */
-               if (!is_sched_load_balance(parent))
-                       clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
         }
  
         /*
@@ -4318,8 +4302,6 @@ int __init cpuset_init(void)
         nodes_setall(top_cpuset.effective_mems);
  
         fmeter_init(&top_cpuset.fmeter);
-       set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
-       top_cpuset.relax_domain_level = -1;
         INIT_LIST_HEAD(&remote_children);
  
         BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
         }
  }
  
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+       struct cpuset_remove_tasks_struct *s;
+
+       s = container_of(work, struct cpuset_remove_tasks_struct, work);
+       remove_tasks_in_empty_cpuset(s->cs);
+       css_put(&s->cs->css);
+       kfree(s);
+}
+
  static void
  hotplug_update_tasks_legacy(struct cpuset *cs,
                             struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
         /*
          * Move tasks to the nearest ancestor with execution resources,
          * This is full cgroup operation which will also call back into
-        * cpuset. Should be done outside any lock.
+        * cpuset. Execute it asynchronously using workqueue.
          */
-       if (is_empty) {
-               mutex_unlock(&cpuset_mutex);
-               remove_tasks_in_empty_cpuset(cs);
-               mutex_lock(&cpuset_mutex);
+       if (is_empty && cs->css.cgroup->nr_populated_csets &&
+           css_tryget_online(&cs->css)) {
+               struct cpuset_remove_tasks_struct *s;
+
+               s = kzalloc(sizeof(*s), GFP_KERNEL);
+               if (WARN_ON_ONCE(!s)) {
+                       css_put(&cs->css);
+                       return;
+               }
+
+               s->cs = cs;
+               INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+               schedule_work(&s->work);
         }
  }
  
@@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void)
         force_rebuild = true;
  }
  
-/*
- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
- * progress.
- * Return: true if successful, false otherwise
- *
- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
-static bool cpuset_hotplug_cpus_read_trylock(void)
-{
-       int retries = 0;
-
-       while (!cpus_read_trylock()) {
-               /*
-                * CPU hotplug still in progress. Retry 5 times
-                * with a 10ms wait before bailing out.
-                */
-               if (++retries > 5)
-                       return false;
-               msleep(10);
-       }
-       return true;
-}
-
  /**
   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
   * @cs: cpuset in interest
@@ -4493,13 +4470,11 @@ retry:
                 compute_partition_effective_cpumask(cs, &new_cpus);
  
         if (remote && cpumask_empty(&new_cpus) &&
-           partition_is_populated(cs, NULL) &&
-           cpuset_hotplug_cpus_read_trylock()) {
+           partition_is_populated(cs, NULL)) {
                 remote_partition_disable(cs, tmp);
                 compute_effective_cpumask(&new_cpus, cs, parent);
                 remote = false;
                 cpuset_force_rebuild();
-               cpus_read_unlock();
         }
  
         /*
@@ -4519,18 +4494,8 @@ retry:
         else if (is_partition_valid(parent) && is_partition_invalid(cs))
                 partcmd = partcmd_update;
  
-       /*
-        * cpus_read_lock needs to be held before calling
-        * update_parent_effective_cpumask(). To avoid circular lock
-        * dependency between cpuset_mutex and cpus_read_lock,
-        * cpus_read_trylock() is used here to acquire the lock.
-        */
         if (partcmd >= 0) {
-               if (!cpuset_hotplug_cpus_read_trylock())
-                       goto update_tasks;
-
                 update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
-               cpus_read_unlock();
                 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
                         compute_partition_effective_cpumask(cs, &new_cpus);
                         cpuset_force_rebuild();
@@ -4558,8 +4523,7 @@ unlock:
  }
  
  /**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- * @work: unused
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
   *
   * This function is called after either CPU or memory configuration has
   * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -4573,8 +4537,10 @@ unlock:
   *
   * Note that CPU offlining during suspend is ignored.  We don't modify
   * cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
   */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_handle_hotplug(void)
  {
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
@@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                 ptmp = &tmp;
  
+       lockdep_assert_cpus_held();
         mutex_lock(&cpuset_mutex);
  
         /* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         /* rebuild sched domains if cpus_allowed has changed */
         if (cpus_updated || force_rebuild) {
                 force_rebuild = false;
-               rebuild_sched_domains();
+               rebuild_sched_domains_cpuslocked();
         }
  
         free_cpumasks(NULL, ptmp);
@@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void)
          * inside cgroup synchronization.  Bounce actual hotplug processing
          * to a work item to avoid reverse locking order.
          */
-       schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
-       flush_work(&cpuset_hotplug_work);
+       cpuset_handle_hotplug();
  }
  
  /*
@@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void)
  static int cpuset_track_online_nodes(struct notifier_block *self,
                                 unsigned long action, void *arg)
  {
-       schedule_work(&cpuset_hotplug_work);
+       cpuset_handle_hotplug();
         return NOTIFY_OK;
  }
  
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c

index 66d1708042a72bf5d25b9ff1d7acb8d91183ffd7..074653f964c1d058f1dcc60dc395d77272a5ad04 100644 (file)
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
   * @css: css being created
   *
   * We're committing to creation of @css.  Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+ * parent's freezing state while holding cpus read lock and freezer_mutex.
   */
  static int freezer_css_online(struct cgroup_subsys_state *css)
  {
@@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
   * freezer_css_offline - initiate destruction of a freezer css
   * @css: css being destroyed
   *
- * @css is going away.  Mark it dead and decrement system_freezing_count if
+ * @css is going away.  Mark it dead and decrement freezer_active if
   * it was holding one.
   */
  static void freezer_css_offline(struct cgroup_subsys_state *css)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c

index 7695e60bcb4092000f7cb1ed61f6fcd6d5a8c6a0..0e5ec7d59b4d21ab79342a02d9ba1a0ec864d71f 100644 (file)
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
         if (!pids)
                 return ERR_PTR(-ENOMEM);
  
-       atomic64_set(&pids->counter, 0);
         atomic64_set(&pids->limit, PIDS_MAX);
-       atomic64_set(&pids->events_limit, 0);
         return &pids->css;
  }
  
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c

index 07e2284bb4997123dd3c6699d98f6d42ae0e6166..fb8b494375731baf375dc27f272245966b730a23 100644 (file)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -7,6 +7,8 @@
  #include <linux/btf.h>
  #include <linux/btf_ids.h>
  
+#include <trace/events/cgroup.h>
+
  static DEFINE_SPINLOCK(cgroup_rstat_lock);
  static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
  
@@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
         return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  }
  
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+                                    struct cgroup *cgrp, const bool fast_path)
+{
+       unsigned long flags;
+       bool contended;
+
+       /*
+        * The _irqsave() is needed because cgroup_rstat_lock is
+        * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+        * this lock with the _irq() suffix only disables interrupts on
+        * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+        * interrupts on both configurations. The _irqsave() ensures
+        * that interrupts are always disabled and later restored.
+        */
+       contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+       if (contended) {
+               if (fast_path)
+                       trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+               else
+                       trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+               raw_spin_lock_irqsave(cpu_lock, flags);
+       }
+
+       if (fast_path)
+               trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+       else
+               trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+       return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+                             struct cgroup *cgrp, unsigned long flags,
+                             const bool fast_path)
+{
+       if (fast_path)
+               trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+       else
+               trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+       raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
  /**
   * cgroup_rstat_updated - keep track of updated rstat_cpu
   * @cgrp: target cgroup
@@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
         if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
                 return;
  
-       raw_spin_lock_irqsave(cpu_lock, flags);
+       flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
  
         /* put @cgrp and all ancestors on the corresponding updated lists */
         while (true) {
@@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
                 cgrp = parent;
         }
  
-       raw_spin_unlock_irqrestore(cpu_lock, flags);
+       _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
  }
  
  /**
@@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
         struct cgroup *head = NULL, *parent, *child;
         unsigned long flags;
  
-       /*
-        * The _irqsave() is needed because cgroup_rstat_lock is
-        * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-        * this lock with the _irq() suffix only disables interrupts on
-        * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-        * interrupts on both configurations. The _irqsave() ensures
-        * that interrupts are always disabled and later restored.
-        */
-       raw_spin_lock_irqsave(cpu_lock, flags);
+       flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
  
         /* Return NULL if this subtree is not on-list */
         if (!rstatc->updated_next)
@@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
         if (child != root)
                 head = cgroup_rstat_push_children(head, child, cpu);
  unlock_ret:
-       raw_spin_unlock_irqrestore(cpu_lock, flags);
+       _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
         return head;
  }
  
@@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
  
  __bpf_hook_end();
  
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments.  The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+       __acquires(&cgroup_rstat_lock)
+{
+       bool contended;
+
+       contended = !spin_trylock_irq(&cgroup_rstat_lock);
+       if (contended) {
+               trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+               spin_lock_irq(&cgroup_rstat_lock);
+       }
+       trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+       __releases(&cgroup_rstat_lock)
+{
+       trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+       spin_unlock_irq(&cgroup_rstat_lock);
+}
+
  /* see cgroup_rstat_flush() */
  static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
         __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
  
                 /* play nice and yield if necessary */
                 if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-                       spin_unlock_irq(&cgroup_rstat_lock);
+                       __cgroup_rstat_unlock(cgrp, cpu);
                         if (!cond_resched())
                                 cpu_relax();
-                       spin_lock_irq(&cgroup_rstat_lock);
+                       __cgroup_rstat_lock(cgrp, cpu);
                 }
         }
  }
@@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
  {
         might_sleep();
  
-       spin_lock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_lock(cgrp, -1);
         cgroup_rstat_flush_locked(cgrp);
-       spin_unlock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_unlock(cgrp, -1);
  }
  
  /**
@@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
         __acquires(&cgroup_rstat_lock)
  {
         might_sleep();
-       spin_lock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_lock(cgrp, -1);
         cgroup_rstat_flush_locked(cgrp);
  }
  
  /**
   * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ * @cgrp: cgroup used by tracepoint
   */
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
         __releases(&cgroup_rstat_lock)
  {
-       spin_unlock_irq(&cgroup_rstat_lock);
+       __cgroup_rstat_unlock(cgrp, -1);
  }
  
  int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
  #ifdef CONFIG_SCHED_CORE
                 forceidle_time = cgrp->bstat.forceidle_sum;
  #endif
-               cgroup_rstat_flush_release();
+               cgroup_rstat_flush_release(cgrp);
         } else {
                 root_cgroup_cputime(&bstat);
                 usage = bstat.cputime.sum_exec_runtime;
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 63447eb85dab6bd0fa2ab23c25bffdf788a2b9c1..563877d6c28b65f40a9e5597c865e0afa1200a64 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void)
         kthread_unpark(this_cpu_read(cpuhp_state.thread));
  }
  
-/*
- *
- * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
- * protected region.
- *
- * The operation is still serialized against concurrent CPU hotplug via
- * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
- * serialized against other hotplug related activity like adding or
- * removing of state callbacks and state instances, which invoke either the
- * startup or the teardown callback of the affected state.
- *
- * This is required for subsystems which are unfixable vs. CPU hotplug and
- * evade lock inversion problems by scheduling work which has to be
- * completed _before_ cpu_up()/_cpu_down() returns.
- *
- * Don't even think about adding anything to this for any new code or even
- * drivers. It's only purpose is to keep existing lock order trainwrecks
- * working.
- *
- * For cpu_down() there might be valid reasons to finish cleanups which are
- * not required to be done under cpu_hotplug_lock, but that's a different
- * story and would be not invoked via this.
- */
-static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
-{
-       /*
-        * cpusets delegate hotplug operations to a worker to "solve" the
-        * lock order problems. Wait for the worker, but only if tasks are
-        * _not_ frozen (suspend, hibernate) as that would wait forever.
-        *
-        * The wait is required because otherwise the hotplug operation
-        * returns with inconsistent state, which could even be observed in
-        * user space when a new CPU is brought up. The CPU plug uevent
-        * would be delivered and user space reacting on it would fail to
-        * move tasks to the newly plugged CPU up to the point where the
-        * work has finished because up to that point the newly plugged CPU
-        * is not assignable in cpusets/cgroups. On unplug that's not
-        * necessarily a visible issue, but it is still inconsistent state,
-        * which is the real problem which needs to be "fixed". This can't
-        * prevent the transient state between scheduling the work and
-        * returning from waiting for it.
-        */
-       if (!tasks_frozen)
-               cpuset_wait_for_hotplug();
-}
-
  #ifdef CONFIG_HOTPLUG_CPU
  #ifndef arch_clear_mm_cpumask_cpu
  #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -1494,7 +1448,6 @@ out:
          */
         lockup_detector_cleanup();
         arch_smt_update();
-       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
  
@@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
  out:
         cpus_write_unlock();
         arch_smt_update();
-       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
  
diff --git a/kernel/power/process.c b/kernel/power/process.c

index cae81a87cc91e3a2f06da590f8db68fb22b4db26..66ac067d9ae64e022ba7259554628d175494ed13 100644 (file)
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -194,8 +194,6 @@ void thaw_processes(void)
         __usermodehelper_set_disable_depth(UMH_FREEZING);
         thaw_workqueues();
  
-       cpuset_wait_for_hotplug();
-
         read_lock(&tasklist_lock);
         for_each_process_thread(g, p) {
                 /* No other threads should have PF_SUSPEND_TASK set */
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile

index 00b4419289096177d124fca3ad2dbc0ca416465d..16461dc0ffdf9dc69f632056a3c3d46cedf84d6c 100644 (file)
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -4,7 +4,7 @@ CFLAGS += -Wall -pthread
  all: ${HELPER_PROGS}
  
  TEST_FILES     := with_stress.sh
-TEST_PROGS     := test_stress.sh test_cpuset_prs.sh
+TEST_PROGS     := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh
  TEST_GEN_FILES := wait_inotify
  TEST_GEN_PROGS = test_memcontrol
  TEST_GEN_PROGS += test_kmem
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c

index 0340d4ca8f51cbfed84512362f3aa02c106712f2..ce16a50ecff87e49b87208d6fc84c86e9e4b0139 100644 (file)
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -1,7 +1,4 @@
  /* SPDX-License-Identifier: GPL-2.0 */
-
-#define _GNU_SOURCE
-
  #include <errno.h>
  #include <fcntl.h>
  #include <linux/limits.h>
@@ -195,10 +192,10 @@ int cg_write_numeric(const char *cgroup, const char *control, long value)
         return cg_write(cgroup, control, buf);
  }
  
-int cg_find_unified_root(char *root, size_t len)
+int cg_find_unified_root(char *root, size_t len, bool *nsdelegate)
  {
         char buf[10 * PAGE_SIZE];
-       char *fs, *mount, *type;
+       char *fs, *mount, *type, *options;
         const char delim[] = "\n\t ";
  
         if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
@@ -211,12 +208,14 @@ int cg_find_unified_root(char *root, size_t len)
         for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
                 mount = strtok(NULL, delim);
                 type = strtok(NULL, delim);
-               strtok(NULL, delim);
+               options = strtok(NULL, delim);
                 strtok(NULL, delim);
                 strtok(NULL, delim);
  
                 if (strcmp(type, "cgroup2") == 0) {
                         strncpy(root, mount, len);
+                       if (nsdelegate)
+                               *nsdelegate = !!strstr(options, "nsdelegate");
                         return 0;
                 }
         }
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h

index 1df7f202214afcb6bb73e55d04163d02974371d9..e8d04ac9e3d23cdcce8c85e392684dec850562f1 100644 (file)
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -18,10 +18,10 @@
   */
  static inline int values_close(long a, long b, int err)
  {
-       return abs(a - b) <= (a + b) / 100 * err;
+       return labs(a - b) <= (a + b) / 100 * err;
  }
  
-extern int cg_find_unified_root(char *root, size_t len);
+extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate);
  extern char *cg_name(const char *root, const char *name);
  extern char *cg_name_indexed(const char *root, const char *name, int index);
  extern char *cg_control(const char *cgroup, const char *control);
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c

index 80aa6b2373b9660a2d5407cf1bf112f2ea3e92f5..de8baad460222de738bac58259db5afcf5670630 100644 (file)
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -1,6 +1,4 @@
  /* SPDX-License-Identifier: GPL-2.0 */
-
-#define _GNU_SOURCE
  #include <linux/limits.h>
  #include <linux/sched.h>
  #include <sys/types.h>
@@ -18,6 +16,8 @@
  #include "../kselftest.h"
  #include "cgroup_util.h"
  
+static bool nsdelegate;
+
  static int touch_anon(char *buf, size_t size)
  {
         int fd;
@@ -775,6 +775,9 @@ static int test_cgcore_lesser_ns_open(const char *root)
         pid_t pid;
         int status;
  
+       if (!nsdelegate)
+               return KSFT_SKIP;
+
         cg_test_a = cg_name(root, "cg_test_a");
         cg_test_b = cg_name(root, "cg_test_b");
  
@@ -862,7 +865,7 @@ int main(int argc, char *argv[])
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), &nsdelegate))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c

index 24020a2c68dcdd9fff0e8d0f4af279d6840bbe77..5a4a314f6af7e8704d755deb9f5df10153758bcd 100644 (file)
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -1,6 +1,4 @@
  // SPDX-License-Identifier: GPL-2.0
-
-#define _GNU_SOURCE
  #include <linux/limits.h>
  #include <sys/sysinfo.h>
  #include <sys/wait.h>
@@ -237,7 +235,7 @@ run_cpucg_weight_test(
  {
         int ret = KSFT_FAIL, i;
         char *parent = NULL;
-       struct cpu_hogger children[3] = {NULL};
+       struct cpu_hogger children[3] = {};
  
         parent = cg_name(root, "cpucg_test_0");
         if (!parent)
@@ -408,7 +406,7 @@ run_cpucg_nested_weight_test(const char *root, bool overprovisioned)
  {
         int ret = KSFT_FAIL, i;
         char *parent = NULL, *child = NULL;
-       struct cpu_hogger leaf[3] = {NULL};
+       struct cpu_hogger leaf[3] = {};
         long nested_leaf_usage, child_usage;
         int nprocs = get_nprocs();
  
@@ -700,7 +698,7 @@ int main(int argc, char *argv[])
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         if (cg_read_strstr(root, "cgroup.subtree_control", "cpu"))
diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c

index b061ed1e05b4d08f63b3c7ea4541133c7ad7f16a..4034d14ba69ac03b119d9718beca6e66332d90cf 100644 (file)
--- a/tools/testing/selftests/cgroup/test_cpuset.c
+++ b/tools/testing/selftests/cgroup/test_cpuset.c
@@ -249,7 +249,7 @@ int main(int argc, char *argv[])
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset"))
diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh

new file mode 100755 (executable)

index 0000000..3f45512
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test the special cpuset v1 hotplug case where a cpuset become empty of
+# CPUs will force migration of tasks out to an ancestor.
+#
+
+skip_test() {
+       echo "$1"
+       echo "Test SKIPPED"
+       exit 4 # ksft_skip
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+# Find cpuset v1 mount point
+CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}')
+[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!"
+
+#
+# Create a test cpuset, put a CPU and a task there and offline that CPU
+#
+TDIR=test$$
+[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR
+echo 1 > $CPUSET/$TDIR/cpuset.cpus
+echo 0 > $CPUSET/$TDIR/cpuset.mems
+sleep 10&
+TASK=$!
+echo $TASK > $CPUSET/$TDIR/tasks
+NEWCS=$(cat /proc/$TASK/cpuset)
+[[ $NEWCS != "/$TDIR" ]] && {
+       echo "Unexpected cpuset $NEWCS, test FAILED!"
+       exit 1
+}
+
+echo 0 > /sys/devices/system/cpu/cpu1/online
+sleep 0.5
+echo 1 > /sys/devices/system/cpu/cpu1/online
+NEWCS=$(cat /proc/$TASK/cpuset)
+rmdir $CPUSET/$TDIR
+[[ $NEWCS != "/" ]] && {
+       echo "cpuset $NEWCS, test FAILED!"
+       exit 1
+}
+echo "Test PASSED"
+exit 0
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c

index 8845353aca53bb90e7781afc8bb3e86b94bc35b1..8730645d363a73a03f297a06ba4ae6acf7eb9d55 100644 (file)
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -827,7 +827,7 @@ int main(int argc, char *argv[])
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
         for (i = 0; i < ARRAY_SIZE(tests); i++) {
                 switch (tests[i].fn(root)) {
diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c

index f0fefeb4cc24c4c130674e13dd40138332ca023e..80d05d50a42db30a8eb39a42388f576f5ae7f516 100644 (file)
--- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c
+++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c
@@ -1,6 +1,4 @@
  // SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-
  #include <linux/limits.h>
  #include <sys/mman.h>
  #include <stdio.h>
@@ -214,7 +212,7 @@ int main(int argc, char **argv)
                 return ret;
         }
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         switch (test_hugetlb_memcg(root)) {
diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c

index 6153690319c9c888e0e53047898ee0469f577d3a..0e5bb6c7307a504ffb76653a1241e6beb8c12e3b 100644 (file)
--- a/tools/testing/selftests/cgroup/test_kill.c
+++ b/tools/testing/selftests/cgroup/test_kill.c
@@ -276,7 +276,7 @@ int main(int argc, char *argv[])
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
         for (i = 0; i < ARRAY_SIZE(tests); i++) {
                 switch (tests[i].fn(root)) {
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c

index c82f974b85c94d0ae82b54ec5b560449480fb48f..2e453ac50c0d674ff65a84748c211db347dd9ff4 100644 (file)
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -1,6 +1,4 @@
  // SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-
  #include <linux/limits.h>
  #include <fcntl.h>
  #include <stdio.h>
@@ -192,7 +190,7 @@ static int test_kmem_memcg_deletion(const char *root)
                 goto cleanup;
  
         sum = anon + file + kernel + sock;
-       if (abs(sum - current) < MAX_VMSTAT_ERROR) {
+       if (labs(sum - current) < MAX_VMSTAT_ERROR) {
                 ret = KSFT_PASS;
         } else {
                 printf("memory.current = %ld\n", current);
@@ -380,7 +378,7 @@ static int test_percpu_basic(const char *root)
         current = cg_read_long(parent, "memory.current");
         percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
  
-       if (current > 0 && percpu > 0 && abs(current - percpu) <
+       if (current > 0 && percpu > 0 && labs(current - percpu) <
             MAX_VMSTAT_ERROR)
                 ret = KSFT_PASS;
         else
@@ -420,7 +418,7 @@ int main(int argc, char **argv)
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         /*
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c

index c7c9572003a8c9efa29a179e4121a243112c63be..c871630d62a3aef2e8edfb2b106f2f6184f1e47f 100644 (file)
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -1,6 +1,4 @@
  /* SPDX-License-Identifier: GPL-2.0 */
-#define _GNU_SOURCE
-
  #include <linux/limits.h>
  #include <linux/oom.h>
  #include <fcntl.h>
@@ -716,7 +714,9 @@ static bool reclaim_until(const char *memcg, long goal)
   */
  static int test_memcg_reclaim(const char *root)
  {
-       int ret = KSFT_FAIL, fd, retries;
+       int ret = KSFT_FAIL;
+       int fd = -1;
+       int retries;
         char *memcg;
         long current, expected_usage;
  
@@ -1314,7 +1314,7 @@ int main(int argc, char **argv)
         char root[PATH_MAX];
         int i, proc_status, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         /*
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c

index f0e488ed90d8959dc5edb83da057741ad6608384..8418a8d7439f5d0e614b6b1db21324e2dc33861d 100644 (file)
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -1,6 +1,4 @@
  // SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-
  #include <linux/limits.h>
  #include <unistd.h>
  #include <stdio.h>
@@ -257,7 +255,7 @@ static int test_no_invasive_cgroup_shrink(const char *root)
  {
         int ret = KSFT_FAIL;
         size_t control_allocation_size = MB(10);
-       char *control_allocation, *wb_group = NULL, *control_group = NULL;
+       char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL;
  
         wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
         if (!wb_group)
@@ -342,7 +340,7 @@ static int test_no_kmem_bypass(const char *root)
         struct sysinfo sys_info;
         int ret = KSFT_FAIL;
         int child_status;
-       char *test_group;
+       char *test_group = NULL;
         pid_t child_pid;
  
         /* Read sys info and compute test values accordingly */
@@ -440,7 +438,7 @@ int main(int argc, char **argv)
         char root[PATH_MAX];
         int i, ret = EXIT_SUCCESS;
  
-       if (cg_find_unified_root(root, sizeof(root)))
+       if (cg_find_unified_root(root, sizeof(root), NULL))
                 ksft_exit_skip("cgroup v2 isn't mounted\n");
  
         if (!zswap_configured())
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 16 May 2024 00:06:08 +0000 (17:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 16 May 2024 00:06:08 +0000 (17:06 -0700)
Documentation/admin-guide/cgroup-v1/cgroups.rst		patch \| blob \| blame \| history
Documentation/admin-guide/cgroup-v1/memcg_test.rst		patch \| blob \| blame \| history
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| blame \| history
include/linux/cgroup.h		patch \| blob \| blame \| history
include/linux/cpuset.h		patch \| blob \| blame \| history
include/trace/events/cgroup.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup-v1.c		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history
kernel/cgroup/cpuset.c		patch \| blob \| blame \| history
kernel/cgroup/legacy_freezer.c		patch \| blob \| blame \| history
kernel/cgroup/pids.c		patch \| blob \| blame \| history
kernel/cgroup/rstat.c		patch \| blob \| blame \| history
kernel/cpu.c		patch \| blob \| blame \| history
kernel/power/process.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/Makefile		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/cgroup_util.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/cgroup_util.h		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_core.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_cpu.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_cpuset.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh	[new file with mode: 0755]	patch \| blob
tools/testing/selftests/cgroup/test_freezer.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_hugetlb_memcg.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_kill.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_kmem.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_memcontrol.c		patch \| blob \| blame \| history
tools/testing/selftests/cgroup/test_zswap.c		patch \| blob \| blame \| history