sched/psi: Per-cgroup PSI accounting disable/re-enable interface

author Chengming Zhou <zhouchengming@bytedance.com>

Wed, 7 Sep 2022 09:03:32 +0000 (17:03 +0800)

committer Peter Zijlstra <peterz@infradead.org>

Fri, 9 Sep 2022 09:08:33 +0000 (11:08 +0200)
author Chengming Zhou <zhouchengming@bytedance.com>
Wed, 7 Sep 2022 09:03:32 +0000 (17:03 +0800)
committer Peter Zijlstra <peterz@infradead.org>
Fri, 9 Sep 2022 09:08:33 +0000 (11:08 +0200)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 971c418bc77847677c92772f207abeb82207c9d9..4cad4e2b31ec87d829833623e4b3e63c80849368 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -976,6 +976,23 @@ All cgroup core files are prefixed with "cgroup."
         killing cgroups is a process directed operation, i.e. it affects
         the whole thread-group.
  
+  cgroup.pressure
+       A read-write single value file that allowed values are "0" and "1".
+       The default is "1".
+
+       Writing "0" to the file will disable the cgroup PSI accounting.
+       Writing "1" to the file will re-enable the cgroup PSI accounting.
+
+       This control attribute is not hierarchical, so disable or enable PSI
+       accounting in a cgroup does not affect PSI accounting in descendants
+       and doesn't need pass enablement via ancestors from root.
+
+       The reason this control attribute exists is that PSI accounts stalls for
+       each cgroup separately and aggregates it at each level of the hierarchy.
+       This may cause non-negligible overhead for some workloads when under
+       deep level of the hierarchy, in which case this control attribute can
+       be used to disable PSI accounting in the non-leaf cgroups.
+
    irq.pressure
         A read-write nested-keyed file.
  
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 4bcf56b3491ca00bcc3276ddb2d63d0e28f4ee17..7df76b318245f0e104f2bdddfb65ae135632e460 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -428,6 +428,9 @@ struct cgroup {
         struct cgroup_file procs_file;  /* handle for "cgroup.procs" */
         struct cgroup_file events_file; /* handle for "cgroup.events" */
  
+       /* handles for "{cpu,memory,io,irq}.pressure" */
+       struct cgroup_file psi_files[NR_PSI_RESOURCES];
+
         /*
          * The bitmask of subsystems enabled on the child cgroups.
          * ->subtree_control is the one configured through
diff --git a/include/linux/psi.h b/include/linux/psi.h

index 362a74ca1d3bfad0840d9b09974aa45efa708443..b029a847def1e28d47f35f1cb0a2337b53a77d2c 100644 (file)
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -39,6 +39,7 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
  int psi_cgroup_alloc(struct cgroup *cgrp);
  void psi_cgroup_free(struct cgroup *cgrp);
  void cgroup_move_task(struct task_struct *p, struct css_set *to);
+void psi_cgroup_restart(struct psi_group *group);
  #endif
  
  #else /* CONFIG_PSI */
@@ -60,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
  {
         rcu_assign_pointer(p->cgroups, to);
  }
+static inline void psi_cgroup_restart(struct psi_group *group) {}
  #endif
  
  #endif /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h

index a0b746258c682f0ccb68f9108bbc877c8656ca49..6e4372735068925c00d85e4a1ad9c1ca3bdefa6d 100644 (file)
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -152,6 +152,7 @@ struct psi_trigger {
  
  struct psi_group {
         struct psi_group *parent;
+       bool enabled;
  
         /* Protects data used by the aggregator */
         struct mutex avgs_lock;
@@ -194,6 +195,8 @@ struct psi_group {
  
  #else /* CONFIG_PSI */
  
+#define NR_PSI_RESOURCES       0
+
  struct psi_group { };
  
  #endif /* CONFIG_PSI */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 772b35d65d1f05547c490a524c9579e36bb312c9..fa1cf836b66a36e939d249142ebce87eee79f695 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3708,8 +3708,8 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
         return psi_show(seq, psi, PSI_CPU);
  }
  
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
-                                         size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+                             size_t nbytes, enum psi_res res)
  {
         struct cgroup_file_ctx *ctx = of->priv;
         struct psi_trigger *new;
@@ -3746,21 +3746,21 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+       return pressure_write(of, buf, nbytes, PSI_IO);
  }
  
  static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+       return pressure_write(of, buf, nbytes, PSI_MEM);
  }
  
  static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+       return pressure_write(of, buf, nbytes, PSI_CPU);
  }
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3776,10 +3776,58 @@ static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
+       return pressure_write(of, buf, nbytes, PSI_IRQ);
  }
  #endif
  
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       struct psi_group *psi = cgroup_psi(cgrp);
+
+       seq_printf(seq, "%d\n", psi->enabled);
+
+       return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+                                    char *buf, size_t nbytes,
+                                    loff_t off)
+{
+       ssize_t ret;
+       int enable;
+       struct cgroup *cgrp;
+       struct psi_group *psi;
+
+       ret = kstrtoint(strstrip(buf), 0, &enable);
+       if (ret)
+               return ret;
+
+       if (enable < 0 || enable > 1)
+               return -ERANGE;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+
+       psi = cgroup_psi(cgrp);
+       if (psi->enabled != enable) {
+               int i;
+
+               /* show or hide {cpu,memory,io,irq}.pressure files */
+               for (i = 0; i < NR_PSI_RESOURCES; i++)
+                       cgroup_file_show(&cgrp->psi_files[i], enable);
+
+               psi->enabled = enable;
+               if (enable)
+                       psi_cgroup_restart(psi);
+       }
+
+       cgroup_kn_unlock(of->kn);
+
+       return nbytes;
+}
+
  static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
                                           poll_table *pt)
  {
@@ -5175,6 +5223,7 @@ static struct cftype cgroup_base_files[] = {
         {
                 .name = "io.pressure",
                 .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
                 .seq_show = cgroup_io_pressure_show,
                 .write = cgroup_io_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5183,6 +5232,7 @@ static struct cftype cgroup_base_files[] = {
         {
                 .name = "memory.pressure",
                 .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
                 .seq_show = cgroup_memory_pressure_show,
                 .write = cgroup_memory_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5191,6 +5241,7 @@ static struct cftype cgroup_base_files[] = {
         {
                 .name = "cpu.pressure",
                 .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
                 .seq_show = cgroup_cpu_pressure_show,
                 .write = cgroup_cpu_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5200,12 +5251,19 @@ static struct cftype cgroup_base_files[] = {
         {
                 .name = "irq.pressure",
                 .flags = CFTYPE_PRESSURE,
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
                 .seq_show = cgroup_irq_pressure_show,
                 .write = cgroup_irq_pressure_write,
                 .poll = cgroup_pressure_poll,
                 .release = cgroup_pressure_release,
         },
  #endif
+       {
+               .name = "cgroup.pressure",
+               .flags = CFTYPE_PRESSURE,
+               .seq_show = cgroup_pressure_show,
+               .write = cgroup_pressure_write,
+       },
  #endif /* CONFIG_PSI */
         { }     /* terminate */
  };
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 9a8aee80a0874ab9fd89816a214dac4af887a24f..9711827e31e5946ee0279c1520d6921e0634cf64 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
  {
         int cpu;
  
+       group->enabled = true;
         for_each_possible_cpu(cpu)
                 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
         group->avg_last_update = sched_clock();
@@ -696,17 +697,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
         groupc = per_cpu_ptr(group->pcpu, cpu);
  
         /*
-        * First we assess the aggregate resource states this CPU's
-        * tasks have been in since the last change, and account any
-        * SOME and FULL time these may have resulted in.
-        *
-        * Then we update the task counts according to the state
+        * First we update the task counts according to the state
          * change requested through the @clear and @set bits.
+        *
+        * Then if the cgroup PSI stats accounting enabled, we
+        * assess the aggregate resource states this CPU's tasks
+        * have been in since the last change, and account any
+        * SOME and FULL time these may have resulted in.
          */
         write_seqcount_begin(&groupc->seq);
  
-       record_times(groupc, now);
-
         /*
          * Start with TSK_ONCPU, which doesn't have a corresponding
          * task count - it's just a boolean flag directly encoded in
@@ -745,6 +745,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 if (set & (1 << t))
                         groupc->tasks[t]++;
  
+       if (!group->enabled) {
+               /*
+                * On the first group change after disabling PSI, conclude
+                * the current state and flush its time. This is unlikely
+                * to matter to the user, but aggregation (get_recent_times)
+                * may have already incorporated the live state into times_prev;
+                * avoid a delta sample underflow when PSI is later re-enabled.
+                */
+               if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+                       record_times(groupc, now);
+
+               groupc->state_mask = state_mask;
+
+               write_seqcount_end(&groupc->seq);
+               return;
+       }
+
         for (s = 0; s < NR_PSI_STATES; s++) {
                 if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
                         state_mask |= (1 << s);
@@ -761,6 +778,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
         if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
                 state_mask |= (1 << PSI_MEM_FULL);
  
+       record_times(groupc, now);
+
         groupc->state_mask = state_mask;
  
         write_seqcount_end(&groupc->seq);
@@ -907,6 +926,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
  
         group = task_psi_group(task);
         do {
+               if (!group->enabled)
+                       continue;
+
                 groupc = per_cpu_ptr(group->pcpu, cpu);
  
                 write_seqcount_begin(&groupc->seq);
@@ -1080,6 +1102,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
  
         task_rq_unlock(rq, task, &rf);
  }
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+       int cpu;
+
+       /*
+        * After we disable psi_group->enabled, we don't actually
+        * stop percpu tasks accounting in each psi_group_cpu,
+        * instead only stop test_state() loop, record_times()
+        * and averaging worker, see psi_group_change() for details.
+        *
+        * When disable cgroup PSI, this function has nothing to sync
+        * since cgroup pressure files are hidden and percpu psi_group_cpu
+        * would see !psi_group->enabled and only do task accounting.
+        *
+        * When re-enable cgroup PSI, this function use psi_group_change()
+        * to get correct state mask from test_state() loop on tasks[],
+        * and restart groupc->state_start from now, use .clear = .set = 0
+        * here since no task status really changed.
+        */
+       if (!group->enabled)
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+               struct rq_flags rf;
+               u64 now;
+
+               rq_lock_irq(rq, &rf);
+               now = cpu_clock(cpu);
+               psi_group_change(group, cpu, 0, 0, now, true);
+               rq_unlock_irq(rq, &rf);
+       }
+}
  #endif /* CONFIG_CGROUPS */
  
  int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
author	Chengming Zhou <zhouchengming@bytedance.com>
	Wed, 7 Sep 2022 09:03:32 +0000 (17:03 +0800)
committer	Peter Zijlstra <peterz@infradead.org>
	Fri, 9 Sep 2022 09:08:33 +0000 (11:08 +0200)
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| blame \| history
include/linux/cgroup-defs.h		patch \| blob \| blame \| history
include/linux/psi.h		patch \| blob \| blame \| history
include/linux/psi_types.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history
kernel/sched/psi.c		patch \| blob \| blame \| history