Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 15 Nov 2017 22:29:44 +0000 (14:29 -0800)
Pull cgroup updates from Tejun Heo:
 "Cgroup2 cpu controller support is finally merged.

   - Basic cpu statistics support to allow monitoring by default without
     the CPU controller enabled.

   - cgroup2 cpu controller support.

   - /sys/kernel/cgroup files to help dealing with new / optional
     features"

* 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: export list of cgroups v2 features using sysfs
  cgroup: export list of delegatable control files using sysfs
  cgroup: mark @cgrp __maybe_unused in cpu_stat_show()
  MAINTAINERS: relocate cpuset.c
  cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat
  sched: Implement interface for cgroup unified hierarchy
  sched: Misc preps for cgroup unified hierarchy interface
  sched/cputime: Add dummy cputime_adjust() implementation for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  cgroup: statically initialize init_css_set->dfl_cgrp
  cgroup: Implement cgroup2 basic CPU usage accounting
  cpuacct: Introduce cgroup_account_cputime[_field]()
  sched/cputime: Expose cputime_adjust()

17 files changed:
Documentation/cgroup-v2.txt
MAINTAINERS
include/linux/cgroup-defs.h
include/linux/cgroup.h
include/linux/sched/cputime.h
kernel/cgroup/Makefile
kernel/cgroup/cgroup-internal.h
kernel/cgroup/cgroup.c
kernel/cgroup/stat.c [new file with mode: 0644]
kernel/sched/core.c
kernel/sched/cpuacct.h [deleted file]
kernel/sched/cputime.c
kernel/sched/deadline.c
kernel/sched/fair.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sched/stop_task.c

index dc44785dc0fa146a2de30aa11ef2c410f1b00622..779211fbb69ffac450f22b0ad6864c7c6c2bd98f 100644 (file)
@@ -893,10 +893,6 @@ Controllers
 CPU
 ---
 
-.. note::
-
-       The interface for the cpu controller hasn't been merged yet
-
 The "cpu" controllers regulates distribution of CPU cycles.  This
 controller implements weight and absolute bandwidth limit models for
 normal scheduling policy and absolute bandwidth allocation model for
@@ -910,12 +906,16 @@ All time durations are in microseconds.
 
   cpu.stat
        A read-only flat-keyed file which exists on non-root cgroups.
+       This file exists whether the controller is enabled or not.
 
-       It reports the following six stats:
+       It always reports the following three stats:
 
        - usage_usec
        - user_usec
        - system_usec
+
+       and the following three when the controller is enabled:
+
        - nr_periods
        - nr_throttled
        - throttled_usec
@@ -926,6 +926,18 @@ All time durations are in microseconds.
 
        The weight in the range [1, 10000].
 
+  cpu.weight.nice
+       A read-write single value file which exists on non-root
+       cgroups.  The default is "0".
+
+       The nice value is in the range [-20, 19].
+
+       This interface file is an alternative interface for
+       "cpu.weight" and allows reading and setting weight using the
+       same values used by nice(2).  Because the range is smaller and
+       granularity is coarser for the nice values, the read value is
+       the closest approximation of the current weight.
+
   cpu.max
        A read-write two value file which exists on non-root cgroups.
        The default is "max 100000".
@@ -938,26 +950,6 @@ All time durations are in microseconds.
        $PERIOD duration.  "max" for $MAX indicates no limit.  If only
        one number is written, $MAX is updated.
 
-  cpu.rt.max
-       .. note::
-
-          The semantics of this file is still under discussion and the
-          interface hasn't been merged yet
-
-       A read-write two value file which exists on all cgroups.
-       The default is "0 100000".
-
-       The maximum realtime runtime allocation.  Over-committing
-       configurations are disallowed and process migrations are
-       rejected if not enough bandwidth is available.  It's in the
-       following format::
-
-         $MAX $PERIOD
-
-       which indicates that the group may consume upto $MAX in each
-       $PERIOD duration.  If only one number is written, $MAX is
-       updated.
-
 
 Memory
 ------
index 16e1e6dc89f253338e8307fc9ff296acbb98b8d1..a74d6a7388641e588db3310dcabd7256637e257d 100644 (file)
@@ -3592,7 +3592,7 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:     Maintained
 F:     Documentation/cgroup-v1/cpusets.txt
 F:     include/linux/cpuset.h
-F:     kernel/cpuset.c
+F:     kernel/cgroup/cpuset.c
 
 CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
 M:     Johannes Weiner <hannes@cmpxchg.org>
index 1dff0a478b45aace3903af82883cb3bf39194774..8b7fd8eeccee26c5694530a45f8f9332aaf681c7 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
 
@@ -255,6 +256,57 @@ struct css_set {
        struct rcu_head rcu_head;
 };
 
+/*
+ * cgroup basic resource usage statistics.  Accounting is done per-cpu in
+ * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+ * reads.
+ *
+ * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * linked into the updated tree.  On the following read, propagation only
+ * considers and consumes the updated tree.  This makes reading O(the
+ * number of descendants which have been active since last read) instead of
+ * O(the total number of descendants).
+ *
+ * This is important because there can be a lot of (draining) cgroups which
+ * aren't active and stat may be read frequently.  The combination can
+ * become very expensive.  By propagating selectively, increasing reading
+ * frequency decreases the cost of each read.
+ */
+struct cgroup_cpu_stat {
+       /*
+        * ->sync protects all the current counters.  These are the only
+        * fields which get updated in the hot path.
+        */
+       struct u64_stats_sync sync;
+       struct task_cputime cputime;
+
+       /*
+        * Snapshots at the last reading.  These are used to calculate the
+        * deltas to propagate to the global counters.
+        */
+       struct task_cputime last_cputime;
+
+       /*
+        * Child cgroups with stat updates on this cpu since the last read
+        * are linked on the parent's ->updated_children through
+        * ->updated_next.
+        *
+        * In addition to being more compact, singly-linked list pointing
+        * to the cgroup makes it unnecessary for each per-cpu struct to
+        * point back to the associated cgroup.
+        *
+        * Protected by per-cpu cgroup_cpu_stat_lock.
+        */
+       struct cgroup *updated_children;        /* terminated by self cgroup */
+       struct cgroup *updated_next;            /* NULL iff not on the list */
+};
+
+struct cgroup_stat {
+       /* per-cpu statistics are collected into the folowing global counters */
+       struct task_cputime cputime;
+       struct prev_cputime prev_cputime;
+};
+
 struct cgroup {
        /* self css with NULL ->ss, points back to this cgroup */
        struct cgroup_subsys_state self;
@@ -354,6 +406,11 @@ struct cgroup {
         */
        struct cgroup *dom_cgrp;
 
+       /* cgroup basic resource statistics */
+       struct cgroup_cpu_stat __percpu *cpu_stat;
+       struct cgroup_stat pending_stat;        /* pending from children */
+       struct cgroup_stat stat;
+
        /*
         * list of pidlists, up to two for each namespace (one for procs, one
         * for tasks); created on demand.
@@ -513,6 +570,8 @@ struct cgroup_subsys {
        void (*css_released)(struct cgroup_subsys_state *css);
        void (*css_free)(struct cgroup_subsys_state *css);
        void (*css_reset)(struct cgroup_subsys_state *css);
+       int (*css_extra_stat_show)(struct seq_file *seq,
+                                  struct cgroup_subsys_state *css);
 
        int (*can_attach)(struct cgroup_taskset *tset);
        void (*cancel_attach)(struct cgroup_taskset *tset);
index dddbc29e20098e0a9f0377f64b0945c87691d315..473e0c0abb8621f732ae3a384acbac63eca8ff11 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
 #include <linux/refcount.h>
+#include <linux/kernel_stat.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -689,6 +690,63 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
        char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
+/*
+ * Basic resource stats.
+ */
+#ifdef CONFIG_CGROUPS
+
+#ifdef CONFIG_CGROUP_CPUACCT
+void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_account_field(struct task_struct *tsk, int index,
+                                        u64 val) {}
+#endif
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+                                   enum cpu_usage_stat index, u64 delta_exec);
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+                                         u64 delta_exec)
+{
+       struct cgroup *cgrp;
+
+       cpuacct_charge(task, delta_exec);
+
+       rcu_read_lock();
+       cgrp = task_dfl_cgroup(task);
+       if (cgroup_parent(cgrp))
+               __cgroup_account_cputime(cgrp, delta_exec);
+       rcu_read_unlock();
+}
+
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+                                               enum cpu_usage_stat index,
+                                               u64 delta_exec)
+{
+       struct cgroup *cgrp;
+
+       cpuacct_account_field(task, index, delta_exec);
+
+       rcu_read_lock();
+       cgrp = task_dfl_cgroup(task);
+       if (cgroup_parent(cgrp))
+               __cgroup_account_cputime_field(cgrp, index, delta_exec);
+       rcu_read_unlock();
+}
+
+#else  /* CONFIG_CGROUPS */
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+                                         u64 delta_exec) {}
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+                                               enum cpu_usage_stat index,
+                                               u64 delta_exec) {}
+
+#endif /* CONFIG_CGROUPS */
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
index d0677f6739f636e6ed6152e5875528f6539b1334..53f883f5a2fd1d29d1e2131b89260304a0be8df0 100644 (file)
@@ -54,7 +54,8 @@ static inline void task_cputime_scaled(struct task_struct *t,
 
 extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-
+extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+                          u64 *ut, u64 *st);
 
 /*
  * Thread group CPU time accounting.
index ae448f7632cc64753e0e0e4d9c4a0e65d41912a5..2be89a003185bb4cb3613a539496008c129643d9 100644 (file)
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
index bf54ade001be4ada1d9ebd45d3a12930828d94e1..b928b27050c62fee81fa791dd2ed3e0d282d59f5 100644 (file)
@@ -200,6 +200,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
+/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_stat_boot(void);
+
 /*
  * namespace.c
  */
index 00f5b358aeac5af94424c22b0e6a006874b10ce0..0b1ffe147f240c39726d79505c4e02e8fe40cd47 100644 (file)
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -461,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                return &cgrp->self;
 }
 
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+                                                    struct cgroup_subsys *ss)
+{
+       struct cgroup_subsys_state *css;
+
+       rcu_read_lock();
+       css = cgroup_css(cgrp, ss);
+       if (!css || !css_tryget_online(css))
+               css = NULL;
+       rcu_read_unlock();
+
+       return css;
+}
+
 /**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -647,6 +671,14 @@ struct css_set init_css_set = {
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+
+       /*
+        * The following field is re-initialized when this cset gets linked
+        * in cgroup_init().  However, let's initialize the field
+        * statically too so that the default cgroup can be accessed safely
+        * early during boot.
+        */
+       .dfl_cgrp               = &cgrp_dfl_root.cgrp,
 };
 
 static int css_set_count       = 1;    /* 1 for init_css_set */
@@ -3315,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
        return 0;
 }
 
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+                                                struct cgroup *cgrp, int ssid)
+{
+       struct cgroup_subsys *ss = cgroup_subsys[ssid];
+       struct cgroup_subsys_state *css;
+       int ret;
+
+       if (!ss->css_extra_stat_show)
+               return 0;
+
+       css = cgroup_tryget_css(cgrp, ss);
+       if (!css)
+               return 0;
+
+       ret = ss->css_extra_stat_show(seq, css);
+       css_put(css);
+       return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+       struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+       int ret = 0;
+
+       cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+       ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+       return ret;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
        struct cftype *cft = of->kn->priv;
@@ -4422,6 +4485,11 @@ static struct cftype cgroup_base_files[] = {
                .name = "cgroup.stat",
                .seq_show = cgroup_stat_show,
        },
+       {
+               .name = "cpu.stat",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_stat_show,
+       },
        { }     /* terminate */
 };
 
@@ -4482,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work)
                         */
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
+                       if (cgroup_on_dfl(cgrp))
+                               cgroup_stat_exit(cgrp);
                        kfree(cgrp);
                } else {
                        /*
@@ -4526,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work)
                /* cgroup release path */
                trace_cgroup_release(cgrp);
 
+               if (cgroup_on_dfl(cgrp))
+                       cgroup_stat_flush(cgrp);
+
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
                        tcgrp->nr_dying_descendants--;
@@ -4709,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        if (ret)
                goto out_free_cgrp;
 
+       if (cgroup_on_dfl(parent)) {
+               ret = cgroup_stat_init(cgrp);
+               if (ret)
+                       goto out_cancel_ref;
+       }
+
        /*
         * Temporarily set the pointer to NULL, so idr_find() won't return
         * a half-baked cgroup.
@@ -4716,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
        if (cgrp->id < 0) {
                ret = -ENOMEM;
-               goto out_cancel_ref;
+               goto out_stat_exit;
        }
 
        init_cgroup_housekeeping(cgrp);
@@ -4767,6 +4846,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 
 out_idr_free:
        cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_stat_exit:
+       if (cgroup_on_dfl(parent))
+               cgroup_stat_exit(cgrp);
 out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5161,6 +5243,8 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
+       cgroup_stat_boot();
+
        /*
         * The latency of the synchronize_sched() is too high for cgroups,
         * avoid it at the cost of forcing all readers into the slow path.
@@ -5780,3 +5864,72 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
        return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+                                     ssize_t size, const char *prefix)
+{
+       struct cftype *cft;
+       ssize_t ret = 0;
+
+       for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+               if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+                       continue;
+
+               if (prefix)
+                       ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+               ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+               if (unlikely(ret >= size)) {
+                       WARN_ON(1);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+       struct cgroup_subsys *ss;
+       int ssid;
+       ssize_t ret = 0;
+
+       ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+                                    NULL);
+
+       for_each_subsys(ss, ssid)
+               ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+                                             PAGE_SIZE - ret,
+                                             cgroup_subsys_name[ssid]);
+
+       return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+                            char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+       &cgroup_delegate_attr.attr,
+       &cgroup_features_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+       .attrs = cgroup_sysfs_attrs,
+       .name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+       return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644 (file)
index 0000000..133b465
--- /dev/null
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+       return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
+ * cpu_stat->updated_children list.  See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+       raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+       struct cgroup *parent;
+       unsigned long flags;
+
+       /*
+        * Speculative already-on-list test.  This may race leading to
+        * temporary inaccuracies, which is fine.
+        *
+        * Because @parent's updated_children is terminated with @parent
+        * instead of NULL, we can tell whether @cgrp is on the list by
+        * testing the next pointer for NULL.
+        */
+       if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+               return;
+
+       raw_spin_lock_irqsave(cpu_lock, flags);
+
+       /* put @cgrp and all ancestors on the corresponding updated lists */
+       for (parent = cgroup_parent(cgrp); parent;
+            cgrp = parent, parent = cgroup_parent(cgrp)) {
+               struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+               struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+               /*
+                * Both additions and removals are bottom-up.  If a cgroup
+                * is already in the tree, all ancestors are.
+                */
+               if (cstat->updated_next)
+                       break;
+
+               cstat->updated_next = pcstat->updated_children;
+               pcstat->updated_children = cgrp;
+       }
+
+       raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+                                                 struct cgroup *root, int cpu)
+{
+       struct cgroup_cpu_stat *cstat;
+       struct cgroup *parent;
+
+       if (pos == root)
+               return NULL;
+
+       /*
+        * We're gonna walk down to the first leaf and visit/remove it.  We
+        * can pick whatever unvisited node as the starting point.
+        */
+       if (!pos)
+               pos = root;
+       else
+               pos = cgroup_parent(pos);
+
+       /* walk down to the first leaf */
+       while (true) {
+               cstat = cgroup_cpu_stat(pos, cpu);
+               if (cstat->updated_children == pos)
+                       break;
+               pos = cstat->updated_children;
+       }
+
+       /*
+        * Unlink @pos from the tree.  As the updated_children list is
+        * singly linked, we have to walk it to find the removal point.
+        * However, due to the way we traverse, @pos will be the first
+        * child in most cases. The only exception is @root.
+        */
+       parent = cgroup_parent(pos);
+       if (parent && cstat->updated_next) {
+               struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+               struct cgroup_cpu_stat *ncstat;
+               struct cgroup **nextp;
+
+               nextp = &pcstat->updated_children;
+               while (true) {
+                       ncstat = cgroup_cpu_stat(*nextp, cpu);
+                       if (*nextp == pos)
+                               break;
+
+                       WARN_ON_ONCE(*nextp == parent);
+                       nextp = &ncstat->updated_next;
+               }
+
+               *nextp = cstat->updated_next;
+               cstat->updated_next = NULL;
+       }
+
+       return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+                                  struct cgroup_stat *src_stat)
+{
+       dst_stat->cputime.utime += src_stat->cputime.utime;
+       dst_stat->cputime.stime += src_stat->cputime.stime;
+       dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+       struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+       struct task_cputime *last_cputime = &cstat->last_cputime;
+       struct task_cputime cputime;
+       struct cgroup_stat delta;
+       unsigned seq;
+
+       lockdep_assert_held(&cgroup_stat_mutex);
+
+       /* fetch the current per-cpu values */
+       do {
+               seq = __u64_stats_fetch_begin(&cstat->sync);
+               cputime = cstat->cputime;
+       } while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+       /* accumulate the deltas to propgate */
+       delta.cputime.utime = cputime.utime - last_cputime->utime;
+       delta.cputime.stime = cputime.stime - last_cputime->stime;
+       delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+                                        last_cputime->sum_exec_runtime;
+       *last_cputime = cputime;
+
+       /* transfer the pending stat into delta */
+       cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+       memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+       /* propagate delta into the global stat and the parent's pending */
+       cgroup_stat_accumulate(&cgrp->stat, &delta);
+       if (parent)
+               cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+       int cpu;
+
+       lockdep_assert_held(&cgroup_stat_mutex);
+
+       for_each_possible_cpu(cpu) {
+               raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+               struct cgroup *pos = NULL;
+
+               raw_spin_lock_irq(cpu_lock);
+               while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+                       cgroup_cpu_stat_flush_one(pos, cpu);
+               raw_spin_unlock_irq(cpu_lock);
+       }
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+       mutex_lock(&cgroup_stat_mutex);
+       cgroup_stat_flush_locked(cgrp);
+       mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+       struct cgroup_cpu_stat *cstat;
+
+       cstat = get_cpu_ptr(cgrp->cpu_stat);
+       u64_stats_update_begin(&cstat->sync);
+       return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+                                       struct cgroup_cpu_stat *cstat)
+{
+       u64_stats_update_end(&cstat->sync);
+       cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+       put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+       struct cgroup_cpu_stat *cstat;
+
+       cstat = cgroup_cpu_stat_account_begin(cgrp);
+       cstat->cputime.sum_exec_runtime += delta_exec;
+       cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+                                   enum cpu_usage_stat index, u64 delta_exec)
+{
+       struct cgroup_cpu_stat *cstat;
+
+       cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+       switch (index) {
+       case CPUTIME_USER:
+       case CPUTIME_NICE:
+               cstat->cputime.utime += delta_exec;
+               break;
+       case CPUTIME_SYSTEM:
+       case CPUTIME_IRQ:
+       case CPUTIME_SOFTIRQ:
+               cstat->cputime.stime += delta_exec;
+               break;
+       default:
+               break;
+       }
+
+       cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       u64 usage, utime, stime;
+
+       if (!cgroup_parent(cgrp))
+               return;
+
+       mutex_lock(&cgroup_stat_mutex);
+
+       cgroup_stat_flush_locked(cgrp);
+
+       usage = cgrp->stat.cputime.sum_exec_runtime;
+       cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+                      &utime, &stime);
+
+       mutex_unlock(&cgroup_stat_mutex);
+
+       do_div(usage, NSEC_PER_USEC);
+       do_div(utime, NSEC_PER_USEC);
+       do_div(stime, NSEC_PER_USEC);
+
+       seq_printf(seq, "usage_usec %llu\n"
+                  "user_usec %llu\n"
+                  "system_usec %llu\n",
+                  usage, utime, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+       int cpu;
+
+       /* the root cgrp has cpu_stat preallocated */
+       if (!cgrp->cpu_stat) {
+               cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+               if (!cgrp->cpu_stat)
+                       return -ENOMEM;
+       }
+
+       /* ->updated_children list is self terminated */
+       for_each_possible_cpu(cpu)
+               cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+       prev_cputime_init(&cgrp->stat.prev_cputime);
+
+       return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+       int cpu;
+
+       cgroup_stat_flush(cgrp);
+
+       /* sanity check */
+       for_each_possible_cpu(cpu) {
+               struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+               if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+                   WARN_ON_ONCE(cstat->updated_next))
+                       return;
+       }
+
+       free_percpu(cgrp->cpu_stat);
+       cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+       BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
index 5b82a00735325ba75ca9a93095953a28cc2b7257..a092f350f3a21ad31c359c704eff65990613853f 100644 (file)
@@ -6620,7 +6620,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
        return ret;
 }
 
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
 {
        struct task_group *tg = css_tg(seq_css(sf));
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6660,7 +6660,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .name = "shares",
@@ -6681,7 +6681,7 @@ static struct cftype cpu_files[] = {
        },
        {
                .name = "stat",
-               .seq_show = cpu_stats_show,
+               .seq_show = cpu_cfs_stat_show,
        },
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -6699,16 +6699,182 @@ static struct cftype cpu_files[] = {
        { }     /* Terminate */
 };
 
+static int cpu_extra_stat_show(struct seq_file *sf,
+                              struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+       {
+               struct task_group *tg = css_tg(css);
+               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+               u64 throttled_usec;
+
+               throttled_usec = cfs_b->throttled_time;
+               do_div(throttled_usec, NSEC_PER_USEC);
+
+               seq_printf(sf, "nr_periods %d\n"
+                          "nr_throttled %d\n"
+                          "throttled_usec %llu\n",
+                          cfs_b->nr_periods, cfs_b->nr_throttled,
+                          throttled_usec);
+       }
+#endif
+       return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
+{
+       struct task_group *tg = css_tg(css);
+       u64 weight = scale_load_down(tg->shares);
+
+       return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+                               struct cftype *cft, u64 weight)
+{
+       /*
+        * cgroup weight knobs should use the common MIN, DFL and MAX
+        * values which are 1, 100 and 10000 respectively.  While it loses
+        * a bit of range on both ends, it maps pretty well onto the shares
+        * value used by scheduler and the round-trip conversions preserve
+        * the original value over the entire range.
+        */
+       if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+               return -ERANGE;
+
+       weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+       return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+                                   struct cftype *cft)
+{
+       unsigned long weight = scale_load_down(css_tg(css)->shares);
+       int last_delta = INT_MAX;
+       int prio, delta;
+
+       /* find the closest nice value to the current weight */
+       for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+               delta = abs(sched_prio_to_weight[prio] - weight);
+               if (delta >= last_delta)
+                       break;
+               last_delta = delta;
+       }
+
+       return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, s64 nice)
+{
+       unsigned long weight;
+
+       if (nice < MIN_NICE || nice > MAX_NICE)
+               return -ERANGE;
+
+       weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+       return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+                                                 long period, long quota)
+{
+       if (quota < 0)
+               seq_puts(sf, "max");
+       else
+               seq_printf(sf, "%ld", quota);
+
+       seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+                                                u64 *periodp, u64 *quotap)
+{
+       char tok[21];   /* U64_MAX */
+
+       if (!sscanf(buf, "%s %llu", tok, periodp))
+               return -EINVAL;
+
+       *periodp *= NSEC_PER_USEC;
+
+       if (sscanf(tok, "%llu", quotap))
+               *quotap *= NSEC_PER_USEC;
+       else if (!strcmp(tok, "max"))
+               *quotap = RUNTIME_INF;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+       struct task_group *tg = css_tg(seq_css(sf));
+
+       cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+       return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+                            char *buf, size_t nbytes, loff_t off)
+{
+       struct task_group *tg = css_tg(of_css(of));
+       u64 period = tg_get_cfs_period(tg);
+       u64 quota;
+       int ret;
+
+       ret = cpu_period_quota_parse(buf, &period, &quota);
+       if (!ret)
+               ret = tg_set_cfs_bandwidth(tg, period, quota);
+       return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       {
+               .name = "weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_u64 = cpu_weight_read_u64,
+               .write_u64 = cpu_weight_write_u64,
+       },
+       {
+               .name = "weight.nice",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_s64 = cpu_weight_nice_read_s64,
+               .write_s64 = cpu_weight_nice_write_s64,
+       },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+       {
+               .name = "max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cpu_max_show,
+               .write = cpu_max_write,
+       },
+#endif
+       { }     /* terminate */
+};
+
 struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
        .css_online     = cpu_cgroup_css_online,
        .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
+       .css_extra_stat_show = cpu_extra_stat_show,
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
-       .legacy_cftypes = cpu_files,
+       .legacy_cftypes = cpu_legacy_files,
+       .dfl_cftypes    = cpu_files,
        .early_init     = true,
+       .threaded       = true,
 };
 
 #endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644 (file)
index a8358a5..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
index 9be8b68a66da0cf5a334f2cd38d915b31a2a4b65..bac6ac9a4ec7068e11e5b35fdea9a3f6a43fd490 100644 (file)
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
         */
        __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
-       cpuacct_account_field(p, index, tmp);
+       cgroup_account_cputime_field(p, index, tmp);
 }
 
 /*
@@ -446,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+                   u64 *ut, u64 *st)
+{
+       *ut = curr->utime;
+       *st = curr->stime;
+}
+
 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
        *ut = p->utime;
@@ -584,9 +591,8 @@ drop_precision:
  *
  * Assuming that rtime_i+1 >= rtime_i.
  */
-static void cputime_adjust(struct task_cputime *curr,
-                          struct prev_cputime *prev,
-                          u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+                   u64 *ut, u64 *st)
 {
        u64 rtime, stime, utime;
        unsigned long flags;
index f349f7e98deca60b1b26ab10a0be36719866cd1e..2473736c7616dd3810b5295a8a5fe4bc4c16b92c 100644 (file)
@@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq)
        account_group_exec_runtime(curr, delta_exec);
 
        curr->se.exec_start = rq_clock_task(rq);
-       cpuacct_charge(curr, delta_exec);
+       cgroup_account_cputime(curr, delta_exec);
 
        sched_rt_avg_update(rq, delta_exec);
 
index 0989676c50e92df396e8a4ef6ccf3545a1e057fe..4037e19bbca25939f0dd57b05f8fb25de8a90908 100644 (file)
@@ -844,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                struct task_struct *curtask = task_of(curr);
 
                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-               cpuacct_charge(curtask, delta_exec);
+               cgroup_account_cputime(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
 
index d8c43d73e078806ac468450732b76282fb13f798..4056c19ca3f00efbc7592a1b4b071426fabf2124 100644 (file)
@@ -969,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
        account_group_exec_runtime(curr, delta_exec);
 
        curr->se.exec_start = rq_clock_task(rq);
-       cpuacct_charge(curr, delta_exec);
+       cgroup_account_cputime(curr, delta_exec);
 
        sched_rt_avg_update(rq, delta_exec);
 
index 45ab0bf564e7abde39013518754e96c3e72f5c3e..b19552a212de379f8a06589249fd0d78af4482dc 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
+#include <linux/cgroup.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -37,7 +38,6 @@
 
 #include "cpupri.h"
 #include "cpudeadline.h"
-#include "cpuacct.h"
 
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)      WARN_ONCE(x, #x)
index 45caf90b24cd9693a72b943220ecd0176580f748..210b1f2146ff2f44b7ee1021b3a99b262857c841 100644 (file)
@@ -72,7 +72,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
        account_group_exec_runtime(curr, delta_exec);
 
        curr->se.exec_start = rq_clock_task(rq);
-       cpuacct_charge(curr, delta_exec);
+       cgroup_account_cputime(curr, delta_exec);
 }
 
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)