[linux-2.6-block.git] / kernel / cgroup / rstat.c

// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
	unsigned long flags;

	/*
	 * Speculative already-on-list test. This may race leading to
	 * temporary inaccuracies, which is fine.
	 *
	 * Because @parent's updated_children is terminated with @parent
	 * instead of NULL, we can tell whether @cgrp is on the list by
	 * testing the next pointer for NULL.
	 */
	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
		return;

	raw_spin_lock_irqsave(cpu_lock, flags);

	/* put @cgrp and all ancestors on the corresponding updated lists */
	while (true) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
		struct cgroup *parent = cgroup_parent(cgrp);
		struct cgroup_rstat_cpu *prstatc;

		/*
		 * Both additions and removals are bottom-up.  If a cgroup
		 * is already in the tree, all ancestors are.
		 */
		if (rstatc->updated_next)
			break;

		/* Root has no parent to link it to, but mark it busy */
		if (!parent) {
			rstatc->updated_next = cgrp;
			break;
		}

		prstatc = cgroup_rstat_cpu(parent, cpu);
		rstatc->updated_next = prstatc->updated_children;
		prstatc->updated_children = cgrp;

		cgrp = parent;
	}

	raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
 * @pos: current position
 * @root: root of the tree to traversal
 * @cpu: target cpu
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
 * the traversal and %NULL return indicates the end.  During traversal,
 * each returned cgroup is unlinked from the tree.  Must be called with the
 * matching cgroup_rstat_cpu_lock held.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, if a child is visited, its parent is
 * guaranteed to be visited afterwards.
 */
static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
						   struct cgroup *root, int cpu)
{
	struct cgroup_rstat_cpu *rstatc;
	struct cgroup *parent;

	if (pos == root)
		return NULL;

	/*
	 * We're gonna walk down to the first leaf and visit/remove it.  We
	 * can pick whatever unvisited node as the starting point.
	 */
	if (!pos) {
		pos = root;
		/* return NULL if this subtree is not on-list */
		if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
			return NULL;
	} else {
		pos = cgroup_parent(pos);
	}

	/* walk down to the first leaf */
	while (true) {
		rstatc = cgroup_rstat_cpu(pos, cpu);
		if (rstatc->updated_children == pos)
			break;
		pos = rstatc->updated_children;
	}

	/*
	 * Unlink @pos from the tree.  As the updated_children list is
	 * singly linked, we have to walk it to find the removal point.
	 * However, due to the way we traverse, @pos will be the first
	 * child in most cases. The only exception is @root.
	 */
	parent = cgroup_parent(pos);
	if (parent) {
		struct cgroup_rstat_cpu *prstatc;
		struct cgroup **nextp;

		prstatc = cgroup_rstat_cpu(parent, cpu);
		nextp = &prstatc->updated_children;
		while (*nextp != pos) {
			struct cgroup_rstat_cpu *nrstatc;

			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
			WARN_ON_ONCE(*nextp == parent);
			nextp = &nrstatc->updated_next;
		}
		*nextp = rstatc->updated_next;
	}

	rstatc->updated_next = NULL;
	return pos;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 *
 * __diag_* below are needed to dismiss the missing prototype warning.
 */
__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
		  "kfuncs which will be used in BPF programs");

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
				     struct cgroup *parent, int cpu)
{
}

__diag_pop();

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
	int cpu;

	lockdep_assert_held(&cgroup_rstat_lock);

	for_each_possible_cpu(cpu) {
		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
						       cpu);
		struct cgroup *pos = NULL;
		unsigned long flags;

		/*
		 * The _irqsave() is needed because cgroup_rstat_lock is
		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
		 * this lock with the _irq() suffix only disables interrupts on
		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
		 * interrupts on both configurations. The _irqsave() ensures
		 * that interrupts are always disabled and later restored.
		 */
		raw_spin_lock_irqsave(cpu_lock, flags);
		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
			struct cgroup_subsys_state *css;

			cgroup_base_stat_flush(pos, cpu);
			bpf_rstat_flush(pos, cgroup_parent(pos), cpu);

			rcu_read_lock();
			list_for_each_entry_rcu(css, &pos->rstat_css_list,
						rstat_css_node)
				css->ss->css_rstat_flush(css, cpu);
			rcu_read_unlock();
		}
		raw_spin_unlock_irqrestore(cpu_lock, flags);

		/* if @may_sleep, play nice and yield if necessary */
		if (may_sleep && (need_resched() ||
				  spin_needbreak(&cgroup_rstat_lock))) {
			spin_unlock_irq(&cgroup_rstat_lock);
			if (!cond_resched())
				cpu_relax();
			spin_lock_irq(&cgroup_rstat_lock);
		}
	}
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
	might_sleep();

	spin_lock_irq(&cgroup_rstat_lock);
	cgroup_rstat_flush_locked(cgrp, true);
	spin_unlock_irq(&cgroup_rstat_lock);
}

/**
 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 * @cgrp: target cgroup
 *
 * This function can be called from any context.
 */
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
{
	unsigned long flags;

	spin_lock_irqsave(&cgroup_rstat_lock, flags);
	cgroup_rstat_flush_locked(cgrp, false);
	spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
}

/**
 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
	__acquires(&cgroup_rstat_lock)
{
	might_sleep();
	spin_lock_irq(&cgroup_rstat_lock);
	cgroup_rstat_flush_locked(cgrp, true);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 */
void cgroup_rstat_flush_release(void)
	__releases(&cgroup_rstat_lock)
{
	spin_unlock_irq(&cgroup_rstat_lock);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
	int cpu;

	/* the root cgrp has rstat_cpu preallocated */
	if (!cgrp->rstat_cpu) {
		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
		if (!cgrp->rstat_cpu)
			return -ENOMEM;
	}

	/* ->updated_children list is self terminated */
	for_each_possible_cpu(cpu) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

		rstatc->updated_children = cgrp;
		u64_stats_init(&rstatc->bsync);
	}

	return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
	int cpu;

	cgroup_rstat_flush(cgrp);

	/* sanity check */
	for_each_possible_cpu(cpu) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
		    WARN_ON_ONCE(rstatc->updated_next))
			return;
	}

	free_percpu(cgrp->rstat_cpu);
	cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
	int cpu;

	for_each_possible_cpu(cpu)
		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
				 struct cgroup_base_stat *src_bstat)
{
	dst_bstat->cputime.utime += src_bstat->cputime.utime;
	dst_bstat->cputime.stime += src_bstat->cputime.stime;
	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
				 struct cgroup_base_stat *src_bstat)
{
	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	struct cgroup *parent = cgroup_parent(cgrp);
	struct cgroup_base_stat delta;
	unsigned seq;

	/* Root-level stats are sourced from system-wide CPU stats */
	if (!parent)
		return;

	/* fetch the current per-cpu values */
	do {
		seq = __u64_stats_fetch_begin(&rstatc->bsync);
		delta = rstatc->bstat;
	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

	/* propagate percpu delta to global */
	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
	cgroup_base_stat_add(&cgrp->bstat, &delta);
	cgroup_base_stat_add(&rstatc->last_bstat, &delta);

	/* propagate global delta to parent (unless that's root) */
	if (cgroup_parent(parent)) {
		delta = cgrp->bstat;
		cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
		cgroup_base_stat_add(&parent->bstat, &delta);
		cgroup_base_stat_add(&cgrp->last_bstat, &delta);
	}
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
	struct cgroup_rstat_cpu *rstatc;

	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
	return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
						 struct cgroup_rstat_cpu *rstatc,
						 unsigned long flags)
{
	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
	cgroup_rstat_updated(cgrp, smp_processor_id());
	put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
	struct cgroup_rstat_cpu *rstatc;
	unsigned long flags;

	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
				    enum cpu_usage_stat index, u64 delta_exec)
{
	struct cgroup_rstat_cpu *rstatc;
	unsigned long flags;

	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

	switch (index) {
	case CPUTIME_USER:
	case CPUTIME_NICE:
		rstatc->bstat.cputime.utime += delta_exec;
		break;
	case CPUTIME_SYSTEM:
	case CPUTIME_IRQ:
	case CPUTIME_SOFTIRQ:
		rstatc->bstat.cputime.stime += delta_exec;
		break;
#ifdef CONFIG_SCHED_CORE
	case CPUTIME_FORCEIDLE:
		rstatc->bstat.forceidle_sum += delta_exec;
		break;
#endif
	default:
		break;
	}

	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
	struct task_cputime *cputime = &bstat->cputime;
	int i;

	cputime->stime = 0;
	cputime->utime = 0;
	cputime->sum_exec_runtime = 0;
	for_each_possible_cpu(i) {
		struct kernel_cpustat kcpustat;
		u64 *cpustat = kcpustat.cpustat;
		u64 user = 0;
		u64 sys = 0;

		kcpustat_cpu_fetch(&kcpustat, i);

		user += cpustat[CPUTIME_USER];
		user += cpustat[CPUTIME_NICE];
		cputime->utime += user;

		sys += cpustat[CPUTIME_SYSTEM];
		sys += cpustat[CPUTIME_IRQ];
		sys += cpustat[CPUTIME_SOFTIRQ];
		cputime->stime += sys;

		cputime->sum_exec_runtime += user;
		cputime->sum_exec_runtime += sys;
		cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];

#ifdef CONFIG_SCHED_CORE
		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
	}
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
	struct cgroup *cgrp = seq_css(seq)->cgroup;
	u64 usage, utime, stime;
	struct cgroup_base_stat bstat;
#ifdef CONFIG_SCHED_CORE
	u64 forceidle_time;
#endif

	if (cgroup_parent(cgrp)) {
		cgroup_rstat_flush_hold(cgrp);
		usage = cgrp->bstat.cputime.sum_exec_runtime;
		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
			       &utime, &stime);
#ifdef CONFIG_SCHED_CORE
		forceidle_time = cgrp->bstat.forceidle_sum;
#endif
		cgroup_rstat_flush_release();
	} else {
		root_cgroup_cputime(&bstat);
		usage = bstat.cputime.sum_exec_runtime;
		utime = bstat.cputime.utime;
		stime = bstat.cputime.stime;
#ifdef CONFIG_SCHED_CORE
		forceidle_time = bstat.forceidle_sum;
#endif
	}

	do_div(usage, NSEC_PER_USEC);
	do_div(utime, NSEC_PER_USEC);
	do_div(stime, NSEC_PER_USEC);
#ifdef CONFIG_SCHED_CORE
	do_div(forceidle_time, NSEC_PER_USEC);
#endif

	seq_printf(seq, "usage_usec %llu\n"
		   "user_usec %llu\n"
		   "system_usec %llu\n",
		   usage, utime, stime);

#ifdef CONFIG_SCHED_CORE
	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
BTF_SET8_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
BTF_SET8_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
	.owner          = THIS_MODULE,
	.set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
					 &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);
Commit	Line	Data
457c8996	1	// SPDX-License-Identifier: GPL-2.0-only
041cd640 TH	2	#include "cgroup-internal.h"
	3
	4	#include <linux/sched/cputime.h>
	5
a319185b YA	6	#include <linux/bpf.h>
	7	#include <linux/btf.h>
	8	#include <linux/btf_ids.h>
	9
0fa294fb	10	static DEFINE_SPINLOCK(cgroup_rstat_lock);
c58632b3	11	static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
041cd640	12
a17556f8 TH	13	static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
a17556f8 TH	14
c58632b3	15	static struct cgroup_rstat_cpu cgroup_rstat_cpu(struct cgroup cgrp, int cpu)
041cd640	16	{
c58632b3	17	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
041cd640 TH	18	}
	19
	20	/**
6162cef0	21	* cgroup_rstat_updated - keep track of updated rstat_cpu
041cd640	22	* @cgrp: target cgroup
c58632b3	23	* @cpu: cpu on which rstat_cpu was updated
041cd640	24	*
c58632b3 TH	25	* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
	26	* rstat_cpu->updated_children list. See the comment on top of
	27	* cgroup_rstat_cpu definition for details.
041cd640	28	*/
400031e0	29	__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
041cd640	30	{
c58632b3	31	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
041cd640 TH	32	unsigned long flags;
	33
	34	/*
d8ef4b38 TH	35	* Speculative already-on-list test. This may race leading to
	36	* temporary inaccuracies, which is fine.
	37	*
041cd640 TH	38	* Because @parent's updated_children is terminated with @parent
	39	* instead of NULL, we can tell whether @cgrp is on the list by
	40	* testing the next pointer for NULL.
	41	*/
eda09706	42	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
041cd640 TH	43	return;
	44
	45	raw_spin_lock_irqsave(cpu_lock, flags);
	46
	47	/* put @cgrp and all ancestors on the corresponding updated lists */
dc26532a	48	while (true) {
c58632b3	49	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
dc26532a JW	50	struct cgroup *parent = cgroup_parent(cgrp);
dc26532a JW	51	struct cgroup_rstat_cpu *prstatc;
041cd640 TH	52
	53	/*
	54	* Both additions and removals are bottom-up. If a cgroup
	55	* is already in the tree, all ancestors are.
	56	*/
c58632b3	57	if (rstatc->updated_next)
041cd640 TH	58	break;
041cd640 TH	59
dc26532a JW	60	/* Root has no parent to link it to, but mark it busy */
	61	if (!parent) {
	62	rstatc->updated_next = cgrp;
	63	break;
	64	}
	65
	66	prstatc = cgroup_rstat_cpu(parent, cpu);
c58632b3 TH	67	rstatc->updated_next = prstatc->updated_children;
c58632b3 TH	68	prstatc->updated_children = cgrp;
dc26532a JW	69
dc26532a JW	70	cgrp = parent;
041cd640 TH	71	}
	72
	73	raw_spin_unlock_irqrestore(cpu_lock, flags);
	74	}
	75
	76	/**
c58632b3	77	* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
041cd640 TH	78	* @pos: current position
	79	* @root: root of the tree to traversal
	80	* @cpu: target cpu
	81	*
08b2b6fd	82	* Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
041cd640 TH	83	* the traversal and %NULL return indicates the end. During traversal,
041cd640 TH	84	* each returned cgroup is unlinked from the tree. Must be called with the
c58632b3	85	* matching cgroup_rstat_cpu_lock held.
041cd640 TH	86	*
	87	* The only ordering guarantee is that, for a parent and a child pair
	88	* covered by a given traversal, if a child is visited, its parent is
	89	* guaranteed to be visited afterwards.
	90	*/
c58632b3 TH	91	static struct cgroup cgroup_rstat_cpu_pop_updated(struct cgroup pos,
c58632b3 TH	92	struct cgroup *root, int cpu)
041cd640	93	{
c58632b3	94	struct cgroup_rstat_cpu *rstatc;
f5f60d23	95	struct cgroup *parent;
041cd640 TH	96
	97	if (pos == root)
	98	return NULL;
	99
	100	/*
	101	* We're gonna walk down to the first leaf and visit/remove it. We
	102	* can pick whatever unvisited node as the starting point.
	103	*/
f5f60d23	104	if (!pos) {
041cd640	105	pos = root;
f5f60d23 WY	106	/* return NULL if this subtree is not on-list */
	107	if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
	108	return NULL;
	109	} else {
041cd640	110	pos = cgroup_parent(pos);
f5f60d23	111	}
041cd640 TH	112
	113	/* walk down to the first leaf */
	114	while (true) {
c58632b3 TH	115	rstatc = cgroup_rstat_cpu(pos, cpu);
c58632b3 TH	116	if (rstatc->updated_children == pos)
041cd640	117	break;
c58632b3	118	pos = rstatc->updated_children;
041cd640 TH	119	}
	120
	121	/*
	122	* Unlink @pos from the tree. As the updated_children list is
	123	* singly linked, we have to walk it to find the removal point.
	124	* However, due to the way we traverse, @pos will be the first
	125	* child in most cases. The only exception is @root.
	126	*/
f5f60d23 WY	127	parent = cgroup_parent(pos);
	128	if (parent) {
	129	struct cgroup_rstat_cpu *prstatc;
	130	struct cgroup **nextp;
041cd640	131
f5f60d23 WY	132	prstatc = cgroup_rstat_cpu(parent, cpu);
	133	nextp = &prstatc->updated_children;
	134	while (*nextp != pos) {
	135	struct cgroup_rstat_cpu *nrstatc;
	136
	137	nrstatc = cgroup_rstat_cpu(*nextp, cpu);
	138	WARN_ON_ONCE(*nextp == parent);
	139	nextp = &nrstatc->updated_next;
	140	}
	141	*nextp = rstatc->updated_next;
041cd640 TH	142	}
041cd640 TH	143
f5f60d23 WY	144	rstatc->updated_next = NULL;
f5f60d23 WY	145	return pos;
041cd640 TH	146	}
041cd640 TH	147
a319185b YA	148	/*
	149	* A hook for bpf stat collectors to attach to and flush their stats.
	150	* Together with providing bpf kfuncs for cgroup_rstat_updated() and
	151	* cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
	152	* collect cgroup stats can integrate with rstat for efficient flushing.
	153	*
	154	* A static noinline declaration here could cause the compiler to optimize away
	155	* the function. A global noinline declaration will keep the definition, but may
	156	* optimize away the callsite. Therefore, __weak is needed to ensure that the
	157	* call is still emitted, by telling the compiler that we don't know what the
	158	* function might eventually be.
	159	*
	160	* __diag_* below are needed to dismiss the missing prototype warning.
	161	*/
	162	__diag_push();
	163	__diag_ignore_all("-Wmissing-prototypes",
	164	"kfuncs which will be used in BPF programs");
	165
	166	__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
	167	struct cgroup *parent, int cpu)
	168	{
	169	}
	170
	171	__diag_pop();
	172
a17556f8	173	/* see cgroup_rstat_flush() */
0fa294fb TH	174	static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
0fa294fb TH	175	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
a17556f8 TH	176	{
	177	int cpu;
	178
0fa294fb	179	lockdep_assert_held(&cgroup_rstat_lock);
a17556f8 TH	180
	181	for_each_possible_cpu(cpu) {
	182	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
	183	cpu);
	184	struct cgroup *pos = NULL;
b1e2c8df	185	unsigned long flags;
a17556f8	186
b1e2c8df SAS	187	/*
	188	* The _irqsave() is needed because cgroup_rstat_lock is
	189	* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
	190	* this lock with the _irq() suffix only disables interrupts on
	191	* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
	192	* interrupts on both configurations. The _irqsave() ensures
	193	* that interrupts are always disabled and later restored.
	194	*/
	195	raw_spin_lock_irqsave(cpu_lock, flags);
8f53470b TH	196	while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
	197	struct cgroup_subsys_state *css;
	198
a17556f8	199	cgroup_base_stat_flush(pos, cpu);
a319185b	200	bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
8f53470b TH	201
	202	rcu_read_lock();
	203	list_for_each_entry_rcu(css, &pos->rstat_css_list,
	204	rstat_css_node)
	205	css->ss->css_rstat_flush(css, cpu);
	206	rcu_read_unlock();
	207	}
b1e2c8df	208	raw_spin_unlock_irqrestore(cpu_lock, flags);
0fa294fb TH	209
	210	/* if @may_sleep, play nice and yield if necessary */
	211	if (may_sleep && (need_resched() \|\|
	212	spin_needbreak(&cgroup_rstat_lock))) {
	213	spin_unlock_irq(&cgroup_rstat_lock);
	214	if (!cond_resched())
	215	cpu_relax();
	216	spin_lock_irq(&cgroup_rstat_lock);
	217	}
a17556f8 TH	218	}
	219	}
	220
	221	/**
	222	* cgroup_rstat_flush - flush stats in @cgrp's subtree
	223	* @cgrp: target cgroup
	224	*
	225	* Collect all per-cpu stats in @cgrp's subtree into the global counters
	226	* and propagate them upwards. After this function returns, all cgroups in
	227	* the subtree have up-to-date ->stat.
	228	*
	229	* This also gets all cgroups in the subtree including @cgrp off the
	230	* ->updated_children lists.
0fa294fb TH	231	*
0fa294fb TH	232	* This function may block.
a17556f8	233	*/
400031e0	234	__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
a17556f8	235	{
0fa294fb TH	236	might_sleep();
	237
	238	spin_lock_irq(&cgroup_rstat_lock);
	239	cgroup_rstat_flush_locked(cgrp, true);
	240	spin_unlock_irq(&cgroup_rstat_lock);
	241	}
	242
	243	/**
	244	* cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
	245	* @cgrp: target cgroup
	246	*
	247	* This function can be called from any context.
	248	*/
	249	void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
	250	{
	251	unsigned long flags;
	252
	253	spin_lock_irqsave(&cgroup_rstat_lock, flags);
	254	cgroup_rstat_flush_locked(cgrp, false);
	255	spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
a17556f8 TH	256	}
a17556f8 TH	257
6162cef0	258	/**
2ca11b0e	259	* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
6162cef0 TH	260	* @cgrp: target cgroup
	261	*
	262	* Flush stats in @cgrp's subtree and prevent further flushes. Must be
	263	* paired with cgroup_rstat_flush_release().
0fa294fb TH	264	*
0fa294fb TH	265	* This function may block.
6162cef0 TH	266	*/
6162cef0 TH	267	void cgroup_rstat_flush_hold(struct cgroup *cgrp)
0fa294fb	268	__acquires(&cgroup_rstat_lock)
6162cef0	269	{
0fa294fb TH	270	might_sleep();
	271	spin_lock_irq(&cgroup_rstat_lock);
	272	cgroup_rstat_flush_locked(cgrp, true);
6162cef0 TH	273	}
	274
	275	/**
	276	* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
	277	*/
	278	void cgroup_rstat_flush_release(void)
0fa294fb	279	__releases(&cgroup_rstat_lock)
6162cef0	280	{
0fa294fb	281	spin_unlock_irq(&cgroup_rstat_lock);
6162cef0 TH	282	}
6162cef0 TH	283
a17556f8 TH	284	int cgroup_rstat_init(struct cgroup *cgrp)
	285	{
	286	int cpu;
	287
	288	/* the root cgrp has rstat_cpu preallocated */
	289	if (!cgrp->rstat_cpu) {
	290	cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
	291	if (!cgrp->rstat_cpu)
	292	return -ENOMEM;
	293	}
	294
	295	/* ->updated_children list is self terminated */
	296	for_each_possible_cpu(cpu) {
	297	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	298
	299	rstatc->updated_children = cgrp;
	300	u64_stats_init(&rstatc->bsync);
	301	}
	302
	303	return 0;
	304	}
	305
	306	void cgroup_rstat_exit(struct cgroup *cgrp)
	307	{
	308	int cpu;
	309
	310	cgroup_rstat_flush(cgrp);
	311
	312	/* sanity check */
	313	for_each_possible_cpu(cpu) {
	314	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	315
	316	if (WARN_ON_ONCE(rstatc->updated_children != cgrp) \|\|
	317	WARN_ON_ONCE(rstatc->updated_next))
	318	return;
	319	}
	320
	321	free_percpu(cgrp->rstat_cpu);
	322	cgrp->rstat_cpu = NULL;
	323	}
	324
	325	void __init cgroup_rstat_boot(void)
	326	{
	327	int cpu;
	328
	329	for_each_possible_cpu(cpu)
	330	raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
a17556f8 TH	331	}
	332
	333	/*
	334	* Functions for cgroup basic resource statistics implemented on top of
	335	* rstat.
	336	*/
1bb5ec2e TH	337	static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
1bb5ec2e TH	338	struct cgroup_base_stat *src_bstat)
041cd640	339	{
d4ff749b TH	340	dst_bstat->cputime.utime += src_bstat->cputime.utime;
	341	dst_bstat->cputime.stime += src_bstat->cputime.stime;
	342	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
1fcf54de JD	343	#ifdef CONFIG_SCHED_CORE
	344	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
	345	#endif
041cd640 TH	346	}
041cd640 TH	347
1bb5ec2e TH	348	static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
	349	struct cgroup_base_stat *src_bstat)
	350	{
	351	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
	352	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
	353	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
1fcf54de JD	354	#ifdef CONFIG_SCHED_CORE
	355	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
	356	#endif
1bb5ec2e TH	357	}
1bb5ec2e TH	358
d4ff749b	359	static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
041cd640	360	{
c58632b3	361	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
dc26532a	362	struct cgroup *parent = cgroup_parent(cgrp);
95b99f35	363	struct cgroup_base_stat delta;
041cd640 TH	364	unsigned seq;
041cd640 TH	365
dc26532a JW	366	/* Root-level stats are sourced from system-wide CPU stats */
	367	if (!parent)
	368	return;
	369
041cd640 TH	370	/* fetch the current per-cpu values */
041cd640 TH	371	do {
d4ff749b	372	seq = __u64_stats_fetch_begin(&rstatc->bsync);
95b99f35	373	delta = rstatc->bstat;
d4ff749b	374	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
041cd640	375
1bb5ec2e	376	/* propagate percpu delta to global */
1bb5ec2e TH	377	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
	378	cgroup_base_stat_add(&cgrp->bstat, &delta);
	379	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
	380
dc26532a JW	381	/* propagate global delta to parent (unless that's root) */
dc26532a JW	382	if (cgroup_parent(parent)) {
1bb5ec2e TH	383	delta = cgrp->bstat;
	384	cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
	385	cgroup_base_stat_add(&parent->bstat, &delta);
	386	cgroup_base_stat_add(&cgrp->last_bstat, &delta);
	387	}
041cd640 TH	388	}
041cd640 TH	389
c58632b3	390	static struct cgroup_rstat_cpu *
c3df5fb5	391	cgroup_base_stat_cputime_account_begin(struct cgroup cgrp, unsigned long flags)
041cd640	392	{
c58632b3	393	struct cgroup_rstat_cpu *rstatc;
041cd640	394
c58632b3	395	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
c3df5fb5	396	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
c58632b3	397	return rstatc;
041cd640 TH	398	}
041cd640 TH	399
d4ff749b	400	static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
c3df5fb5 TH	401	struct cgroup_rstat_cpu *rstatc,
c3df5fb5 TH	402	unsigned long flags)
041cd640	403	{
c3df5fb5	404	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
6162cef0	405	cgroup_rstat_updated(cgrp, smp_processor_id());
c58632b3	406	put_cpu_ptr(rstatc);
041cd640 TH	407	}
	408
	409	void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
	410	{
c58632b3	411	struct cgroup_rstat_cpu *rstatc;
c3df5fb5	412	unsigned long flags;
041cd640	413
c3df5fb5	414	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
d4ff749b	415	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
c3df5fb5	416	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
041cd640 TH	417	}
	418
	419	void __cgroup_account_cputime_field(struct cgroup *cgrp,
	420	enum cpu_usage_stat index, u64 delta_exec)
	421	{
c58632b3	422	struct cgroup_rstat_cpu *rstatc;
c3df5fb5	423	unsigned long flags;
041cd640	424
c3df5fb5	425	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
041cd640 TH	426
	427	switch (index) {
	428	case CPUTIME_USER:
	429	case CPUTIME_NICE:
d4ff749b	430	rstatc->bstat.cputime.utime += delta_exec;
041cd640 TH	431	break;
	432	case CPUTIME_SYSTEM:
	433	case CPUTIME_IRQ:
	434	case CPUTIME_SOFTIRQ:
d4ff749b	435	rstatc->bstat.cputime.stime += delta_exec;
041cd640	436	break;
1fcf54de JD	437	#ifdef CONFIG_SCHED_CORE
	438	case CPUTIME_FORCEIDLE:
	439	rstatc->bstat.forceidle_sum += delta_exec;
	440	break;
	441	#endif
041cd640 TH	442	default:
	443	break;
	444	}
	445
c3df5fb5	446	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
041cd640 TH	447	}
041cd640 TH	448
936f2a70 BB	449	/*
	450	* compute the cputime for the root cgroup by getting the per cpu data
	451	* at a global level, then categorizing the fields in a manner consistent
	452	* with how it is done by __cgroup_account_cputime_field for each bit of
	453	* cpu time attributed to a cgroup.
	454	*/
1fcf54de	455	static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
936f2a70	456	{
1fcf54de	457	struct task_cputime *cputime = &bstat->cputime;
936f2a70 BB	458	int i;
	459
	460	cputime->stime = 0;
	461	cputime->utime = 0;
	462	cputime->sum_exec_runtime = 0;
	463	for_each_possible_cpu(i) {
	464	struct kernel_cpustat kcpustat;
	465	u64 *cpustat = kcpustat.cpustat;
	466	u64 user = 0;
	467	u64 sys = 0;
	468
	469	kcpustat_cpu_fetch(&kcpustat, i);
	470
	471	user += cpustat[CPUTIME_USER];
	472	user += cpustat[CPUTIME_NICE];
	473	cputime->utime += user;
	474
	475	sys += cpustat[CPUTIME_SYSTEM];
	476	sys += cpustat[CPUTIME_IRQ];
	477	sys += cpustat[CPUTIME_SOFTIRQ];
	478	cputime->stime += sys;
	479
	480	cputime->sum_exec_runtime += user;
	481	cputime->sum_exec_runtime += sys;
	482	cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
1fcf54de JD	483
	484	#ifdef CONFIG_SCHED_CORE
	485	bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
	486	#endif
936f2a70 BB	487	}
	488	}
	489
d4ff749b	490	void cgroup_base_stat_cputime_show(struct seq_file *seq)
041cd640 TH	491	{
	492	struct cgroup *cgrp = seq_css(seq)->cgroup;
	493	u64 usage, utime, stime;
1fcf54de JD	494	struct cgroup_base_stat bstat;
	495	#ifdef CONFIG_SCHED_CORE
	496	u64 forceidle_time;
	497	#endif
936f2a70 BB	498
	499	if (cgroup_parent(cgrp)) {
	500	cgroup_rstat_flush_hold(cgrp);
	501	usage = cgrp->bstat.cputime.sum_exec_runtime;
	502	cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
	503	&utime, &stime);
1fcf54de JD	504	#ifdef CONFIG_SCHED_CORE
	505	forceidle_time = cgrp->bstat.forceidle_sum;
	506	#endif
936f2a70 BB	507	cgroup_rstat_flush_release();
936f2a70 BB	508	} else {
1fcf54de JD	509	root_cgroup_cputime(&bstat);
	510	usage = bstat.cputime.sum_exec_runtime;
	511	utime = bstat.cputime.utime;
	512	stime = bstat.cputime.stime;
	513	#ifdef CONFIG_SCHED_CORE
	514	forceidle_time = bstat.forceidle_sum;
	515	#endif
936f2a70	516	}
041cd640 TH	517
	518	do_div(usage, NSEC_PER_USEC);
	519	do_div(utime, NSEC_PER_USEC);
	520	do_div(stime, NSEC_PER_USEC);
1fcf54de JD	521	#ifdef CONFIG_SCHED_CORE
	522	do_div(forceidle_time, NSEC_PER_USEC);
	523	#endif
041cd640	524
d41bf8c9 TH	525	seq_printf(seq, "usage_usec %llu\n"
	526	"user_usec %llu\n"
	527	"system_usec %llu\n",
	528	usage, utime, stime);
1fcf54de JD	529
	530	#ifdef CONFIG_SCHED_CORE
	531	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
	532	#endif
041cd640	533	}
a319185b YA	534
	535	/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
	536	BTF_SET8_START(bpf_rstat_kfunc_ids)
	537	BTF_ID_FLAGS(func, cgroup_rstat_updated)
	538	BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
	539	BTF_SET8_END(bpf_rstat_kfunc_ids)
	540
	541	static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
	542	.owner = THIS_MODULE,
	543	.set = &bpf_rstat_kfunc_ids,
	544	};
	545
	546	static int __init bpf_rstat_kfunc_init(void)
	547	{
	548	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
	549	&bpf_rstat_kfunc_set);
	550	}
	551	late_initcall(bpf_rstat_kfunc_init);