[linux-block.git] / kernel / cgroup / rstat.c

// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
	unsigned long flags;

	/*
	 * Speculative already-on-list test. This may race leading to
	 * temporary inaccuracies, which is fine.
	 *
	 * Because @parent's updated_children is terminated with @parent
	 * instead of NULL, we can tell whether @cgrp is on the list by
	 * testing the next pointer for NULL.
	 */
	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
		return;

	raw_spin_lock_irqsave(cpu_lock, flags);

	/* put @cgrp and all ancestors on the corresponding updated lists */
	while (true) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
		struct cgroup *parent = cgroup_parent(cgrp);
		struct cgroup_rstat_cpu *prstatc;

		/*
		 * Both additions and removals are bottom-up.  If a cgroup
		 * is already in the tree, all ancestors are.
		 */
		if (rstatc->updated_next)
			break;

		/* Root has no parent to link it to, but mark it busy */
		if (!parent) {
			rstatc->updated_next = cgrp;
			break;
		}

		prstatc = cgroup_rstat_cpu(parent, cpu);
		rstatc->updated_next = prstatc->updated_children;
		prstatc->updated_children = cgrp;

		cgrp = parent;
	}

	raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
 * @pos: current position
 * @root: root of the tree to traversal
 * @cpu: target cpu
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
 * the traversal and %NULL return indicates the end.  During traversal,
 * each returned cgroup is unlinked from the tree.  Must be called with the
 * matching cgroup_rstat_cpu_lock held.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, if a child is visited, its parent is
 * guaranteed to be visited afterwards.
 */
static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
						   struct cgroup *root, int cpu)
{
	struct cgroup_rstat_cpu *rstatc;
	struct cgroup *parent;

	if (pos == root)
		return NULL;

	/*
	 * We're gonna walk down to the first leaf and visit/remove it.  We
	 * can pick whatever unvisited node as the starting point.
	 */
	if (!pos) {
		pos = root;
		/* return NULL if this subtree is not on-list */
		if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
			return NULL;
	} else {
		pos = cgroup_parent(pos);
	}

	/* walk down to the first leaf */
	while (true) {
		rstatc = cgroup_rstat_cpu(pos, cpu);
		if (rstatc->updated_children == pos)
			break;
		pos = rstatc->updated_children;
	}

	/*
	 * Unlink @pos from the tree.  As the updated_children list is
	 * singly linked, we have to walk it to find the removal point.
	 * However, due to the way we traverse, @pos will be the first
	 * child in most cases. The only exception is @root.
	 */
	parent = cgroup_parent(pos);
	if (parent) {
		struct cgroup_rstat_cpu *prstatc;
		struct cgroup **nextp;

		prstatc = cgroup_rstat_cpu(parent, cpu);
		nextp = &prstatc->updated_children;
		while (*nextp != pos) {
			struct cgroup_rstat_cpu *nrstatc;

			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
			WARN_ON_ONCE(*nextp == parent);
			nextp = &nrstatc->updated_next;
		}
		*nextp = rstatc->updated_next;
	}

	rstatc->updated_next = NULL;
	return pos;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
				     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
	int cpu;

	lockdep_assert_held(&cgroup_rstat_lock);

	for_each_possible_cpu(cpu) {
		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
						       cpu);
		struct cgroup *pos = NULL;
		unsigned long flags;

		/*
		 * The _irqsave() is needed because cgroup_rstat_lock is
		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
		 * this lock with the _irq() suffix only disables interrupts on
		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
		 * interrupts on both configurations. The _irqsave() ensures
		 * that interrupts are always disabled and later restored.
		 */
		raw_spin_lock_irqsave(cpu_lock, flags);
		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
			struct cgroup_subsys_state *css;

			cgroup_base_stat_flush(pos, cpu);
			bpf_rstat_flush(pos, cgroup_parent(pos), cpu);

			rcu_read_lock();
			list_for_each_entry_rcu(css, &pos->rstat_css_list,
						rstat_css_node)
				css->ss->css_rstat_flush(css, cpu);
			rcu_read_unlock();
		}
		raw_spin_unlock_irqrestore(cpu_lock, flags);

		/* play nice and yield if necessary */
		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
			spin_unlock_irq(&cgroup_rstat_lock);
			if (!cond_resched())
				cpu_relax();
			spin_lock_irq(&cgroup_rstat_lock);
		}
	}
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
	might_sleep();

	spin_lock_irq(&cgroup_rstat_lock);
	cgroup_rstat_flush_locked(cgrp);
	spin_unlock_irq(&cgroup_rstat_lock);
}

/**
 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
	__acquires(&cgroup_rstat_lock)
{
	might_sleep();
	spin_lock_irq(&cgroup_rstat_lock);
	cgroup_rstat_flush_locked(cgrp);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 */
void cgroup_rstat_flush_release(void)
	__releases(&cgroup_rstat_lock)
{
	spin_unlock_irq(&cgroup_rstat_lock);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
	int cpu;

	/* the root cgrp has rstat_cpu preallocated */
	if (!cgrp->rstat_cpu) {
		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
		if (!cgrp->rstat_cpu)
			return -ENOMEM;
	}

	/* ->updated_children list is self terminated */
	for_each_possible_cpu(cpu) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

		rstatc->updated_children = cgrp;
		u64_stats_init(&rstatc->bsync);
	}

	return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
	int cpu;

	cgroup_rstat_flush(cgrp);

	/* sanity check */
	for_each_possible_cpu(cpu) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
		    WARN_ON_ONCE(rstatc->updated_next))
			return;
	}

	free_percpu(cgrp->rstat_cpu);
	cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
	int cpu;

	for_each_possible_cpu(cpu)
		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
				 struct cgroup_base_stat *src_bstat)
{
	dst_bstat->cputime.utime += src_bstat->cputime.utime;
	dst_bstat->cputime.stime += src_bstat->cputime.stime;
	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
				 struct cgroup_base_stat *src_bstat)
{
	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	struct cgroup *parent = cgroup_parent(cgrp);
	struct cgroup_rstat_cpu *prstatc;
	struct cgroup_base_stat delta;
	unsigned seq;

	/* Root-level stats are sourced from system-wide CPU stats */
	if (!parent)
		return;

	/* fetch the current per-cpu values */
	do {
		seq = __u64_stats_fetch_begin(&rstatc->bsync);
		delta = rstatc->bstat;
	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

	/* propagate per-cpu delta to cgroup and per-cpu global statistics */
	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
	cgroup_base_stat_add(&cgrp->bstat, &delta);
	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
	cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);

	/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
	if (cgroup_parent(parent)) {
		delta = cgrp->bstat;
		cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
		cgroup_base_stat_add(&parent->bstat, &delta);
		cgroup_base_stat_add(&cgrp->last_bstat, &delta);

		delta = rstatc->subtree_bstat;
		prstatc = cgroup_rstat_cpu(parent, cpu);
		cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
		cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
		cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
	}
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
	struct cgroup_rstat_cpu *rstatc;

	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
	return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
						 struct cgroup_rstat_cpu *rstatc,
						 unsigned long flags)
{
	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
	cgroup_rstat_updated(cgrp, smp_processor_id());
	put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
	struct cgroup_rstat_cpu *rstatc;
	unsigned long flags;

	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
				    enum cpu_usage_stat index, u64 delta_exec)
{
	struct cgroup_rstat_cpu *rstatc;
	unsigned long flags;

	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

	switch (index) {
	case CPUTIME_USER:
	case CPUTIME_NICE:
		rstatc->bstat.cputime.utime += delta_exec;
		break;
	case CPUTIME_SYSTEM:
	case CPUTIME_IRQ:
	case CPUTIME_SOFTIRQ:
		rstatc->bstat.cputime.stime += delta_exec;
		break;
#ifdef CONFIG_SCHED_CORE
	case CPUTIME_FORCEIDLE:
		rstatc->bstat.forceidle_sum += delta_exec;
		break;
#endif
	default:
		break;
	}

	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
	struct task_cputime *cputime = &bstat->cputime;
	int i;

	memset(bstat, 0, sizeof(*bstat));
	for_each_possible_cpu(i) {
		struct kernel_cpustat kcpustat;
		u64 *cpustat = kcpustat.cpustat;
		u64 user = 0;
		u64 sys = 0;

		kcpustat_cpu_fetch(&kcpustat, i);

		user += cpustat[CPUTIME_USER];
		user += cpustat[CPUTIME_NICE];
		cputime->utime += user;

		sys += cpustat[CPUTIME_SYSTEM];
		sys += cpustat[CPUTIME_IRQ];
		sys += cpustat[CPUTIME_SOFTIRQ];
		cputime->stime += sys;

		cputime->sum_exec_runtime += user;
		cputime->sum_exec_runtime += sys;
		cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];

#ifdef CONFIG_SCHED_CORE
		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
	}
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
	struct cgroup *cgrp = seq_css(seq)->cgroup;
	u64 usage, utime, stime;
	struct cgroup_base_stat bstat;
#ifdef CONFIG_SCHED_CORE
	u64 forceidle_time;
#endif

	if (cgroup_parent(cgrp)) {
		cgroup_rstat_flush_hold(cgrp);
		usage = cgrp->bstat.cputime.sum_exec_runtime;
		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
			       &utime, &stime);
#ifdef CONFIG_SCHED_CORE
		forceidle_time = cgrp->bstat.forceidle_sum;
#endif
		cgroup_rstat_flush_release();
	} else {
		root_cgroup_cputime(&bstat);
		usage = bstat.cputime.sum_exec_runtime;
		utime = bstat.cputime.utime;
		stime = bstat.cputime.stime;
#ifdef CONFIG_SCHED_CORE
		forceidle_time = bstat.forceidle_sum;
#endif
	}

	do_div(usage, NSEC_PER_USEC);
	do_div(utime, NSEC_PER_USEC);
	do_div(stime, NSEC_PER_USEC);
#ifdef CONFIG_SCHED_CORE
	do_div(forceidle_time, NSEC_PER_USEC);
#endif

	seq_printf(seq, "usage_usec %llu\n"
		   "user_usec %llu\n"
		   "system_usec %llu\n",
		   usage, utime, stime);

#ifdef CONFIG_SCHED_CORE
	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
BTF_SET8_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
BTF_SET8_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
	.owner          = THIS_MODULE,
	.set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
					 &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);
Commit	Line	Data
457c8996	1	// SPDX-License-Identifier: GPL-2.0-only
041cd640 TH	2	#include "cgroup-internal.h"
	3
	4	#include <linux/sched/cputime.h>
	5
a319185b YA	6	#include <linux/bpf.h>
	7	#include <linux/btf.h>
	8	#include <linux/btf_ids.h>
	9
0fa294fb	10	static DEFINE_SPINLOCK(cgroup_rstat_lock);
c58632b3	11	static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
041cd640	12
a17556f8 TH	13	static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
a17556f8 TH	14
c58632b3	15	static struct cgroup_rstat_cpu cgroup_rstat_cpu(struct cgroup cgrp, int cpu)
041cd640	16	{
c58632b3	17	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
041cd640 TH	18	}
	19
	20	/**
6162cef0	21	* cgroup_rstat_updated - keep track of updated rstat_cpu
041cd640	22	* @cgrp: target cgroup
c58632b3	23	* @cpu: cpu on which rstat_cpu was updated
041cd640	24	*
c58632b3 TH	25	* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
	26	* rstat_cpu->updated_children list. See the comment on top of
	27	* cgroup_rstat_cpu definition for details.
041cd640	28	*/
400031e0	29	__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
041cd640	30	{
c58632b3	31	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
041cd640 TH	32	unsigned long flags;
	33
	34	/*
d8ef4b38 TH	35	* Speculative already-on-list test. This may race leading to
	36	* temporary inaccuracies, which is fine.
	37	*
041cd640 TH	38	* Because @parent's updated_children is terminated with @parent
	39	* instead of NULL, we can tell whether @cgrp is on the list by
	40	* testing the next pointer for NULL.
	41	*/
eda09706	42	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
041cd640 TH	43	return;
	44
	45	raw_spin_lock_irqsave(cpu_lock, flags);
	46
	47	/* put @cgrp and all ancestors on the corresponding updated lists */
dc26532a	48	while (true) {
c58632b3	49	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
dc26532a JW	50	struct cgroup *parent = cgroup_parent(cgrp);
dc26532a JW	51	struct cgroup_rstat_cpu *prstatc;
041cd640 TH	52
	53	/*
	54	* Both additions and removals are bottom-up. If a cgroup
	55	* is already in the tree, all ancestors are.
	56	*/
c58632b3	57	if (rstatc->updated_next)
041cd640 TH	58	break;
041cd640 TH	59
dc26532a JW	60	/* Root has no parent to link it to, but mark it busy */
	61	if (!parent) {
	62	rstatc->updated_next = cgrp;
	63	break;
	64	}
	65
	66	prstatc = cgroup_rstat_cpu(parent, cpu);
c58632b3 TH	67	rstatc->updated_next = prstatc->updated_children;
c58632b3 TH	68	prstatc->updated_children = cgrp;
dc26532a JW	69
dc26532a JW	70	cgrp = parent;
041cd640 TH	71	}
	72
	73	raw_spin_unlock_irqrestore(cpu_lock, flags);
	74	}
	75
	76	/**
c58632b3	77	* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
041cd640 TH	78	* @pos: current position
	79	* @root: root of the tree to traversal
	80	* @cpu: target cpu
	81	*
08b2b6fd	82	* Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
041cd640 TH	83	* the traversal and %NULL return indicates the end. During traversal,
041cd640 TH	84	* each returned cgroup is unlinked from the tree. Must be called with the
c58632b3	85	* matching cgroup_rstat_cpu_lock held.
041cd640 TH	86	*
	87	* The only ordering guarantee is that, for a parent and a child pair
	88	* covered by a given traversal, if a child is visited, its parent is
	89	* guaranteed to be visited afterwards.
	90	*/
c58632b3 TH	91	static struct cgroup cgroup_rstat_cpu_pop_updated(struct cgroup pos,
c58632b3 TH	92	struct cgroup *root, int cpu)
041cd640	93	{
c58632b3	94	struct cgroup_rstat_cpu *rstatc;
f5f60d23	95	struct cgroup *parent;
041cd640 TH	96
	97	if (pos == root)
	98	return NULL;
	99
	100	/*
	101	* We're gonna walk down to the first leaf and visit/remove it. We
	102	* can pick whatever unvisited node as the starting point.
	103	*/
f5f60d23	104	if (!pos) {
041cd640	105	pos = root;
f5f60d23 WY	106	/* return NULL if this subtree is not on-list */
	107	if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
	108	return NULL;
	109	} else {
041cd640	110	pos = cgroup_parent(pos);
f5f60d23	111	}
041cd640 TH	112
	113	/* walk down to the first leaf */
	114	while (true) {
c58632b3 TH	115	rstatc = cgroup_rstat_cpu(pos, cpu);
c58632b3 TH	116	if (rstatc->updated_children == pos)
041cd640	117	break;
c58632b3	118	pos = rstatc->updated_children;
041cd640 TH	119	}
	120
	121	/*
	122	* Unlink @pos from the tree. As the updated_children list is
	123	* singly linked, we have to walk it to find the removal point.
	124	* However, due to the way we traverse, @pos will be the first
	125	* child in most cases. The only exception is @root.
	126	*/
f5f60d23 WY	127	parent = cgroup_parent(pos);
	128	if (parent) {
	129	struct cgroup_rstat_cpu *prstatc;
	130	struct cgroup **nextp;
041cd640	131
f5f60d23 WY	132	prstatc = cgroup_rstat_cpu(parent, cpu);
	133	nextp = &prstatc->updated_children;
	134	while (*nextp != pos) {
	135	struct cgroup_rstat_cpu *nrstatc;
	136
	137	nrstatc = cgroup_rstat_cpu(*nextp, cpu);
	138	WARN_ON_ONCE(*nextp == parent);
	139	nextp = &nrstatc->updated_next;
	140	}
	141	*nextp = rstatc->updated_next;
041cd640 TH	142	}
041cd640 TH	143
f5f60d23 WY	144	rstatc->updated_next = NULL;
f5f60d23 WY	145	return pos;
041cd640 TH	146	}
041cd640 TH	147
a319185b YA	148	/*
	149	* A hook for bpf stat collectors to attach to and flush their stats.
	150	* Together with providing bpf kfuncs for cgroup_rstat_updated() and
	151	* cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
	152	* collect cgroup stats can integrate with rstat for efficient flushing.
	153	*
	154	* A static noinline declaration here could cause the compiler to optimize away
	155	* the function. A global noinline declaration will keep the definition, but may
	156	* optimize away the callsite. Therefore, __weak is needed to ensure that the
	157	* call is still emitted, by telling the compiler that we don't know what the
	158	* function might eventually be.
a319185b	159	*/
15fb6f2b DM	160
15fb6f2b DM	161	__bpf_hook_start();
a319185b YA	162
	163	__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
	164	struct cgroup *parent, int cpu)
	165	{
	166	}
	167
15fb6f2b	168	__bpf_hook_end();
a319185b	169
a17556f8	170	/* see cgroup_rstat_flush() */
0a2dc6ac	171	static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
0fa294fb	172	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
a17556f8 TH	173	{
	174	int cpu;
	175
0fa294fb	176	lockdep_assert_held(&cgroup_rstat_lock);
a17556f8 TH	177
	178	for_each_possible_cpu(cpu) {
	179	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
	180	cpu);
	181	struct cgroup *pos = NULL;
b1e2c8df	182	unsigned long flags;
a17556f8	183
b1e2c8df SAS	184	/*
	185	* The _irqsave() is needed because cgroup_rstat_lock is
	186	* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
	187	* this lock with the _irq() suffix only disables interrupts on
	188	* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
	189	* interrupts on both configurations. The _irqsave() ensures
	190	* that interrupts are always disabled and later restored.
	191	*/
	192	raw_spin_lock_irqsave(cpu_lock, flags);
8f53470b TH	193	while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
	194	struct cgroup_subsys_state *css;
	195
a17556f8	196	cgroup_base_stat_flush(pos, cpu);
a319185b	197	bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
8f53470b TH	198
	199	rcu_read_lock();
	200	list_for_each_entry_rcu(css, &pos->rstat_css_list,
	201	rstat_css_node)
	202	css->ss->css_rstat_flush(css, cpu);
	203	rcu_read_unlock();
	204	}
b1e2c8df	205	raw_spin_unlock_irqrestore(cpu_lock, flags);
0fa294fb	206
0a2dc6ac YA	207	/* play nice and yield if necessary */
0a2dc6ac YA	208	if (need_resched() \|\| spin_needbreak(&cgroup_rstat_lock)) {
0fa294fb TH	209	spin_unlock_irq(&cgroup_rstat_lock);
	210	if (!cond_resched())
	211	cpu_relax();
	212	spin_lock_irq(&cgroup_rstat_lock);
	213	}
a17556f8 TH	214	}
	215	}
	216
	217	/**
	218	* cgroup_rstat_flush - flush stats in @cgrp's subtree
	219	* @cgrp: target cgroup
	220	*
	221	* Collect all per-cpu stats in @cgrp's subtree into the global counters
	222	* and propagate them upwards. After this function returns, all cgroups in
	223	* the subtree have up-to-date ->stat.
	224	*
	225	* This also gets all cgroups in the subtree including @cgrp off the
	226	* ->updated_children lists.
0fa294fb TH	227	*
0fa294fb TH	228	* This function may block.
a17556f8	229	*/
400031e0	230	__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
a17556f8	231	{
0fa294fb TH	232	might_sleep();
	233
	234	spin_lock_irq(&cgroup_rstat_lock);
0a2dc6ac	235	cgroup_rstat_flush_locked(cgrp);
0fa294fb TH	236	spin_unlock_irq(&cgroup_rstat_lock);
	237	}
	238
6162cef0	239	/**
2ca11b0e	240	* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
6162cef0 TH	241	* @cgrp: target cgroup
	242	*
	243	* Flush stats in @cgrp's subtree and prevent further flushes. Must be
	244	* paired with cgroup_rstat_flush_release().
0fa294fb TH	245	*
0fa294fb TH	246	* This function may block.
6162cef0 TH	247	*/
6162cef0 TH	248	void cgroup_rstat_flush_hold(struct cgroup *cgrp)
0fa294fb	249	__acquires(&cgroup_rstat_lock)
6162cef0	250	{
0fa294fb TH	251	might_sleep();
0fa294fb TH	252	spin_lock_irq(&cgroup_rstat_lock);
0a2dc6ac	253	cgroup_rstat_flush_locked(cgrp);
6162cef0 TH	254	}
	255
	256	/**
	257	* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
	258	*/
	259	void cgroup_rstat_flush_release(void)
0fa294fb	260	__releases(&cgroup_rstat_lock)
6162cef0	261	{
0fa294fb	262	spin_unlock_irq(&cgroup_rstat_lock);
6162cef0 TH	263	}
6162cef0 TH	264
a17556f8 TH	265	int cgroup_rstat_init(struct cgroup *cgrp)
	266	{
	267	int cpu;
	268
	269	/* the root cgrp has rstat_cpu preallocated */
	270	if (!cgrp->rstat_cpu) {
	271	cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
	272	if (!cgrp->rstat_cpu)
	273	return -ENOMEM;
	274	}
	275
	276	/* ->updated_children list is self terminated */
	277	for_each_possible_cpu(cpu) {
	278	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	279
	280	rstatc->updated_children = cgrp;
	281	u64_stats_init(&rstatc->bsync);
	282	}
	283
	284	return 0;
	285	}
	286
	287	void cgroup_rstat_exit(struct cgroup *cgrp)
	288	{
	289	int cpu;
	290
	291	cgroup_rstat_flush(cgrp);
	292
	293	/* sanity check */
	294	for_each_possible_cpu(cpu) {
	295	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	296
	297	if (WARN_ON_ONCE(rstatc->updated_children != cgrp) \|\|
	298	WARN_ON_ONCE(rstatc->updated_next))
	299	return;
	300	}
	301
	302	free_percpu(cgrp->rstat_cpu);
	303	cgrp->rstat_cpu = NULL;
	304	}
	305
	306	void __init cgroup_rstat_boot(void)
	307	{
	308	int cpu;
	309
	310	for_each_possible_cpu(cpu)
	311	raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
a17556f8 TH	312	}
	313
	314	/*
	315	* Functions for cgroup basic resource statistics implemented on top of
	316	* rstat.
	317	*/
1bb5ec2e TH	318	static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
1bb5ec2e TH	319	struct cgroup_base_stat *src_bstat)
041cd640	320	{
d4ff749b TH	321	dst_bstat->cputime.utime += src_bstat->cputime.utime;
	322	dst_bstat->cputime.stime += src_bstat->cputime.stime;
	323	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
1fcf54de JD	324	#ifdef CONFIG_SCHED_CORE
	325	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
	326	#endif
041cd640 TH	327	}
041cd640 TH	328
1bb5ec2e TH	329	static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
	330	struct cgroup_base_stat *src_bstat)
	331	{
	332	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
	333	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
	334	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
1fcf54de JD	335	#ifdef CONFIG_SCHED_CORE
	336	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
	337	#endif
1bb5ec2e TH	338	}
1bb5ec2e TH	339
d4ff749b	340	static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
041cd640	341	{
c58632b3	342	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
dc26532a	343	struct cgroup *parent = cgroup_parent(cgrp);
0437719c	344	struct cgroup_rstat_cpu *prstatc;
95b99f35	345	struct cgroup_base_stat delta;
041cd640 TH	346	unsigned seq;
041cd640 TH	347
dc26532a JW	348	/* Root-level stats are sourced from system-wide CPU stats */
	349	if (!parent)
	350	return;
	351
041cd640 TH	352	/* fetch the current per-cpu values */
041cd640 TH	353	do {
d4ff749b	354	seq = __u64_stats_fetch_begin(&rstatc->bsync);
95b99f35	355	delta = rstatc->bstat;
d4ff749b	356	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
041cd640	357
0437719c	358	/* propagate per-cpu delta to cgroup and per-cpu global statistics */
1bb5ec2e TH	359	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
	360	cgroup_base_stat_add(&cgrp->bstat, &delta);
	361	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
0437719c	362	cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
1bb5ec2e	363
0437719c	364	/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
dc26532a	365	if (cgroup_parent(parent)) {
1bb5ec2e TH	366	delta = cgrp->bstat;
	367	cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
	368	cgroup_base_stat_add(&parent->bstat, &delta);
	369	cgroup_base_stat_add(&cgrp->last_bstat, &delta);
0437719c HJ	370
	371	delta = rstatc->subtree_bstat;
	372	prstatc = cgroup_rstat_cpu(parent, cpu);
	373	cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
	374	cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
	375	cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
1bb5ec2e	376	}
041cd640 TH	377	}
041cd640 TH	378
c58632b3	379	static struct cgroup_rstat_cpu *
c3df5fb5	380	cgroup_base_stat_cputime_account_begin(struct cgroup cgrp, unsigned long flags)
041cd640	381	{
c58632b3	382	struct cgroup_rstat_cpu *rstatc;
041cd640	383
c58632b3	384	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
c3df5fb5	385	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
c58632b3	386	return rstatc;
041cd640 TH	387	}
041cd640 TH	388
d4ff749b	389	static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
c3df5fb5 TH	390	struct cgroup_rstat_cpu *rstatc,
c3df5fb5 TH	391	unsigned long flags)
041cd640	392	{
c3df5fb5	393	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
6162cef0	394	cgroup_rstat_updated(cgrp, smp_processor_id());
c58632b3	395	put_cpu_ptr(rstatc);
041cd640 TH	396	}
	397
	398	void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
	399	{
c58632b3	400	struct cgroup_rstat_cpu *rstatc;
c3df5fb5	401	unsigned long flags;
041cd640	402
c3df5fb5	403	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
d4ff749b	404	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
c3df5fb5	405	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
041cd640 TH	406	}
	407
	408	void __cgroup_account_cputime_field(struct cgroup *cgrp,
	409	enum cpu_usage_stat index, u64 delta_exec)
	410	{
c58632b3	411	struct cgroup_rstat_cpu *rstatc;
c3df5fb5	412	unsigned long flags;
041cd640	413
c3df5fb5	414	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
041cd640 TH	415
	416	switch (index) {
	417	case CPUTIME_USER:
	418	case CPUTIME_NICE:
d4ff749b	419	rstatc->bstat.cputime.utime += delta_exec;
041cd640 TH	420	break;
	421	case CPUTIME_SYSTEM:
	422	case CPUTIME_IRQ:
	423	case CPUTIME_SOFTIRQ:
d4ff749b	424	rstatc->bstat.cputime.stime += delta_exec;
041cd640	425	break;
1fcf54de JD	426	#ifdef CONFIG_SCHED_CORE
	427	case CPUTIME_FORCEIDLE:
	428	rstatc->bstat.forceidle_sum += delta_exec;
	429	break;
	430	#endif
041cd640 TH	431	default:
	432	break;
	433	}
	434
c3df5fb5	435	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
041cd640 TH	436	}
041cd640 TH	437
936f2a70 BB	438	/*
	439	* compute the cputime for the root cgroup by getting the per cpu data
	440	* at a global level, then categorizing the fields in a manner consistent
	441	* with how it is done by __cgroup_account_cputime_field for each bit of
	442	* cpu time attributed to a cgroup.
	443	*/
1fcf54de	444	static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
936f2a70	445	{
1fcf54de	446	struct task_cputime *cputime = &bstat->cputime;
936f2a70 BB	447	int i;
936f2a70 BB	448
fcdb1eda	449	memset(bstat, 0, sizeof(*bstat));
936f2a70 BB	450	for_each_possible_cpu(i) {
	451	struct kernel_cpustat kcpustat;
	452	u64 *cpustat = kcpustat.cpustat;
	453	u64 user = 0;
	454	u64 sys = 0;
	455
	456	kcpustat_cpu_fetch(&kcpustat, i);
	457
	458	user += cpustat[CPUTIME_USER];
	459	user += cpustat[CPUTIME_NICE];
	460	cputime->utime += user;
	461
	462	sys += cpustat[CPUTIME_SYSTEM];
	463	sys += cpustat[CPUTIME_IRQ];
	464	sys += cpustat[CPUTIME_SOFTIRQ];
	465	cputime->stime += sys;
	466
	467	cputime->sum_exec_runtime += user;
	468	cputime->sum_exec_runtime += sys;
	469	cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
1fcf54de JD	470
	471	#ifdef CONFIG_SCHED_CORE
	472	bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
	473	#endif
936f2a70 BB	474	}
	475	}
	476
d4ff749b	477	void cgroup_base_stat_cputime_show(struct seq_file *seq)
041cd640 TH	478	{
	479	struct cgroup *cgrp = seq_css(seq)->cgroup;
	480	u64 usage, utime, stime;
1fcf54de JD	481	struct cgroup_base_stat bstat;
	482	#ifdef CONFIG_SCHED_CORE
	483	u64 forceidle_time;
	484	#endif
936f2a70 BB	485
	486	if (cgroup_parent(cgrp)) {
	487	cgroup_rstat_flush_hold(cgrp);
	488	usage = cgrp->bstat.cputime.sum_exec_runtime;
	489	cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
	490	&utime, &stime);
1fcf54de JD	491	#ifdef CONFIG_SCHED_CORE
	492	forceidle_time = cgrp->bstat.forceidle_sum;
	493	#endif
936f2a70 BB	494	cgroup_rstat_flush_release();
936f2a70 BB	495	} else {
1fcf54de JD	496	root_cgroup_cputime(&bstat);
	497	usage = bstat.cputime.sum_exec_runtime;
	498	utime = bstat.cputime.utime;
	499	stime = bstat.cputime.stime;
	500	#ifdef CONFIG_SCHED_CORE
	501	forceidle_time = bstat.forceidle_sum;
	502	#endif
936f2a70	503	}
041cd640 TH	504
	505	do_div(usage, NSEC_PER_USEC);
	506	do_div(utime, NSEC_PER_USEC);
	507	do_div(stime, NSEC_PER_USEC);
1fcf54de JD	508	#ifdef CONFIG_SCHED_CORE
	509	do_div(forceidle_time, NSEC_PER_USEC);
	510	#endif
041cd640	511
d41bf8c9 TH	512	seq_printf(seq, "usage_usec %llu\n"
	513	"user_usec %llu\n"
	514	"system_usec %llu\n",
	515	usage, utime, stime);
1fcf54de JD	516
	517	#ifdef CONFIG_SCHED_CORE
	518	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
	519	#endif
041cd640	520	}
a319185b YA	521
	522	/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
	523	BTF_SET8_START(bpf_rstat_kfunc_ids)
	524	BTF_ID_FLAGS(func, cgroup_rstat_updated)
	525	BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
	526	BTF_SET8_END(bpf_rstat_kfunc_ids)
	527
	528	static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
	529	.owner = THIS_MODULE,
	530	.set = &bpf_rstat_kfunc_ids,
	531	};
	532
	533	static int __init bpf_rstat_kfunc_init(void)
	534	{
	535	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
	536	&bpf_rstat_kfunc_set);
	537	}
	538	late_initcall(bpf_rstat_kfunc_init);