[linux-block.git] / kernel / cgroup / rstat.c

// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
	unsigned long flags;

	/*
	 * Speculative already-on-list test. This may race leading to
	 * temporary inaccuracies, which is fine.
	 *
	 * Because @parent's updated_children is terminated with @parent
	 * instead of NULL, we can tell whether @cgrp is on the list by
	 * testing the next pointer for NULL.
	 */
	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
		return;

	raw_spin_lock_irqsave(cpu_lock, flags);

	/* put @cgrp and all ancestors on the corresponding updated lists */
	while (true) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
		struct cgroup *parent = cgroup_parent(cgrp);
		struct cgroup_rstat_cpu *prstatc;

		/*
		 * Both additions and removals are bottom-up.  If a cgroup
		 * is already in the tree, all ancestors are.
		 */
		if (rstatc->updated_next)
			break;

		/* Root has no parent to link it to, but mark it busy */
		if (!parent) {
			rstatc->updated_next = cgrp;
			break;
		}

		prstatc = cgroup_rstat_cpu(parent, cpu);
		rstatc->updated_next = prstatc->updated_children;
		prstatc->updated_children = cgrp;

		cgrp = parent;
	}

	raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
 * @pos: current position
 * @root: root of the tree to traversal
 * @cpu: target cpu
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
 * the traversal and %NULL return indicates the end.  During traversal,
 * each returned cgroup is unlinked from the tree.  Must be called with the
 * matching cgroup_rstat_cpu_lock held.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, if a child is visited, its parent is
 * guaranteed to be visited afterwards.
 */
static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
						   struct cgroup *root, int cpu)
{
	struct cgroup_rstat_cpu *rstatc;
	struct cgroup *parent;

	if (pos == root)
		return NULL;

	/*
	 * We're gonna walk down to the first leaf and visit/remove it.  We
	 * can pick whatever unvisited node as the starting point.
	 */
	if (!pos) {
		pos = root;
		/* return NULL if this subtree is not on-list */
		if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
			return NULL;
	} else {
		pos = cgroup_parent(pos);
	}

	/* walk down to the first leaf */
	while (true) {
		rstatc = cgroup_rstat_cpu(pos, cpu);
		if (rstatc->updated_children == pos)
			break;
		pos = rstatc->updated_children;
	}

	/*
	 * Unlink @pos from the tree.  As the updated_children list is
	 * singly linked, we have to walk it to find the removal point.
	 * However, due to the way we traverse, @pos will be the first
	 * child in most cases. The only exception is @root.
	 */
	parent = cgroup_parent(pos);
	if (parent) {
		struct cgroup_rstat_cpu *prstatc;
		struct cgroup **nextp;

		prstatc = cgroup_rstat_cpu(parent, cpu);
		nextp = &prstatc->updated_children;
		while (*nextp != pos) {
			struct cgroup_rstat_cpu *nrstatc;

			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
			WARN_ON_ONCE(*nextp == parent);
			nextp = &nrstatc->updated_next;
		}
		*nextp = rstatc->updated_next;
	}

	rstatc->updated_next = NULL;
	return pos;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
				     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
	int cpu;

	lockdep_assert_held(&cgroup_rstat_lock);

	for_each_possible_cpu(cpu) {
		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
						       cpu);
		struct cgroup *pos = NULL;
		unsigned long flags;

		/*
		 * The _irqsave() is needed because cgroup_rstat_lock is
		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
		 * this lock with the _irq() suffix only disables interrupts on
		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
		 * interrupts on both configurations. The _irqsave() ensures
		 * that interrupts are always disabled and later restored.
		 */
		raw_spin_lock_irqsave(cpu_lock, flags);
		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
			struct cgroup_subsys_state *css;

			cgroup_base_stat_flush(pos, cpu);
			bpf_rstat_flush(pos, cgroup_parent(pos), cpu);

			rcu_read_lock();
			list_for_each_entry_rcu(css, &pos->rstat_css_list,
						rstat_css_node)
				css->ss->css_rstat_flush(css, cpu);
			rcu_read_unlock();
		}
		raw_spin_unlock_irqrestore(cpu_lock, flags);

		/* play nice and yield if necessary */
		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
			spin_unlock_irq(&cgroup_rstat_lock);
			if (!cond_resched())
				cpu_relax();
			spin_lock_irq(&cgroup_rstat_lock);
		}
	}
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
	might_sleep();

	spin_lock_irq(&cgroup_rstat_lock);
	cgroup_rstat_flush_locked(cgrp);
	spin_unlock_irq(&cgroup_rstat_lock);
}

/**
 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
	__acquires(&cgroup_rstat_lock)
{
	might_sleep();
	spin_lock_irq(&cgroup_rstat_lock);
	cgroup_rstat_flush_locked(cgrp);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 */
void cgroup_rstat_flush_release(void)
	__releases(&cgroup_rstat_lock)
{
	spin_unlock_irq(&cgroup_rstat_lock);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
	int cpu;

	/* the root cgrp has rstat_cpu preallocated */
	if (!cgrp->rstat_cpu) {
		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
		if (!cgrp->rstat_cpu)
			return -ENOMEM;
	}

	/* ->updated_children list is self terminated */
	for_each_possible_cpu(cpu) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

		rstatc->updated_children = cgrp;
		u64_stats_init(&rstatc->bsync);
	}

	return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
	int cpu;

	cgroup_rstat_flush(cgrp);

	/* sanity check */
	for_each_possible_cpu(cpu) {
		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
		    WARN_ON_ONCE(rstatc->updated_next))
			return;
	}

	free_percpu(cgrp->rstat_cpu);
	cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
	int cpu;

	for_each_possible_cpu(cpu)
		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
				 struct cgroup_base_stat *src_bstat)
{
	dst_bstat->cputime.utime += src_bstat->cputime.utime;
	dst_bstat->cputime.stime += src_bstat->cputime.stime;
	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
				 struct cgroup_base_stat *src_bstat)
{
	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	struct cgroup *parent = cgroup_parent(cgrp);
	struct cgroup_rstat_cpu *prstatc;
	struct cgroup_base_stat delta;
	unsigned seq;

	/* Root-level stats are sourced from system-wide CPU stats */
	if (!parent)
		return;

	/* fetch the current per-cpu values */
	do {
		seq = __u64_stats_fetch_begin(&rstatc->bsync);
		delta = rstatc->bstat;
	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

	/* propagate per-cpu delta to cgroup and per-cpu global statistics */
	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
	cgroup_base_stat_add(&cgrp->bstat, &delta);
	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
	cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);

	/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
	if (cgroup_parent(parent)) {
		delta = cgrp->bstat;
		cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
		cgroup_base_stat_add(&parent->bstat, &delta);
		cgroup_base_stat_add(&cgrp->last_bstat, &delta);

		delta = rstatc->subtree_bstat;
		prstatc = cgroup_rstat_cpu(parent, cpu);
		cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
		cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
		cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
	}
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
	struct cgroup_rstat_cpu *rstatc;

	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
	return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
						 struct cgroup_rstat_cpu *rstatc,
						 unsigned long flags)
{
	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
	cgroup_rstat_updated(cgrp, smp_processor_id());
	put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
	struct cgroup_rstat_cpu *rstatc;
	unsigned long flags;

	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
				    enum cpu_usage_stat index, u64 delta_exec)
{
	struct cgroup_rstat_cpu *rstatc;
	unsigned long flags;

	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

	switch (index) {
	case CPUTIME_USER:
	case CPUTIME_NICE:
		rstatc->bstat.cputime.utime += delta_exec;
		break;
	case CPUTIME_SYSTEM:
	case CPUTIME_IRQ:
	case CPUTIME_SOFTIRQ:
		rstatc->bstat.cputime.stime += delta_exec;
		break;
#ifdef CONFIG_SCHED_CORE
	case CPUTIME_FORCEIDLE:
		rstatc->bstat.forceidle_sum += delta_exec;
		break;
#endif
	default:
		break;
	}

	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
	struct task_cputime *cputime = &bstat->cputime;
	int i;

	memset(bstat, 0, sizeof(*bstat));
	for_each_possible_cpu(i) {
		struct kernel_cpustat kcpustat;
		u64 *cpustat = kcpustat.cpustat;
		u64 user = 0;
		u64 sys = 0;

		kcpustat_cpu_fetch(&kcpustat, i);

		user += cpustat[CPUTIME_USER];
		user += cpustat[CPUTIME_NICE];
		cputime->utime += user;

		sys += cpustat[CPUTIME_SYSTEM];
		sys += cpustat[CPUTIME_IRQ];
		sys += cpustat[CPUTIME_SOFTIRQ];
		cputime->stime += sys;

		cputime->sum_exec_runtime += user;
		cputime->sum_exec_runtime += sys;
		cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];

#ifdef CONFIG_SCHED_CORE
		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
	}
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
	struct cgroup *cgrp = seq_css(seq)->cgroup;
	u64 usage, utime, stime;
	struct cgroup_base_stat bstat;
#ifdef CONFIG_SCHED_CORE
	u64 forceidle_time;
#endif

	if (cgroup_parent(cgrp)) {
		cgroup_rstat_flush_hold(cgrp);
		usage = cgrp->bstat.cputime.sum_exec_runtime;
		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
			       &utime, &stime);
#ifdef CONFIG_SCHED_CORE
		forceidle_time = cgrp->bstat.forceidle_sum;
#endif
		cgroup_rstat_flush_release();
	} else {
		root_cgroup_cputime(&bstat);
		usage = bstat.cputime.sum_exec_runtime;
		utime = bstat.cputime.utime;
		stime = bstat.cputime.stime;
#ifdef CONFIG_SCHED_CORE
		forceidle_time = bstat.forceidle_sum;
#endif
	}

	do_div(usage, NSEC_PER_USEC);
	do_div(utime, NSEC_PER_USEC);
	do_div(stime, NSEC_PER_USEC);
#ifdef CONFIG_SCHED_CORE
	do_div(forceidle_time, NSEC_PER_USEC);
#endif

	seq_printf(seq, "usage_usec %llu\n"
		   "user_usec %llu\n"
		   "system_usec %llu\n",
		   usage, utime, stime);

#ifdef CONFIG_SCHED_CORE
	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
BTF_SET8_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
BTF_SET8_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
	.owner          = THIS_MODULE,
	.set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
					 &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);
Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0-only
	2	#include "cgroup-internal.h"
	3
	4	#include <linux/sched/cputime.h>
	5
	6	#include <linux/bpf.h>
	7	#include <linux/btf.h>
	8	#include <linux/btf_ids.h>
	9
	10	static DEFINE_SPINLOCK(cgroup_rstat_lock);
	11	static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
	12
	13	static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
	14
	15	static struct cgroup_rstat_cpu cgroup_rstat_cpu(struct cgroup cgrp, int cpu)
	16	{
	17	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
	18	}
	19
	20	/**
	21	* cgroup_rstat_updated - keep track of updated rstat_cpu
	22	* @cgrp: target cgroup
	23	* @cpu: cpu on which rstat_cpu was updated
	24	*
	25	* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
	26	* rstat_cpu->updated_children list. See the comment on top of
	27	* cgroup_rstat_cpu definition for details.
	28	*/
	29	__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
	30	{
	31	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
	32	unsigned long flags;
	33
	34	/*
	35	* Speculative already-on-list test. This may race leading to
	36	* temporary inaccuracies, which is fine.
	37	*
	38	* Because @parent's updated_children is terminated with @parent
	39	* instead of NULL, we can tell whether @cgrp is on the list by
	40	* testing the next pointer for NULL.
	41	*/
	42	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
	43	return;
	44
	45	raw_spin_lock_irqsave(cpu_lock, flags);
	46
	47	/* put @cgrp and all ancestors on the corresponding updated lists */
	48	while (true) {
	49	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	50	struct cgroup *parent = cgroup_parent(cgrp);
	51	struct cgroup_rstat_cpu *prstatc;
	52
	53	/*
	54	* Both additions and removals are bottom-up. If a cgroup
	55	* is already in the tree, all ancestors are.
	56	*/
	57	if (rstatc->updated_next)
	58	break;
	59
	60	/* Root has no parent to link it to, but mark it busy */
	61	if (!parent) {
	62	rstatc->updated_next = cgrp;
	63	break;
	64	}
	65
	66	prstatc = cgroup_rstat_cpu(parent, cpu);
	67	rstatc->updated_next = prstatc->updated_children;
	68	prstatc->updated_children = cgrp;
	69
	70	cgrp = parent;
	71	}
	72
	73	raw_spin_unlock_irqrestore(cpu_lock, flags);
	74	}
	75
	76	/**
	77	* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
	78	* @pos: current position
	79	* @root: root of the tree to traversal
	80	* @cpu: target cpu
	81	*
	82	* Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
	83	* the traversal and %NULL return indicates the end. During traversal,
	84	* each returned cgroup is unlinked from the tree. Must be called with the
	85	* matching cgroup_rstat_cpu_lock held.
	86	*
	87	* The only ordering guarantee is that, for a parent and a child pair
	88	* covered by a given traversal, if a child is visited, its parent is
	89	* guaranteed to be visited afterwards.
	90	*/
	91	static struct cgroup cgroup_rstat_cpu_pop_updated(struct cgroup pos,
	92	struct cgroup *root, int cpu)
	93	{
	94	struct cgroup_rstat_cpu *rstatc;
	95	struct cgroup *parent;
	96
	97	if (pos == root)
	98	return NULL;
	99
	100	/*
	101	* We're gonna walk down to the first leaf and visit/remove it. We
	102	* can pick whatever unvisited node as the starting point.
	103	*/
	104	if (!pos) {
	105	pos = root;
	106	/* return NULL if this subtree is not on-list */
	107	if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
	108	return NULL;
	109	} else {
	110	pos = cgroup_parent(pos);
	111	}
	112
	113	/* walk down to the first leaf */
	114	while (true) {
	115	rstatc = cgroup_rstat_cpu(pos, cpu);
	116	if (rstatc->updated_children == pos)
	117	break;
	118	pos = rstatc->updated_children;
	119	}
	120
	121	/*
	122	* Unlink @pos from the tree. As the updated_children list is
	123	* singly linked, we have to walk it to find the removal point.
	124	* However, due to the way we traverse, @pos will be the first
	125	* child in most cases. The only exception is @root.
	126	*/
	127	parent = cgroup_parent(pos);
	128	if (parent) {
	129	struct cgroup_rstat_cpu *prstatc;
	130	struct cgroup **nextp;
	131
	132	prstatc = cgroup_rstat_cpu(parent, cpu);
	133	nextp = &prstatc->updated_children;
	134	while (*nextp != pos) {
	135	struct cgroup_rstat_cpu *nrstatc;
	136
	137	nrstatc = cgroup_rstat_cpu(*nextp, cpu);
	138	WARN_ON_ONCE(*nextp == parent);
	139	nextp = &nrstatc->updated_next;
	140	}
	141	*nextp = rstatc->updated_next;
	142	}
	143
	144	rstatc->updated_next = NULL;
	145	return pos;
	146	}
	147
	148	/*
	149	* A hook for bpf stat collectors to attach to and flush their stats.
	150	* Together with providing bpf kfuncs for cgroup_rstat_updated() and
	151	* cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
	152	* collect cgroup stats can integrate with rstat for efficient flushing.
	153	*
	154	* A static noinline declaration here could cause the compiler to optimize away
	155	* the function. A global noinline declaration will keep the definition, but may
	156	* optimize away the callsite. Therefore, __weak is needed to ensure that the
	157	* call is still emitted, by telling the compiler that we don't know what the
	158	* function might eventually be.
	159	*/
	160
	161	__bpf_hook_start();
	162
	163	__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
	164	struct cgroup *parent, int cpu)
	165	{
	166	}
	167
	168	__bpf_hook_end();
	169
	170	/* see cgroup_rstat_flush() */
	171	static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
	172	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
	173	{
	174	int cpu;
	175
	176	lockdep_assert_held(&cgroup_rstat_lock);
	177
	178	for_each_possible_cpu(cpu) {
	179	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
	180	cpu);
	181	struct cgroup *pos = NULL;
	182	unsigned long flags;
	183
	184	/*
	185	* The _irqsave() is needed because cgroup_rstat_lock is
	186	* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
	187	* this lock with the _irq() suffix only disables interrupts on
	188	* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
	189	* interrupts on both configurations. The _irqsave() ensures
	190	* that interrupts are always disabled and later restored.
	191	*/
	192	raw_spin_lock_irqsave(cpu_lock, flags);
	193	while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
	194	struct cgroup_subsys_state *css;
	195
	196	cgroup_base_stat_flush(pos, cpu);
	197	bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
	198
	199	rcu_read_lock();
	200	list_for_each_entry_rcu(css, &pos->rstat_css_list,
	201	rstat_css_node)
	202	css->ss->css_rstat_flush(css, cpu);
	203	rcu_read_unlock();
	204	}
	205	raw_spin_unlock_irqrestore(cpu_lock, flags);
	206
	207	/* play nice and yield if necessary */
	208	if (need_resched() \|\| spin_needbreak(&cgroup_rstat_lock)) {
	209	spin_unlock_irq(&cgroup_rstat_lock);
	210	if (!cond_resched())
	211	cpu_relax();
	212	spin_lock_irq(&cgroup_rstat_lock);
	213	}
	214	}
	215	}
	216
	217	/**
	218	* cgroup_rstat_flush - flush stats in @cgrp's subtree
	219	* @cgrp: target cgroup
	220	*
	221	* Collect all per-cpu stats in @cgrp's subtree into the global counters
	222	* and propagate them upwards. After this function returns, all cgroups in
	223	* the subtree have up-to-date ->stat.
	224	*
	225	* This also gets all cgroups in the subtree including @cgrp off the
	226	* ->updated_children lists.
	227	*
	228	* This function may block.
	229	*/
	230	__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
	231	{
	232	might_sleep();
	233
	234	spin_lock_irq(&cgroup_rstat_lock);
	235	cgroup_rstat_flush_locked(cgrp);
	236	spin_unlock_irq(&cgroup_rstat_lock);
	237	}
	238
	239	/**
	240	* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
	241	* @cgrp: target cgroup
	242	*
	243	* Flush stats in @cgrp's subtree and prevent further flushes. Must be
	244	* paired with cgroup_rstat_flush_release().
	245	*
	246	* This function may block.
	247	*/
	248	void cgroup_rstat_flush_hold(struct cgroup *cgrp)
	249	__acquires(&cgroup_rstat_lock)
	250	{
	251	might_sleep();
	252	spin_lock_irq(&cgroup_rstat_lock);
	253	cgroup_rstat_flush_locked(cgrp);
	254	}
	255
	256	/**
	257	* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
	258	*/
	259	void cgroup_rstat_flush_release(void)
	260	__releases(&cgroup_rstat_lock)
	261	{
	262	spin_unlock_irq(&cgroup_rstat_lock);
	263	}
	264
	265	int cgroup_rstat_init(struct cgroup *cgrp)
	266	{
	267	int cpu;
	268
	269	/* the root cgrp has rstat_cpu preallocated */
	270	if (!cgrp->rstat_cpu) {
	271	cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
	272	if (!cgrp->rstat_cpu)
	273	return -ENOMEM;
	274	}
	275
	276	/* ->updated_children list is self terminated */
	277	for_each_possible_cpu(cpu) {
	278	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	279
	280	rstatc->updated_children = cgrp;
	281	u64_stats_init(&rstatc->bsync);
	282	}
	283
	284	return 0;
	285	}
	286
	287	void cgroup_rstat_exit(struct cgroup *cgrp)
	288	{
	289	int cpu;
	290
	291	cgroup_rstat_flush(cgrp);
	292
	293	/* sanity check */
	294	for_each_possible_cpu(cpu) {
	295	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	296
	297	if (WARN_ON_ONCE(rstatc->updated_children != cgrp) \|\|
	298	WARN_ON_ONCE(rstatc->updated_next))
	299	return;
	300	}
	301
	302	free_percpu(cgrp->rstat_cpu);
	303	cgrp->rstat_cpu = NULL;
	304	}
	305
	306	void __init cgroup_rstat_boot(void)
	307	{
	308	int cpu;
	309
	310	for_each_possible_cpu(cpu)
	311	raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
	312	}
	313
	314	/*
	315	* Functions for cgroup basic resource statistics implemented on top of
	316	* rstat.
	317	*/
	318	static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
	319	struct cgroup_base_stat *src_bstat)
	320	{
	321	dst_bstat->cputime.utime += src_bstat->cputime.utime;
	322	dst_bstat->cputime.stime += src_bstat->cputime.stime;
	323	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
	324	#ifdef CONFIG_SCHED_CORE
	325	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
	326	#endif
	327	}
	328
	329	static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
	330	struct cgroup_base_stat *src_bstat)
	331	{
	332	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
	333	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
	334	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
	335	#ifdef CONFIG_SCHED_CORE
	336	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
	337	#endif
	338	}
	339
	340	static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
	341	{
	342	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
	343	struct cgroup *parent = cgroup_parent(cgrp);
	344	struct cgroup_rstat_cpu *prstatc;
	345	struct cgroup_base_stat delta;
	346	unsigned seq;
	347
	348	/* Root-level stats are sourced from system-wide CPU stats */
	349	if (!parent)
	350	return;
	351
	352	/* fetch the current per-cpu values */
	353	do {
	354	seq = __u64_stats_fetch_begin(&rstatc->bsync);
	355	delta = rstatc->bstat;
	356	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
	357
	358	/* propagate per-cpu delta to cgroup and per-cpu global statistics */
	359	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
	360	cgroup_base_stat_add(&cgrp->bstat, &delta);
	361	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
	362	cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
	363
	364	/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
	365	if (cgroup_parent(parent)) {
	366	delta = cgrp->bstat;
	367	cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
	368	cgroup_base_stat_add(&parent->bstat, &delta);
	369	cgroup_base_stat_add(&cgrp->last_bstat, &delta);
	370
	371	delta = rstatc->subtree_bstat;
	372	prstatc = cgroup_rstat_cpu(parent, cpu);
	373	cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
	374	cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
	375	cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
	376	}
	377	}
	378
	379	static struct cgroup_rstat_cpu *
	380	cgroup_base_stat_cputime_account_begin(struct cgroup cgrp, unsigned long flags)
	381	{
	382	struct cgroup_rstat_cpu *rstatc;
	383
	384	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
	385	*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
	386	return rstatc;
	387	}
	388
	389	static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
	390	struct cgroup_rstat_cpu *rstatc,
	391	unsigned long flags)
	392	{
	393	u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
	394	cgroup_rstat_updated(cgrp, smp_processor_id());
	395	put_cpu_ptr(rstatc);
	396	}
	397
	398	void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
	399	{
	400	struct cgroup_rstat_cpu *rstatc;
	401	unsigned long flags;
	402
	403	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
	404	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
	405	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
	406	}
	407
	408	void __cgroup_account_cputime_field(struct cgroup *cgrp,
	409	enum cpu_usage_stat index, u64 delta_exec)
	410	{
	411	struct cgroup_rstat_cpu *rstatc;
	412	unsigned long flags;
	413
	414	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
	415
	416	switch (index) {
	417	case CPUTIME_USER:
	418	case CPUTIME_NICE:
	419	rstatc->bstat.cputime.utime += delta_exec;
	420	break;
	421	case CPUTIME_SYSTEM:
	422	case CPUTIME_IRQ:
	423	case CPUTIME_SOFTIRQ:
	424	rstatc->bstat.cputime.stime += delta_exec;
	425	break;
	426	#ifdef CONFIG_SCHED_CORE
	427	case CPUTIME_FORCEIDLE:
	428	rstatc->bstat.forceidle_sum += delta_exec;
	429	break;
	430	#endif
	431	default:
	432	break;
	433	}
	434
	435	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
	436	}
	437
	438	/*
	439	* compute the cputime for the root cgroup by getting the per cpu data
	440	* at a global level, then categorizing the fields in a manner consistent
	441	* with how it is done by __cgroup_account_cputime_field for each bit of
	442	* cpu time attributed to a cgroup.
	443	*/
	444	static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
	445	{
	446	struct task_cputime *cputime = &bstat->cputime;
	447	int i;
	448
	449	memset(bstat, 0, sizeof(*bstat));
	450	for_each_possible_cpu(i) {
	451	struct kernel_cpustat kcpustat;
	452	u64 *cpustat = kcpustat.cpustat;
	453	u64 user = 0;
	454	u64 sys = 0;
	455
	456	kcpustat_cpu_fetch(&kcpustat, i);
	457
	458	user += cpustat[CPUTIME_USER];
	459	user += cpustat[CPUTIME_NICE];
	460	cputime->utime += user;
	461
	462	sys += cpustat[CPUTIME_SYSTEM];
	463	sys += cpustat[CPUTIME_IRQ];
	464	sys += cpustat[CPUTIME_SOFTIRQ];
	465	cputime->stime += sys;
	466
	467	cputime->sum_exec_runtime += user;
	468	cputime->sum_exec_runtime += sys;
	469	cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
	470
	471	#ifdef CONFIG_SCHED_CORE
	472	bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
	473	#endif
	474	}
	475	}
	476
	477	void cgroup_base_stat_cputime_show(struct seq_file *seq)
	478	{
	479	struct cgroup *cgrp = seq_css(seq)->cgroup;
	480	u64 usage, utime, stime;
	481	struct cgroup_base_stat bstat;
	482	#ifdef CONFIG_SCHED_CORE
	483	u64 forceidle_time;
	484	#endif
	485
	486	if (cgroup_parent(cgrp)) {
	487	cgroup_rstat_flush_hold(cgrp);
	488	usage = cgrp->bstat.cputime.sum_exec_runtime;
	489	cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
	490	&utime, &stime);
	491	#ifdef CONFIG_SCHED_CORE
	492	forceidle_time = cgrp->bstat.forceidle_sum;
	493	#endif
	494	cgroup_rstat_flush_release();
	495	} else {
	496	root_cgroup_cputime(&bstat);
	497	usage = bstat.cputime.sum_exec_runtime;
	498	utime = bstat.cputime.utime;
	499	stime = bstat.cputime.stime;
	500	#ifdef CONFIG_SCHED_CORE
	501	forceidle_time = bstat.forceidle_sum;
	502	#endif
	503	}
	504
	505	do_div(usage, NSEC_PER_USEC);
	506	do_div(utime, NSEC_PER_USEC);
	507	do_div(stime, NSEC_PER_USEC);
	508	#ifdef CONFIG_SCHED_CORE
	509	do_div(forceidle_time, NSEC_PER_USEC);
	510	#endif
	511
	512	seq_printf(seq, "usage_usec %llu\n"
	513	"user_usec %llu\n"
	514	"system_usec %llu\n",
	515	usage, utime, stime);
	516
	517	#ifdef CONFIG_SCHED_CORE
	518	seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
	519	#endif
	520	}
	521
	522	/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
	523	BTF_SET8_START(bpf_rstat_kfunc_ids)
	524	BTF_ID_FLAGS(func, cgroup_rstat_updated)
	525	BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
	526	BTF_SET8_END(bpf_rstat_kfunc_ids)
	527
	528	static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
	529	.owner = THIS_MODULE,
	530	.set = &bpf_rstat_kfunc_ids,
	531	};
	532
	533	static int __init bpf_rstat_kfunc_init(void)
	534	{
	535	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
	536	&bpf_rstat_kfunc_set);
	537	}
	538	late_initcall(bpf_rstat_kfunc_init);