[linux-block.git] / tools / perf / util / bpf_skel / off_cpu.bpf.c

// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2022 Google
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

/* task->flags for off-cpu analysis */
#define PF_KTHREAD   0x00200000  /* I am a kernel thread */

/* task->state for off-cpu analysis */
#define TASK_INTERRUPTIBLE	0x0001
#define TASK_UNINTERRUPTIBLE	0x0002

#define MAX_STACKS   32
#define MAX_ENTRIES  102400

struct tstamp_data {
	__u32 stack_id;
	__u32 state;
	__u64 timestamp;
};

struct offcpu_key {
	__u32 pid;
	__u32 tgid;
	__u32 stack_id;
	__u32 state;
	__u64 cgroup_id;
};

struct {
	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, MAX_STACKS * sizeof(__u64));
	__uint(max_entries, MAX_ENTRIES);
} stacks SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct tstamp_data);
} tstamp SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(struct offcpu_key));
	__uint(value_size, sizeof(__u64));
	__uint(max_entries, MAX_ENTRIES);
} off_cpu SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
} cpu_filter SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
} task_filter SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(key_size, sizeof(__u64));
	__uint(value_size, sizeof(__u8));
	__uint(max_entries, 1);
} cgroup_filter SEC(".maps");

/* old kernel task_struct definition */
struct task_struct___old {
	long state;
} __attribute__((preserve_access_index));

int enabled = 0;
int has_cpu = 0;
int has_task = 0;
int has_cgroup = 0;

const volatile bool has_prev_state = false;
const volatile bool needs_cgroup = false;
const volatile bool uses_cgroup_v1 = false;

/*
 * Old kernel used to call it task_struct->state and now it's '__state'.
 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
 *
 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
 */
static inline int get_task_state(struct task_struct *t)
{
	if (bpf_core_field_exists(t->__state))
		return BPF_CORE_READ(t, __state);

	/* recast pointer to capture task_struct___old type for compiler */
	struct task_struct___old *t_old = (void *)t;

	/* now use old "state" name of the field */
	return BPF_CORE_READ(t_old, state);
}

static inline __u64 get_cgroup_id(struct task_struct *t)
{
	struct cgroup *cgrp;

	if (uses_cgroup_v1)
		cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
	else
		cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);

	return BPF_CORE_READ(cgrp, kn, id);
}

static inline int can_record(struct task_struct *t, int state)
{
	/* kernel threads don't have user stack */
	if (t->flags & PF_KTHREAD)
		return 0;

	if (state != TASK_INTERRUPTIBLE &&
	    state != TASK_UNINTERRUPTIBLE)
		return 0;

	if (has_cpu) {
		__u32 cpu = bpf_get_smp_processor_id();
		__u8 *ok;

		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
		if (!ok)
			return 0;
	}

	if (has_task) {
		__u8 *ok;
		__u32 pid = t->pid;

		ok = bpf_map_lookup_elem(&task_filter, &pid);
		if (!ok)
			return 0;
	}

	if (has_cgroup) {
		__u8 *ok;
		__u64 cgrp_id = get_cgroup_id(t);

		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
		if (!ok)
			return 0;
	}

	return 1;
}

static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
			struct task_struct *next, int state)
{
	__u64 ts;
	__u32 stack_id;
	struct tstamp_data *pelem;

	ts = bpf_ktime_get_ns();

	if (!can_record(prev, state))
		goto next;

	stack_id = bpf_get_stackid(ctx, &stacks,
				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);

	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
				     BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!pelem)
		goto next;

	pelem->timestamp = ts;
	pelem->state = state;
	pelem->stack_id = stack_id;

next:
	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);

	if (pelem && pelem->timestamp) {
		struct offcpu_key key = {
			.pid = next->pid,
			.tgid = next->tgid,
			.stack_id = pelem->stack_id,
			.state = pelem->state,
			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
		};
		__u64 delta = ts - pelem->timestamp;
		__u64 *total;

		total = bpf_map_lookup_elem(&off_cpu, &key);
		if (total)
			*total += delta;
		else
			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);

		/* prevent to reuse the timestamp later */
		pelem->timestamp = 0;
	}

	return 0;
}

SEC("tp_btf/sched_switch")
int on_switch(u64 *ctx)
{
	struct task_struct *prev, *next;
	int prev_state;

	if (!enabled)
		return 0;

	prev = (struct task_struct *)ctx[1];
	next = (struct task_struct *)ctx[2];

	if (has_prev_state)
		prev_state = (int)ctx[3];
	else
		prev_state = get_task_state(prev);

	return off_cpu_stat(ctx, prev, next, prev_state);
}

char LICENSE[] SEC("license") = "Dual BSD/GPL";
Commit	Line	Data
edc41a10 NK	1	// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
	2	// Copyright (c) 2022 Google
	3	#include "vmlinux.h"
	4	#include <bpf/bpf_helpers.h>
	5	#include <bpf/bpf_tracing.h>
	6	#include <bpf/bpf_core_read.h>
	7
	8	/* task->flags for off-cpu analysis */
	9	#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
	10
	11	/* task->state for off-cpu analysis */
	12	#define TASK_INTERRUPTIBLE 0x0001
	13	#define TASK_UNINTERRUPTIBLE 0x0002
	14
	15	#define MAX_STACKS 32
	16	#define MAX_ENTRIES 102400
	17
	18	struct tstamp_data {
	19	__u32 stack_id;
	20	__u32 state;
	21	__u64 timestamp;
	22	};
	23
	24	struct offcpu_key {
	25	__u32 pid;
	26	__u32 tgid;
	27	__u32 stack_id;
	28	__u32 state;
685439a7	29	__u64 cgroup_id;
edc41a10 NK	30	};
	31
	32	struct {
	33	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
	34	__uint(key_size, sizeof(__u32));
	35	__uint(value_size, MAX_STACKS * sizeof(__u64));
	36	__uint(max_entries, MAX_ENTRIES);
	37	} stacks SEC(".maps");
	38
	39	struct {
	40	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	41	__uint(map_flags, BPF_F_NO_PREALLOC);
	42	__type(key, int);
	43	__type(value, struct tstamp_data);
	44	} tstamp SEC(".maps");
	45
	46	struct {
	47	__uint(type, BPF_MAP_TYPE_HASH);
	48	__uint(key_size, sizeof(struct offcpu_key));
	49	__uint(value_size, sizeof(__u64));
	50	__uint(max_entries, MAX_ENTRIES);
	51	} off_cpu SEC(".maps");
	52
10742d0c NK	53	struct {
	54	__uint(type, BPF_MAP_TYPE_HASH);
	55	__uint(key_size, sizeof(__u32));
	56	__uint(value_size, sizeof(__u8));
	57	__uint(max_entries, 1);
	58	} cpu_filter SEC(".maps");
	59
	60	struct {
	61	__uint(type, BPF_MAP_TYPE_HASH);
	62	__uint(key_size, sizeof(__u32));
	63	__uint(value_size, sizeof(__u8));
	64	__uint(max_entries, 1);
	65	} task_filter SEC(".maps");
	66
685439a7 NK	67	struct {
	68	__uint(type, BPF_MAP_TYPE_HASH);
	69	__uint(key_size, sizeof(__u64));
	70	__uint(value_size, sizeof(__u8));
	71	__uint(max_entries, 1);
	72	} cgroup_filter SEC(".maps");
	73
edc41a10 NK	74	/* old kernel task_struct definition */
	75	struct task_struct___old {
	76	long state;
	77	} __attribute__((preserve_access_index));
	78
	79	int enabled = 0;
10742d0c NK	80	int has_cpu = 0;
10742d0c NK	81	int has_task = 0;
685439a7	82	int has_cgroup = 0;
edc41a10	83
b36888f7	84	const volatile bool has_prev_state = false;
685439a7 NK	85	const volatile bool needs_cgroup = false;
685439a7 NK	86	const volatile bool uses_cgroup_v1 = false;
b36888f7	87
edc41a10 NK	88	/*
	89	* Old kernel used to call it task_struct->state and now it's '__state'.
	90	* Use BPF CO-RE "ignored suffix rule" to deal with it like below:
	91	*
	92	* https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
	93	*/
	94	static inline int get_task_state(struct task_struct *t)
	95	{
	96	if (bpf_core_field_exists(t->__state))
	97	return BPF_CORE_READ(t, __state);
	98
	99	/* recast pointer to capture task_struct___old type for compiler */
	100	struct task_struct___old t_old = (void )t;
	101
	102	/* now use old "state" name of the field */
	103	return BPF_CORE_READ(t_old, state);
	104	}
	105
685439a7 NK	106	static inline __u64 get_cgroup_id(struct task_struct *t)
	107	{
	108	struct cgroup *cgrp;
	109
	110	if (uses_cgroup_v1)
	111	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
	112	else
	113	cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
	114
	115	return BPF_CORE_READ(cgrp, kn, id);
	116	}
	117
10742d0c NK	118	static inline int can_record(struct task_struct *t, int state)
	119	{
	120	/* kernel threads don't have user stack */
	121	if (t->flags & PF_KTHREAD)
	122	return 0;
	123
	124	if (state != TASK_INTERRUPTIBLE &&
	125	state != TASK_UNINTERRUPTIBLE)
	126	return 0;
	127
	128	if (has_cpu) {
	129	__u32 cpu = bpf_get_smp_processor_id();
	130	__u8 *ok;
	131
	132	ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
	133	if (!ok)
	134	return 0;
	135	}
	136
	137	if (has_task) {
	138	__u8 *ok;
	139	__u32 pid = t->pid;
	140
	141	ok = bpf_map_lookup_elem(&task_filter, &pid);
	142	if (!ok)
	143	return 0;
	144	}
	145
685439a7 NK	146	if (has_cgroup) {
	147	__u8 *ok;
	148	__u64 cgrp_id = get_cgroup_id(t);
	149
	150	ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
	151	if (!ok)
	152	return 0;
	153	}
	154
10742d0c NK	155	return 1;
	156	}
	157
b36888f7 NK	158	static int off_cpu_stat(u64 ctx, struct task_struct prev,
b36888f7 NK	159	struct task_struct *next, int state)
edc41a10 NK	160	{
edc41a10 NK	161	__u64 ts;
edc41a10	162	__u32 stack_id;
edc41a10 NK	163	struct tstamp_data *pelem;
edc41a10 NK	164
edc41a10 NK	165	ts = bpf_ktime_get_ns();
edc41a10 NK	166
10742d0c	167	if (!can_record(prev, state))
edc41a10 NK	168	goto next;
	169
	170	stack_id = bpf_get_stackid(ctx, &stacks,
	171	BPF_F_FAST_STACK_CMP \| BPF_F_USER_STACK);
	172
	173	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
	174	BPF_LOCAL_STORAGE_GET_F_CREATE);
	175	if (!pelem)
	176	goto next;
	177
	178	pelem->timestamp = ts;
	179	pelem->state = state;
	180	pelem->stack_id = stack_id;
	181
	182	next:
	183	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
	184
	185	if (pelem && pelem->timestamp) {
	186	struct offcpu_key key = {
	187	.pid = next->pid,
	188	.tgid = next->tgid,
	189	.stack_id = pelem->stack_id,
	190	.state = pelem->state,
685439a7	191	.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
edc41a10 NK	192	};
	193	__u64 delta = ts - pelem->timestamp;
	194	__u64 *total;
	195
	196	total = bpf_map_lookup_elem(&off_cpu, &key);
	197	if (total)
	198	*total += delta;
	199	else
	200	bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
	201
	202	/* prevent to reuse the timestamp later */
	203	pelem->timestamp = 0;
	204	}
	205
	206	return 0;
	207	}
	208
b36888f7 NK	209	SEC("tp_btf/sched_switch")
	210	int on_switch(u64 *ctx)
	211	{
	212	struct task_struct prev, next;
	213	int prev_state;
	214
	215	if (!enabled)
	216	return 0;
	217
	218	prev = (struct task_struct *)ctx[1];
	219	next = (struct task_struct *)ctx[2];
	220
	221	if (has_prev_state)
	222	prev_state = (int)ctx[3];
	223	else
	224	prev_state = get_task_state(prev);
	225
	226	return off_cpu_stat(ctx, prev, next, prev_state);
	227	}
	228
edc41a10	229	char LICENSE[] SEC("license") = "Dual BSD/GPL";