perf record: Add cgroup support for off-cpu profiling
[linux-block.git] / tools / perf / util / bpf_skel / off_cpu.bpf.c
CommitLineData
edc41a10
NK
1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2// Copyright (c) 2022 Google
3#include "vmlinux.h"
4#include <bpf/bpf_helpers.h>
5#include <bpf/bpf_tracing.h>
6#include <bpf/bpf_core_read.h>
7
8/* task->flags for off-cpu analysis */
9#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
10
11/* task->state for off-cpu analysis */
12#define TASK_INTERRUPTIBLE 0x0001
13#define TASK_UNINTERRUPTIBLE 0x0002
14
15#define MAX_STACKS 32
16#define MAX_ENTRIES 102400
17
18struct tstamp_data {
19 __u32 stack_id;
20 __u32 state;
21 __u64 timestamp;
22};
23
24struct offcpu_key {
25 __u32 pid;
26 __u32 tgid;
27 __u32 stack_id;
28 __u32 state;
685439a7 29 __u64 cgroup_id;
edc41a10
NK
30};
31
32struct {
33 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
34 __uint(key_size, sizeof(__u32));
35 __uint(value_size, MAX_STACKS * sizeof(__u64));
36 __uint(max_entries, MAX_ENTRIES);
37} stacks SEC(".maps");
38
39struct {
40 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
41 __uint(map_flags, BPF_F_NO_PREALLOC);
42 __type(key, int);
43 __type(value, struct tstamp_data);
44} tstamp SEC(".maps");
45
46struct {
47 __uint(type, BPF_MAP_TYPE_HASH);
48 __uint(key_size, sizeof(struct offcpu_key));
49 __uint(value_size, sizeof(__u64));
50 __uint(max_entries, MAX_ENTRIES);
51} off_cpu SEC(".maps");
52
10742d0c
NK
53struct {
54 __uint(type, BPF_MAP_TYPE_HASH);
55 __uint(key_size, sizeof(__u32));
56 __uint(value_size, sizeof(__u8));
57 __uint(max_entries, 1);
58} cpu_filter SEC(".maps");
59
60struct {
61 __uint(type, BPF_MAP_TYPE_HASH);
62 __uint(key_size, sizeof(__u32));
63 __uint(value_size, sizeof(__u8));
64 __uint(max_entries, 1);
65} task_filter SEC(".maps");
66
685439a7
NK
67struct {
68 __uint(type, BPF_MAP_TYPE_HASH);
69 __uint(key_size, sizeof(__u64));
70 __uint(value_size, sizeof(__u8));
71 __uint(max_entries, 1);
72} cgroup_filter SEC(".maps");
73
edc41a10
NK
74/* old kernel task_struct definition */
75struct task_struct___old {
76 long state;
77} __attribute__((preserve_access_index));
78
79int enabled = 0;
10742d0c
NK
80int has_cpu = 0;
81int has_task = 0;
685439a7 82int has_cgroup = 0;
edc41a10 83
b36888f7 84const volatile bool has_prev_state = false;
685439a7
NK
85const volatile bool needs_cgroup = false;
86const volatile bool uses_cgroup_v1 = false;
b36888f7 87
edc41a10
NK
88/*
89 * Old kernel used to call it task_struct->state and now it's '__state'.
90 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
91 *
92 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
93 */
94static inline int get_task_state(struct task_struct *t)
95{
96 if (bpf_core_field_exists(t->__state))
97 return BPF_CORE_READ(t, __state);
98
99 /* recast pointer to capture task_struct___old type for compiler */
100 struct task_struct___old *t_old = (void *)t;
101
102 /* now use old "state" name of the field */
103 return BPF_CORE_READ(t_old, state);
104}
105
685439a7
NK
106static inline __u64 get_cgroup_id(struct task_struct *t)
107{
108 struct cgroup *cgrp;
109
110 if (uses_cgroup_v1)
111 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
112 else
113 cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
114
115 return BPF_CORE_READ(cgrp, kn, id);
116}
117
10742d0c
NK
118static inline int can_record(struct task_struct *t, int state)
119{
120 /* kernel threads don't have user stack */
121 if (t->flags & PF_KTHREAD)
122 return 0;
123
124 if (state != TASK_INTERRUPTIBLE &&
125 state != TASK_UNINTERRUPTIBLE)
126 return 0;
127
128 if (has_cpu) {
129 __u32 cpu = bpf_get_smp_processor_id();
130 __u8 *ok;
131
132 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
133 if (!ok)
134 return 0;
135 }
136
137 if (has_task) {
138 __u8 *ok;
139 __u32 pid = t->pid;
140
141 ok = bpf_map_lookup_elem(&task_filter, &pid);
142 if (!ok)
143 return 0;
144 }
145
685439a7
NK
146 if (has_cgroup) {
147 __u8 *ok;
148 __u64 cgrp_id = get_cgroup_id(t);
149
150 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
151 if (!ok)
152 return 0;
153 }
154
10742d0c
NK
155 return 1;
156}
157
b36888f7
NK
158static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
159 struct task_struct *next, int state)
edc41a10
NK
160{
161 __u64 ts;
edc41a10 162 __u32 stack_id;
edc41a10
NK
163 struct tstamp_data *pelem;
164
edc41a10
NK
165 ts = bpf_ktime_get_ns();
166
10742d0c 167 if (!can_record(prev, state))
edc41a10
NK
168 goto next;
169
170 stack_id = bpf_get_stackid(ctx, &stacks,
171 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
172
173 pelem = bpf_task_storage_get(&tstamp, prev, NULL,
174 BPF_LOCAL_STORAGE_GET_F_CREATE);
175 if (!pelem)
176 goto next;
177
178 pelem->timestamp = ts;
179 pelem->state = state;
180 pelem->stack_id = stack_id;
181
182next:
183 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
184
185 if (pelem && pelem->timestamp) {
186 struct offcpu_key key = {
187 .pid = next->pid,
188 .tgid = next->tgid,
189 .stack_id = pelem->stack_id,
190 .state = pelem->state,
685439a7 191 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
edc41a10
NK
192 };
193 __u64 delta = ts - pelem->timestamp;
194 __u64 *total;
195
196 total = bpf_map_lookup_elem(&off_cpu, &key);
197 if (total)
198 *total += delta;
199 else
200 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
201
202 /* prevent to reuse the timestamp later */
203 pelem->timestamp = 0;
204 }
205
206 return 0;
207}
208
b36888f7
NK
209SEC("tp_btf/sched_switch")
210int on_switch(u64 *ctx)
211{
212 struct task_struct *prev, *next;
213 int prev_state;
214
215 if (!enabled)
216 return 0;
217
218 prev = (struct task_struct *)ctx[1];
219 next = (struct task_struct *)ctx[2];
220
221 if (has_prev_state)
222 prev_state = (int)ctx[3];
223 else
224 prev_state = get_task_state(prev);
225
226 return off_cpu_stat(ctx, prev, next, prev_state);
227}
228
edc41a10 229char LICENSE[] SEC("license") = "Dual BSD/GPL";