Commit | Line | Data |
---|---|---|
edc41a10 NK |
1 | // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) |
2 | // Copyright (c) 2022 Google | |
3 | #include "vmlinux.h" | |
4 | #include <bpf/bpf_helpers.h> | |
5 | #include <bpf/bpf_tracing.h> | |
6 | #include <bpf/bpf_core_read.h> | |
7 | ||
8 | /* task->flags for off-cpu analysis */ | |
9 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ | |
10 | ||
11 | /* task->state for off-cpu analysis */ | |
12 | #define TASK_INTERRUPTIBLE 0x0001 | |
13 | #define TASK_UNINTERRUPTIBLE 0x0002 | |
14 | ||
15 | #define MAX_STACKS 32 | |
16 | #define MAX_ENTRIES 102400 | |
17 | ||
18 | struct tstamp_data { | |
19 | __u32 stack_id; | |
20 | __u32 state; | |
21 | __u64 timestamp; | |
22 | }; | |
23 | ||
24 | struct offcpu_key { | |
25 | __u32 pid; | |
26 | __u32 tgid; | |
27 | __u32 stack_id; | |
28 | __u32 state; | |
685439a7 | 29 | __u64 cgroup_id; |
edc41a10 NK |
30 | }; |
31 | ||
32 | struct { | |
33 | __uint(type, BPF_MAP_TYPE_STACK_TRACE); | |
34 | __uint(key_size, sizeof(__u32)); | |
35 | __uint(value_size, MAX_STACKS * sizeof(__u64)); | |
36 | __uint(max_entries, MAX_ENTRIES); | |
37 | } stacks SEC(".maps"); | |
38 | ||
39 | struct { | |
40 | __uint(type, BPF_MAP_TYPE_TASK_STORAGE); | |
41 | __uint(map_flags, BPF_F_NO_PREALLOC); | |
42 | __type(key, int); | |
43 | __type(value, struct tstamp_data); | |
44 | } tstamp SEC(".maps"); | |
45 | ||
46 | struct { | |
47 | __uint(type, BPF_MAP_TYPE_HASH); | |
48 | __uint(key_size, sizeof(struct offcpu_key)); | |
49 | __uint(value_size, sizeof(__u64)); | |
50 | __uint(max_entries, MAX_ENTRIES); | |
51 | } off_cpu SEC(".maps"); | |
52 | ||
10742d0c NK |
53 | struct { |
54 | __uint(type, BPF_MAP_TYPE_HASH); | |
55 | __uint(key_size, sizeof(__u32)); | |
56 | __uint(value_size, sizeof(__u8)); | |
57 | __uint(max_entries, 1); | |
58 | } cpu_filter SEC(".maps"); | |
59 | ||
60 | struct { | |
61 | __uint(type, BPF_MAP_TYPE_HASH); | |
62 | __uint(key_size, sizeof(__u32)); | |
63 | __uint(value_size, sizeof(__u8)); | |
64 | __uint(max_entries, 1); | |
65 | } task_filter SEC(".maps"); | |
66 | ||
685439a7 NK |
67 | struct { |
68 | __uint(type, BPF_MAP_TYPE_HASH); | |
69 | __uint(key_size, sizeof(__u64)); | |
70 | __uint(value_size, sizeof(__u8)); | |
71 | __uint(max_entries, 1); | |
72 | } cgroup_filter SEC(".maps"); | |
73 | ||
edc41a10 NK |
74 | /* old kernel task_struct definition */ |
75 | struct task_struct___old { | |
76 | long state; | |
77 | } __attribute__((preserve_access_index)); | |
78 | ||
79 | int enabled = 0; | |
10742d0c NK |
80 | int has_cpu = 0; |
81 | int has_task = 0; | |
685439a7 | 82 | int has_cgroup = 0; |
edc41a10 | 83 | |
b36888f7 | 84 | const volatile bool has_prev_state = false; |
685439a7 NK |
85 | const volatile bool needs_cgroup = false; |
86 | const volatile bool uses_cgroup_v1 = false; | |
b36888f7 | 87 | |
edc41a10 NK |
88 | /* |
89 | * Old kernel used to call it task_struct->state and now it's '__state'. | |
90 | * Use BPF CO-RE "ignored suffix rule" to deal with it like below: | |
91 | * | |
92 | * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes | |
93 | */ | |
94 | static inline int get_task_state(struct task_struct *t) | |
95 | { | |
96 | if (bpf_core_field_exists(t->__state)) | |
97 | return BPF_CORE_READ(t, __state); | |
98 | ||
99 | /* recast pointer to capture task_struct___old type for compiler */ | |
100 | struct task_struct___old *t_old = (void *)t; | |
101 | ||
102 | /* now use old "state" name of the field */ | |
103 | return BPF_CORE_READ(t_old, state); | |
104 | } | |
105 | ||
685439a7 NK |
106 | static inline __u64 get_cgroup_id(struct task_struct *t) |
107 | { | |
108 | struct cgroup *cgrp; | |
109 | ||
110 | if (uses_cgroup_v1) | |
111 | cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); | |
112 | else | |
113 | cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); | |
114 | ||
115 | return BPF_CORE_READ(cgrp, kn, id); | |
116 | } | |
117 | ||
10742d0c NK |
118 | static inline int can_record(struct task_struct *t, int state) |
119 | { | |
120 | /* kernel threads don't have user stack */ | |
121 | if (t->flags & PF_KTHREAD) | |
122 | return 0; | |
123 | ||
124 | if (state != TASK_INTERRUPTIBLE && | |
125 | state != TASK_UNINTERRUPTIBLE) | |
126 | return 0; | |
127 | ||
128 | if (has_cpu) { | |
129 | __u32 cpu = bpf_get_smp_processor_id(); | |
130 | __u8 *ok; | |
131 | ||
132 | ok = bpf_map_lookup_elem(&cpu_filter, &cpu); | |
133 | if (!ok) | |
134 | return 0; | |
135 | } | |
136 | ||
137 | if (has_task) { | |
138 | __u8 *ok; | |
139 | __u32 pid = t->pid; | |
140 | ||
141 | ok = bpf_map_lookup_elem(&task_filter, &pid); | |
142 | if (!ok) | |
143 | return 0; | |
144 | } | |
145 | ||
685439a7 NK |
146 | if (has_cgroup) { |
147 | __u8 *ok; | |
148 | __u64 cgrp_id = get_cgroup_id(t); | |
149 | ||
150 | ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); | |
151 | if (!ok) | |
152 | return 0; | |
153 | } | |
154 | ||
10742d0c NK |
155 | return 1; |
156 | } | |
157 | ||
b36888f7 NK |
158 | static int off_cpu_stat(u64 *ctx, struct task_struct *prev, |
159 | struct task_struct *next, int state) | |
edc41a10 NK |
160 | { |
161 | __u64 ts; | |
edc41a10 | 162 | __u32 stack_id; |
edc41a10 NK |
163 | struct tstamp_data *pelem; |
164 | ||
edc41a10 NK |
165 | ts = bpf_ktime_get_ns(); |
166 | ||
10742d0c | 167 | if (!can_record(prev, state)) |
edc41a10 NK |
168 | goto next; |
169 | ||
170 | stack_id = bpf_get_stackid(ctx, &stacks, | |
171 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); | |
172 | ||
173 | pelem = bpf_task_storage_get(&tstamp, prev, NULL, | |
174 | BPF_LOCAL_STORAGE_GET_F_CREATE); | |
175 | if (!pelem) | |
176 | goto next; | |
177 | ||
178 | pelem->timestamp = ts; | |
179 | pelem->state = state; | |
180 | pelem->stack_id = stack_id; | |
181 | ||
182 | next: | |
183 | pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); | |
184 | ||
185 | if (pelem && pelem->timestamp) { | |
186 | struct offcpu_key key = { | |
187 | .pid = next->pid, | |
188 | .tgid = next->tgid, | |
189 | .stack_id = pelem->stack_id, | |
190 | .state = pelem->state, | |
685439a7 | 191 | .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, |
edc41a10 NK |
192 | }; |
193 | __u64 delta = ts - pelem->timestamp; | |
194 | __u64 *total; | |
195 | ||
196 | total = bpf_map_lookup_elem(&off_cpu, &key); | |
197 | if (total) | |
198 | *total += delta; | |
199 | else | |
200 | bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); | |
201 | ||
202 | /* prevent to reuse the timestamp later */ | |
203 | pelem->timestamp = 0; | |
204 | } | |
205 | ||
206 | return 0; | |
207 | } | |
208 | ||
b36888f7 NK |
209 | SEC("tp_btf/sched_switch") |
210 | int on_switch(u64 *ctx) | |
211 | { | |
212 | struct task_struct *prev, *next; | |
213 | int prev_state; | |
214 | ||
215 | if (!enabled) | |
216 | return 0; | |
217 | ||
218 | prev = (struct task_struct *)ctx[1]; | |
219 | next = (struct task_struct *)ctx[2]; | |
220 | ||
221 | if (has_prev_state) | |
222 | prev_state = (int)ctx[3]; | |
223 | else | |
224 | prev_state = get_task_state(prev); | |
225 | ||
226 | return off_cpu_stat(ctx, prev, next, prev_state); | |
227 | } | |
228 | ||
edc41a10 | 229 | char LICENSE[] SEC("license") = "Dual BSD/GPL"; |