Commit | Line | Data |
---|---|---|
457c8996 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
041cd640 TH |
2 | #include "cgroup-internal.h" |
3 | ||
4 | #include <linux/sched/cputime.h> | |
5 | ||
a319185b YA |
6 | #include <linux/bpf.h> |
7 | #include <linux/btf.h> | |
8 | #include <linux/btf_ids.h> | |
9 | ||
fc29e04a JDB |
10 | #include <trace/events/cgroup.h> |
11 | ||
748922dc JK |
12 | static DEFINE_SPINLOCK(rstat_base_lock); |
13 | static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); | |
041cd640 | 14 | |
a17556f8 TH |
15 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); |
16 | ||
93b35663 JK |
17 | /* |
18 | * Determines whether a given css can participate in rstat. | |
19 | * css's that are cgroup::self use rstat for base stats. | |
20 | * Other css's associated with a subsystem use rstat only when | |
21 | * they define the ss->css_rstat_flush callback. | |
22 | */ | |
23 | static inline bool css_uses_rstat(struct cgroup_subsys_state *css) | |
24 | { | |
25 | return css_is_self(css) || css->ss->css_rstat_flush != NULL; | |
26 | } | |
27 | ||
5da3bfa0 JK |
28 | static struct css_rstat_cpu *css_rstat_cpu( |
29 | struct cgroup_subsys_state *css, int cpu) | |
041cd640 | 30 | { |
5da3bfa0 | 31 | return per_cpu_ptr(css->rstat_cpu, cpu); |
041cd640 TH |
32 | } |
33 | ||
f6e9a26e JK |
34 | static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( |
35 | struct cgroup *cgrp, int cpu) | |
36 | { | |
37 | return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); | |
38 | } | |
39 | ||
748922dc JK |
40 | static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) |
41 | { | |
42 | if (ss) | |
43 | return &ss->rstat_ss_lock; | |
44 | ||
45 | return &rstat_base_lock; | |
46 | } | |
47 | ||
48 | static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) | |
49 | { | |
c853d187 JK |
50 | if (ss) { |
51 | /* | |
52 | * Depending on config, the subsystem per-cpu lock type may be an | |
53 | * empty struct. In enviromnents where this is the case, allocation | |
54 | * of this field is not performed in ss_rstat_init(). Avoid a | |
55 | * cpu-based offset relative to NULL by returning early. When the | |
56 | * lock type is zero in size, the corresponding lock functions are | |
57 | * no-ops so passing them NULL is acceptable. | |
58 | */ | |
59 | if (sizeof(*ss->rstat_ss_cpu_lock) == 0) | |
60 | return NULL; | |
61 | ||
748922dc | 62 | return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); |
c853d187 | 63 | } |
748922dc JK |
64 | |
65 | return per_cpu_ptr(&rstat_base_cpu_lock, cpu); | |
66 | } | |
67 | ||
21c38a3b | 68 | /* |
748922dc | 69 | * Helper functions for rstat per CPU locks. |
21c38a3b JDB |
70 | * |
71 | * This makes it easier to diagnose locking issues and contention in | |
72 | * production environments. The parameter @fast_path determine the | |
73 | * tracepoints being added, allowing us to diagnose "flush" related | |
74 | * operations without handling high-frequency fast-path "update" events. | |
75 | */ | |
76 | static __always_inline | |
748922dc JK |
77 | unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, |
78 | const bool fast_path) | |
21c38a3b | 79 | { |
a9791555 | 80 | struct cgroup *cgrp = css->cgroup; |
748922dc | 81 | raw_spinlock_t *cpu_lock; |
21c38a3b JDB |
82 | unsigned long flags; |
83 | bool contended; | |
84 | ||
85 | /* | |
748922dc JK |
86 | * The _irqsave() is needed because the locks used for flushing are |
87 | * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock | |
88 | * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT | |
89 | * kernel. The raw_spinlock_t below disables interrupts on both | |
90 | * configurations. The _irqsave() ensures that interrupts are always | |
91 | * disabled and later restored. | |
21c38a3b | 92 | */ |
748922dc | 93 | cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); |
21c38a3b JDB |
94 | contended = !raw_spin_trylock_irqsave(cpu_lock, flags); |
95 | if (contended) { | |
96 | if (fast_path) | |
97 | trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); | |
98 | else | |
99 | trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); | |
100 | ||
101 | raw_spin_lock_irqsave(cpu_lock, flags); | |
102 | } | |
103 | ||
104 | if (fast_path) | |
105 | trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); | |
106 | else | |
107 | trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); | |
108 | ||
109 | return flags; | |
110 | } | |
111 | ||
112 | static __always_inline | |
748922dc JK |
113 | void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, |
114 | unsigned long flags, const bool fast_path) | |
21c38a3b | 115 | { |
a9791555 | 116 | struct cgroup *cgrp = css->cgroup; |
748922dc | 117 | raw_spinlock_t *cpu_lock; |
a9791555 | 118 | |
21c38a3b JDB |
119 | if (fast_path) |
120 | trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); | |
121 | else | |
122 | trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); | |
123 | ||
748922dc | 124 | cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); |
21c38a3b JDB |
125 | raw_spin_unlock_irqrestore(cpu_lock, flags); |
126 | } | |
127 | ||
041cd640 | 128 | /** |
a9791555 JK |
129 | * css_rstat_updated - keep track of updated rstat_cpu |
130 | * @css: target cgroup subsystem state | |
c58632b3 | 131 | * @cpu: cpu on which rstat_cpu was updated |
041cd640 | 132 | * |
5da3bfa0 JK |
133 | * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching |
134 | * rstat_cpu->updated_children list. See the comment on top of | |
135 | * css_rstat_cpu definition for details. | |
041cd640 | 136 | */ |
a9791555 | 137 | __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) |
041cd640 | 138 | { |
041cd640 TH |
139 | unsigned long flags; |
140 | ||
5da3bfa0 JK |
141 | /* |
142 | * Since bpf programs can call this function, prevent access to | |
143 | * uninitialized rstat pointers. | |
144 | */ | |
93b35663 | 145 | if (!css_uses_rstat(css)) |
5da3bfa0 JK |
146 | return; |
147 | ||
041cd640 | 148 | /* |
d8ef4b38 TH |
149 | * Speculative already-on-list test. This may race leading to |
150 | * temporary inaccuracies, which is fine. | |
151 | * | |
041cd640 | 152 | * Because @parent's updated_children is terminated with @parent |
5da3bfa0 | 153 | * instead of NULL, we can tell whether @css is on the list by |
041cd640 TH |
154 | * testing the next pointer for NULL. |
155 | */ | |
5da3bfa0 | 156 | if (data_race(css_rstat_cpu(css, cpu)->updated_next)) |
041cd640 TH |
157 | return; |
158 | ||
748922dc | 159 | flags = _css_rstat_cpu_lock(css, cpu, true); |
041cd640 | 160 | |
5da3bfa0 | 161 | /* put @css and all ancestors on the corresponding updated lists */ |
dc26532a | 162 | while (true) { |
5da3bfa0 JK |
163 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); |
164 | struct cgroup_subsys_state *parent = css->parent; | |
165 | struct css_rstat_cpu *prstatc; | |
041cd640 TH |
166 | |
167 | /* | |
168 | * Both additions and removals are bottom-up. If a cgroup | |
169 | * is already in the tree, all ancestors are. | |
170 | */ | |
c58632b3 | 171 | if (rstatc->updated_next) |
041cd640 TH |
172 | break; |
173 | ||
dc26532a JW |
174 | /* Root has no parent to link it to, but mark it busy */ |
175 | if (!parent) { | |
5da3bfa0 | 176 | rstatc->updated_next = css; |
dc26532a JW |
177 | break; |
178 | } | |
179 | ||
5da3bfa0 | 180 | prstatc = css_rstat_cpu(parent, cpu); |
c58632b3 | 181 | rstatc->updated_next = prstatc->updated_children; |
5da3bfa0 | 182 | prstatc->updated_children = css; |
dc26532a | 183 | |
5da3bfa0 | 184 | css = parent; |
041cd640 TH |
185 | } |
186 | ||
748922dc | 187 | _css_rstat_cpu_unlock(css, cpu, flags, true); |
041cd640 TH |
188 | } |
189 | ||
190 | /** | |
5da3bfa0 | 191 | * css_rstat_push_children - push children css's into the given list |
d499fd41 WL |
192 | * @head: current head of the list (= subtree root) |
193 | * @child: first child of the root | |
041cd640 | 194 | * @cpu: target cpu |
5da3bfa0 | 195 | * Return: A new singly linked list of css's to be flushed |
041cd640 | 196 | * |
5da3bfa0 | 197 | * Iteratively traverse down the css_rstat_cpu updated tree level by |
d499fd41 | 198 | * level and push all the parents first before their next level children |
8f52633c | 199 | * into a singly linked list via the rstat_flush_next pointer built from the |
5da3bfa0 | 200 | * tail backward like "pushing" css's into a stack. The root is pushed by |
8f52633c | 201 | * the caller. |
d499fd41 | 202 | */ |
5da3bfa0 JK |
203 | static struct cgroup_subsys_state *css_rstat_push_children( |
204 | struct cgroup_subsys_state *head, | |
205 | struct cgroup_subsys_state *child, int cpu) | |
d499fd41 | 206 | { |
5da3bfa0 JK |
207 | struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ |
208 | struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ | |
209 | struct cgroup_subsys_state *parent, *grandchild; | |
210 | struct css_rstat_cpu *crstatc; | |
d499fd41 WL |
211 | |
212 | child->rstat_flush_next = NULL; | |
213 | ||
8f52633c | 214 | /* |
748922dc | 215 | * The subsystem rstat lock must be held for the whole duration from |
8f52633c WL |
216 | * here as the rstat_flush_next list is being constructed to when |
217 | * it is consumed later in css_rstat_flush(). | |
218 | */ | |
748922dc | 219 | lockdep_assert_held(ss_rstat_lock(head->ss)); |
8f52633c WL |
220 | |
221 | /* | |
222 | * Notation: -> updated_next pointer | |
223 | * => rstat_flush_next pointer | |
224 | * | |
225 | * Assuming the following sample updated_children lists: | |
226 | * P: C1 -> C2 -> P | |
227 | * C1: G11 -> G12 -> C1 | |
228 | * C2: G21 -> G22 -> C2 | |
229 | * | |
230 | * After 1st iteration: | |
231 | * head => C2 => C1 => NULL | |
232 | * ghead => G21 => G11 => NULL | |
233 | * | |
234 | * After 2nd iteration: | |
235 | * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL | |
236 | */ | |
d499fd41 | 237 | next_level: |
8f52633c WL |
238 | while (cnext) { |
239 | child = cnext; | |
240 | cnext = child->rstat_flush_next; | |
5da3bfa0 | 241 | parent = child->parent; |
d499fd41 | 242 | |
8f52633c | 243 | /* updated_next is parent cgroup terminated if !NULL */ |
d499fd41 WL |
244 | while (child != parent) { |
245 | child->rstat_flush_next = head; | |
246 | head = child; | |
5da3bfa0 | 247 | crstatc = css_rstat_cpu(child, cpu); |
d499fd41 WL |
248 | grandchild = crstatc->updated_children; |
249 | if (grandchild != child) { | |
250 | /* Push the grand child to the next level */ | |
251 | crstatc->updated_children = child; | |
252 | grandchild->rstat_flush_next = ghead; | |
253 | ghead = grandchild; | |
254 | } | |
255 | child = crstatc->updated_next; | |
256 | crstatc->updated_next = NULL; | |
257 | } | |
258 | } | |
259 | ||
260 | if (ghead) { | |
8f52633c | 261 | cnext = ghead; |
d499fd41 WL |
262 | ghead = NULL; |
263 | goto next_level; | |
264 | } | |
265 | return head; | |
266 | } | |
267 | ||
268 | /** | |
5da3bfa0 JK |
269 | * css_rstat_updated_list - build a list of updated css's to be flushed |
270 | * @root: root of the css subtree to traverse | |
d499fd41 | 271 | * @cpu: target cpu |
5da3bfa0 | 272 | * Return: A singly linked list of css's to be flushed |
d499fd41 WL |
273 | * |
274 | * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, | |
5da3bfa0 | 275 | * each returned css is unlinked from the updated tree. |
041cd640 TH |
276 | * |
277 | * The only ordering guarantee is that, for a parent and a child pair | |
d499fd41 WL |
278 | * covered by a given traversal, the child is before its parent in |
279 | * the list. | |
280 | * | |
281 | * Note that updated_children is self terminated and points to a list of | |
5da3bfa0 JK |
282 | * child css's if not empty. Whereas updated_next is like a sibling link |
283 | * within the children list and terminated by the parent css. An exception | |
284 | * here is the css root whose updated_next can be self terminated. | |
041cd640 | 285 | */ |
5da3bfa0 JK |
286 | static struct cgroup_subsys_state *css_rstat_updated_list( |
287 | struct cgroup_subsys_state *root, int cpu) | |
041cd640 | 288 | { |
5da3bfa0 JK |
289 | struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); |
290 | struct cgroup_subsys_state *head = NULL, *parent, *child; | |
d499fd41 | 291 | unsigned long flags; |
041cd640 | 292 | |
748922dc | 293 | flags = _css_rstat_cpu_lock(root, cpu, false); |
041cd640 | 294 | |
d499fd41 WL |
295 | /* Return NULL if this subtree is not on-list */ |
296 | if (!rstatc->updated_next) | |
297 | goto unlock_ret; | |
041cd640 TH |
298 | |
299 | /* | |
d499fd41 | 300 | * Unlink @root from its parent. As the updated_children list is |
041cd640 | 301 | * singly linked, we have to walk it to find the removal point. |
041cd640 | 302 | */ |
5da3bfa0 | 303 | parent = root->parent; |
f5f60d23 | 304 | if (parent) { |
5da3bfa0 JK |
305 | struct css_rstat_cpu *prstatc; |
306 | struct cgroup_subsys_state **nextp; | |
041cd640 | 307 | |
5da3bfa0 | 308 | prstatc = css_rstat_cpu(parent, cpu); |
f5f60d23 | 309 | nextp = &prstatc->updated_children; |
d499fd41 | 310 | while (*nextp != root) { |
5da3bfa0 | 311 | struct css_rstat_cpu *nrstatc; |
f5f60d23 | 312 | |
5da3bfa0 | 313 | nrstatc = css_rstat_cpu(*nextp, cpu); |
f5f60d23 WY |
314 | WARN_ON_ONCE(*nextp == parent); |
315 | nextp = &nrstatc->updated_next; | |
316 | } | |
317 | *nextp = rstatc->updated_next; | |
041cd640 TH |
318 | } |
319 | ||
f5f60d23 | 320 | rstatc->updated_next = NULL; |
e76d28bd | 321 | |
d499fd41 WL |
322 | /* Push @root to the list first before pushing the children */ |
323 | head = root; | |
324 | root->rstat_flush_next = NULL; | |
325 | child = rstatc->updated_children; | |
326 | rstatc->updated_children = root; | |
327 | if (child != root) | |
5da3bfa0 | 328 | head = css_rstat_push_children(head, child, cpu); |
d499fd41 | 329 | unlock_ret: |
748922dc | 330 | _css_rstat_cpu_unlock(root, cpu, flags, false); |
e76d28bd | 331 | return head; |
041cd640 TH |
332 | } |
333 | ||
a319185b YA |
334 | /* |
335 | * A hook for bpf stat collectors to attach to and flush their stats. | |
a9791555 JK |
336 | * Together with providing bpf kfuncs for css_rstat_updated() and |
337 | * css_rstat_flush(), this enables a complete workflow where bpf progs that | |
a319185b YA |
338 | * collect cgroup stats can integrate with rstat for efficient flushing. |
339 | * | |
340 | * A static noinline declaration here could cause the compiler to optimize away | |
341 | * the function. A global noinline declaration will keep the definition, but may | |
342 | * optimize away the callsite. Therefore, __weak is needed to ensure that the | |
343 | * call is still emitted, by telling the compiler that we don't know what the | |
344 | * function might eventually be. | |
a319185b | 345 | */ |
15fb6f2b DM |
346 | |
347 | __bpf_hook_start(); | |
a319185b YA |
348 | |
349 | __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, | |
350 | struct cgroup *parent, int cpu) | |
351 | { | |
352 | } | |
353 | ||
15fb6f2b | 354 | __bpf_hook_end(); |
a319185b | 355 | |
fc29e04a | 356 | /* |
748922dc | 357 | * Helper functions for locking. |
fc29e04a JDB |
358 | * |
359 | * This makes it easier to diagnose locking issues and contention in | |
360 | * production environments. The parameter @cpu_in_loop indicate lock | |
361 | * was released and re-taken when collection data from the CPUs. The | |
362 | * value -1 is used when obtaining the main lock else this is the CPU | |
363 | * number processed last. | |
364 | */ | |
a9791555 JK |
365 | static inline void __css_rstat_lock(struct cgroup_subsys_state *css, |
366 | int cpu_in_loop) | |
748922dc | 367 | __acquires(ss_rstat_lock(css->ss)) |
fc29e04a | 368 | { |
a9791555 | 369 | struct cgroup *cgrp = css->cgroup; |
748922dc | 370 | spinlock_t *lock; |
fc29e04a JDB |
371 | bool contended; |
372 | ||
748922dc JK |
373 | lock = ss_rstat_lock(css->ss); |
374 | contended = !spin_trylock_irq(lock); | |
fc29e04a JDB |
375 | if (contended) { |
376 | trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); | |
748922dc | 377 | spin_lock_irq(lock); |
fc29e04a JDB |
378 | } |
379 | trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); | |
380 | } | |
381 | ||
a9791555 JK |
382 | static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, |
383 | int cpu_in_loop) | |
748922dc | 384 | __releases(ss_rstat_lock(css->ss)) |
fc29e04a | 385 | { |
a9791555 | 386 | struct cgroup *cgrp = css->cgroup; |
748922dc | 387 | spinlock_t *lock; |
a9791555 | 388 | |
748922dc | 389 | lock = ss_rstat_lock(css->ss); |
fc29e04a | 390 | trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); |
748922dc | 391 | spin_unlock_irq(lock); |
fc29e04a JDB |
392 | } |
393 | ||
a17556f8 | 394 | /** |
5da3bfa0 | 395 | * css_rstat_flush - flush stats in @css's rstat subtree |
a9791555 | 396 | * @css: target cgroup subsystem state |
a17556f8 | 397 | * |
5da3bfa0 JK |
398 | * Collect all per-cpu stats in @css's subtree into the global counters |
399 | * and propagate them upwards. After this function returns, all rstat | |
400 | * nodes in the subtree have up-to-date ->stat. | |
a17556f8 | 401 | * |
5da3bfa0 | 402 | * This also gets all rstat nodes in the subtree including @css off the |
a17556f8 | 403 | * ->updated_children lists. |
0fa294fb TH |
404 | * |
405 | * This function may block. | |
a17556f8 | 406 | */ |
a9791555 | 407 | __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) |
a17556f8 | 408 | { |
a17556f8 | 409 | int cpu; |
5da3bfa0 JK |
410 | bool is_self = css_is_self(css); |
411 | ||
412 | /* | |
413 | * Since bpf programs can call this function, prevent access to | |
414 | * uninitialized rstat pointers. | |
415 | */ | |
93b35663 | 416 | if (!css_uses_rstat(css)) |
5da3bfa0 | 417 | return; |
a17556f8 | 418 | |
0fa294fb | 419 | might_sleep(); |
a17556f8 | 420 | for_each_possible_cpu(cpu) { |
5da3bfa0 | 421 | struct cgroup_subsys_state *pos; |
0fa294fb | 422 | |
093c8812 | 423 | /* Reacquire for each CPU to avoid disabling IRQs too long */ |
a9791555 | 424 | __css_rstat_lock(css, cpu); |
5da3bfa0 | 425 | pos = css_rstat_updated_list(css, cpu); |
e76d28bd | 426 | for (; pos; pos = pos->rstat_flush_next) { |
5da3bfa0 JK |
427 | if (is_self) { |
428 | cgroup_base_stat_flush(pos->cgroup, cpu); | |
429 | bpf_rstat_flush(pos->cgroup, | |
430 | cgroup_parent(pos->cgroup), cpu); | |
431 | } else | |
432 | pos->ss->css_rstat_flush(pos, cpu); | |
8f53470b | 433 | } |
a9791555 | 434 | __css_rstat_unlock(css, cpu); |
0efc297a ED |
435 | if (!cond_resched()) |
436 | cpu_relax(); | |
a17556f8 | 437 | } |
6162cef0 TH |
438 | } |
439 | ||
a9791555 | 440 | int css_rstat_init(struct cgroup_subsys_state *css) |
a17556f8 | 441 | { |
a9791555 | 442 | struct cgroup *cgrp = css->cgroup; |
a17556f8 | 443 | int cpu; |
5da3bfa0 | 444 | bool is_self = css_is_self(css); |
a17556f8 | 445 | |
5da3bfa0 JK |
446 | if (is_self) { |
447 | /* the root cgrp has rstat_base_cpu preallocated */ | |
7efbc8f2 | 448 | if (!cgrp->rstat_base_cpu) { |
5da3bfa0 JK |
449 | cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); |
450 | if (!cgrp->rstat_base_cpu) | |
451 | return -ENOMEM; | |
452 | } | |
453 | } else if (css->ss->css_rstat_flush == NULL) | |
454 | return 0; | |
455 | ||
456 | /* the root cgrp's self css has rstat_cpu preallocated */ | |
457 | if (!css->rstat_cpu) { | |
458 | css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); | |
459 | if (!css->rstat_cpu) { | |
460 | if (is_self) | |
461 | free_percpu(cgrp->rstat_base_cpu); | |
462 | ||
f6e9a26e JK |
463 | return -ENOMEM; |
464 | } | |
465 | } | |
466 | ||
a17556f8 TH |
467 | /* ->updated_children list is self terminated */ |
468 | for_each_possible_cpu(cpu) { | |
5da3bfa0 | 469 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); |
a17556f8 | 470 | |
5da3bfa0 JK |
471 | rstatc->updated_children = css; |
472 | ||
473 | if (is_self) { | |
474 | struct cgroup_rstat_base_cpu *rstatbc; | |
475 | ||
476 | rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); | |
477 | u64_stats_init(&rstatbc->bsync); | |
478 | } | |
a17556f8 TH |
479 | } |
480 | ||
481 | return 0; | |
482 | } | |
483 | ||
a9791555 | 484 | void css_rstat_exit(struct cgroup_subsys_state *css) |
a17556f8 TH |
485 | { |
486 | int cpu; | |
487 | ||
93b35663 | 488 | if (!css_uses_rstat(css)) |
5da3bfa0 JK |
489 | return; |
490 | ||
491 | css_rstat_flush(css); | |
a17556f8 TH |
492 | |
493 | /* sanity check */ | |
494 | for_each_possible_cpu(cpu) { | |
5da3bfa0 | 495 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); |
a17556f8 | 496 | |
5da3bfa0 | 497 | if (WARN_ON_ONCE(rstatc->updated_children != css) || |
a17556f8 TH |
498 | WARN_ON_ONCE(rstatc->updated_next)) |
499 | return; | |
500 | } | |
501 | ||
5da3bfa0 JK |
502 | if (css_is_self(css)) { |
503 | struct cgroup *cgrp = css->cgroup; | |
504 | ||
505 | free_percpu(cgrp->rstat_base_cpu); | |
506 | cgrp->rstat_base_cpu = NULL; | |
507 | } | |
508 | ||
509 | free_percpu(css->rstat_cpu); | |
510 | css->rstat_cpu = NULL; | |
a17556f8 TH |
511 | } |
512 | ||
748922dc JK |
513 | /** |
514 | * ss_rstat_init - subsystem-specific rstat initialization | |
515 | * @ss: target subsystem | |
516 | * | |
517 | * If @ss is NULL, the static locks associated with the base stats | |
518 | * are initialized. If @ss is non-NULL, the subsystem-specific locks | |
519 | * are initialized. | |
520 | */ | |
521 | int __init ss_rstat_init(struct cgroup_subsys *ss) | |
a17556f8 TH |
522 | { |
523 | int cpu; | |
524 | ||
731bdd97 | 525 | /* |
c853d187 JK |
526 | * Depending on config, the subsystem per-cpu lock type may be an empty |
527 | * struct. Avoid allocating a size of zero in this case. | |
731bdd97 | 528 | */ |
c853d187 | 529 | if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { |
748922dc JK |
530 | ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); |
531 | if (!ss->rstat_ss_cpu_lock) | |
532 | return -ENOMEM; | |
533 | } | |
534 | ||
535 | spin_lock_init(ss_rstat_lock(ss)); | |
a17556f8 | 536 | for_each_possible_cpu(cpu) |
748922dc JK |
537 | raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); |
538 | ||
539 | return 0; | |
a17556f8 TH |
540 | } |
541 | ||
542 | /* | |
543 | * Functions for cgroup basic resource statistics implemented on top of | |
544 | * rstat. | |
545 | */ | |
1bb5ec2e TH |
546 | static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, |
547 | struct cgroup_base_stat *src_bstat) | |
041cd640 | 548 | { |
d4ff749b TH |
549 | dst_bstat->cputime.utime += src_bstat->cputime.utime; |
550 | dst_bstat->cputime.stime += src_bstat->cputime.stime; | |
551 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; | |
1fcf54de JD |
552 | #ifdef CONFIG_SCHED_CORE |
553 | dst_bstat->forceidle_sum += src_bstat->forceidle_sum; | |
554 | #endif | |
aefa398d | 555 | dst_bstat->ntime += src_bstat->ntime; |
041cd640 TH |
556 | } |
557 | ||
1bb5ec2e TH |
558 | static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, |
559 | struct cgroup_base_stat *src_bstat) | |
560 | { | |
561 | dst_bstat->cputime.utime -= src_bstat->cputime.utime; | |
562 | dst_bstat->cputime.stime -= src_bstat->cputime.stime; | |
563 | dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; | |
1fcf54de JD |
564 | #ifdef CONFIG_SCHED_CORE |
565 | dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; | |
566 | #endif | |
aefa398d | 567 | dst_bstat->ntime -= src_bstat->ntime; |
1bb5ec2e TH |
568 | } |
569 | ||
d4ff749b | 570 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) |
041cd640 | 571 | { |
f6e9a26e | 572 | struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); |
dc26532a | 573 | struct cgroup *parent = cgroup_parent(cgrp); |
f6e9a26e | 574 | struct cgroup_rstat_base_cpu *prstatbc; |
95b99f35 | 575 | struct cgroup_base_stat delta; |
041cd640 TH |
576 | unsigned seq; |
577 | ||
dc26532a JW |
578 | /* Root-level stats are sourced from system-wide CPU stats */ |
579 | if (!parent) | |
580 | return; | |
581 | ||
041cd640 TH |
582 | /* fetch the current per-cpu values */ |
583 | do { | |
f6e9a26e JK |
584 | seq = __u64_stats_fetch_begin(&rstatbc->bsync); |
585 | delta = rstatbc->bstat; | |
586 | } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); | |
041cd640 | 587 | |
0437719c | 588 | /* propagate per-cpu delta to cgroup and per-cpu global statistics */ |
f6e9a26e | 589 | cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); |
1bb5ec2e | 590 | cgroup_base_stat_add(&cgrp->bstat, &delta); |
f6e9a26e JK |
591 | cgroup_base_stat_add(&rstatbc->last_bstat, &delta); |
592 | cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); | |
1bb5ec2e | 593 | |
0437719c | 594 | /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ |
dc26532a | 595 | if (cgroup_parent(parent)) { |
1bb5ec2e TH |
596 | delta = cgrp->bstat; |
597 | cgroup_base_stat_sub(&delta, &cgrp->last_bstat); | |
598 | cgroup_base_stat_add(&parent->bstat, &delta); | |
599 | cgroup_base_stat_add(&cgrp->last_bstat, &delta); | |
0437719c | 600 | |
f6e9a26e JK |
601 | delta = rstatbc->subtree_bstat; |
602 | prstatbc = cgroup_rstat_base_cpu(parent, cpu); | |
603 | cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); | |
604 | cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); | |
605 | cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); | |
1bb5ec2e | 606 | } |
041cd640 TH |
607 | } |
608 | ||
f6e9a26e | 609 | static struct cgroup_rstat_base_cpu * |
c3df5fb5 | 610 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) |
041cd640 | 611 | { |
f6e9a26e | 612 | struct cgroup_rstat_base_cpu *rstatbc; |
041cd640 | 613 | |
f6e9a26e JK |
614 | rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); |
615 | *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); | |
616 | return rstatbc; | |
041cd640 TH |
617 | } |
618 | ||
d4ff749b | 619 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, |
f6e9a26e | 620 | struct cgroup_rstat_base_cpu *rstatbc, |
c3df5fb5 | 621 | unsigned long flags) |
041cd640 | 622 | { |
f6e9a26e | 623 | u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); |
a9791555 | 624 | css_rstat_updated(&cgrp->self, smp_processor_id()); |
f6e9a26e | 625 | put_cpu_ptr(rstatbc); |
041cd640 TH |
626 | } |
627 | ||
628 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | |
629 | { | |
f6e9a26e | 630 | struct cgroup_rstat_base_cpu *rstatbc; |
c3df5fb5 | 631 | unsigned long flags; |
041cd640 | 632 | |
f6e9a26e JK |
633 | rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); |
634 | rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; | |
635 | cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); | |
041cd640 TH |
636 | } |
637 | ||
638 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | |
639 | enum cpu_usage_stat index, u64 delta_exec) | |
640 | { | |
f6e9a26e | 641 | struct cgroup_rstat_base_cpu *rstatbc; |
c3df5fb5 | 642 | unsigned long flags; |
041cd640 | 643 | |
f6e9a26e | 644 | rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); |
041cd640 TH |
645 | |
646 | switch (index) { | |
041cd640 | 647 | case CPUTIME_NICE: |
f6e9a26e | 648 | rstatbc->bstat.ntime += delta_exec; |
aefa398d JH |
649 | fallthrough; |
650 | case CPUTIME_USER: | |
f6e9a26e | 651 | rstatbc->bstat.cputime.utime += delta_exec; |
041cd640 TH |
652 | break; |
653 | case CPUTIME_SYSTEM: | |
654 | case CPUTIME_IRQ: | |
655 | case CPUTIME_SOFTIRQ: | |
f6e9a26e | 656 | rstatbc->bstat.cputime.stime += delta_exec; |
041cd640 | 657 | break; |
1fcf54de JD |
658 | #ifdef CONFIG_SCHED_CORE |
659 | case CPUTIME_FORCEIDLE: | |
f6e9a26e | 660 | rstatbc->bstat.forceidle_sum += delta_exec; |
1fcf54de JD |
661 | break; |
662 | #endif | |
041cd640 TH |
663 | default: |
664 | break; | |
665 | } | |
666 | ||
f6e9a26e | 667 | cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); |
041cd640 TH |
668 | } |
669 | ||
936f2a70 BB |
670 | /* |
671 | * compute the cputime for the root cgroup by getting the per cpu data | |
672 | * at a global level, then categorizing the fields in a manner consistent | |
673 | * with how it is done by __cgroup_account_cputime_field for each bit of | |
674 | * cpu time attributed to a cgroup. | |
675 | */ | |
1fcf54de | 676 | static void root_cgroup_cputime(struct cgroup_base_stat *bstat) |
936f2a70 | 677 | { |
1fcf54de | 678 | struct task_cputime *cputime = &bstat->cputime; |
936f2a70 BB |
679 | int i; |
680 | ||
fcdb1eda | 681 | memset(bstat, 0, sizeof(*bstat)); |
936f2a70 BB |
682 | for_each_possible_cpu(i) { |
683 | struct kernel_cpustat kcpustat; | |
684 | u64 *cpustat = kcpustat.cpustat; | |
685 | u64 user = 0; | |
686 | u64 sys = 0; | |
687 | ||
688 | kcpustat_cpu_fetch(&kcpustat, i); | |
689 | ||
690 | user += cpustat[CPUTIME_USER]; | |
691 | user += cpustat[CPUTIME_NICE]; | |
692 | cputime->utime += user; | |
693 | ||
694 | sys += cpustat[CPUTIME_SYSTEM]; | |
695 | sys += cpustat[CPUTIME_IRQ]; | |
696 | sys += cpustat[CPUTIME_SOFTIRQ]; | |
697 | cputime->stime += sys; | |
698 | ||
699 | cputime->sum_exec_runtime += user; | |
700 | cputime->sum_exec_runtime += sys; | |
1fcf54de JD |
701 | |
702 | #ifdef CONFIG_SCHED_CORE | |
703 | bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; | |
704 | #endif | |
aefa398d | 705 | bstat->ntime += cpustat[CPUTIME_NICE]; |
936f2a70 BB |
706 | } |
707 | } | |
708 | ||
b8247665 CR |
709 | |
710 | static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) | |
711 | { | |
712 | #ifdef CONFIG_SCHED_CORE | |
713 | u64 forceidle_time = bstat->forceidle_sum; | |
714 | ||
715 | do_div(forceidle_time, NSEC_PER_USEC); | |
716 | seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); | |
717 | #endif | |
718 | } | |
719 | ||
d4ff749b | 720 | void cgroup_base_stat_cputime_show(struct seq_file *seq) |
041cd640 TH |
721 | { |
722 | struct cgroup *cgrp = seq_css(seq)->cgroup; | |
c4af66a9 | 723 | struct cgroup_base_stat bstat; |
936f2a70 BB |
724 | |
725 | if (cgroup_parent(cgrp)) { | |
a9791555 JK |
726 | css_rstat_flush(&cgrp->self); |
727 | __css_rstat_lock(&cgrp->self, -1); | |
c4af66a9 | 728 | bstat = cgrp->bstat; |
936f2a70 | 729 | cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, |
c4af66a9 | 730 | &bstat.cputime.utime, &bstat.cputime.stime); |
a9791555 | 731 | __css_rstat_unlock(&cgrp->self, -1); |
936f2a70 | 732 | } else { |
c4af66a9 | 733 | root_cgroup_cputime(&bstat); |
936f2a70 | 734 | } |
041cd640 | 735 | |
c4af66a9 AW |
736 | do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); |
737 | do_div(bstat.cputime.utime, NSEC_PER_USEC); | |
738 | do_div(bstat.cputime.stime, NSEC_PER_USEC); | |
739 | do_div(bstat.ntime, NSEC_PER_USEC); | |
041cd640 | 740 | |
d41bf8c9 | 741 | seq_printf(seq, "usage_usec %llu\n" |
aefa398d JH |
742 | "user_usec %llu\n" |
743 | "system_usec %llu\n" | |
744 | "nice_usec %llu\n", | |
c4af66a9 AW |
745 | bstat.cputime.sum_exec_runtime, |
746 | bstat.cputime.utime, | |
747 | bstat.cputime.stime, | |
748 | bstat.ntime); | |
1fcf54de | 749 | |
c4af66a9 | 750 | cgroup_force_idle_show(seq, &bstat); |
041cd640 | 751 | } |
a319185b | 752 | |
a9791555 | 753 | /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ |
6f3189f3 | 754 | BTF_KFUNCS_START(bpf_rstat_kfunc_ids) |
a9791555 JK |
755 | BTF_ID_FLAGS(func, css_rstat_updated) |
756 | BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) | |
6f3189f3 | 757 | BTF_KFUNCS_END(bpf_rstat_kfunc_ids) |
a319185b YA |
758 | |
759 | static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { | |
760 | .owner = THIS_MODULE, | |
761 | .set = &bpf_rstat_kfunc_ids, | |
762 | }; | |
763 | ||
764 | static int __init bpf_rstat_kfunc_init(void) | |
765 | { | |
766 | return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, | |
767 | &bpf_rstat_kfunc_set); | |
768 | } | |
769 | late_initcall(bpf_rstat_kfunc_init); |