| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | #include "cgroup-internal.h" |
| 3 | |
| 4 | #include <linux/sched/cputime.h> |
| 5 | |
| 6 | #include <linux/bpf.h> |
| 7 | #include <linux/btf.h> |
| 8 | #include <linux/btf_ids.h> |
| 9 | |
| 10 | #include <trace/events/cgroup.h> |
| 11 | |
| 12 | static DEFINE_SPINLOCK(rstat_base_lock); |
| 13 | static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); |
| 14 | |
| 15 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); |
| 16 | |
| 17 | /* |
| 18 | * Determines whether a given css can participate in rstat. |
| 19 | * css's that are cgroup::self use rstat for base stats. |
| 20 | * Other css's associated with a subsystem use rstat only when |
| 21 | * they define the ss->css_rstat_flush callback. |
| 22 | */ |
| 23 | static inline bool css_uses_rstat(struct cgroup_subsys_state *css) |
| 24 | { |
| 25 | return css_is_self(css) || css->ss->css_rstat_flush != NULL; |
| 26 | } |
| 27 | |
| 28 | static struct css_rstat_cpu *css_rstat_cpu( |
| 29 | struct cgroup_subsys_state *css, int cpu) |
| 30 | { |
| 31 | return per_cpu_ptr(css->rstat_cpu, cpu); |
| 32 | } |
| 33 | |
| 34 | static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( |
| 35 | struct cgroup *cgrp, int cpu) |
| 36 | { |
| 37 | return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); |
| 38 | } |
| 39 | |
| 40 | static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) |
| 41 | { |
| 42 | if (ss) |
| 43 | return &ss->rstat_ss_lock; |
| 44 | |
| 45 | return &rstat_base_lock; |
| 46 | } |
| 47 | |
| 48 | static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) |
| 49 | { |
| 50 | if (ss) { |
| 51 | /* |
| 52 | * Depending on config, the subsystem per-cpu lock type may be an |
| 53 | * empty struct. In enviromnents where this is the case, allocation |
| 54 | * of this field is not performed in ss_rstat_init(). Avoid a |
| 55 | * cpu-based offset relative to NULL by returning early. When the |
| 56 | * lock type is zero in size, the corresponding lock functions are |
| 57 | * no-ops so passing them NULL is acceptable. |
| 58 | */ |
| 59 | if (sizeof(*ss->rstat_ss_cpu_lock) == 0) |
| 60 | return NULL; |
| 61 | |
| 62 | return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); |
| 63 | } |
| 64 | |
| 65 | return per_cpu_ptr(&rstat_base_cpu_lock, cpu); |
| 66 | } |
| 67 | |
| 68 | /* |
| 69 | * Helper functions for rstat per CPU locks. |
| 70 | * |
| 71 | * This makes it easier to diagnose locking issues and contention in |
| 72 | * production environments. The parameter @fast_path determine the |
| 73 | * tracepoints being added, allowing us to diagnose "flush" related |
| 74 | * operations without handling high-frequency fast-path "update" events. |
| 75 | */ |
| 76 | static __always_inline |
| 77 | unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, |
| 78 | const bool fast_path) |
| 79 | { |
| 80 | struct cgroup *cgrp = css->cgroup; |
| 81 | raw_spinlock_t *cpu_lock; |
| 82 | unsigned long flags; |
| 83 | bool contended; |
| 84 | |
| 85 | /* |
| 86 | * The _irqsave() is needed because the locks used for flushing are |
| 87 | * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock |
| 88 | * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT |
| 89 | * kernel. The raw_spinlock_t below disables interrupts on both |
| 90 | * configurations. The _irqsave() ensures that interrupts are always |
| 91 | * disabled and later restored. |
| 92 | */ |
| 93 | cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); |
| 94 | contended = !raw_spin_trylock_irqsave(cpu_lock, flags); |
| 95 | if (contended) { |
| 96 | if (fast_path) |
| 97 | trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); |
| 98 | else |
| 99 | trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); |
| 100 | |
| 101 | raw_spin_lock_irqsave(cpu_lock, flags); |
| 102 | } |
| 103 | |
| 104 | if (fast_path) |
| 105 | trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); |
| 106 | else |
| 107 | trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); |
| 108 | |
| 109 | return flags; |
| 110 | } |
| 111 | |
| 112 | static __always_inline |
| 113 | void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, |
| 114 | unsigned long flags, const bool fast_path) |
| 115 | { |
| 116 | struct cgroup *cgrp = css->cgroup; |
| 117 | raw_spinlock_t *cpu_lock; |
| 118 | |
| 119 | if (fast_path) |
| 120 | trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); |
| 121 | else |
| 122 | trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); |
| 123 | |
| 124 | cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); |
| 125 | raw_spin_unlock_irqrestore(cpu_lock, flags); |
| 126 | } |
| 127 | |
| 128 | /** |
| 129 | * css_rstat_updated - keep track of updated rstat_cpu |
| 130 | * @css: target cgroup subsystem state |
| 131 | * @cpu: cpu on which rstat_cpu was updated |
| 132 | * |
| 133 | * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching |
| 134 | * rstat_cpu->updated_children list. See the comment on top of |
| 135 | * css_rstat_cpu definition for details. |
| 136 | */ |
| 137 | __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) |
| 138 | { |
| 139 | unsigned long flags; |
| 140 | |
| 141 | /* |
| 142 | * Since bpf programs can call this function, prevent access to |
| 143 | * uninitialized rstat pointers. |
| 144 | */ |
| 145 | if (!css_uses_rstat(css)) |
| 146 | return; |
| 147 | |
| 148 | /* |
| 149 | * Speculative already-on-list test. This may race leading to |
| 150 | * temporary inaccuracies, which is fine. |
| 151 | * |
| 152 | * Because @parent's updated_children is terminated with @parent |
| 153 | * instead of NULL, we can tell whether @css is on the list by |
| 154 | * testing the next pointer for NULL. |
| 155 | */ |
| 156 | if (data_race(css_rstat_cpu(css, cpu)->updated_next)) |
| 157 | return; |
| 158 | |
| 159 | flags = _css_rstat_cpu_lock(css, cpu, true); |
| 160 | |
| 161 | /* put @css and all ancestors on the corresponding updated lists */ |
| 162 | while (true) { |
| 163 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); |
| 164 | struct cgroup_subsys_state *parent = css->parent; |
| 165 | struct css_rstat_cpu *prstatc; |
| 166 | |
| 167 | /* |
| 168 | * Both additions and removals are bottom-up. If a cgroup |
| 169 | * is already in the tree, all ancestors are. |
| 170 | */ |
| 171 | if (rstatc->updated_next) |
| 172 | break; |
| 173 | |
| 174 | /* Root has no parent to link it to, but mark it busy */ |
| 175 | if (!parent) { |
| 176 | rstatc->updated_next = css; |
| 177 | break; |
| 178 | } |
| 179 | |
| 180 | prstatc = css_rstat_cpu(parent, cpu); |
| 181 | rstatc->updated_next = prstatc->updated_children; |
| 182 | prstatc->updated_children = css; |
| 183 | |
| 184 | css = parent; |
| 185 | } |
| 186 | |
| 187 | _css_rstat_cpu_unlock(css, cpu, flags, true); |
| 188 | } |
| 189 | |
| 190 | /** |
| 191 | * css_rstat_push_children - push children css's into the given list |
| 192 | * @head: current head of the list (= subtree root) |
| 193 | * @child: first child of the root |
| 194 | * @cpu: target cpu |
| 195 | * Return: A new singly linked list of css's to be flushed |
| 196 | * |
| 197 | * Iteratively traverse down the css_rstat_cpu updated tree level by |
| 198 | * level and push all the parents first before their next level children |
| 199 | * into a singly linked list via the rstat_flush_next pointer built from the |
| 200 | * tail backward like "pushing" css's into a stack. The root is pushed by |
| 201 | * the caller. |
| 202 | */ |
| 203 | static struct cgroup_subsys_state *css_rstat_push_children( |
| 204 | struct cgroup_subsys_state *head, |
| 205 | struct cgroup_subsys_state *child, int cpu) |
| 206 | { |
| 207 | struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ |
| 208 | struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ |
| 209 | struct cgroup_subsys_state *parent, *grandchild; |
| 210 | struct css_rstat_cpu *crstatc; |
| 211 | |
| 212 | child->rstat_flush_next = NULL; |
| 213 | |
| 214 | /* |
| 215 | * The subsystem rstat lock must be held for the whole duration from |
| 216 | * here as the rstat_flush_next list is being constructed to when |
| 217 | * it is consumed later in css_rstat_flush(). |
| 218 | */ |
| 219 | lockdep_assert_held(ss_rstat_lock(head->ss)); |
| 220 | |
| 221 | /* |
| 222 | * Notation: -> updated_next pointer |
| 223 | * => rstat_flush_next pointer |
| 224 | * |
| 225 | * Assuming the following sample updated_children lists: |
| 226 | * P: C1 -> C2 -> P |
| 227 | * C1: G11 -> G12 -> C1 |
| 228 | * C2: G21 -> G22 -> C2 |
| 229 | * |
| 230 | * After 1st iteration: |
| 231 | * head => C2 => C1 => NULL |
| 232 | * ghead => G21 => G11 => NULL |
| 233 | * |
| 234 | * After 2nd iteration: |
| 235 | * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL |
| 236 | */ |
| 237 | next_level: |
| 238 | while (cnext) { |
| 239 | child = cnext; |
| 240 | cnext = child->rstat_flush_next; |
| 241 | parent = child->parent; |
| 242 | |
| 243 | /* updated_next is parent cgroup terminated if !NULL */ |
| 244 | while (child != parent) { |
| 245 | child->rstat_flush_next = head; |
| 246 | head = child; |
| 247 | crstatc = css_rstat_cpu(child, cpu); |
| 248 | grandchild = crstatc->updated_children; |
| 249 | if (grandchild != child) { |
| 250 | /* Push the grand child to the next level */ |
| 251 | crstatc->updated_children = child; |
| 252 | grandchild->rstat_flush_next = ghead; |
| 253 | ghead = grandchild; |
| 254 | } |
| 255 | child = crstatc->updated_next; |
| 256 | crstatc->updated_next = NULL; |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | if (ghead) { |
| 261 | cnext = ghead; |
| 262 | ghead = NULL; |
| 263 | goto next_level; |
| 264 | } |
| 265 | return head; |
| 266 | } |
| 267 | |
| 268 | /** |
| 269 | * css_rstat_updated_list - build a list of updated css's to be flushed |
| 270 | * @root: root of the css subtree to traverse |
| 271 | * @cpu: target cpu |
| 272 | * Return: A singly linked list of css's to be flushed |
| 273 | * |
| 274 | * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, |
| 275 | * each returned css is unlinked from the updated tree. |
| 276 | * |
| 277 | * The only ordering guarantee is that, for a parent and a child pair |
| 278 | * covered by a given traversal, the child is before its parent in |
| 279 | * the list. |
| 280 | * |
| 281 | * Note that updated_children is self terminated and points to a list of |
| 282 | * child css's if not empty. Whereas updated_next is like a sibling link |
| 283 | * within the children list and terminated by the parent css. An exception |
| 284 | * here is the css root whose updated_next can be self terminated. |
| 285 | */ |
| 286 | static struct cgroup_subsys_state *css_rstat_updated_list( |
| 287 | struct cgroup_subsys_state *root, int cpu) |
| 288 | { |
| 289 | struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); |
| 290 | struct cgroup_subsys_state *head = NULL, *parent, *child; |
| 291 | unsigned long flags; |
| 292 | |
| 293 | flags = _css_rstat_cpu_lock(root, cpu, false); |
| 294 | |
| 295 | /* Return NULL if this subtree is not on-list */ |
| 296 | if (!rstatc->updated_next) |
| 297 | goto unlock_ret; |
| 298 | |
| 299 | /* |
| 300 | * Unlink @root from its parent. As the updated_children list is |
| 301 | * singly linked, we have to walk it to find the removal point. |
| 302 | */ |
| 303 | parent = root->parent; |
| 304 | if (parent) { |
| 305 | struct css_rstat_cpu *prstatc; |
| 306 | struct cgroup_subsys_state **nextp; |
| 307 | |
| 308 | prstatc = css_rstat_cpu(parent, cpu); |
| 309 | nextp = &prstatc->updated_children; |
| 310 | while (*nextp != root) { |
| 311 | struct css_rstat_cpu *nrstatc; |
| 312 | |
| 313 | nrstatc = css_rstat_cpu(*nextp, cpu); |
| 314 | WARN_ON_ONCE(*nextp == parent); |
| 315 | nextp = &nrstatc->updated_next; |
| 316 | } |
| 317 | *nextp = rstatc->updated_next; |
| 318 | } |
| 319 | |
| 320 | rstatc->updated_next = NULL; |
| 321 | |
| 322 | /* Push @root to the list first before pushing the children */ |
| 323 | head = root; |
| 324 | root->rstat_flush_next = NULL; |
| 325 | child = rstatc->updated_children; |
| 326 | rstatc->updated_children = root; |
| 327 | if (child != root) |
| 328 | head = css_rstat_push_children(head, child, cpu); |
| 329 | unlock_ret: |
| 330 | _css_rstat_cpu_unlock(root, cpu, flags, false); |
| 331 | return head; |
| 332 | } |
| 333 | |
| 334 | /* |
| 335 | * A hook for bpf stat collectors to attach to and flush their stats. |
| 336 | * Together with providing bpf kfuncs for css_rstat_updated() and |
| 337 | * css_rstat_flush(), this enables a complete workflow where bpf progs that |
| 338 | * collect cgroup stats can integrate with rstat for efficient flushing. |
| 339 | * |
| 340 | * A static noinline declaration here could cause the compiler to optimize away |
| 341 | * the function. A global noinline declaration will keep the definition, but may |
| 342 | * optimize away the callsite. Therefore, __weak is needed to ensure that the |
| 343 | * call is still emitted, by telling the compiler that we don't know what the |
| 344 | * function might eventually be. |
| 345 | */ |
| 346 | |
| 347 | __bpf_hook_start(); |
| 348 | |
| 349 | __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, |
| 350 | struct cgroup *parent, int cpu) |
| 351 | { |
| 352 | } |
| 353 | |
| 354 | __bpf_hook_end(); |
| 355 | |
| 356 | /* |
| 357 | * Helper functions for locking. |
| 358 | * |
| 359 | * This makes it easier to diagnose locking issues and contention in |
| 360 | * production environments. The parameter @cpu_in_loop indicate lock |
| 361 | * was released and re-taken when collection data from the CPUs. The |
| 362 | * value -1 is used when obtaining the main lock else this is the CPU |
| 363 | * number processed last. |
| 364 | */ |
| 365 | static inline void __css_rstat_lock(struct cgroup_subsys_state *css, |
| 366 | int cpu_in_loop) |
| 367 | __acquires(ss_rstat_lock(css->ss)) |
| 368 | { |
| 369 | struct cgroup *cgrp = css->cgroup; |
| 370 | spinlock_t *lock; |
| 371 | bool contended; |
| 372 | |
| 373 | lock = ss_rstat_lock(css->ss); |
| 374 | contended = !spin_trylock_irq(lock); |
| 375 | if (contended) { |
| 376 | trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); |
| 377 | spin_lock_irq(lock); |
| 378 | } |
| 379 | trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); |
| 380 | } |
| 381 | |
| 382 | static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, |
| 383 | int cpu_in_loop) |
| 384 | __releases(ss_rstat_lock(css->ss)) |
| 385 | { |
| 386 | struct cgroup *cgrp = css->cgroup; |
| 387 | spinlock_t *lock; |
| 388 | |
| 389 | lock = ss_rstat_lock(css->ss); |
| 390 | trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); |
| 391 | spin_unlock_irq(lock); |
| 392 | } |
| 393 | |
| 394 | /** |
| 395 | * css_rstat_flush - flush stats in @css's rstat subtree |
| 396 | * @css: target cgroup subsystem state |
| 397 | * |
| 398 | * Collect all per-cpu stats in @css's subtree into the global counters |
| 399 | * and propagate them upwards. After this function returns, all rstat |
| 400 | * nodes in the subtree have up-to-date ->stat. |
| 401 | * |
| 402 | * This also gets all rstat nodes in the subtree including @css off the |
| 403 | * ->updated_children lists. |
| 404 | * |
| 405 | * This function may block. |
| 406 | */ |
| 407 | __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) |
| 408 | { |
| 409 | int cpu; |
| 410 | bool is_self = css_is_self(css); |
| 411 | |
| 412 | /* |
| 413 | * Since bpf programs can call this function, prevent access to |
| 414 | * uninitialized rstat pointers. |
| 415 | */ |
| 416 | if (!css_uses_rstat(css)) |
| 417 | return; |
| 418 | |
| 419 | might_sleep(); |
| 420 | for_each_possible_cpu(cpu) { |
| 421 | struct cgroup_subsys_state *pos; |
| 422 | |
| 423 | /* Reacquire for each CPU to avoid disabling IRQs too long */ |
| 424 | __css_rstat_lock(css, cpu); |
| 425 | pos = css_rstat_updated_list(css, cpu); |
| 426 | for (; pos; pos = pos->rstat_flush_next) { |
| 427 | if (is_self) { |
| 428 | cgroup_base_stat_flush(pos->cgroup, cpu); |
| 429 | bpf_rstat_flush(pos->cgroup, |
| 430 | cgroup_parent(pos->cgroup), cpu); |
| 431 | } else |
| 432 | pos->ss->css_rstat_flush(pos, cpu); |
| 433 | } |
| 434 | __css_rstat_unlock(css, cpu); |
| 435 | if (!cond_resched()) |
| 436 | cpu_relax(); |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | int css_rstat_init(struct cgroup_subsys_state *css) |
| 441 | { |
| 442 | struct cgroup *cgrp = css->cgroup; |
| 443 | int cpu; |
| 444 | bool is_self = css_is_self(css); |
| 445 | |
| 446 | if (is_self) { |
| 447 | /* the root cgrp has rstat_base_cpu preallocated */ |
| 448 | if (!cgrp->rstat_base_cpu) { |
| 449 | cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); |
| 450 | if (!cgrp->rstat_base_cpu) |
| 451 | return -ENOMEM; |
| 452 | } |
| 453 | } else if (css->ss->css_rstat_flush == NULL) |
| 454 | return 0; |
| 455 | |
| 456 | /* the root cgrp's self css has rstat_cpu preallocated */ |
| 457 | if (!css->rstat_cpu) { |
| 458 | css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); |
| 459 | if (!css->rstat_cpu) { |
| 460 | if (is_self) |
| 461 | free_percpu(cgrp->rstat_base_cpu); |
| 462 | |
| 463 | return -ENOMEM; |
| 464 | } |
| 465 | } |
| 466 | |
| 467 | /* ->updated_children list is self terminated */ |
| 468 | for_each_possible_cpu(cpu) { |
| 469 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); |
| 470 | |
| 471 | rstatc->updated_children = css; |
| 472 | |
| 473 | if (is_self) { |
| 474 | struct cgroup_rstat_base_cpu *rstatbc; |
| 475 | |
| 476 | rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); |
| 477 | u64_stats_init(&rstatbc->bsync); |
| 478 | } |
| 479 | } |
| 480 | |
| 481 | return 0; |
| 482 | } |
| 483 | |
| 484 | void css_rstat_exit(struct cgroup_subsys_state *css) |
| 485 | { |
| 486 | int cpu; |
| 487 | |
| 488 | if (!css_uses_rstat(css)) |
| 489 | return; |
| 490 | |
| 491 | css_rstat_flush(css); |
| 492 | |
| 493 | /* sanity check */ |
| 494 | for_each_possible_cpu(cpu) { |
| 495 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); |
| 496 | |
| 497 | if (WARN_ON_ONCE(rstatc->updated_children != css) || |
| 498 | WARN_ON_ONCE(rstatc->updated_next)) |
| 499 | return; |
| 500 | } |
| 501 | |
| 502 | if (css_is_self(css)) { |
| 503 | struct cgroup *cgrp = css->cgroup; |
| 504 | |
| 505 | free_percpu(cgrp->rstat_base_cpu); |
| 506 | cgrp->rstat_base_cpu = NULL; |
| 507 | } |
| 508 | |
| 509 | free_percpu(css->rstat_cpu); |
| 510 | css->rstat_cpu = NULL; |
| 511 | } |
| 512 | |
| 513 | /** |
| 514 | * ss_rstat_init - subsystem-specific rstat initialization |
| 515 | * @ss: target subsystem |
| 516 | * |
| 517 | * If @ss is NULL, the static locks associated with the base stats |
| 518 | * are initialized. If @ss is non-NULL, the subsystem-specific locks |
| 519 | * are initialized. |
| 520 | */ |
| 521 | int __init ss_rstat_init(struct cgroup_subsys *ss) |
| 522 | { |
| 523 | int cpu; |
| 524 | |
| 525 | /* |
| 526 | * Depending on config, the subsystem per-cpu lock type may be an empty |
| 527 | * struct. Avoid allocating a size of zero in this case. |
| 528 | */ |
| 529 | if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { |
| 530 | ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); |
| 531 | if (!ss->rstat_ss_cpu_lock) |
| 532 | return -ENOMEM; |
| 533 | } |
| 534 | |
| 535 | spin_lock_init(ss_rstat_lock(ss)); |
| 536 | for_each_possible_cpu(cpu) |
| 537 | raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); |
| 538 | |
| 539 | return 0; |
| 540 | } |
| 541 | |
| 542 | /* |
| 543 | * Functions for cgroup basic resource statistics implemented on top of |
| 544 | * rstat. |
| 545 | */ |
| 546 | static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, |
| 547 | struct cgroup_base_stat *src_bstat) |
| 548 | { |
| 549 | dst_bstat->cputime.utime += src_bstat->cputime.utime; |
| 550 | dst_bstat->cputime.stime += src_bstat->cputime.stime; |
| 551 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; |
| 552 | #ifdef CONFIG_SCHED_CORE |
| 553 | dst_bstat->forceidle_sum += src_bstat->forceidle_sum; |
| 554 | #endif |
| 555 | dst_bstat->ntime += src_bstat->ntime; |
| 556 | } |
| 557 | |
| 558 | static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, |
| 559 | struct cgroup_base_stat *src_bstat) |
| 560 | { |
| 561 | dst_bstat->cputime.utime -= src_bstat->cputime.utime; |
| 562 | dst_bstat->cputime.stime -= src_bstat->cputime.stime; |
| 563 | dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; |
| 564 | #ifdef CONFIG_SCHED_CORE |
| 565 | dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; |
| 566 | #endif |
| 567 | dst_bstat->ntime -= src_bstat->ntime; |
| 568 | } |
| 569 | |
| 570 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) |
| 571 | { |
| 572 | struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); |
| 573 | struct cgroup *parent = cgroup_parent(cgrp); |
| 574 | struct cgroup_rstat_base_cpu *prstatbc; |
| 575 | struct cgroup_base_stat delta; |
| 576 | unsigned seq; |
| 577 | |
| 578 | /* Root-level stats are sourced from system-wide CPU stats */ |
| 579 | if (!parent) |
| 580 | return; |
| 581 | |
| 582 | /* fetch the current per-cpu values */ |
| 583 | do { |
| 584 | seq = __u64_stats_fetch_begin(&rstatbc->bsync); |
| 585 | delta = rstatbc->bstat; |
| 586 | } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); |
| 587 | |
| 588 | /* propagate per-cpu delta to cgroup and per-cpu global statistics */ |
| 589 | cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); |
| 590 | cgroup_base_stat_add(&cgrp->bstat, &delta); |
| 591 | cgroup_base_stat_add(&rstatbc->last_bstat, &delta); |
| 592 | cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); |
| 593 | |
| 594 | /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ |
| 595 | if (cgroup_parent(parent)) { |
| 596 | delta = cgrp->bstat; |
| 597 | cgroup_base_stat_sub(&delta, &cgrp->last_bstat); |
| 598 | cgroup_base_stat_add(&parent->bstat, &delta); |
| 599 | cgroup_base_stat_add(&cgrp->last_bstat, &delta); |
| 600 | |
| 601 | delta = rstatbc->subtree_bstat; |
| 602 | prstatbc = cgroup_rstat_base_cpu(parent, cpu); |
| 603 | cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); |
| 604 | cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); |
| 605 | cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | static struct cgroup_rstat_base_cpu * |
| 610 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) |
| 611 | { |
| 612 | struct cgroup_rstat_base_cpu *rstatbc; |
| 613 | |
| 614 | rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); |
| 615 | *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); |
| 616 | return rstatbc; |
| 617 | } |
| 618 | |
| 619 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, |
| 620 | struct cgroup_rstat_base_cpu *rstatbc, |
| 621 | unsigned long flags) |
| 622 | { |
| 623 | u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); |
| 624 | css_rstat_updated(&cgrp->self, smp_processor_id()); |
| 625 | put_cpu_ptr(rstatbc); |
| 626 | } |
| 627 | |
| 628 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) |
| 629 | { |
| 630 | struct cgroup_rstat_base_cpu *rstatbc; |
| 631 | unsigned long flags; |
| 632 | |
| 633 | rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); |
| 634 | rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; |
| 635 | cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); |
| 636 | } |
| 637 | |
| 638 | void __cgroup_account_cputime_field(struct cgroup *cgrp, |
| 639 | enum cpu_usage_stat index, u64 delta_exec) |
| 640 | { |
| 641 | struct cgroup_rstat_base_cpu *rstatbc; |
| 642 | unsigned long flags; |
| 643 | |
| 644 | rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); |
| 645 | |
| 646 | switch (index) { |
| 647 | case CPUTIME_NICE: |
| 648 | rstatbc->bstat.ntime += delta_exec; |
| 649 | fallthrough; |
| 650 | case CPUTIME_USER: |
| 651 | rstatbc->bstat.cputime.utime += delta_exec; |
| 652 | break; |
| 653 | case CPUTIME_SYSTEM: |
| 654 | case CPUTIME_IRQ: |
| 655 | case CPUTIME_SOFTIRQ: |
| 656 | rstatbc->bstat.cputime.stime += delta_exec; |
| 657 | break; |
| 658 | #ifdef CONFIG_SCHED_CORE |
| 659 | case CPUTIME_FORCEIDLE: |
| 660 | rstatbc->bstat.forceidle_sum += delta_exec; |
| 661 | break; |
| 662 | #endif |
| 663 | default: |
| 664 | break; |
| 665 | } |
| 666 | |
| 667 | cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); |
| 668 | } |
| 669 | |
| 670 | /* |
| 671 | * compute the cputime for the root cgroup by getting the per cpu data |
| 672 | * at a global level, then categorizing the fields in a manner consistent |
| 673 | * with how it is done by __cgroup_account_cputime_field for each bit of |
| 674 | * cpu time attributed to a cgroup. |
| 675 | */ |
| 676 | static void root_cgroup_cputime(struct cgroup_base_stat *bstat) |
| 677 | { |
| 678 | struct task_cputime *cputime = &bstat->cputime; |
| 679 | int i; |
| 680 | |
| 681 | memset(bstat, 0, sizeof(*bstat)); |
| 682 | for_each_possible_cpu(i) { |
| 683 | struct kernel_cpustat kcpustat; |
| 684 | u64 *cpustat = kcpustat.cpustat; |
| 685 | u64 user = 0; |
| 686 | u64 sys = 0; |
| 687 | |
| 688 | kcpustat_cpu_fetch(&kcpustat, i); |
| 689 | |
| 690 | user += cpustat[CPUTIME_USER]; |
| 691 | user += cpustat[CPUTIME_NICE]; |
| 692 | cputime->utime += user; |
| 693 | |
| 694 | sys += cpustat[CPUTIME_SYSTEM]; |
| 695 | sys += cpustat[CPUTIME_IRQ]; |
| 696 | sys += cpustat[CPUTIME_SOFTIRQ]; |
| 697 | cputime->stime += sys; |
| 698 | |
| 699 | cputime->sum_exec_runtime += user; |
| 700 | cputime->sum_exec_runtime += sys; |
| 701 | |
| 702 | #ifdef CONFIG_SCHED_CORE |
| 703 | bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; |
| 704 | #endif |
| 705 | bstat->ntime += cpustat[CPUTIME_NICE]; |
| 706 | } |
| 707 | } |
| 708 | |
| 709 | |
| 710 | static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) |
| 711 | { |
| 712 | #ifdef CONFIG_SCHED_CORE |
| 713 | u64 forceidle_time = bstat->forceidle_sum; |
| 714 | |
| 715 | do_div(forceidle_time, NSEC_PER_USEC); |
| 716 | seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); |
| 717 | #endif |
| 718 | } |
| 719 | |
| 720 | void cgroup_base_stat_cputime_show(struct seq_file *seq) |
| 721 | { |
| 722 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
| 723 | struct cgroup_base_stat bstat; |
| 724 | |
| 725 | if (cgroup_parent(cgrp)) { |
| 726 | css_rstat_flush(&cgrp->self); |
| 727 | __css_rstat_lock(&cgrp->self, -1); |
| 728 | bstat = cgrp->bstat; |
| 729 | cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, |
| 730 | &bstat.cputime.utime, &bstat.cputime.stime); |
| 731 | __css_rstat_unlock(&cgrp->self, -1); |
| 732 | } else { |
| 733 | root_cgroup_cputime(&bstat); |
| 734 | } |
| 735 | |
| 736 | do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); |
| 737 | do_div(bstat.cputime.utime, NSEC_PER_USEC); |
| 738 | do_div(bstat.cputime.stime, NSEC_PER_USEC); |
| 739 | do_div(bstat.ntime, NSEC_PER_USEC); |
| 740 | |
| 741 | seq_printf(seq, "usage_usec %llu\n" |
| 742 | "user_usec %llu\n" |
| 743 | "system_usec %llu\n" |
| 744 | "nice_usec %llu\n", |
| 745 | bstat.cputime.sum_exec_runtime, |
| 746 | bstat.cputime.utime, |
| 747 | bstat.cputime.stime, |
| 748 | bstat.ntime); |
| 749 | |
| 750 | cgroup_force_idle_show(seq, &bstat); |
| 751 | } |
| 752 | |
| 753 | /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ |
| 754 | BTF_KFUNCS_START(bpf_rstat_kfunc_ids) |
| 755 | BTF_ID_FLAGS(func, css_rstat_updated) |
| 756 | BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) |
| 757 | BTF_KFUNCS_END(bpf_rstat_kfunc_ids) |
| 758 | |
| 759 | static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { |
| 760 | .owner = THIS_MODULE, |
| 761 | .set = &bpf_rstat_kfunc_ids, |
| 762 | }; |
| 763 | |
| 764 | static int __init bpf_rstat_kfunc_init(void) |
| 765 | { |
| 766 | return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, |
| 767 | &bpf_rstat_kfunc_set); |
| 768 | } |
| 769 | late_initcall(bpf_rstat_kfunc_init); |