Commit | Line | Data |
---|---|---|
457c8996 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
041cd640 TH |
2 | #include "cgroup-internal.h" |
3 | ||
4 | #include <linux/sched/cputime.h> | |
5 | ||
0fa294fb | 6 | static DEFINE_SPINLOCK(cgroup_rstat_lock); |
c58632b3 | 7 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); |
041cd640 | 8 | |
a17556f8 TH |
9 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); |
10 | ||
c58632b3 | 11 | static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) |
041cd640 | 12 | { |
c58632b3 | 13 | return per_cpu_ptr(cgrp->rstat_cpu, cpu); |
041cd640 TH |
14 | } |
15 | ||
16 | /** | |
6162cef0 | 17 | * cgroup_rstat_updated - keep track of updated rstat_cpu |
041cd640 | 18 | * @cgrp: target cgroup |
c58632b3 | 19 | * @cpu: cpu on which rstat_cpu was updated |
041cd640 | 20 | * |
c58632b3 TH |
21 | * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching |
22 | * rstat_cpu->updated_children list. See the comment on top of | |
23 | * cgroup_rstat_cpu definition for details. | |
041cd640 | 24 | */ |
6162cef0 | 25 | void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) |
041cd640 | 26 | { |
c58632b3 | 27 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); |
041cd640 TH |
28 | struct cgroup *parent; |
29 | unsigned long flags; | |
30 | ||
c43c5ea7 TH |
31 | /* nothing to do for root */ |
32 | if (!cgroup_parent(cgrp)) | |
33 | return; | |
34 | ||
041cd640 | 35 | /* |
9a9e97b2 TH |
36 | * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we |
37 | * see NULL updated_next or they see our updated stat. | |
38 | */ | |
39 | smp_mb(); | |
40 | ||
41 | /* | |
041cd640 TH |
42 | * Because @parent's updated_children is terminated with @parent |
43 | * instead of NULL, we can tell whether @cgrp is on the list by | |
44 | * testing the next pointer for NULL. | |
45 | */ | |
c58632b3 | 46 | if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) |
041cd640 TH |
47 | return; |
48 | ||
49 | raw_spin_lock_irqsave(cpu_lock, flags); | |
50 | ||
51 | /* put @cgrp and all ancestors on the corresponding updated lists */ | |
52 | for (parent = cgroup_parent(cgrp); parent; | |
53 | cgrp = parent, parent = cgroup_parent(cgrp)) { | |
c58632b3 TH |
54 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
55 | struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); | |
041cd640 TH |
56 | |
57 | /* | |
58 | * Both additions and removals are bottom-up. If a cgroup | |
59 | * is already in the tree, all ancestors are. | |
60 | */ | |
c58632b3 | 61 | if (rstatc->updated_next) |
041cd640 TH |
62 | break; |
63 | ||
c58632b3 TH |
64 | rstatc->updated_next = prstatc->updated_children; |
65 | prstatc->updated_children = cgrp; | |
041cd640 TH |
66 | } |
67 | ||
68 | raw_spin_unlock_irqrestore(cpu_lock, flags); | |
69 | } | |
6162cef0 | 70 | EXPORT_SYMBOL_GPL(cgroup_rstat_updated); |
041cd640 TH |
71 | |
72 | /** | |
c58632b3 | 73 | * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree |
041cd640 TH |
74 | * @pos: current position |
75 | * @root: root of the tree to traversal | |
76 | * @cpu: target cpu | |
77 | * | |
c58632b3 | 78 | * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts |
041cd640 TH |
79 | * the traversal and %NULL return indicates the end. During traversal, |
80 | * each returned cgroup is unlinked from the tree. Must be called with the | |
c58632b3 | 81 | * matching cgroup_rstat_cpu_lock held. |
041cd640 TH |
82 | * |
83 | * The only ordering guarantee is that, for a parent and a child pair | |
84 | * covered by a given traversal, if a child is visited, its parent is | |
85 | * guaranteed to be visited afterwards. | |
86 | */ | |
c58632b3 TH |
87 | static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, |
88 | struct cgroup *root, int cpu) | |
041cd640 | 89 | { |
c58632b3 | 90 | struct cgroup_rstat_cpu *rstatc; |
041cd640 TH |
91 | |
92 | if (pos == root) | |
93 | return NULL; | |
94 | ||
95 | /* | |
96 | * We're gonna walk down to the first leaf and visit/remove it. We | |
97 | * can pick whatever unvisited node as the starting point. | |
98 | */ | |
99 | if (!pos) | |
100 | pos = root; | |
101 | else | |
102 | pos = cgroup_parent(pos); | |
103 | ||
104 | /* walk down to the first leaf */ | |
105 | while (true) { | |
c58632b3 TH |
106 | rstatc = cgroup_rstat_cpu(pos, cpu); |
107 | if (rstatc->updated_children == pos) | |
041cd640 | 108 | break; |
c58632b3 | 109 | pos = rstatc->updated_children; |
041cd640 TH |
110 | } |
111 | ||
112 | /* | |
113 | * Unlink @pos from the tree. As the updated_children list is | |
114 | * singly linked, we have to walk it to find the removal point. | |
115 | * However, due to the way we traverse, @pos will be the first | |
116 | * child in most cases. The only exception is @root. | |
117 | */ | |
b4ff1b44 TH |
118 | if (rstatc->updated_next) { |
119 | struct cgroup *parent = cgroup_parent(pos); | |
c58632b3 TH |
120 | struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); |
121 | struct cgroup_rstat_cpu *nrstatc; | |
041cd640 TH |
122 | struct cgroup **nextp; |
123 | ||
c58632b3 | 124 | nextp = &prstatc->updated_children; |
041cd640 | 125 | while (true) { |
c58632b3 | 126 | nrstatc = cgroup_rstat_cpu(*nextp, cpu); |
041cd640 TH |
127 | if (*nextp == pos) |
128 | break; | |
129 | ||
130 | WARN_ON_ONCE(*nextp == parent); | |
c58632b3 | 131 | nextp = &nrstatc->updated_next; |
041cd640 TH |
132 | } |
133 | ||
c58632b3 TH |
134 | *nextp = rstatc->updated_next; |
135 | rstatc->updated_next = NULL; | |
9a9e97b2 TH |
136 | |
137 | /* | |
138 | * Paired with the one in cgroup_rstat_cpu_updated(). | |
139 | * Either they see NULL updated_next or we see their | |
140 | * updated stat. | |
141 | */ | |
142 | smp_mb(); | |
b4ff1b44 TH |
143 | |
144 | return pos; | |
041cd640 TH |
145 | } |
146 | ||
b4ff1b44 TH |
147 | /* only happens for @root */ |
148 | return NULL; | |
041cd640 TH |
149 | } |
150 | ||
a17556f8 | 151 | /* see cgroup_rstat_flush() */ |
0fa294fb TH |
152 | static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) |
153 | __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) | |
a17556f8 TH |
154 | { |
155 | int cpu; | |
156 | ||
0fa294fb | 157 | lockdep_assert_held(&cgroup_rstat_lock); |
a17556f8 TH |
158 | |
159 | for_each_possible_cpu(cpu) { | |
160 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, | |
161 | cpu); | |
162 | struct cgroup *pos = NULL; | |
163 | ||
0fa294fb | 164 | raw_spin_lock(cpu_lock); |
8f53470b TH |
165 | while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { |
166 | struct cgroup_subsys_state *css; | |
167 | ||
a17556f8 | 168 | cgroup_base_stat_flush(pos, cpu); |
8f53470b TH |
169 | |
170 | rcu_read_lock(); | |
171 | list_for_each_entry_rcu(css, &pos->rstat_css_list, | |
172 | rstat_css_node) | |
173 | css->ss->css_rstat_flush(css, cpu); | |
174 | rcu_read_unlock(); | |
175 | } | |
0fa294fb TH |
176 | raw_spin_unlock(cpu_lock); |
177 | ||
178 | /* if @may_sleep, play nice and yield if necessary */ | |
179 | if (may_sleep && (need_resched() || | |
180 | spin_needbreak(&cgroup_rstat_lock))) { | |
181 | spin_unlock_irq(&cgroup_rstat_lock); | |
182 | if (!cond_resched()) | |
183 | cpu_relax(); | |
184 | spin_lock_irq(&cgroup_rstat_lock); | |
185 | } | |
a17556f8 TH |
186 | } |
187 | } | |
188 | ||
189 | /** | |
190 | * cgroup_rstat_flush - flush stats in @cgrp's subtree | |
191 | * @cgrp: target cgroup | |
192 | * | |
193 | * Collect all per-cpu stats in @cgrp's subtree into the global counters | |
194 | * and propagate them upwards. After this function returns, all cgroups in | |
195 | * the subtree have up-to-date ->stat. | |
196 | * | |
197 | * This also gets all cgroups in the subtree including @cgrp off the | |
198 | * ->updated_children lists. | |
0fa294fb TH |
199 | * |
200 | * This function may block. | |
a17556f8 TH |
201 | */ |
202 | void cgroup_rstat_flush(struct cgroup *cgrp) | |
203 | { | |
0fa294fb TH |
204 | might_sleep(); |
205 | ||
206 | spin_lock_irq(&cgroup_rstat_lock); | |
207 | cgroup_rstat_flush_locked(cgrp, true); | |
208 | spin_unlock_irq(&cgroup_rstat_lock); | |
209 | } | |
210 | ||
211 | /** | |
212 | * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() | |
213 | * @cgrp: target cgroup | |
214 | * | |
215 | * This function can be called from any context. | |
216 | */ | |
217 | void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) | |
218 | { | |
219 | unsigned long flags; | |
220 | ||
221 | spin_lock_irqsave(&cgroup_rstat_lock, flags); | |
222 | cgroup_rstat_flush_locked(cgrp, false); | |
223 | spin_unlock_irqrestore(&cgroup_rstat_lock, flags); | |
a17556f8 TH |
224 | } |
225 | ||
6162cef0 TH |
226 | /** |
227 | * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold | |
228 | * @cgrp: target cgroup | |
229 | * | |
230 | * Flush stats in @cgrp's subtree and prevent further flushes. Must be | |
231 | * paired with cgroup_rstat_flush_release(). | |
0fa294fb TH |
232 | * |
233 | * This function may block. | |
6162cef0 TH |
234 | */ |
235 | void cgroup_rstat_flush_hold(struct cgroup *cgrp) | |
0fa294fb | 236 | __acquires(&cgroup_rstat_lock) |
6162cef0 | 237 | { |
0fa294fb TH |
238 | might_sleep(); |
239 | spin_lock_irq(&cgroup_rstat_lock); | |
240 | cgroup_rstat_flush_locked(cgrp, true); | |
6162cef0 TH |
241 | } |
242 | ||
243 | /** | |
244 | * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() | |
245 | */ | |
246 | void cgroup_rstat_flush_release(void) | |
0fa294fb | 247 | __releases(&cgroup_rstat_lock) |
6162cef0 | 248 | { |
0fa294fb | 249 | spin_unlock_irq(&cgroup_rstat_lock); |
6162cef0 TH |
250 | } |
251 | ||
a17556f8 TH |
252 | int cgroup_rstat_init(struct cgroup *cgrp) |
253 | { | |
254 | int cpu; | |
255 | ||
256 | /* the root cgrp has rstat_cpu preallocated */ | |
257 | if (!cgrp->rstat_cpu) { | |
258 | cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); | |
259 | if (!cgrp->rstat_cpu) | |
260 | return -ENOMEM; | |
261 | } | |
262 | ||
263 | /* ->updated_children list is self terminated */ | |
264 | for_each_possible_cpu(cpu) { | |
265 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); | |
266 | ||
267 | rstatc->updated_children = cgrp; | |
268 | u64_stats_init(&rstatc->bsync); | |
269 | } | |
270 | ||
271 | return 0; | |
272 | } | |
273 | ||
274 | void cgroup_rstat_exit(struct cgroup *cgrp) | |
275 | { | |
276 | int cpu; | |
277 | ||
278 | cgroup_rstat_flush(cgrp); | |
279 | ||
280 | /* sanity check */ | |
281 | for_each_possible_cpu(cpu) { | |
282 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); | |
283 | ||
284 | if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || | |
285 | WARN_ON_ONCE(rstatc->updated_next)) | |
286 | return; | |
287 | } | |
288 | ||
289 | free_percpu(cgrp->rstat_cpu); | |
290 | cgrp->rstat_cpu = NULL; | |
291 | } | |
292 | ||
293 | void __init cgroup_rstat_boot(void) | |
294 | { | |
295 | int cpu; | |
296 | ||
297 | for_each_possible_cpu(cpu) | |
298 | raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); | |
299 | ||
300 | BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); | |
301 | } | |
302 | ||
303 | /* | |
304 | * Functions for cgroup basic resource statistics implemented on top of | |
305 | * rstat. | |
306 | */ | |
1bb5ec2e TH |
307 | static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, |
308 | struct cgroup_base_stat *src_bstat) | |
041cd640 | 309 | { |
d4ff749b TH |
310 | dst_bstat->cputime.utime += src_bstat->cputime.utime; |
311 | dst_bstat->cputime.stime += src_bstat->cputime.stime; | |
312 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; | |
041cd640 TH |
313 | } |
314 | ||
1bb5ec2e TH |
315 | static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, |
316 | struct cgroup_base_stat *src_bstat) | |
317 | { | |
318 | dst_bstat->cputime.utime -= src_bstat->cputime.utime; | |
319 | dst_bstat->cputime.stime -= src_bstat->cputime.stime; | |
320 | dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; | |
321 | } | |
322 | ||
d4ff749b | 323 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) |
041cd640 TH |
324 | { |
325 | struct cgroup *parent = cgroup_parent(cgrp); | |
c58632b3 | 326 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
1bb5ec2e | 327 | struct cgroup_base_stat cur, delta; |
041cd640 TH |
328 | unsigned seq; |
329 | ||
041cd640 TH |
330 | /* fetch the current per-cpu values */ |
331 | do { | |
d4ff749b | 332 | seq = __u64_stats_fetch_begin(&rstatc->bsync); |
1bb5ec2e | 333 | cur.cputime = rstatc->bstat.cputime; |
d4ff749b | 334 | } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); |
041cd640 | 335 | |
1bb5ec2e TH |
336 | /* propagate percpu delta to global */ |
337 | delta = cur; | |
338 | cgroup_base_stat_sub(&delta, &rstatc->last_bstat); | |
339 | cgroup_base_stat_add(&cgrp->bstat, &delta); | |
340 | cgroup_base_stat_add(&rstatc->last_bstat, &delta); | |
341 | ||
342 | /* propagate global delta to parent */ | |
343 | if (parent) { | |
344 | delta = cgrp->bstat; | |
345 | cgroup_base_stat_sub(&delta, &cgrp->last_bstat); | |
346 | cgroup_base_stat_add(&parent->bstat, &delta); | |
347 | cgroup_base_stat_add(&cgrp->last_bstat, &delta); | |
348 | } | |
041cd640 TH |
349 | } |
350 | ||
c58632b3 | 351 | static struct cgroup_rstat_cpu * |
d4ff749b | 352 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) |
041cd640 | 353 | { |
c58632b3 | 354 | struct cgroup_rstat_cpu *rstatc; |
041cd640 | 355 | |
c58632b3 | 356 | rstatc = get_cpu_ptr(cgrp->rstat_cpu); |
d4ff749b | 357 | u64_stats_update_begin(&rstatc->bsync); |
c58632b3 | 358 | return rstatc; |
041cd640 TH |
359 | } |
360 | ||
d4ff749b TH |
361 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, |
362 | struct cgroup_rstat_cpu *rstatc) | |
041cd640 | 363 | { |
d4ff749b | 364 | u64_stats_update_end(&rstatc->bsync); |
6162cef0 | 365 | cgroup_rstat_updated(cgrp, smp_processor_id()); |
c58632b3 | 366 | put_cpu_ptr(rstatc); |
041cd640 TH |
367 | } |
368 | ||
369 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | |
370 | { | |
c58632b3 | 371 | struct cgroup_rstat_cpu *rstatc; |
041cd640 | 372 | |
d4ff749b TH |
373 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp); |
374 | rstatc->bstat.cputime.sum_exec_runtime += delta_exec; | |
375 | cgroup_base_stat_cputime_account_end(cgrp, rstatc); | |
041cd640 TH |
376 | } |
377 | ||
378 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | |
379 | enum cpu_usage_stat index, u64 delta_exec) | |
380 | { | |
c58632b3 | 381 | struct cgroup_rstat_cpu *rstatc; |
041cd640 | 382 | |
d4ff749b | 383 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp); |
041cd640 TH |
384 | |
385 | switch (index) { | |
386 | case CPUTIME_USER: | |
387 | case CPUTIME_NICE: | |
d4ff749b | 388 | rstatc->bstat.cputime.utime += delta_exec; |
041cd640 TH |
389 | break; |
390 | case CPUTIME_SYSTEM: | |
391 | case CPUTIME_IRQ: | |
392 | case CPUTIME_SOFTIRQ: | |
d4ff749b | 393 | rstatc->bstat.cputime.stime += delta_exec; |
041cd640 TH |
394 | break; |
395 | default: | |
396 | break; | |
397 | } | |
398 | ||
d4ff749b | 399 | cgroup_base_stat_cputime_account_end(cgrp, rstatc); |
041cd640 TH |
400 | } |
401 | ||
d4ff749b | 402 | void cgroup_base_stat_cputime_show(struct seq_file *seq) |
041cd640 TH |
403 | { |
404 | struct cgroup *cgrp = seq_css(seq)->cgroup; | |
405 | u64 usage, utime, stime; | |
406 | ||
407 | if (!cgroup_parent(cgrp)) | |
408 | return; | |
409 | ||
6162cef0 | 410 | cgroup_rstat_flush_hold(cgrp); |
d4ff749b TH |
411 | usage = cgrp->bstat.cputime.sum_exec_runtime; |
412 | cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); | |
6162cef0 | 413 | cgroup_rstat_flush_release(); |
041cd640 TH |
414 | |
415 | do_div(usage, NSEC_PER_USEC); | |
416 | do_div(utime, NSEC_PER_USEC); | |
417 | do_div(stime, NSEC_PER_USEC); | |
418 | ||
d41bf8c9 TH |
419 | seq_printf(seq, "usage_usec %llu\n" |
420 | "user_usec %llu\n" | |
421 | "system_usec %llu\n", | |
422 | usage, utime, stime); | |
041cd640 | 423 | } |