Commit | Line | Data |
---|---|---|
041cd640 TH |
1 | #include "cgroup-internal.h" |
2 | ||
3 | #include <linux/sched/cputime.h> | |
4 | ||
5 | static DEFINE_MUTEX(cgroup_stat_mutex); | |
6 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); | |
7 | ||
8 | static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) | |
9 | { | |
10 | return per_cpu_ptr(cgrp->cpu_stat, cpu); | |
11 | } | |
12 | ||
13 | /** | |
14 | * cgroup_cpu_stat_updated - keep track of updated cpu_stat | |
15 | * @cgrp: target cgroup | |
16 | * @cpu: cpu on which cpu_stat was updated | |
17 | * | |
18 | * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching | |
19 | * cpu_stat->updated_children list. See the comment on top of | |
20 | * cgroup_cpu_stat definition for details. | |
21 | */ | |
22 | static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) | |
23 | { | |
24 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); | |
25 | struct cgroup *parent; | |
26 | unsigned long flags; | |
27 | ||
28 | /* | |
29 | * Speculative already-on-list test. This may race leading to | |
30 | * temporary inaccuracies, which is fine. | |
31 | * | |
32 | * Because @parent's updated_children is terminated with @parent | |
33 | * instead of NULL, we can tell whether @cgrp is on the list by | |
34 | * testing the next pointer for NULL. | |
35 | */ | |
36 | if (cgroup_cpu_stat(cgrp, cpu)->updated_next) | |
37 | return; | |
38 | ||
39 | raw_spin_lock_irqsave(cpu_lock, flags); | |
40 | ||
41 | /* put @cgrp and all ancestors on the corresponding updated lists */ | |
42 | for (parent = cgroup_parent(cgrp); parent; | |
43 | cgrp = parent, parent = cgroup_parent(cgrp)) { | |
44 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | |
45 | struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); | |
46 | ||
47 | /* | |
48 | * Both additions and removals are bottom-up. If a cgroup | |
49 | * is already in the tree, all ancestors are. | |
50 | */ | |
51 | if (cstat->updated_next) | |
52 | break; | |
53 | ||
54 | cstat->updated_next = pcstat->updated_children; | |
55 | pcstat->updated_children = cgrp; | |
56 | } | |
57 | ||
58 | raw_spin_unlock_irqrestore(cpu_lock, flags); | |
59 | } | |
60 | ||
61 | /** | |
62 | * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree | |
63 | * @pos: current position | |
64 | * @root: root of the tree to traversal | |
65 | * @cpu: target cpu | |
66 | * | |
67 | * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts | |
68 | * the traversal and %NULL return indicates the end. During traversal, | |
69 | * each returned cgroup is unlinked from the tree. Must be called with the | |
70 | * matching cgroup_cpu_stat_lock held. | |
71 | * | |
72 | * The only ordering guarantee is that, for a parent and a child pair | |
73 | * covered by a given traversal, if a child is visited, its parent is | |
74 | * guaranteed to be visited afterwards. | |
75 | */ | |
76 | static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, | |
77 | struct cgroup *root, int cpu) | |
78 | { | |
79 | struct cgroup_cpu_stat *cstat; | |
80 | struct cgroup *parent; | |
81 | ||
82 | if (pos == root) | |
83 | return NULL; | |
84 | ||
85 | /* | |
86 | * We're gonna walk down to the first leaf and visit/remove it. We | |
87 | * can pick whatever unvisited node as the starting point. | |
88 | */ | |
89 | if (!pos) | |
90 | pos = root; | |
91 | else | |
92 | pos = cgroup_parent(pos); | |
93 | ||
94 | /* walk down to the first leaf */ | |
95 | while (true) { | |
96 | cstat = cgroup_cpu_stat(pos, cpu); | |
97 | if (cstat->updated_children == pos) | |
98 | break; | |
99 | pos = cstat->updated_children; | |
100 | } | |
101 | ||
102 | /* | |
103 | * Unlink @pos from the tree. As the updated_children list is | |
104 | * singly linked, we have to walk it to find the removal point. | |
105 | * However, due to the way we traverse, @pos will be the first | |
106 | * child in most cases. The only exception is @root. | |
107 | */ | |
108 | parent = cgroup_parent(pos); | |
109 | if (parent && cstat->updated_next) { | |
110 | struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); | |
111 | struct cgroup_cpu_stat *ncstat; | |
112 | struct cgroup **nextp; | |
113 | ||
114 | nextp = &pcstat->updated_children; | |
115 | while (true) { | |
116 | ncstat = cgroup_cpu_stat(*nextp, cpu); | |
117 | if (*nextp == pos) | |
118 | break; | |
119 | ||
120 | WARN_ON_ONCE(*nextp == parent); | |
121 | nextp = &ncstat->updated_next; | |
122 | } | |
123 | ||
124 | *nextp = cstat->updated_next; | |
125 | cstat->updated_next = NULL; | |
126 | } | |
127 | ||
128 | return pos; | |
129 | } | |
130 | ||
131 | static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, | |
132 | struct cgroup_stat *src_stat) | |
133 | { | |
134 | dst_stat->cputime.utime += src_stat->cputime.utime; | |
135 | dst_stat->cputime.stime += src_stat->cputime.stime; | |
136 | dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; | |
137 | } | |
138 | ||
139 | static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) | |
140 | { | |
141 | struct cgroup *parent = cgroup_parent(cgrp); | |
142 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | |
143 | struct task_cputime *last_cputime = &cstat->last_cputime; | |
144 | struct task_cputime cputime; | |
145 | struct cgroup_stat delta; | |
146 | unsigned seq; | |
147 | ||
148 | lockdep_assert_held(&cgroup_stat_mutex); | |
149 | ||
150 | /* fetch the current per-cpu values */ | |
151 | do { | |
152 | seq = __u64_stats_fetch_begin(&cstat->sync); | |
153 | cputime = cstat->cputime; | |
154 | } while (__u64_stats_fetch_retry(&cstat->sync, seq)); | |
155 | ||
156 | /* accumulate the deltas to propgate */ | |
157 | delta.cputime.utime = cputime.utime - last_cputime->utime; | |
158 | delta.cputime.stime = cputime.stime - last_cputime->stime; | |
159 | delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - | |
160 | last_cputime->sum_exec_runtime; | |
161 | *last_cputime = cputime; | |
162 | ||
163 | /* transfer the pending stat into delta */ | |
164 | cgroup_stat_accumulate(&delta, &cgrp->pending_stat); | |
165 | memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); | |
166 | ||
167 | /* propagate delta into the global stat and the parent's pending */ | |
168 | cgroup_stat_accumulate(&cgrp->stat, &delta); | |
169 | if (parent) | |
170 | cgroup_stat_accumulate(&parent->pending_stat, &delta); | |
171 | } | |
172 | ||
173 | /* see cgroup_stat_flush() */ | |
174 | static void cgroup_stat_flush_locked(struct cgroup *cgrp) | |
175 | { | |
176 | int cpu; | |
177 | ||
178 | lockdep_assert_held(&cgroup_stat_mutex); | |
179 | ||
180 | for_each_possible_cpu(cpu) { | |
181 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); | |
182 | struct cgroup *pos = NULL; | |
183 | ||
184 | raw_spin_lock_irq(cpu_lock); | |
185 | while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) | |
186 | cgroup_cpu_stat_flush_one(pos, cpu); | |
187 | raw_spin_unlock_irq(cpu_lock); | |
188 | } | |
189 | } | |
190 | ||
191 | /** | |
192 | * cgroup_stat_flush - flush stats in @cgrp's subtree | |
193 | * @cgrp: target cgroup | |
194 | * | |
195 | * Collect all per-cpu stats in @cgrp's subtree into the global counters | |
196 | * and propagate them upwards. After this function returns, all cgroups in | |
197 | * the subtree have up-to-date ->stat. | |
198 | * | |
199 | * This also gets all cgroups in the subtree including @cgrp off the | |
200 | * ->updated_children lists. | |
201 | */ | |
202 | void cgroup_stat_flush(struct cgroup *cgrp) | |
203 | { | |
204 | mutex_lock(&cgroup_stat_mutex); | |
205 | cgroup_stat_flush_locked(cgrp); | |
206 | mutex_unlock(&cgroup_stat_mutex); | |
207 | } | |
208 | ||
209 | static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) | |
210 | { | |
211 | struct cgroup_cpu_stat *cstat; | |
212 | ||
213 | cstat = get_cpu_ptr(cgrp->cpu_stat); | |
214 | u64_stats_update_begin(&cstat->sync); | |
215 | return cstat; | |
216 | } | |
217 | ||
218 | static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, | |
219 | struct cgroup_cpu_stat *cstat) | |
220 | { | |
221 | u64_stats_update_end(&cstat->sync); | |
222 | cgroup_cpu_stat_updated(cgrp, smp_processor_id()); | |
223 | put_cpu_ptr(cstat); | |
224 | } | |
225 | ||
226 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | |
227 | { | |
228 | struct cgroup_cpu_stat *cstat; | |
229 | ||
230 | cstat = cgroup_cpu_stat_account_begin(cgrp); | |
231 | cstat->cputime.sum_exec_runtime += delta_exec; | |
232 | cgroup_cpu_stat_account_end(cgrp, cstat); | |
233 | } | |
234 | ||
235 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | |
236 | enum cpu_usage_stat index, u64 delta_exec) | |
237 | { | |
238 | struct cgroup_cpu_stat *cstat; | |
239 | ||
240 | cstat = cgroup_cpu_stat_account_begin(cgrp); | |
241 | ||
242 | switch (index) { | |
243 | case CPUTIME_USER: | |
244 | case CPUTIME_NICE: | |
245 | cstat->cputime.utime += delta_exec; | |
246 | break; | |
247 | case CPUTIME_SYSTEM: | |
248 | case CPUTIME_IRQ: | |
249 | case CPUTIME_SOFTIRQ: | |
250 | cstat->cputime.stime += delta_exec; | |
251 | break; | |
252 | default: | |
253 | break; | |
254 | } | |
255 | ||
256 | cgroup_cpu_stat_account_end(cgrp, cstat); | |
257 | } | |
258 | ||
d41bf8c9 | 259 | void cgroup_stat_show_cputime(struct seq_file *seq) |
041cd640 TH |
260 | { |
261 | struct cgroup *cgrp = seq_css(seq)->cgroup; | |
262 | u64 usage, utime, stime; | |
263 | ||
264 | if (!cgroup_parent(cgrp)) | |
265 | return; | |
266 | ||
267 | mutex_lock(&cgroup_stat_mutex); | |
268 | ||
269 | cgroup_stat_flush_locked(cgrp); | |
270 | ||
271 | usage = cgrp->stat.cputime.sum_exec_runtime; | |
272 | cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, | |
273 | &utime, &stime); | |
274 | ||
275 | mutex_unlock(&cgroup_stat_mutex); | |
276 | ||
277 | do_div(usage, NSEC_PER_USEC); | |
278 | do_div(utime, NSEC_PER_USEC); | |
279 | do_div(stime, NSEC_PER_USEC); | |
280 | ||
d41bf8c9 TH |
281 | seq_printf(seq, "usage_usec %llu\n" |
282 | "user_usec %llu\n" | |
283 | "system_usec %llu\n", | |
284 | usage, utime, stime); | |
041cd640 TH |
285 | } |
286 | ||
287 | int cgroup_stat_init(struct cgroup *cgrp) | |
288 | { | |
289 | int cpu; | |
290 | ||
291 | /* the root cgrp has cpu_stat preallocated */ | |
292 | if (!cgrp->cpu_stat) { | |
293 | cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); | |
294 | if (!cgrp->cpu_stat) | |
295 | return -ENOMEM; | |
296 | } | |
297 | ||
298 | /* ->updated_children list is self terminated */ | |
52cf373c LS |
299 | for_each_possible_cpu(cpu) { |
300 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | |
301 | ||
302 | cstat->updated_children = cgrp; | |
303 | u64_stats_init(&cstat->sync); | |
304 | } | |
041cd640 TH |
305 | |
306 | prev_cputime_init(&cgrp->stat.prev_cputime); | |
307 | ||
308 | return 0; | |
309 | } | |
310 | ||
311 | void cgroup_stat_exit(struct cgroup *cgrp) | |
312 | { | |
313 | int cpu; | |
314 | ||
315 | cgroup_stat_flush(cgrp); | |
316 | ||
317 | /* sanity check */ | |
318 | for_each_possible_cpu(cpu) { | |
319 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); | |
320 | ||
321 | if (WARN_ON_ONCE(cstat->updated_children != cgrp) || | |
322 | WARN_ON_ONCE(cstat->updated_next)) | |
323 | return; | |
324 | } | |
325 | ||
326 | free_percpu(cgrp->cpu_stat); | |
327 | cgrp->cpu_stat = NULL; | |
328 | } | |
329 | ||
330 | void __init cgroup_stat_boot(void) | |
331 | { | |
332 | int cpu; | |
333 | ||
334 | for_each_possible_cpu(cpu) | |
335 | raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); | |
336 | ||
337 | BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); | |
338 | } |