Commit | Line | Data |
---|---|---|
73fbec60 FW |
1 | #include <linux/export.h> |
2 | #include <linux/sched.h> | |
3 | #include <linux/tsacct_kern.h> | |
4 | #include <linux/kernel_stat.h> | |
5 | #include <linux/static_key.h> | |
6 | #include "sched.h" | |
7 | ||
8 | ||
9 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | |
10 | ||
11 | /* | |
12 | * There are no locks covering percpu hardirq/softirq time. | |
bf9fae9f | 13 | * They are only modified in vtime_account, on corresponding CPU |
73fbec60 FW |
14 | * with interrupts disabled. So, writes are safe. |
15 | * They are read and saved off onto struct rq in update_rq_clock(). | |
16 | * This may result in other CPU reading this CPU's irq time and can | |
bf9fae9f | 17 | * race with irq/vtime_account on this CPU. We would either get old |
73fbec60 FW |
18 | * or new value with a side effect of accounting a slice of irq time to wrong |
19 | * task when irq is in progress while we read rq->clock. That is a worthy | |
20 | * compromise in place of having locks on each irq in account_system_time. | |
21 | */ | |
22 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | |
23 | DEFINE_PER_CPU(u64, cpu_softirq_time); | |
24 | ||
25 | static DEFINE_PER_CPU(u64, irq_start_time); | |
26 | static int sched_clock_irqtime; | |
27 | ||
28 | void enable_sched_clock_irqtime(void) | |
29 | { | |
30 | sched_clock_irqtime = 1; | |
31 | } | |
32 | ||
33 | void disable_sched_clock_irqtime(void) | |
34 | { | |
35 | sched_clock_irqtime = 0; | |
36 | } | |
37 | ||
38 | #ifndef CONFIG_64BIT | |
39 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |
40 | #endif /* CONFIG_64BIT */ | |
41 | ||
42 | /* | |
43 | * Called before incrementing preempt_count on {soft,}irq_enter | |
44 | * and before decrementing preempt_count on {soft,}irq_exit. | |
45 | */ | |
3e1df4f5 | 46 | void irqtime_account_irq(struct task_struct *curr) |
73fbec60 FW |
47 | { |
48 | unsigned long flags; | |
49 | s64 delta; | |
50 | int cpu; | |
51 | ||
52 | if (!sched_clock_irqtime) | |
53 | return; | |
54 | ||
55 | local_irq_save(flags); | |
56 | ||
57 | cpu = smp_processor_id(); | |
58 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | |
59 | __this_cpu_add(irq_start_time, delta); | |
60 | ||
61 | irq_time_write_begin(); | |
62 | /* | |
63 | * We do not account for softirq time from ksoftirqd here. | |
64 | * We want to continue accounting softirq time to ksoftirqd thread | |
65 | * in that case, so as not to confuse scheduler with a special task | |
66 | * that do not consume any time, but still wants to run. | |
67 | */ | |
68 | if (hardirq_count()) | |
69 | __this_cpu_add(cpu_hardirq_time, delta); | |
70 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | |
71 | __this_cpu_add(cpu_softirq_time, delta); | |
72 | ||
73 | irq_time_write_end(); | |
74 | local_irq_restore(flags); | |
75 | } | |
3e1df4f5 | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
73fbec60 FW |
77 | |
78 | static int irqtime_account_hi_update(void) | |
79 | { | |
80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
81 | unsigned long flags; | |
82 | u64 latest_ns; | |
83 | int ret = 0; | |
84 | ||
85 | local_irq_save(flags); | |
86 | latest_ns = this_cpu_read(cpu_hardirq_time); | |
87 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | |
88 | ret = 1; | |
89 | local_irq_restore(flags); | |
90 | return ret; | |
91 | } | |
92 | ||
93 | static int irqtime_account_si_update(void) | |
94 | { | |
95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
96 | unsigned long flags; | |
97 | u64 latest_ns; | |
98 | int ret = 0; | |
99 | ||
100 | local_irq_save(flags); | |
101 | latest_ns = this_cpu_read(cpu_softirq_time); | |
102 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | |
103 | ret = 1; | |
104 | local_irq_restore(flags); | |
105 | return ret; | |
106 | } | |
107 | ||
108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
109 | ||
110 | #define sched_clock_irqtime (0) | |
111 | ||
112 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | |
113 | ||
114 | static inline void task_group_account_field(struct task_struct *p, int index, | |
115 | u64 tmp) | |
116 | { | |
117 | #ifdef CONFIG_CGROUP_CPUACCT | |
118 | struct kernel_cpustat *kcpustat; | |
119 | struct cpuacct *ca; | |
120 | #endif | |
121 | /* | |
122 | * Since all updates are sure to touch the root cgroup, we | |
123 | * get ourselves ahead and touch it first. If the root cgroup | |
124 | * is the only cgroup, then nothing else should be necessary. | |
125 | * | |
126 | */ | |
127 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | |
128 | ||
129 | #ifdef CONFIG_CGROUP_CPUACCT | |
130 | if (unlikely(!cpuacct_subsys.active)) | |
131 | return; | |
132 | ||
133 | rcu_read_lock(); | |
134 | ca = task_ca(p); | |
135 | while (ca && (ca != &root_cpuacct)) { | |
136 | kcpustat = this_cpu_ptr(ca->cpustat); | |
137 | kcpustat->cpustat[index] += tmp; | |
138 | ca = parent_ca(ca); | |
139 | } | |
140 | rcu_read_unlock(); | |
141 | #endif | |
142 | } | |
143 | ||
144 | /* | |
145 | * Account user cpu time to a process. | |
146 | * @p: the process that the cpu time gets accounted to | |
147 | * @cputime: the cpu time spent in user space since the last update | |
148 | * @cputime_scaled: cputime scaled by cpu frequency | |
149 | */ | |
150 | void account_user_time(struct task_struct *p, cputime_t cputime, | |
151 | cputime_t cputime_scaled) | |
152 | { | |
153 | int index; | |
154 | ||
155 | /* Add user time to process. */ | |
156 | p->utime += cputime; | |
157 | p->utimescaled += cputime_scaled; | |
158 | account_group_user_time(p, cputime); | |
159 | ||
160 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | |
161 | ||
162 | /* Add user time to cpustat. */ | |
163 | task_group_account_field(p, index, (__force u64) cputime); | |
164 | ||
165 | /* Account for user time used */ | |
166 | acct_update_integrals(p); | |
167 | } | |
168 | ||
169 | /* | |
170 | * Account guest cpu time to a process. | |
171 | * @p: the process that the cpu time gets accounted to | |
172 | * @cputime: the cpu time spent in virtual machine since the last update | |
173 | * @cputime_scaled: cputime scaled by cpu frequency | |
174 | */ | |
175 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | |
176 | cputime_t cputime_scaled) | |
177 | { | |
178 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
179 | ||
180 | /* Add guest time to process. */ | |
181 | p->utime += cputime; | |
182 | p->utimescaled += cputime_scaled; | |
183 | account_group_user_time(p, cputime); | |
184 | p->gtime += cputime; | |
185 | ||
186 | /* Add guest time to cpustat. */ | |
187 | if (TASK_NICE(p) > 0) { | |
188 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | |
189 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | |
190 | } else { | |
191 | cpustat[CPUTIME_USER] += (__force u64) cputime; | |
192 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | |
193 | } | |
194 | } | |
195 | ||
196 | /* | |
197 | * Account system cpu time to a process and desired cpustat field | |
198 | * @p: the process that the cpu time gets accounted to | |
199 | * @cputime: the cpu time spent in kernel space since the last update | |
200 | * @cputime_scaled: cputime scaled by cpu frequency | |
201 | * @target_cputime64: pointer to cpustat field that has to be updated | |
202 | */ | |
203 | static inline | |
204 | void __account_system_time(struct task_struct *p, cputime_t cputime, | |
205 | cputime_t cputime_scaled, int index) | |
206 | { | |
207 | /* Add system time to process. */ | |
208 | p->stime += cputime; | |
209 | p->stimescaled += cputime_scaled; | |
210 | account_group_system_time(p, cputime); | |
211 | ||
212 | /* Add system time to cpustat. */ | |
213 | task_group_account_field(p, index, (__force u64) cputime); | |
214 | ||
215 | /* Account for system time used */ | |
216 | acct_update_integrals(p); | |
217 | } | |
218 | ||
219 | /* | |
220 | * Account system cpu time to a process. | |
221 | * @p: the process that the cpu time gets accounted to | |
222 | * @hardirq_offset: the offset to subtract from hardirq_count() | |
223 | * @cputime: the cpu time spent in kernel space since the last update | |
224 | * @cputime_scaled: cputime scaled by cpu frequency | |
225 | */ | |
226 | void account_system_time(struct task_struct *p, int hardirq_offset, | |
227 | cputime_t cputime, cputime_t cputime_scaled) | |
228 | { | |
229 | int index; | |
230 | ||
231 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | |
232 | account_guest_time(p, cputime, cputime_scaled); | |
233 | return; | |
234 | } | |
235 | ||
236 | if (hardirq_count() - hardirq_offset) | |
237 | index = CPUTIME_IRQ; | |
238 | else if (in_serving_softirq()) | |
239 | index = CPUTIME_SOFTIRQ; | |
240 | else | |
241 | index = CPUTIME_SYSTEM; | |
242 | ||
243 | __account_system_time(p, cputime, cputime_scaled, index); | |
244 | } | |
245 | ||
246 | /* | |
247 | * Account for involuntary wait time. | |
248 | * @cputime: the cpu time spent in involuntary wait | |
249 | */ | |
250 | void account_steal_time(cputime_t cputime) | |
251 | { | |
252 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
253 | ||
254 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | |
255 | } | |
256 | ||
257 | /* | |
258 | * Account for idle time. | |
259 | * @cputime: the cpu time spent in idle wait | |
260 | */ | |
261 | void account_idle_time(cputime_t cputime) | |
262 | { | |
263 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
264 | struct rq *rq = this_rq(); | |
265 | ||
266 | if (atomic_read(&rq->nr_iowait) > 0) | |
267 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | |
268 | else | |
269 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | |
270 | } | |
271 | ||
272 | static __always_inline bool steal_account_process_tick(void) | |
273 | { | |
274 | #ifdef CONFIG_PARAVIRT | |
275 | if (static_key_false(¶virt_steal_enabled)) { | |
276 | u64 steal, st = 0; | |
277 | ||
278 | steal = paravirt_steal_clock(smp_processor_id()); | |
279 | steal -= this_rq()->prev_steal_time; | |
280 | ||
281 | st = steal_ticks(steal); | |
282 | this_rq()->prev_steal_time += st * TICK_NSEC; | |
283 | ||
284 | account_steal_time(st); | |
285 | return st; | |
286 | } | |
287 | #endif | |
288 | return false; | |
289 | } | |
290 | ||
a634f933 FW |
291 | /* |
292 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | |
293 | * tasks (sum on group iteration) belonging to @tsk's group. | |
294 | */ | |
295 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |
296 | { | |
297 | struct signal_struct *sig = tsk->signal; | |
298 | struct task_struct *t; | |
299 | ||
300 | times->utime = sig->utime; | |
301 | times->stime = sig->stime; | |
302 | times->sum_exec_runtime = sig->sum_sched_runtime; | |
303 | ||
304 | rcu_read_lock(); | |
305 | /* make sure we can trust tsk->thread_group list */ | |
306 | if (!likely(pid_alive(tsk))) | |
307 | goto out; | |
308 | ||
309 | t = tsk; | |
310 | do { | |
311 | times->utime += t->utime; | |
312 | times->stime += t->stime; | |
313 | times->sum_exec_runtime += task_sched_runtime(t); | |
314 | } while_each_thread(tsk, t); | |
315 | out: | |
316 | rcu_read_unlock(); | |
317 | } | |
318 | ||
73fbec60 FW |
319 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
320 | ||
321 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | |
322 | /* | |
323 | * Account a tick to a process and cpustat | |
324 | * @p: the process that the cpu time gets accounted to | |
325 | * @user_tick: is the tick from userspace | |
326 | * @rq: the pointer to rq | |
327 | * | |
328 | * Tick demultiplexing follows the order | |
329 | * - pending hardirq update | |
330 | * - pending softirq update | |
331 | * - user_time | |
332 | * - idle_time | |
333 | * - system time | |
334 | * - check for guest_time | |
335 | * - else account as system_time | |
336 | * | |
337 | * Check for hardirq is done both for system and user time as there is | |
338 | * no timer going off while we are on hardirq and hence we may never get an | |
339 | * opportunity to update it solely in system time. | |
340 | * p->stime and friends are only updated on system time and not on irq | |
341 | * softirq as those do not count in task exec_runtime any more. | |
342 | */ | |
343 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
344 | struct rq *rq) | |
345 | { | |
346 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | |
347 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
348 | ||
349 | if (steal_account_process_tick()) | |
350 | return; | |
351 | ||
352 | if (irqtime_account_hi_update()) { | |
353 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | |
354 | } else if (irqtime_account_si_update()) { | |
355 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | |
356 | } else if (this_cpu_ksoftirqd() == p) { | |
357 | /* | |
358 | * ksoftirqd time do not get accounted in cpu_softirq_time. | |
359 | * So, we have to handle it separately here. | |
360 | * Also, p->stime needs to be updated for ksoftirqd. | |
361 | */ | |
362 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | |
363 | CPUTIME_SOFTIRQ); | |
364 | } else if (user_tick) { | |
365 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | |
366 | } else if (p == rq->idle) { | |
367 | account_idle_time(cputime_one_jiffy); | |
368 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | |
369 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | |
370 | } else { | |
371 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | |
372 | CPUTIME_SYSTEM); | |
373 | } | |
374 | } | |
375 | ||
376 | static void irqtime_account_idle_ticks(int ticks) | |
377 | { | |
378 | int i; | |
379 | struct rq *rq = this_rq(); | |
380 | ||
381 | for (i = 0; i < ticks; i++) | |
382 | irqtime_account_process_tick(current, 0, rq); | |
383 | } | |
384 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
385 | static void irqtime_account_idle_ticks(int ticks) {} | |
386 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
387 | struct rq *rq) {} | |
388 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
389 | ||
390 | /* | |
391 | * Account a single tick of cpu time. | |
392 | * @p: the process that the cpu time gets accounted to | |
393 | * @user_tick: indicates if the tick is a user or a system tick | |
394 | */ | |
395 | void account_process_tick(struct task_struct *p, int user_tick) | |
396 | { | |
397 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | |
398 | struct rq *rq = this_rq(); | |
399 | ||
400 | if (sched_clock_irqtime) { | |
401 | irqtime_account_process_tick(p, user_tick, rq); | |
402 | return; | |
403 | } | |
404 | ||
405 | if (steal_account_process_tick()) | |
406 | return; | |
407 | ||
408 | if (user_tick) | |
409 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | |
410 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | |
411 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | |
412 | one_jiffy_scaled); | |
413 | else | |
414 | account_idle_time(cputime_one_jiffy); | |
415 | } | |
416 | ||
417 | /* | |
418 | * Account multiple ticks of steal time. | |
419 | * @p: the process from which the cpu time has been stolen | |
420 | * @ticks: number of stolen ticks | |
421 | */ | |
422 | void account_steal_ticks(unsigned long ticks) | |
423 | { | |
424 | account_steal_time(jiffies_to_cputime(ticks)); | |
425 | } | |
426 | ||
427 | /* | |
428 | * Account multiple ticks of idle time. | |
429 | * @ticks: number of stolen ticks | |
430 | */ | |
431 | void account_idle_ticks(unsigned long ticks) | |
432 | { | |
433 | ||
434 | if (sched_clock_irqtime) { | |
435 | irqtime_account_idle_ticks(ticks); | |
436 | return; | |
437 | } | |
438 | ||
439 | account_idle_time(jiffies_to_cputime(ticks)); | |
440 | } | |
441 | ||
442 | #endif | |
443 | ||
444 | /* | |
445 | * Use precise platform statistics if available: | |
446 | */ | |
447 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | |
e80d0a1a | 448 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
73fbec60 FW |
449 | { |
450 | *ut = p->utime; | |
451 | *st = p->stime; | |
452 | } | |
453 | ||
e80d0a1a | 454 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
73fbec60 FW |
455 | { |
456 | struct task_cputime cputime; | |
457 | ||
458 | thread_group_cputime(p, &cputime); | |
459 | ||
460 | *ut = cputime.utime; | |
461 | *st = cputime.stime; | |
462 | } | |
a7e1a9e3 | 463 | |
11113334 FW |
464 | void vtime_account_system(struct task_struct *tsk) |
465 | { | |
466 | unsigned long flags; | |
467 | ||
468 | local_irq_save(flags); | |
469 | __vtime_account_system(tsk); | |
470 | local_irq_restore(flags); | |
471 | } | |
472 | EXPORT_SYMBOL_GPL(vtime_account_system); | |
473 | ||
a7e1a9e3 FW |
474 | /* |
475 | * Archs that account the whole time spent in the idle task | |
476 | * (outside irq) as idle time can rely on this and just implement | |
11113334 | 477 | * __vtime_account_system() and __vtime_account_idle(). Archs that |
a7e1a9e3 FW |
478 | * have other meaning of the idle time (s390 only includes the |
479 | * time spent by the CPU when it's in low power mode) must override | |
480 | * vtime_account(). | |
481 | */ | |
482 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | |
483 | void vtime_account(struct task_struct *tsk) | |
484 | { | |
485 | unsigned long flags; | |
486 | ||
487 | local_irq_save(flags); | |
488 | ||
489 | if (in_interrupt() || !is_idle_task(tsk)) | |
11113334 | 490 | __vtime_account_system(tsk); |
a7e1a9e3 | 491 | else |
11113334 | 492 | __vtime_account_idle(tsk); |
a7e1a9e3 FW |
493 | |
494 | local_irq_restore(flags); | |
495 | } | |
496 | EXPORT_SYMBOL_GPL(vtime_account); | |
497 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | |
498 | ||
73fbec60 FW |
499 | #else |
500 | ||
501 | #ifndef nsecs_to_cputime | |
502 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | |
503 | #endif | |
504 | ||
505 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | |
506 | { | |
507 | u64 temp = (__force u64) rtime; | |
508 | ||
509 | temp *= (__force u64) utime; | |
510 | ||
511 | if (sizeof(cputime_t) == 4) | |
512 | temp = div_u64(temp, (__force u32) total); | |
513 | else | |
514 | temp = div64_u64(temp, (__force u64) total); | |
515 | ||
516 | return (__force cputime_t) temp; | |
517 | } | |
518 | ||
e80d0a1a | 519 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
73fbec60 FW |
520 | { |
521 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | |
522 | ||
523 | /* | |
524 | * Use CFS's precise accounting: | |
525 | */ | |
526 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | |
527 | ||
528 | if (total) | |
529 | utime = scale_utime(utime, rtime, total); | |
530 | else | |
531 | utime = rtime; | |
532 | ||
533 | /* | |
534 | * Compare with previous values, to keep monotonicity: | |
535 | */ | |
536 | p->prev_utime = max(p->prev_utime, utime); | |
537 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | |
538 | ||
539 | *ut = p->prev_utime; | |
540 | *st = p->prev_stime; | |
541 | } | |
542 | ||
543 | /* | |
544 | * Must be called with siglock held. | |
545 | */ | |
e80d0a1a | 546 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
73fbec60 FW |
547 | { |
548 | struct signal_struct *sig = p->signal; | |
549 | struct task_cputime cputime; | |
550 | cputime_t rtime, utime, total; | |
551 | ||
552 | thread_group_cputime(p, &cputime); | |
553 | ||
554 | total = cputime.utime + cputime.stime; | |
555 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | |
556 | ||
557 | if (total) | |
558 | utime = scale_utime(cputime.utime, rtime, total); | |
559 | else | |
560 | utime = rtime; | |
561 | ||
562 | sig->prev_utime = max(sig->prev_utime, utime); | |
563 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | |
564 | ||
565 | *ut = sig->prev_utime; | |
566 | *st = sig->prev_stime; | |
567 | } | |
568 | #endif |