psi: split update_stats into parts
[linux-2.6-block.git] / kernel / sched / psi.c
CommitLineData
eb414681
JW
1/*
2 * Pressure stall information for CPU, memory and IO
3 *
4 * Copyright (c) 2018 Facebook, Inc.
5 * Author: Johannes Weiner <hannes@cmpxchg.org>
6 *
7 * When CPU, memory and IO are contended, tasks experience delays that
8 * reduce throughput and introduce latencies into the workload. Memory
9 * and IO contention, in addition, can cause a full loss of forward
10 * progress in which the CPU goes idle.
11 *
12 * This code aggregates individual task delays into resource pressure
13 * metrics that indicate problems with both workload health and
14 * resource utilization.
15 *
16 * Model
17 *
18 * The time in which a task can execute on a CPU is our baseline for
19 * productivity. Pressure expresses the amount of time in which this
20 * potential cannot be realized due to resource contention.
21 *
22 * This concept of productivity has two components: the workload and
23 * the CPU. To measure the impact of pressure on both, we define two
24 * contention states for a resource: SOME and FULL.
25 *
26 * In the SOME state of a given resource, one or more tasks are
27 * delayed on that resource. This affects the workload's ability to
28 * perform work, but the CPU may still be executing other tasks.
29 *
30 * In the FULL state of a given resource, all non-idle tasks are
31 * delayed on that resource such that nobody is advancing and the CPU
32 * goes idle. This leaves both workload and CPU unproductive.
33 *
34 * (Naturally, the FULL state doesn't exist for the CPU resource.)
35 *
36 * SOME = nr_delayed_tasks != 0
37 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
38 *
39 * The percentage of wallclock time spent in those compound stall
40 * states gives pressure numbers between 0 and 100 for each resource,
41 * where the SOME percentage indicates workload slowdowns and the FULL
42 * percentage indicates reduced CPU utilization:
43 *
44 * %SOME = time(SOME) / period
45 * %FULL = time(FULL) / period
46 *
47 * Multiple CPUs
48 *
49 * The more tasks and available CPUs there are, the more work can be
50 * performed concurrently. This means that the potential that can go
51 * unrealized due to resource contention *also* scales with non-idle
52 * tasks and CPUs.
53 *
54 * Consider a scenario where 257 number crunching tasks are trying to
55 * run concurrently on 256 CPUs. If we simply aggregated the task
56 * states, we would have to conclude a CPU SOME pressure number of
57 * 100%, since *somebody* is waiting on a runqueue at all
58 * times. However, that is clearly not the amount of contention the
59 * workload is experiencing: only one out of 256 possible exceution
60 * threads will be contended at any given time, or about 0.4%.
61 *
62 * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
63 * given time *one* of the tasks is delayed due to a lack of memory.
64 * Again, looking purely at the task state would yield a memory FULL
65 * pressure number of 0%, since *somebody* is always making forward
66 * progress. But again this wouldn't capture the amount of execution
67 * potential lost, which is 1 out of 4 CPUs, or 25%.
68 *
69 * To calculate wasted potential (pressure) with multiple processors,
70 * we have to base our calculation on the number of non-idle tasks in
71 * conjunction with the number of available CPUs, which is the number
72 * of potential execution threads. SOME becomes then the proportion of
73 * delayed tasks to possibe threads, and FULL is the share of possible
74 * threads that are unproductive due to delays:
75 *
76 * threads = min(nr_nonidle_tasks, nr_cpus)
77 * SOME = min(nr_delayed_tasks / threads, 1)
78 * FULL = (threads - min(nr_running_tasks, threads)) / threads
79 *
80 * For the 257 number crunchers on 256 CPUs, this yields:
81 *
82 * threads = min(257, 256)
83 * SOME = min(1 / 256, 1) = 0.4%
84 * FULL = (256 - min(257, 256)) / 256 = 0%
85 *
86 * For the 1 out of 4 memory-delayed tasks, this yields:
87 *
88 * threads = min(4, 4)
89 * SOME = min(1 / 4, 1) = 25%
90 * FULL = (4 - min(3, 4)) / 4 = 25%
91 *
92 * [ Substitute nr_cpus with 1, and you can see that it's a natural
93 * extension of the single-CPU model. ]
94 *
95 * Implementation
96 *
97 * To assess the precise time spent in each such state, we would have
98 * to freeze the system on task changes and start/stop the state
99 * clocks accordingly. Obviously that doesn't scale in practice.
100 *
101 * Because the scheduler aims to distribute the compute load evenly
102 * among the available CPUs, we can track task state locally to each
103 * CPU and, at much lower frequency, extrapolate the global state for
104 * the cumulative stall times and the running averages.
105 *
106 * For each runqueue, we track:
107 *
108 * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
109 * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
110 * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
111 *
112 * and then periodically aggregate:
113 *
114 * tNONIDLE = sum(tNONIDLE[i])
115 *
116 * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
117 * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
118 *
119 * %SOME = tSOME / period
120 * %FULL = tFULL / period
121 *
122 * This gives us an approximation of pressure that is practical
123 * cost-wise, yet way more sensitive and accurate than periodic
124 * sampling of the aggregate task states would be.
125 */
126
1b69ac6b 127#include "../workqueue_internal.h"
eb414681
JW
128#include <linux/sched/loadavg.h>
129#include <linux/seq_file.h>
130#include <linux/proc_fs.h>
131#include <linux/seqlock.h>
132#include <linux/cgroup.h>
133#include <linux/module.h>
134#include <linux/sched.h>
135#include <linux/psi.h>
136#include "sched.h"
137
138static int psi_bug __read_mostly;
139
e0c27447
JW
140DEFINE_STATIC_KEY_FALSE(psi_disabled);
141
142#ifdef CONFIG_PSI_DEFAULT_DISABLED
9289c5e6 143static bool psi_enable;
e0c27447 144#else
9289c5e6 145static bool psi_enable = true;
e0c27447
JW
146#endif
147static int __init setup_psi(char *str)
148{
149 return kstrtobool(str, &psi_enable) == 0;
150}
151__setup("psi=", setup_psi);
eb414681
JW
152
153/* Running averages - we need to be higher-res than loadavg */
154#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
155#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
156#define EXP_60s 1981 /* 1/exp(2s/60s) */
157#define EXP_300s 2034 /* 1/exp(2s/300s) */
158
159/* Sampling frequency in nanoseconds */
160static u64 psi_period __read_mostly;
161
162/* System-level pressure and stall tracking */
163static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
164static struct psi_group psi_system = {
165 .pcpu = &system_group_pcpu,
166};
167
bcc78db6 168static void psi_avgs_work(struct work_struct *work);
eb414681
JW
169
170static void group_init(struct psi_group *group)
171{
172 int cpu;
173
174 for_each_possible_cpu(cpu)
175 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
bcc78db6
SB
176 group->avg_next_update = sched_clock() + psi_period;
177 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
178 mutex_init(&group->avgs_lock);
eb414681
JW
179}
180
181void __init psi_init(void)
182{
e0c27447
JW
183 if (!psi_enable) {
184 static_branch_enable(&psi_disabled);
eb414681 185 return;
e0c27447 186 }
eb414681
JW
187
188 psi_period = jiffies_to_nsecs(PSI_FREQ);
189 group_init(&psi_system);
190}
191
192static bool test_state(unsigned int *tasks, enum psi_states state)
193{
194 switch (state) {
195 case PSI_IO_SOME:
196 return tasks[NR_IOWAIT];
197 case PSI_IO_FULL:
198 return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
199 case PSI_MEM_SOME:
200 return tasks[NR_MEMSTALL];
201 case PSI_MEM_FULL:
202 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
203 case PSI_CPU_SOME:
204 return tasks[NR_RUNNING] > 1;
205 case PSI_NONIDLE:
206 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
207 tasks[NR_RUNNING];
208 default:
209 return false;
210 }
211}
212
213static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
214{
215 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
eb414681 216 u64 now, state_start;
33b2d630 217 enum psi_states s;
eb414681 218 unsigned int seq;
33b2d630 219 u32 state_mask;
eb414681
JW
220
221 /* Snapshot a coherent view of the CPU state */
222 do {
223 seq = read_seqcount_begin(&groupc->seq);
224 now = cpu_clock(cpu);
225 memcpy(times, groupc->times, sizeof(groupc->times));
33b2d630 226 state_mask = groupc->state_mask;
eb414681
JW
227 state_start = groupc->state_start;
228 } while (read_seqcount_retry(&groupc->seq, seq));
229
230 /* Calculate state time deltas against the previous snapshot */
231 for (s = 0; s < NR_PSI_STATES; s++) {
232 u32 delta;
233 /*
234 * In addition to already concluded states, we also
235 * incorporate currently active states on the CPU,
236 * since states may last for many sampling periods.
237 *
238 * This way we keep our delta sampling buckets small
239 * (u32) and our reported pressure close to what's
240 * actually happening.
241 */
33b2d630 242 if (state_mask & (1 << s))
eb414681
JW
243 times[s] += now - state_start;
244
245 delta = times[s] - groupc->times_prev[s];
246 groupc->times_prev[s] = times[s];
247
248 times[s] = delta;
249 }
250}
251
252static void calc_avgs(unsigned long avg[3], int missed_periods,
253 u64 time, u64 period)
254{
255 unsigned long pct;
256
257 /* Fill in zeroes for periods of no activity */
258 if (missed_periods) {
259 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
260 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
261 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
262 }
263
264 /* Sample the most recent active period */
265 pct = div_u64(time * 100, period);
266 pct *= FIXED_1;
267 avg[0] = calc_load(avg[0], EXP_10s, pct);
268 avg[1] = calc_load(avg[1], EXP_60s, pct);
269 avg[2] = calc_load(avg[2], EXP_300s, pct);
270}
271
7fc70a39 272static bool collect_percpu_times(struct psi_group *group)
eb414681
JW
273{
274 u64 deltas[NR_PSI_STATES - 1] = { 0, };
eb414681 275 unsigned long nonidle_total = 0;
eb414681
JW
276 int cpu;
277 int s;
278
eb414681
JW
279 /*
280 * Collect the per-cpu time buckets and average them into a
281 * single time sample that is normalized to wallclock time.
282 *
283 * For averaging, each CPU is weighted by its non-idle time in
284 * the sampling period. This eliminates artifacts from uneven
285 * loading, or even entirely idle CPUs.
286 */
287 for_each_possible_cpu(cpu) {
288 u32 times[NR_PSI_STATES];
289 u32 nonidle;
290
291 get_recent_times(group, cpu, times);
292
293 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
294 nonidle_total += nonidle;
295
296 for (s = 0; s < PSI_NONIDLE; s++)
297 deltas[s] += (u64)times[s] * nonidle;
298 }
299
300 /*
301 * Integrate the sample into the running statistics that are
302 * reported to userspace: the cumulative stall times and the
303 * decaying averages.
304 *
305 * Pressure percentages are sampled at PSI_FREQ. We might be
306 * called more often when the user polls more frequently than
307 * that; we might be called less often when there is no task
308 * activity, thus no data, and clock ticks are sporadic. The
309 * below handles both.
310 */
311
312 /* total= */
313 for (s = 0; s < NR_PSI_STATES - 1; s++)
314 group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
315
7fc70a39
SB
316 return nonidle_total;
317}
318
319static u64 update_averages(struct psi_group *group, u64 now)
320{
321 unsigned long missed_periods = 0;
322 u64 expires, period;
323 u64 avg_next_update;
324 int s;
325
eb414681 326 /* avgX= */
bcc78db6 327 expires = group->avg_next_update;
4e37504d 328 if (now - expires >= psi_period)
eb414681
JW
329 missed_periods = div_u64(now - expires, psi_period);
330
331 /*
332 * The periodic clock tick can get delayed for various
333 * reasons, especially on loaded systems. To avoid clock
334 * drift, we schedule the clock in fixed psi_period intervals.
335 * But the deltas we sample out of the per-cpu buckets above
336 * are based on the actual time elapsing between clock ticks.
337 */
7fc70a39 338 avg_next_update = expires + ((1 + missed_periods) * psi_period);
bcc78db6
SB
339 period = now - (group->avg_last_update + (missed_periods * psi_period));
340 group->avg_last_update = now;
eb414681
JW
341
342 for (s = 0; s < NR_PSI_STATES - 1; s++) {
343 u32 sample;
344
bcc78db6 345 sample = group->total[s] - group->avg_total[s];
eb414681
JW
346 /*
347 * Due to the lockless sampling of the time buckets,
348 * recorded time deltas can slip into the next period,
349 * which under full pressure can result in samples in
350 * excess of the period length.
351 *
352 * We don't want to report non-sensical pressures in
353 * excess of 100%, nor do we want to drop such events
354 * on the floor. Instead we punt any overage into the
355 * future until pressure subsides. By doing this we
356 * don't underreport the occurring pressure curve, we
357 * just report it delayed by one period length.
358 *
359 * The error isn't cumulative. As soon as another
360 * delta slips from a period P to P+1, by definition
361 * it frees up its time T in P.
362 */
363 if (sample > period)
364 sample = period;
bcc78db6 365 group->avg_total[s] += sample;
eb414681
JW
366 calc_avgs(group->avg[s], missed_periods, sample, period);
367 }
7fc70a39
SB
368
369 return avg_next_update;
eb414681
JW
370}
371
bcc78db6 372static void psi_avgs_work(struct work_struct *work)
eb414681
JW
373{
374 struct delayed_work *dwork;
375 struct psi_group *group;
376 bool nonidle;
7fc70a39 377 u64 now;
eb414681
JW
378
379 dwork = to_delayed_work(work);
bcc78db6 380 group = container_of(dwork, struct psi_group, avgs_work);
eb414681 381
7fc70a39
SB
382 mutex_lock(&group->avgs_lock);
383
384 now = sched_clock();
385
386 nonidle = collect_percpu_times(group);
eb414681
JW
387 /*
388 * If there is task activity, periodically fold the per-cpu
389 * times and feed samples into the running averages. If things
390 * are idle and there is no data to process, stop the clock.
391 * Once restarted, we'll catch up the running averages in one
392 * go - see calc_avgs() and missed_periods.
393 */
7fc70a39
SB
394 if (now >= group->avg_next_update)
395 group->avg_next_update = update_averages(group, now);
eb414681
JW
396
397 if (nonidle) {
7fc70a39
SB
398 schedule_delayed_work(dwork, nsecs_to_jiffies(
399 group->avg_next_update - now) + 1);
eb414681 400 }
7fc70a39
SB
401
402 mutex_unlock(&group->avgs_lock);
eb414681
JW
403}
404
405static void record_times(struct psi_group_cpu *groupc, int cpu,
406 bool memstall_tick)
407{
408 u32 delta;
409 u64 now;
410
411 now = cpu_clock(cpu);
412 delta = now - groupc->state_start;
413 groupc->state_start = now;
414
33b2d630 415 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
eb414681 416 groupc->times[PSI_IO_SOME] += delta;
33b2d630 417 if (groupc->state_mask & (1 << PSI_IO_FULL))
eb414681
JW
418 groupc->times[PSI_IO_FULL] += delta;
419 }
420
33b2d630 421 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
eb414681 422 groupc->times[PSI_MEM_SOME] += delta;
33b2d630 423 if (groupc->state_mask & (1 << PSI_MEM_FULL))
eb414681
JW
424 groupc->times[PSI_MEM_FULL] += delta;
425 else if (memstall_tick) {
426 u32 sample;
427 /*
428 * Since we care about lost potential, a
429 * memstall is FULL when there are no other
430 * working tasks, but also when the CPU is
431 * actively reclaiming and nothing productive
432 * could run even if it were runnable.
433 *
434 * When the timer tick sees a reclaiming CPU,
435 * regardless of runnable tasks, sample a FULL
436 * tick (or less if it hasn't been a full tick
437 * since the last state change).
438 */
439 sample = min(delta, (u32)jiffies_to_nsecs(1));
440 groupc->times[PSI_MEM_FULL] += sample;
441 }
442 }
443
33b2d630 444 if (groupc->state_mask & (1 << PSI_CPU_SOME))
eb414681
JW
445 groupc->times[PSI_CPU_SOME] += delta;
446
33b2d630 447 if (groupc->state_mask & (1 << PSI_NONIDLE))
eb414681
JW
448 groupc->times[PSI_NONIDLE] += delta;
449}
450
451static void psi_group_change(struct psi_group *group, int cpu,
452 unsigned int clear, unsigned int set)
453{
454 struct psi_group_cpu *groupc;
455 unsigned int t, m;
33b2d630
SB
456 enum psi_states s;
457 u32 state_mask = 0;
eb414681
JW
458
459 groupc = per_cpu_ptr(group->pcpu, cpu);
460
461 /*
462 * First we assess the aggregate resource states this CPU's
463 * tasks have been in since the last change, and account any
464 * SOME and FULL time these may have resulted in.
465 *
466 * Then we update the task counts according to the state
467 * change requested through the @clear and @set bits.
468 */
469 write_seqcount_begin(&groupc->seq);
470
471 record_times(groupc, cpu, false);
472
473 for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
474 if (!(m & (1 << t)))
475 continue;
476 if (groupc->tasks[t] == 0 && !psi_bug) {
477 printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
478 cpu, t, groupc->tasks[0],
479 groupc->tasks[1], groupc->tasks[2],
480 clear, set);
481 psi_bug = 1;
482 }
483 groupc->tasks[t]--;
484 }
485
486 for (t = 0; set; set &= ~(1 << t), t++)
487 if (set & (1 << t))
488 groupc->tasks[t]++;
489
33b2d630
SB
490 /* Calculate state mask representing active states */
491 for (s = 0; s < NR_PSI_STATES; s++) {
492 if (test_state(groupc->tasks, s))
493 state_mask |= (1 << s);
494 }
495 groupc->state_mask = state_mask;
496
eb414681 497 write_seqcount_end(&groupc->seq);
eb414681
JW
498}
499
2ce7135a
JW
500static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
501{
502#ifdef CONFIG_CGROUPS
503 struct cgroup *cgroup = NULL;
504
505 if (!*iter)
506 cgroup = task->cgroups->dfl_cgrp;
507 else if (*iter == &psi_system)
508 return NULL;
509 else
510 cgroup = cgroup_parent(*iter);
511
512 if (cgroup && cgroup_parent(cgroup)) {
513 *iter = cgroup;
514 return cgroup_psi(cgroup);
515 }
516#else
517 if (*iter)
518 return NULL;
519#endif
520 *iter = &psi_system;
521 return &psi_system;
522}
523
eb414681
JW
524void psi_task_change(struct task_struct *task, int clear, int set)
525{
526 int cpu = task_cpu(task);
2ce7135a 527 struct psi_group *group;
1b69ac6b 528 bool wake_clock = true;
2ce7135a 529 void *iter = NULL;
eb414681
JW
530
531 if (!task->pid)
532 return;
533
534 if (((task->psi_flags & set) ||
535 (task->psi_flags & clear) != clear) &&
536 !psi_bug) {
537 printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
538 task->pid, task->comm, cpu,
539 task->psi_flags, clear, set);
540 psi_bug = 1;
541 }
542
543 task->psi_flags &= ~clear;
544 task->psi_flags |= set;
545
1b69ac6b
JW
546 /*
547 * Periodic aggregation shuts off if there is a period of no
548 * task changes, so we wake it back up if necessary. However,
549 * don't do this if the task change is the aggregation worker
550 * itself going to sleep, or we'll ping-pong forever.
551 */
552 if (unlikely((clear & TSK_RUNNING) &&
553 (task->flags & PF_WQ_WORKER) &&
bcc78db6 554 wq_worker_last_func(task) == psi_avgs_work))
1b69ac6b
JW
555 wake_clock = false;
556
557 while ((group = iterate_groups(task, &iter))) {
2ce7135a 558 psi_group_change(group, cpu, clear, set);
bcc78db6
SB
559 if (wake_clock && !delayed_work_pending(&group->avgs_work))
560 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
1b69ac6b 561 }
eb414681
JW
562}
563
564void psi_memstall_tick(struct task_struct *task, int cpu)
565{
2ce7135a
JW
566 struct psi_group *group;
567 void *iter = NULL;
eb414681 568
2ce7135a
JW
569 while ((group = iterate_groups(task, &iter))) {
570 struct psi_group_cpu *groupc;
571
572 groupc = per_cpu_ptr(group->pcpu, cpu);
573 write_seqcount_begin(&groupc->seq);
574 record_times(groupc, cpu, true);
575 write_seqcount_end(&groupc->seq);
576 }
eb414681
JW
577}
578
579/**
580 * psi_memstall_enter - mark the beginning of a memory stall section
581 * @flags: flags to handle nested sections
582 *
583 * Marks the calling task as being stalled due to a lack of memory,
584 * such as waiting for a refault or performing reclaim.
585 */
586void psi_memstall_enter(unsigned long *flags)
587{
588 struct rq_flags rf;
589 struct rq *rq;
590
e0c27447 591 if (static_branch_likely(&psi_disabled))
eb414681
JW
592 return;
593
594 *flags = current->flags & PF_MEMSTALL;
595 if (*flags)
596 return;
597 /*
598 * PF_MEMSTALL setting & accounting needs to be atomic wrt
599 * changes to the task's scheduling state, otherwise we can
600 * race with CPU migration.
601 */
602 rq = this_rq_lock_irq(&rf);
603
604 current->flags |= PF_MEMSTALL;
605 psi_task_change(current, 0, TSK_MEMSTALL);
606
607 rq_unlock_irq(rq, &rf);
608}
609
610/**
611 * psi_memstall_leave - mark the end of an memory stall section
612 * @flags: flags to handle nested memdelay sections
613 *
614 * Marks the calling task as no longer stalled due to lack of memory.
615 */
616void psi_memstall_leave(unsigned long *flags)
617{
618 struct rq_flags rf;
619 struct rq *rq;
620
e0c27447 621 if (static_branch_likely(&psi_disabled))
eb414681
JW
622 return;
623
624 if (*flags)
625 return;
626 /*
627 * PF_MEMSTALL clearing & accounting needs to be atomic wrt
628 * changes to the task's scheduling state, otherwise we could
629 * race with CPU migration.
630 */
631 rq = this_rq_lock_irq(&rf);
632
633 current->flags &= ~PF_MEMSTALL;
634 psi_task_change(current, TSK_MEMSTALL, 0);
635
636 rq_unlock_irq(rq, &rf);
637}
638
2ce7135a
JW
639#ifdef CONFIG_CGROUPS
640int psi_cgroup_alloc(struct cgroup *cgroup)
641{
e0c27447 642 if (static_branch_likely(&psi_disabled))
2ce7135a
JW
643 return 0;
644
645 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
646 if (!cgroup->psi.pcpu)
647 return -ENOMEM;
648 group_init(&cgroup->psi);
649 return 0;
650}
651
652void psi_cgroup_free(struct cgroup *cgroup)
653{
e0c27447 654 if (static_branch_likely(&psi_disabled))
2ce7135a
JW
655 return;
656
bcc78db6 657 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
2ce7135a
JW
658 free_percpu(cgroup->psi.pcpu);
659}
660
661/**
662 * cgroup_move_task - move task to a different cgroup
663 * @task: the task
664 * @to: the target css_set
665 *
666 * Move task to a new cgroup and safely migrate its associated stall
667 * state between the different groups.
668 *
669 * This function acquires the task's rq lock to lock out concurrent
670 * changes to the task's scheduling state and - in case the task is
671 * running - concurrent changes to its stall state.
672 */
673void cgroup_move_task(struct task_struct *task, struct css_set *to)
674{
2ce7135a
JW
675 unsigned int task_flags = 0;
676 struct rq_flags rf;
677 struct rq *rq;
678
e0c27447 679 if (static_branch_likely(&psi_disabled)) {
8fcb2312
OJ
680 /*
681 * Lame to do this here, but the scheduler cannot be locked
682 * from the outside, so we move cgroups from inside sched/.
683 */
684 rcu_assign_pointer(task->cgroups, to);
685 return;
686 }
2ce7135a 687
8fcb2312 688 rq = task_rq_lock(task, &rf);
2ce7135a 689
8fcb2312
OJ
690 if (task_on_rq_queued(task))
691 task_flags = TSK_RUNNING;
692 else if (task->in_iowait)
693 task_flags = TSK_IOWAIT;
2ce7135a 694
8fcb2312
OJ
695 if (task->flags & PF_MEMSTALL)
696 task_flags |= TSK_MEMSTALL;
2ce7135a 697
8fcb2312
OJ
698 if (task_flags)
699 psi_task_change(task, task_flags, 0);
700
701 /* See comment above */
2ce7135a
JW
702 rcu_assign_pointer(task->cgroups, to);
703
8fcb2312
OJ
704 if (task_flags)
705 psi_task_change(task, 0, task_flags);
2ce7135a 706
8fcb2312 707 task_rq_unlock(rq, task, &rf);
2ce7135a
JW
708}
709#endif /* CONFIG_CGROUPS */
710
711int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
eb414681
JW
712{
713 int full;
7fc70a39 714 u64 now;
eb414681 715
e0c27447 716 if (static_branch_likely(&psi_disabled))
eb414681
JW
717 return -EOPNOTSUPP;
718
7fc70a39
SB
719 /* Update averages before reporting them */
720 mutex_lock(&group->avgs_lock);
721 now = sched_clock();
722 collect_percpu_times(group);
723 if (now >= group->avg_next_update)
724 group->avg_next_update = update_averages(group, now);
725 mutex_unlock(&group->avgs_lock);
eb414681
JW
726
727 for (full = 0; full < 2 - (res == PSI_CPU); full++) {
728 unsigned long avg[3];
729 u64 total;
730 int w;
731
732 for (w = 0; w < 3; w++)
733 avg[w] = group->avg[res * 2 + full][w];
734 total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
735
736 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
737 full ? "full" : "some",
738 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
739 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
740 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
741 total);
742 }
743
744 return 0;
745}
746
747static int psi_io_show(struct seq_file *m, void *v)
748{
749 return psi_show(m, &psi_system, PSI_IO);
750}
751
752static int psi_memory_show(struct seq_file *m, void *v)
753{
754 return psi_show(m, &psi_system, PSI_MEM);
755}
756
757static int psi_cpu_show(struct seq_file *m, void *v)
758{
759 return psi_show(m, &psi_system, PSI_CPU);
760}
761
762static int psi_io_open(struct inode *inode, struct file *file)
763{
764 return single_open(file, psi_io_show, NULL);
765}
766
767static int psi_memory_open(struct inode *inode, struct file *file)
768{
769 return single_open(file, psi_memory_show, NULL);
770}
771
772static int psi_cpu_open(struct inode *inode, struct file *file)
773{
774 return single_open(file, psi_cpu_show, NULL);
775}
776
777static const struct file_operations psi_io_fops = {
778 .open = psi_io_open,
779 .read = seq_read,
780 .llseek = seq_lseek,
781 .release = single_release,
782};
783
784static const struct file_operations psi_memory_fops = {
785 .open = psi_memory_open,
786 .read = seq_read,
787 .llseek = seq_lseek,
788 .release = single_release,
789};
790
791static const struct file_operations psi_cpu_fops = {
792 .open = psi_cpu_open,
793 .read = seq_read,
794 .llseek = seq_lseek,
795 .release = single_release,
796};
797
798static int __init psi_proc_init(void)
799{
800 proc_mkdir("pressure", NULL);
801 proc_create("pressure/io", 0, NULL, &psi_io_fops);
802 proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
803 proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
804 return 0;
805}
806module_init(psi_proc_init);