Merge branch 'sched/urgent' into sched/core, to pick up fixes
[linux-2.6-block.git] / kernel / sched / fair.c
CommitLineData
bf0f6f24
IM
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
21805085
PZ
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
90eec103 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
bf0f6f24
IM
21 */
22
1983a922 23#include <linux/sched.h>
cb251765 24#include <linux/latencytop.h>
3436ae12 25#include <linux/cpumask.h>
83a0a96a 26#include <linux/cpuidle.h>
029632fb
PZ
27#include <linux/slab.h>
28#include <linux/profile.h>
29#include <linux/interrupt.h>
cbee9f88 30#include <linux/mempolicy.h>
e14808b4 31#include <linux/migrate.h>
cbee9f88 32#include <linux/task_work.h>
029632fb
PZ
33
34#include <trace/events/sched.h>
35
36#include "sched.h"
9745512c 37
bf0f6f24 38/*
21805085 39 * Targeted preemption latency for CPU-bound tasks:
864616ee 40 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 41 *
21805085 42 * NOTE: this latency value is not the same as the concept of
d274a4ce
IM
43 * 'timeslice length' - timeslices in CFS are of variable length
44 * and have no persistent notion like in traditional, time-slice
45 * based scheduling concepts.
bf0f6f24 46 *
d274a4ce
IM
47 * (to see the precise effective timeslice length of your workload,
48 * run vmstat and monitor the context-switches (cs) field)
bf0f6f24 49 */
21406928
MG
50unsigned int sysctl_sched_latency = 6000000ULL;
51unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d4 52
1983a922
CE
53/*
54 * The initial- and re-scaling of tunables is configurable
55 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56 *
57 * Options are:
58 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
61 */
62enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 = SCHED_TUNABLESCALING_LOG;
64
2bd8e6d4 65/*
b2be5e96 66 * Minimal preemption granularity for CPU-bound tasks:
864616ee 67 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d4 68 */
0bf377bb
IM
69unsigned int sysctl_sched_min_granularity = 750000ULL;
70unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
21805085
PZ
71
72/*
b2be5e96
PZ
73 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
74 */
0bf377bb 75static unsigned int sched_nr_latency = 8;
b2be5e96
PZ
76
77/*
2bba22c5 78 * After fork, child runs first. If set to 0 (default) then
b2be5e96 79 * parent will (try to) run first.
21805085 80 */
2bba22c5 81unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24 82
bf0f6f24
IM
83/*
84 * SCHED_OTHER wake-up granularity.
172e082a 85 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24
IM
86 *
87 * This option delays the preemption effects of decoupled workloads
88 * and reduces their over-scheduling. Synchronous workloads will still
89 * have immediate wakeup/sleep latencies.
90 */
172e082a 91unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0bcdcf28 92unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24 93
da84d961
IM
94const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95
a7a4f8a7
PT
96/*
97 * The exponential sliding window over which load is averaged for shares
98 * distribution.
99 * (default: 10msec)
100 */
101unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102
ec12cb7f
PT
103#ifdef CONFIG_CFS_BANDWIDTH
104/*
105 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106 * each time a cfs_rq requests quota.
107 *
108 * Note: in the case that the slice exceeds the runtime remaining (either due
109 * to consumption or the quota being specified to be smaller than the slice)
110 * we will always only issue the remaining available time.
111 *
112 * default: 5 msec, units: microseconds
113 */
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif
116
3273163c
MR
117/*
118 * The margin used when comparing utilization with CPU capacity:
119 * util * 1024 < capacity * margin
120 */
121unsigned int capacity_margin = 1280; /* ~20% */
122
8527632d
PG
123static inline void update_load_add(struct load_weight *lw, unsigned long inc)
124{
125 lw->weight += inc;
126 lw->inv_weight = 0;
127}
128
129static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
130{
131 lw->weight -= dec;
132 lw->inv_weight = 0;
133}
134
135static inline void update_load_set(struct load_weight *lw, unsigned long w)
136{
137 lw->weight = w;
138 lw->inv_weight = 0;
139}
140
029632fb
PZ
141/*
142 * Increase the granularity value when there are more CPUs,
143 * because with more CPUs the 'effective latency' as visible
144 * to users decreases. But the relationship is not linear,
145 * so pick a second-best guess by going with the log2 of the
146 * number of CPUs.
147 *
148 * This idea comes from the SD scheduler of Con Kolivas:
149 */
58ac93e4 150static unsigned int get_update_sysctl_factor(void)
029632fb 151{
58ac93e4 152 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
029632fb
PZ
153 unsigned int factor;
154
155 switch (sysctl_sched_tunable_scaling) {
156 case SCHED_TUNABLESCALING_NONE:
157 factor = 1;
158 break;
159 case SCHED_TUNABLESCALING_LINEAR:
160 factor = cpus;
161 break;
162 case SCHED_TUNABLESCALING_LOG:
163 default:
164 factor = 1 + ilog2(cpus);
165 break;
166 }
167
168 return factor;
169}
170
171static void update_sysctl(void)
172{
173 unsigned int factor = get_update_sysctl_factor();
174
175#define SET_SYSCTL(name) \
176 (sysctl_##name = (factor) * normalized_sysctl_##name)
177 SET_SYSCTL(sched_min_granularity);
178 SET_SYSCTL(sched_latency);
179 SET_SYSCTL(sched_wakeup_granularity);
180#undef SET_SYSCTL
181}
182
183void sched_init_granularity(void)
184{
185 update_sysctl();
186}
187
9dbdb155 188#define WMULT_CONST (~0U)
029632fb
PZ
189#define WMULT_SHIFT 32
190
9dbdb155
PZ
191static void __update_inv_weight(struct load_weight *lw)
192{
193 unsigned long w;
194
195 if (likely(lw->inv_weight))
196 return;
197
198 w = scale_load_down(lw->weight);
199
200 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
201 lw->inv_weight = 1;
202 else if (unlikely(!w))
203 lw->inv_weight = WMULT_CONST;
204 else
205 lw->inv_weight = WMULT_CONST / w;
206}
029632fb
PZ
207
208/*
9dbdb155
PZ
209 * delta_exec * weight / lw.weight
210 * OR
211 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
212 *
1c3de5e1 213 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
9dbdb155
PZ
214 * we're guaranteed shift stays positive because inv_weight is guaranteed to
215 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
216 *
217 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
218 * weight/lw.weight <= 1, and therefore our shift will also be positive.
029632fb 219 */
9dbdb155 220static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
029632fb 221{
9dbdb155
PZ
222 u64 fact = scale_load_down(weight);
223 int shift = WMULT_SHIFT;
029632fb 224
9dbdb155 225 __update_inv_weight(lw);
029632fb 226
9dbdb155
PZ
227 if (unlikely(fact >> 32)) {
228 while (fact >> 32) {
229 fact >>= 1;
230 shift--;
231 }
029632fb
PZ
232 }
233
9dbdb155
PZ
234 /* hint to use a 32x32->64 mul */
235 fact = (u64)(u32)fact * lw->inv_weight;
029632fb 236
9dbdb155
PZ
237 while (fact >> 32) {
238 fact >>= 1;
239 shift--;
240 }
029632fb 241
9dbdb155 242 return mul_u64_u32_shr(delta_exec, fact, shift);
029632fb
PZ
243}
244
245
246const struct sched_class fair_sched_class;
a4c2f00f 247
bf0f6f24
IM
248/**************************************************************
249 * CFS operations on generic schedulable entities:
250 */
251
62160e3f 252#ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24 253
62160e3f 254/* cpu runqueue to which this cfs_rq is attached */
bf0f6f24
IM
255static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
256{
62160e3f 257 return cfs_rq->rq;
bf0f6f24
IM
258}
259
62160e3f
IM
260/* An entity is a task if it doesn't "own" a runqueue */
261#define entity_is_task(se) (!se->my_q)
bf0f6f24 262
8f48894f
PZ
263static inline struct task_struct *task_of(struct sched_entity *se)
264{
265#ifdef CONFIG_SCHED_DEBUG
266 WARN_ON_ONCE(!entity_is_task(se));
267#endif
268 return container_of(se, struct task_struct, se);
269}
270
b758149c
PZ
271/* Walk up scheduling entities hierarchy */
272#define for_each_sched_entity(se) \
273 for (; se; se = se->parent)
274
275static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
276{
277 return p->se.cfs_rq;
278}
279
280/* runqueue on which this entity is (to be) queued */
281static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
282{
283 return se->cfs_rq;
284}
285
286/* runqueue "owned" by this group */
287static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
288{
289 return grp->my_q;
290}
291
3d4b47b4
PZ
292static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
293{
294 if (!cfs_rq->on_list) {
67e86250
PT
295 /*
296 * Ensure we either appear before our parent (if already
297 * enqueued) or force our parent to appear after us when it is
298 * enqueued. The fact that we always enqueue bottom-up
299 * reduces this to two cases.
300 */
301 if (cfs_rq->tg->parent &&
302 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
303 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
304 &rq_of(cfs_rq)->leaf_cfs_rq_list);
305 } else {
306 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
3d4b47b4 307 &rq_of(cfs_rq)->leaf_cfs_rq_list);
67e86250 308 }
3d4b47b4
PZ
309
310 cfs_rq->on_list = 1;
311 }
312}
313
314static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
315{
316 if (cfs_rq->on_list) {
317 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
318 cfs_rq->on_list = 0;
319 }
320}
321
b758149c
PZ
322/* Iterate thr' all leaf cfs_rq's on a runqueue */
323#define for_each_leaf_cfs_rq(rq, cfs_rq) \
324 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
325
326/* Do the two (enqueued) entities belong to the same group ? */
fed14d45 327static inline struct cfs_rq *
b758149c
PZ
328is_same_group(struct sched_entity *se, struct sched_entity *pse)
329{
330 if (se->cfs_rq == pse->cfs_rq)
fed14d45 331 return se->cfs_rq;
b758149c 332
fed14d45 333 return NULL;
b758149c
PZ
334}
335
336static inline struct sched_entity *parent_entity(struct sched_entity *se)
337{
338 return se->parent;
339}
340
464b7527
PZ
341static void
342find_matching_se(struct sched_entity **se, struct sched_entity **pse)
343{
344 int se_depth, pse_depth;
345
346 /*
347 * preemption test can be made between sibling entities who are in the
348 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
349 * both tasks until we find their ancestors who are siblings of common
350 * parent.
351 */
352
353 /* First walk up until both entities are at same depth */
fed14d45
PZ
354 se_depth = (*se)->depth;
355 pse_depth = (*pse)->depth;
464b7527
PZ
356
357 while (se_depth > pse_depth) {
358 se_depth--;
359 *se = parent_entity(*se);
360 }
361
362 while (pse_depth > se_depth) {
363 pse_depth--;
364 *pse = parent_entity(*pse);
365 }
366
367 while (!is_same_group(*se, *pse)) {
368 *se = parent_entity(*se);
369 *pse = parent_entity(*pse);
370 }
371}
372
8f48894f
PZ
373#else /* !CONFIG_FAIR_GROUP_SCHED */
374
375static inline struct task_struct *task_of(struct sched_entity *se)
376{
377 return container_of(se, struct task_struct, se);
378}
bf0f6f24 379
62160e3f
IM
380static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
381{
382 return container_of(cfs_rq, struct rq, cfs);
bf0f6f24
IM
383}
384
385#define entity_is_task(se) 1
386
b758149c
PZ
387#define for_each_sched_entity(se) \
388 for (; se; se = NULL)
bf0f6f24 389
b758149c 390static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24 391{
b758149c 392 return &task_rq(p)->cfs;
bf0f6f24
IM
393}
394
b758149c
PZ
395static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
396{
397 struct task_struct *p = task_of(se);
398 struct rq *rq = task_rq(p);
399
400 return &rq->cfs;
401}
402
403/* runqueue "owned" by this group */
404static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
405{
406 return NULL;
407}
408
3d4b47b4
PZ
409static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
410{
411}
412
413static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
414{
415}
416
b758149c
PZ
417#define for_each_leaf_cfs_rq(rq, cfs_rq) \
418 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
419
b758149c
PZ
420static inline struct sched_entity *parent_entity(struct sched_entity *se)
421{
422 return NULL;
423}
424
464b7527
PZ
425static inline void
426find_matching_se(struct sched_entity **se, struct sched_entity **pse)
427{
428}
429
b758149c
PZ
430#endif /* CONFIG_FAIR_GROUP_SCHED */
431
6c16a6dc 432static __always_inline
9dbdb155 433void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
bf0f6f24
IM
434
435/**************************************************************
436 * Scheduling class tree data structure manipulation methods:
437 */
438
1bf08230 439static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a 440{
1bf08230 441 s64 delta = (s64)(vruntime - max_vruntime);
368059a9 442 if (delta > 0)
1bf08230 443 max_vruntime = vruntime;
02e0431a 444
1bf08230 445 return max_vruntime;
02e0431a
PZ
446}
447
0702e3eb 448static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246
PZ
449{
450 s64 delta = (s64)(vruntime - min_vruntime);
451 if (delta < 0)
452 min_vruntime = vruntime;
453
454 return min_vruntime;
455}
456
54fdc581
FC
457static inline int entity_before(struct sched_entity *a,
458 struct sched_entity *b)
459{
460 return (s64)(a->vruntime - b->vruntime) < 0;
461}
462
1af5f730
PZ
463static void update_min_vruntime(struct cfs_rq *cfs_rq)
464{
465 u64 vruntime = cfs_rq->min_vruntime;
466
467 if (cfs_rq->curr)
468 vruntime = cfs_rq->curr->vruntime;
469
470 if (cfs_rq->rb_leftmost) {
471 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
472 struct sched_entity,
473 run_node);
474
e17036da 475 if (!cfs_rq->curr)
1af5f730
PZ
476 vruntime = se->vruntime;
477 else
478 vruntime = min_vruntime(vruntime, se->vruntime);
479 }
480
1bf08230 481 /* ensure we never gain time by being placed backwards. */
1af5f730 482 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b
PZ
483#ifndef CONFIG_64BIT
484 smp_wmb();
485 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
486#endif
1af5f730
PZ
487}
488
bf0f6f24
IM
489/*
490 * Enqueue an entity into the rb-tree:
491 */
0702e3eb 492static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
493{
494 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
495 struct rb_node *parent = NULL;
496 struct sched_entity *entry;
bf0f6f24
IM
497 int leftmost = 1;
498
499 /*
500 * Find the right place in the rbtree:
501 */
502 while (*link) {
503 parent = *link;
504 entry = rb_entry(parent, struct sched_entity, run_node);
505 /*
506 * We dont care about collisions. Nodes with
507 * the same key stay together.
508 */
2bd2d6f2 509 if (entity_before(se, entry)) {
bf0f6f24
IM
510 link = &parent->rb_left;
511 } else {
512 link = &parent->rb_right;
513 leftmost = 0;
514 }
515 }
516
517 /*
518 * Maintain a cache of leftmost tree entries (it is frequently
519 * used):
520 */
1af5f730 521 if (leftmost)
57cb499d 522 cfs_rq->rb_leftmost = &se->run_node;
bf0f6f24
IM
523
524 rb_link_node(&se->run_node, parent, link);
525 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
526}
527
0702e3eb 528static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 529{
3fe69747
PZ
530 if (cfs_rq->rb_leftmost == &se->run_node) {
531 struct rb_node *next_node;
3fe69747
PZ
532
533 next_node = rb_next(&se->run_node);
534 cfs_rq->rb_leftmost = next_node;
3fe69747 535 }
e9acbff6 536
bf0f6f24 537 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
538}
539
029632fb 540struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24 541{
f4b6755f
PZ
542 struct rb_node *left = cfs_rq->rb_leftmost;
543
544 if (!left)
545 return NULL;
546
547 return rb_entry(left, struct sched_entity, run_node);
bf0f6f24
IM
548}
549
ac53db59
RR
550static struct sched_entity *__pick_next_entity(struct sched_entity *se)
551{
552 struct rb_node *next = rb_next(&se->run_node);
553
554 if (!next)
555 return NULL;
556
557 return rb_entry(next, struct sched_entity, run_node);
558}
559
560#ifdef CONFIG_SCHED_DEBUG
029632fb 561struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b04 562{
7eee3e67 563 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
aeb73b04 564
70eee74b
BS
565 if (!last)
566 return NULL;
7eee3e67
IM
567
568 return rb_entry(last, struct sched_entity, run_node);
aeb73b04
PZ
569}
570
bf0f6f24
IM
571/**************************************************************
572 * Scheduling class statistics methods:
573 */
574
acb4a848 575int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af78 576 void __user *buffer, size_t *lenp,
b2be5e96
PZ
577 loff_t *ppos)
578{
8d65af78 579 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
58ac93e4 580 unsigned int factor = get_update_sysctl_factor();
b2be5e96
PZ
581
582 if (ret || !write)
583 return ret;
584
585 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
586 sysctl_sched_min_granularity);
587
acb4a848
CE
588#define WRT_SYSCTL(name) \
589 (normalized_sysctl_##name = sysctl_##name / (factor))
590 WRT_SYSCTL(sched_min_granularity);
591 WRT_SYSCTL(sched_latency);
592 WRT_SYSCTL(sched_wakeup_granularity);
acb4a848
CE
593#undef WRT_SYSCTL
594
b2be5e96
PZ
595 return 0;
596}
597#endif
647e7cac 598
a7be37ac 599/*
f9c0b095 600 * delta /= w
a7be37ac 601 */
9dbdb155 602static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
a7be37ac 603{
f9c0b095 604 if (unlikely(se->load.weight != NICE_0_LOAD))
9dbdb155 605 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
a7be37ac
PZ
606
607 return delta;
608}
609
647e7cac
IM
610/*
611 * The idea is to set a period in which each task runs once.
612 *
532b1858 613 * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac
IM
614 * this period because otherwise the slices get too small.
615 *
616 * p = (nr <= nl) ? l : l*nr/nl
617 */
4d78e7b6
PZ
618static u64 __sched_period(unsigned long nr_running)
619{
8e2b0bf3
BF
620 if (unlikely(nr_running > sched_nr_latency))
621 return nr_running * sysctl_sched_min_granularity;
622 else
623 return sysctl_sched_latency;
4d78e7b6
PZ
624}
625
647e7cac
IM
626/*
627 * We calculate the wall-time slice from the period by taking a part
628 * proportional to the weight.
629 *
f9c0b095 630 * s = p*P[w/rw]
647e7cac 631 */
6d0f0ebd 632static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
21805085 633{
0a582440 634 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b095 635
0a582440 636 for_each_sched_entity(se) {
6272d68c 637 struct load_weight *load;
3104bf03 638 struct load_weight lw;
6272d68c
LM
639
640 cfs_rq = cfs_rq_of(se);
641 load = &cfs_rq->load;
f9c0b095 642
0a582440 643 if (unlikely(!se->on_rq)) {
3104bf03 644 lw = cfs_rq->load;
0a582440
MG
645
646 update_load_add(&lw, se->load.weight);
647 load = &lw;
648 }
9dbdb155 649 slice = __calc_delta(slice, se->load.weight, load);
0a582440
MG
650 }
651 return slice;
bf0f6f24
IM
652}
653
647e7cac 654/*
660cc00f 655 * We calculate the vruntime slice of a to-be-inserted task.
647e7cac 656 *
f9c0b095 657 * vs = s/w
647e7cac 658 */
f9c0b095 659static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a 660{
f9c0b095 661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac
PZ
662}
663
a75cdaa9 664#ifdef CONFIG_SMP
772bd008 665static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
fb13c7ee
MG
666static unsigned long task_h_load(struct task_struct *p);
667
9d89c257
YD
668/*
669 * We choose a half-life close to 1 scheduling period.
84fb5a18
LY
670 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
671 * dependent on this value.
9d89c257
YD
672 */
673#define LOAD_AVG_PERIOD 32
674#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
84fb5a18 675#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
a75cdaa9 676
540247fb
YD
677/* Give new sched_entity start runnable values to heavy its load in infant time */
678void init_entity_runnable_average(struct sched_entity *se)
a75cdaa9 679{
540247fb 680 struct sched_avg *sa = &se->avg;
a75cdaa9 681
9d89c257
YD
682 sa->last_update_time = 0;
683 /*
684 * sched_avg's period_contrib should be strictly less then 1024, so
685 * we give it 1023 to make sure it is almost a period (1024us), and
686 * will definitely be update (after enqueue).
687 */
688 sa->period_contrib = 1023;
540247fb 689 sa->load_avg = scale_load_down(se->load.weight);
9d89c257 690 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
2b8c41da
YD
691 /*
692 * At this point, util_avg won't be used in select_task_rq_fair anyway
693 */
694 sa->util_avg = 0;
695 sa->util_sum = 0;
9d89c257 696 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
a75cdaa9 697}
7ea241af 698
7dc603c9
PZ
699static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
700static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
3d30544f 701static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
7dc603c9
PZ
702static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
703
2b8c41da
YD
704/*
705 * With new tasks being created, their initial util_avgs are extrapolated
706 * based on the cfs_rq's current util_avg:
707 *
708 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
709 *
710 * However, in many cases, the above util_avg does not give a desired
711 * value. Moreover, the sum of the util_avgs may be divergent, such
712 * as when the series is a harmonic series.
713 *
714 * To solve this problem, we also cap the util_avg of successive tasks to
715 * only 1/2 of the left utilization budget:
716 *
717 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
718 *
719 * where n denotes the nth task.
720 *
721 * For example, a simplest series from the beginning would be like:
722 *
723 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
724 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
725 *
726 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
727 * if util_avg > util_avg_cap.
728 */
729void post_init_entity_util_avg(struct sched_entity *se)
730{
731 struct cfs_rq *cfs_rq = cfs_rq_of(se);
732 struct sched_avg *sa = &se->avg;
172895e6 733 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
7dc603c9 734 u64 now = cfs_rq_clock_task(cfs_rq);
2b8c41da
YD
735
736 if (cap > 0) {
737 if (cfs_rq->avg.util_avg != 0) {
738 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
739 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
740
741 if (sa->util_avg > cap)
742 sa->util_avg = cap;
743 } else {
744 sa->util_avg = cap;
745 }
746 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
747 }
7dc603c9
PZ
748
749 if (entity_is_task(se)) {
750 struct task_struct *p = task_of(se);
751 if (p->sched_class != &fair_sched_class) {
752 /*
753 * For !fair tasks do:
754 *
755 update_cfs_rq_load_avg(now, cfs_rq, false);
756 attach_entity_load_avg(cfs_rq, se);
757 switched_from_fair(rq, p);
758 *
759 * such that the next switched_to_fair() has the
760 * expected state.
761 */
762 se->avg.last_update_time = now;
763 return;
764 }
765 }
766
7c3edd2c 767 update_cfs_rq_load_avg(now, cfs_rq, false);
7dc603c9 768 attach_entity_load_avg(cfs_rq, se);
7c3edd2c 769 update_tg_load_avg(cfs_rq, false);
2b8c41da
YD
770}
771
7dc603c9 772#else /* !CONFIG_SMP */
540247fb 773void init_entity_runnable_average(struct sched_entity *se)
a75cdaa9
AS
774{
775}
2b8c41da
YD
776void post_init_entity_util_avg(struct sched_entity *se)
777{
778}
3d30544f
PZ
779static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
780{
781}
7dc603c9 782#endif /* CONFIG_SMP */
a75cdaa9 783
bf0f6f24 784/*
9dbdb155 785 * Update the current task's runtime statistics.
bf0f6f24 786 */
b7cc0896 787static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24 788{
429d43bc 789 struct sched_entity *curr = cfs_rq->curr;
78becc27 790 u64 now = rq_clock_task(rq_of(cfs_rq));
9dbdb155 791 u64 delta_exec;
bf0f6f24
IM
792
793 if (unlikely(!curr))
794 return;
795
9dbdb155
PZ
796 delta_exec = now - curr->exec_start;
797 if (unlikely((s64)delta_exec <= 0))
34f28ecd 798 return;
bf0f6f24 799
8ebc91d9 800 curr->exec_start = now;
d842de87 801
9dbdb155
PZ
802 schedstat_set(curr->statistics.exec_max,
803 max(delta_exec, curr->statistics.exec_max));
804
805 curr->sum_exec_runtime += delta_exec;
806 schedstat_add(cfs_rq, exec_clock, delta_exec);
807
808 curr->vruntime += calc_delta_fair(delta_exec, curr);
809 update_min_vruntime(cfs_rq);
810
d842de87
SV
811 if (entity_is_task(curr)) {
812 struct task_struct *curtask = task_of(curr);
813
f977bb49 814 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d842de87 815 cpuacct_charge(curtask, delta_exec);
f06febc9 816 account_group_exec_runtime(curtask, delta_exec);
d842de87 817 }
ec12cb7f
PT
818
819 account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24
IM
820}
821
6e998916
SG
822static void update_curr_fair(struct rq *rq)
823{
824 update_curr(cfs_rq_of(&rq->curr->se));
825}
826
3ea94de1 827#ifdef CONFIG_SCHEDSTATS
bf0f6f24 828static inline void
5870db5b 829update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 830{
3ea94de1
JP
831 u64 wait_start = rq_clock(rq_of(cfs_rq));
832
833 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
834 likely(wait_start > se->statistics.wait_start))
835 wait_start -= se->statistics.wait_start;
836
837 se->statistics.wait_start = wait_start;
bf0f6f24
IM
838}
839
3ea94de1
JP
840static void
841update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
842{
843 struct task_struct *p;
cb251765
MG
844 u64 delta;
845
846 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
3ea94de1
JP
847
848 if (entity_is_task(se)) {
849 p = task_of(se);
850 if (task_on_rq_migrating(p)) {
851 /*
852 * Preserve migrating task's wait time so wait_start
853 * time stamp can be adjusted to accumulate wait time
854 * prior to migration.
855 */
856 se->statistics.wait_start = delta;
857 return;
858 }
859 trace_sched_stat_wait(p, delta);
860 }
861
862 se->statistics.wait_max = max(se->statistics.wait_max, delta);
863 se->statistics.wait_count++;
864 se->statistics.wait_sum += delta;
865 se->statistics.wait_start = 0;
866}
3ea94de1 867
bf0f6f24
IM
868/*
869 * Task is being enqueued - update stats:
870 */
cb251765
MG
871static inline void
872update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 873{
bf0f6f24
IM
874 /*
875 * Are we enqueueing a waiting task? (for current tasks
876 * a dequeue/enqueue event is a NOP)
877 */
429d43bc 878 if (se != cfs_rq->curr)
5870db5b 879 update_stats_wait_start(cfs_rq, se);
bf0f6f24
IM
880}
881
bf0f6f24 882static inline void
cb251765 883update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 884{
bf0f6f24
IM
885 /*
886 * Mark the end of the wait period if dequeueing a
887 * waiting task:
888 */
429d43bc 889 if (se != cfs_rq->curr)
9ef0a961 890 update_stats_wait_end(cfs_rq, se);
cb251765
MG
891
892 if (flags & DEQUEUE_SLEEP) {
893 if (entity_is_task(se)) {
894 struct task_struct *tsk = task_of(se);
895
896 if (tsk->state & TASK_INTERRUPTIBLE)
897 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
898 if (tsk->state & TASK_UNINTERRUPTIBLE)
899 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
900 }
901 }
902
903}
904#else
905static inline void
906update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
907{
908}
909
910static inline void
911update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
912{
913}
914
915static inline void
916update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
917{
918}
919
920static inline void
921update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
922{
bf0f6f24 923}
cb251765 924#endif
bf0f6f24
IM
925
926/*
927 * We are picking a new current task - update its stats:
928 */
929static inline void
79303e9e 930update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
931{
932 /*
933 * We are starting a new run period:
934 */
78becc27 935 se->exec_start = rq_clock_task(rq_of(cfs_rq));
bf0f6f24
IM
936}
937
bf0f6f24
IM
938/**************************************************
939 * Scheduling class queueing methods:
940 */
941
cbee9f88
PZ
942#ifdef CONFIG_NUMA_BALANCING
943/*
598f0ec0
MG
944 * Approximate time to scan a full NUMA task in ms. The task scan period is
945 * calculated based on the tasks virtual memory size and
946 * numa_balancing_scan_size.
cbee9f88 947 */
598f0ec0
MG
948unsigned int sysctl_numa_balancing_scan_period_min = 1000;
949unsigned int sysctl_numa_balancing_scan_period_max = 60000;
6e5fb223
PZ
950
951/* Portion of address space to scan in MB */
952unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88 953
4b96a29b
PZ
954/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
955unsigned int sysctl_numa_balancing_scan_delay = 1000;
956
598f0ec0
MG
957static unsigned int task_nr_scan_windows(struct task_struct *p)
958{
959 unsigned long rss = 0;
960 unsigned long nr_scan_pages;
961
962 /*
963 * Calculations based on RSS as non-present and empty pages are skipped
964 * by the PTE scanner and NUMA hinting faults should be trapped based
965 * on resident pages
966 */
967 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
968 rss = get_mm_rss(p->mm);
969 if (!rss)
970 rss = nr_scan_pages;
971
972 rss = round_up(rss, nr_scan_pages);
973 return rss / nr_scan_pages;
974}
975
976/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
977#define MAX_SCAN_WINDOW 2560
978
979static unsigned int task_scan_min(struct task_struct *p)
980{
316c1608 981 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
598f0ec0
MG
982 unsigned int scan, floor;
983 unsigned int windows = 1;
984
64192658
KT
985 if (scan_size < MAX_SCAN_WINDOW)
986 windows = MAX_SCAN_WINDOW / scan_size;
598f0ec0
MG
987 floor = 1000 / windows;
988
989 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
990 return max_t(unsigned int, floor, scan);
991}
992
993static unsigned int task_scan_max(struct task_struct *p)
994{
995 unsigned int smin = task_scan_min(p);
996 unsigned int smax;
997
998 /* Watch for min being lower than max due to floor calculations */
999 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1000 return max(smin, smax);
1001}
1002
0ec8aa00
PZ
1003static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1004{
1005 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1006 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1007}
1008
1009static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1010{
1011 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1012 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1013}
1014
8c8a743c
PZ
1015struct numa_group {
1016 atomic_t refcount;
1017
1018 spinlock_t lock; /* nr_tasks, tasks */
1019 int nr_tasks;
e29cf08b 1020 pid_t gid;
4142c3eb 1021 int active_nodes;
8c8a743c
PZ
1022
1023 struct rcu_head rcu;
989348b5 1024 unsigned long total_faults;
4142c3eb 1025 unsigned long max_faults_cpu;
7e2703e6
RR
1026 /*
1027 * Faults_cpu is used to decide whether memory should move
1028 * towards the CPU. As a consequence, these stats are weighted
1029 * more by CPU use than by memory faults.
1030 */
50ec8a40 1031 unsigned long *faults_cpu;
989348b5 1032 unsigned long faults[0];
8c8a743c
PZ
1033};
1034
be1e4e76
RR
1035/* Shared or private faults. */
1036#define NR_NUMA_HINT_FAULT_TYPES 2
1037
1038/* Memory and CPU locality */
1039#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1040
1041/* Averaged statistics, and temporary buffers. */
1042#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1043
e29cf08b
MG
1044pid_t task_numa_group_id(struct task_struct *p)
1045{
1046 return p->numa_group ? p->numa_group->gid : 0;
1047}
1048
44dba3d5
IM
1049/*
1050 * The averaged statistics, shared & private, memory & cpu,
1051 * occupy the first half of the array. The second half of the
1052 * array is for current counters, which are averaged into the
1053 * first set by task_numa_placement.
1054 */
1055static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
ac8e895b 1056{
44dba3d5 1057 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
ac8e895b
MG
1058}
1059
1060static inline unsigned long task_faults(struct task_struct *p, int nid)
1061{
44dba3d5 1062 if (!p->numa_faults)
ac8e895b
MG
1063 return 0;
1064
44dba3d5
IM
1065 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1066 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
ac8e895b
MG
1067}
1068
83e1d2cd
MG
1069static inline unsigned long group_faults(struct task_struct *p, int nid)
1070{
1071 if (!p->numa_group)
1072 return 0;
1073
44dba3d5
IM
1074 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1075 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
83e1d2cd
MG
1076}
1077
20e07dea
RR
1078static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1079{
44dba3d5
IM
1080 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1081 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
20e07dea
RR
1082}
1083
4142c3eb
RR
1084/*
1085 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1086 * considered part of a numa group's pseudo-interleaving set. Migrations
1087 * between these nodes are slowed down, to allow things to settle down.
1088 */
1089#define ACTIVE_NODE_FRACTION 3
1090
1091static bool numa_is_active_node(int nid, struct numa_group *ng)
1092{
1093 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1094}
1095
6c6b1193
RR
1096/* Handle placement on systems where not all nodes are directly connected. */
1097static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1098 int maxdist, bool task)
1099{
1100 unsigned long score = 0;
1101 int node;
1102
1103 /*
1104 * All nodes are directly connected, and the same distance
1105 * from each other. No need for fancy placement algorithms.
1106 */
1107 if (sched_numa_topology_type == NUMA_DIRECT)
1108 return 0;
1109
1110 /*
1111 * This code is called for each node, introducing N^2 complexity,
1112 * which should be ok given the number of nodes rarely exceeds 8.
1113 */
1114 for_each_online_node(node) {
1115 unsigned long faults;
1116 int dist = node_distance(nid, node);
1117
1118 /*
1119 * The furthest away nodes in the system are not interesting
1120 * for placement; nid was already counted.
1121 */
1122 if (dist == sched_max_numa_distance || node == nid)
1123 continue;
1124
1125 /*
1126 * On systems with a backplane NUMA topology, compare groups
1127 * of nodes, and move tasks towards the group with the most
1128 * memory accesses. When comparing two nodes at distance
1129 * "hoplimit", only nodes closer by than "hoplimit" are part
1130 * of each group. Skip other nodes.
1131 */
1132 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1133 dist > maxdist)
1134 continue;
1135
1136 /* Add up the faults from nearby nodes. */
1137 if (task)
1138 faults = task_faults(p, node);
1139 else
1140 faults = group_faults(p, node);
1141
1142 /*
1143 * On systems with a glueless mesh NUMA topology, there are
1144 * no fixed "groups of nodes". Instead, nodes that are not
1145 * directly connected bounce traffic through intermediate
1146 * nodes; a numa_group can occupy any set of nodes.
1147 * The further away a node is, the less the faults count.
1148 * This seems to result in good task placement.
1149 */
1150 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1151 faults *= (sched_max_numa_distance - dist);
1152 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1153 }
1154
1155 score += faults;
1156 }
1157
1158 return score;
1159}
1160
83e1d2cd
MG
1161/*
1162 * These return the fraction of accesses done by a particular task, or
1163 * task group, on a particular numa node. The group weight is given a
1164 * larger multiplier, in order to group tasks together that are almost
1165 * evenly spread out between numa nodes.
1166 */
7bd95320
RR
1167static inline unsigned long task_weight(struct task_struct *p, int nid,
1168 int dist)
83e1d2cd 1169{
7bd95320 1170 unsigned long faults, total_faults;
83e1d2cd 1171
44dba3d5 1172 if (!p->numa_faults)
83e1d2cd
MG
1173 return 0;
1174
1175 total_faults = p->total_numa_faults;
1176
1177 if (!total_faults)
1178 return 0;
1179
7bd95320 1180 faults = task_faults(p, nid);
6c6b1193
RR
1181 faults += score_nearby_nodes(p, nid, dist, true);
1182
7bd95320 1183 return 1000 * faults / total_faults;
83e1d2cd
MG
1184}
1185
7bd95320
RR
1186static inline unsigned long group_weight(struct task_struct *p, int nid,
1187 int dist)
83e1d2cd 1188{
7bd95320
RR
1189 unsigned long faults, total_faults;
1190
1191 if (!p->numa_group)
1192 return 0;
1193
1194 total_faults = p->numa_group->total_faults;
1195
1196 if (!total_faults)
83e1d2cd
MG
1197 return 0;
1198
7bd95320 1199 faults = group_faults(p, nid);
6c6b1193
RR
1200 faults += score_nearby_nodes(p, nid, dist, false);
1201
7bd95320 1202 return 1000 * faults / total_faults;
83e1d2cd
MG
1203}
1204
10f39042
RR
1205bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1206 int src_nid, int dst_cpu)
1207{
1208 struct numa_group *ng = p->numa_group;
1209 int dst_nid = cpu_to_node(dst_cpu);
1210 int last_cpupid, this_cpupid;
1211
1212 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1213
1214 /*
1215 * Multi-stage node selection is used in conjunction with a periodic
1216 * migration fault to build a temporal task<->page relation. By using
1217 * a two-stage filter we remove short/unlikely relations.
1218 *
1219 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1220 * a task's usage of a particular page (n_p) per total usage of this
1221 * page (n_t) (in a given time-span) to a probability.
1222 *
1223 * Our periodic faults will sample this probability and getting the
1224 * same result twice in a row, given these samples are fully
1225 * independent, is then given by P(n)^2, provided our sample period
1226 * is sufficiently short compared to the usage pattern.
1227 *
1228 * This quadric squishes small probabilities, making it less likely we
1229 * act on an unlikely task<->page relation.
1230 */
1231 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1232 if (!cpupid_pid_unset(last_cpupid) &&
1233 cpupid_to_nid(last_cpupid) != dst_nid)
1234 return false;
1235
1236 /* Always allow migrate on private faults */
1237 if (cpupid_match_pid(p, last_cpupid))
1238 return true;
1239
1240 /* A shared fault, but p->numa_group has not been set up yet. */
1241 if (!ng)
1242 return true;
1243
1244 /*
4142c3eb
RR
1245 * Destination node is much more heavily used than the source
1246 * node? Allow migration.
10f39042 1247 */
4142c3eb
RR
1248 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1249 ACTIVE_NODE_FRACTION)
10f39042
RR
1250 return true;
1251
1252 /*
4142c3eb
RR
1253 * Distribute memory according to CPU & memory use on each node,
1254 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1255 *
1256 * faults_cpu(dst) 3 faults_cpu(src)
1257 * --------------- * - > ---------------
1258 * faults_mem(dst) 4 faults_mem(src)
10f39042 1259 */
4142c3eb
RR
1260 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1261 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
10f39042
RR
1262}
1263
e6628d5b 1264static unsigned long weighted_cpuload(const int cpu);
58d081b5
MG
1265static unsigned long source_load(int cpu, int type);
1266static unsigned long target_load(int cpu, int type);
ced549fa 1267static unsigned long capacity_of(int cpu);
58d081b5
MG
1268static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1269
fb13c7ee 1270/* Cached statistics for all CPUs within a node */
58d081b5 1271struct numa_stats {
fb13c7ee 1272 unsigned long nr_running;
58d081b5 1273 unsigned long load;
fb13c7ee
MG
1274
1275 /* Total compute capacity of CPUs on a node */
5ef20ca1 1276 unsigned long compute_capacity;
fb13c7ee
MG
1277
1278 /* Approximate capacity in terms of runnable tasks on a node */
5ef20ca1 1279 unsigned long task_capacity;
1b6a7495 1280 int has_free_capacity;
58d081b5 1281};
e6628d5b 1282
fb13c7ee
MG
1283/*
1284 * XXX borrowed from update_sg_lb_stats
1285 */
1286static void update_numa_stats(struct numa_stats *ns, int nid)
1287{
83d7f242
RR
1288 int smt, cpu, cpus = 0;
1289 unsigned long capacity;
fb13c7ee
MG
1290
1291 memset(ns, 0, sizeof(*ns));
1292 for_each_cpu(cpu, cpumask_of_node(nid)) {
1293 struct rq *rq = cpu_rq(cpu);
1294
1295 ns->nr_running += rq->nr_running;
1296 ns->load += weighted_cpuload(cpu);
ced549fa 1297 ns->compute_capacity += capacity_of(cpu);
5eca82a9
PZ
1298
1299 cpus++;
fb13c7ee
MG
1300 }
1301
5eca82a9
PZ
1302 /*
1303 * If we raced with hotplug and there are no CPUs left in our mask
1304 * the @ns structure is NULL'ed and task_numa_compare() will
1305 * not find this node attractive.
1306 *
1b6a7495
NP
1307 * We'll either bail at !has_free_capacity, or we'll detect a huge
1308 * imbalance and bail there.
5eca82a9
PZ
1309 */
1310 if (!cpus)
1311 return;
1312
83d7f242
RR
1313 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1314 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1315 capacity = cpus / smt; /* cores */
1316
1317 ns->task_capacity = min_t(unsigned, capacity,
1318 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1b6a7495 1319 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
fb13c7ee
MG
1320}
1321
58d081b5
MG
1322struct task_numa_env {
1323 struct task_struct *p;
e6628d5b 1324
58d081b5
MG
1325 int src_cpu, src_nid;
1326 int dst_cpu, dst_nid;
e6628d5b 1327
58d081b5 1328 struct numa_stats src_stats, dst_stats;
e6628d5b 1329
40ea2b42 1330 int imbalance_pct;
7bd95320 1331 int dist;
fb13c7ee
MG
1332
1333 struct task_struct *best_task;
1334 long best_imp;
58d081b5
MG
1335 int best_cpu;
1336};
1337
fb13c7ee
MG
1338static void task_numa_assign(struct task_numa_env *env,
1339 struct task_struct *p, long imp)
1340{
1341 if (env->best_task)
1342 put_task_struct(env->best_task);
bac78573
ON
1343 if (p)
1344 get_task_struct(p);
fb13c7ee
MG
1345
1346 env->best_task = p;
1347 env->best_imp = imp;
1348 env->best_cpu = env->dst_cpu;
1349}
1350
28a21745 1351static bool load_too_imbalanced(long src_load, long dst_load,
e63da036
RR
1352 struct task_numa_env *env)
1353{
e4991b24
RR
1354 long imb, old_imb;
1355 long orig_src_load, orig_dst_load;
28a21745
RR
1356 long src_capacity, dst_capacity;
1357
1358 /*
1359 * The load is corrected for the CPU capacity available on each node.
1360 *
1361 * src_load dst_load
1362 * ------------ vs ---------
1363 * src_capacity dst_capacity
1364 */
1365 src_capacity = env->src_stats.compute_capacity;
1366 dst_capacity = env->dst_stats.compute_capacity;
e63da036
RR
1367
1368 /* We care about the slope of the imbalance, not the direction. */
e4991b24
RR
1369 if (dst_load < src_load)
1370 swap(dst_load, src_load);
e63da036
RR
1371
1372 /* Is the difference below the threshold? */
e4991b24
RR
1373 imb = dst_load * src_capacity * 100 -
1374 src_load * dst_capacity * env->imbalance_pct;
e63da036
RR
1375 if (imb <= 0)
1376 return false;
1377
1378 /*
1379 * The imbalance is above the allowed threshold.
e4991b24 1380 * Compare it with the old imbalance.
e63da036 1381 */
28a21745 1382 orig_src_load = env->src_stats.load;
e4991b24 1383 orig_dst_load = env->dst_stats.load;
28a21745 1384
e4991b24
RR
1385 if (orig_dst_load < orig_src_load)
1386 swap(orig_dst_load, orig_src_load);
e63da036 1387
e4991b24
RR
1388 old_imb = orig_dst_load * src_capacity * 100 -
1389 orig_src_load * dst_capacity * env->imbalance_pct;
1390
1391 /* Would this change make things worse? */
1392 return (imb > old_imb);
e63da036
RR
1393}
1394
fb13c7ee
MG
1395/*
1396 * This checks if the overall compute and NUMA accesses of the system would
1397 * be improved if the source tasks was migrated to the target dst_cpu taking
1398 * into account that it might be best if task running on the dst_cpu should
1399 * be exchanged with the source task
1400 */
887c290e
RR
1401static void task_numa_compare(struct task_numa_env *env,
1402 long taskimp, long groupimp)
fb13c7ee
MG
1403{
1404 struct rq *src_rq = cpu_rq(env->src_cpu);
1405 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1406 struct task_struct *cur;
28a21745 1407 long src_load, dst_load;
fb13c7ee 1408 long load;
1c5d3eb3 1409 long imp = env->p->numa_group ? groupimp : taskimp;
0132c3e1 1410 long moveimp = imp;
7bd95320 1411 int dist = env->dist;
fb13c7ee
MG
1412
1413 rcu_read_lock();
bac78573
ON
1414 cur = task_rcu_dereference(&dst_rq->curr);
1415 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
fb13c7ee
MG
1416 cur = NULL;
1417
7af68335
PZ
1418 /*
1419 * Because we have preemption enabled we can get migrated around and
1420 * end try selecting ourselves (current == env->p) as a swap candidate.
1421 */
1422 if (cur == env->p)
1423 goto unlock;
1424
fb13c7ee
MG
1425 /*
1426 * "imp" is the fault differential for the source task between the
1427 * source and destination node. Calculate the total differential for
1428 * the source task and potential destination task. The more negative
1429 * the value is, the more rmeote accesses that would be expected to
1430 * be incurred if the tasks were swapped.
1431 */
1432 if (cur) {
1433 /* Skip this swap candidate if cannot move to the source cpu */
1434 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1435 goto unlock;
1436
887c290e
RR
1437 /*
1438 * If dst and source tasks are in the same NUMA group, or not
ca28aa53 1439 * in any group then look only at task weights.
887c290e 1440 */
ca28aa53 1441 if (cur->numa_group == env->p->numa_group) {
7bd95320
RR
1442 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1443 task_weight(cur, env->dst_nid, dist);
ca28aa53
RR
1444 /*
1445 * Add some hysteresis to prevent swapping the
1446 * tasks within a group over tiny differences.
1447 */
1448 if (cur->numa_group)
1449 imp -= imp/16;
887c290e 1450 } else {
ca28aa53
RR
1451 /*
1452 * Compare the group weights. If a task is all by
1453 * itself (not part of a group), use the task weight
1454 * instead.
1455 */
ca28aa53 1456 if (cur->numa_group)
7bd95320
RR
1457 imp += group_weight(cur, env->src_nid, dist) -
1458 group_weight(cur, env->dst_nid, dist);
ca28aa53 1459 else
7bd95320
RR
1460 imp += task_weight(cur, env->src_nid, dist) -
1461 task_weight(cur, env->dst_nid, dist);
887c290e 1462 }
fb13c7ee
MG
1463 }
1464
0132c3e1 1465 if (imp <= env->best_imp && moveimp <= env->best_imp)
fb13c7ee
MG
1466 goto unlock;
1467
1468 if (!cur) {
1469 /* Is there capacity at our destination? */
b932c03c 1470 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1b6a7495 1471 !env->dst_stats.has_free_capacity)
fb13c7ee
MG
1472 goto unlock;
1473
1474 goto balance;
1475 }
1476
1477 /* Balance doesn't matter much if we're running a task per cpu */
0132c3e1
RR
1478 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1479 dst_rq->nr_running == 1)
fb13c7ee
MG
1480 goto assign;
1481
1482 /*
1483 * In the overloaded case, try and keep the load balanced.
1484 */
1485balance:
e720fff6
PZ
1486 load = task_h_load(env->p);
1487 dst_load = env->dst_stats.load + load;
1488 src_load = env->src_stats.load - load;
fb13c7ee 1489
0132c3e1
RR
1490 if (moveimp > imp && moveimp > env->best_imp) {
1491 /*
1492 * If the improvement from just moving env->p direction is
1493 * better than swapping tasks around, check if a move is
1494 * possible. Store a slightly smaller score than moveimp,
1495 * so an actually idle CPU will win.
1496 */
1497 if (!load_too_imbalanced(src_load, dst_load, env)) {
1498 imp = moveimp - 1;
1499 cur = NULL;
1500 goto assign;
1501 }
1502 }
1503
1504 if (imp <= env->best_imp)
1505 goto unlock;
1506
fb13c7ee 1507 if (cur) {
e720fff6
PZ
1508 load = task_h_load(cur);
1509 dst_load -= load;
1510 src_load += load;
fb13c7ee
MG
1511 }
1512
28a21745 1513 if (load_too_imbalanced(src_load, dst_load, env))
fb13c7ee
MG
1514 goto unlock;
1515
ba7e5a27
RR
1516 /*
1517 * One idle CPU per node is evaluated for a task numa move.
1518 * Call select_idle_sibling to maybe find a better one.
1519 */
1520 if (!cur)
772bd008
MR
1521 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1522 env->dst_cpu);
ba7e5a27 1523
fb13c7ee
MG
1524assign:
1525 task_numa_assign(env, cur, imp);
1526unlock:
1527 rcu_read_unlock();
1528}
1529
887c290e
RR
1530static void task_numa_find_cpu(struct task_numa_env *env,
1531 long taskimp, long groupimp)
2c8a50aa
MG
1532{
1533 int cpu;
1534
1535 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1536 /* Skip this CPU if the source task cannot migrate */
1537 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1538 continue;
1539
1540 env->dst_cpu = cpu;
887c290e 1541 task_numa_compare(env, taskimp, groupimp);
2c8a50aa
MG
1542 }
1543}
1544
6f9aad0b
RR
1545/* Only move tasks to a NUMA node less busy than the current node. */
1546static bool numa_has_capacity(struct task_numa_env *env)
1547{
1548 struct numa_stats *src = &env->src_stats;
1549 struct numa_stats *dst = &env->dst_stats;
1550
1551 if (src->has_free_capacity && !dst->has_free_capacity)
1552 return false;
1553
1554 /*
1555 * Only consider a task move if the source has a higher load
1556 * than the destination, corrected for CPU capacity on each node.
1557 *
1558 * src->load dst->load
1559 * --------------------- vs ---------------------
1560 * src->compute_capacity dst->compute_capacity
1561 */
44dcb04f
SD
1562 if (src->load * dst->compute_capacity * env->imbalance_pct >
1563
1564 dst->load * src->compute_capacity * 100)
6f9aad0b
RR
1565 return true;
1566
1567 return false;
1568}
1569
58d081b5
MG
1570static int task_numa_migrate(struct task_struct *p)
1571{
58d081b5
MG
1572 struct task_numa_env env = {
1573 .p = p,
fb13c7ee 1574
58d081b5 1575 .src_cpu = task_cpu(p),
b32e86b4 1576 .src_nid = task_node(p),
fb13c7ee
MG
1577
1578 .imbalance_pct = 112,
1579
1580 .best_task = NULL,
1581 .best_imp = 0,
4142c3eb 1582 .best_cpu = -1,
58d081b5
MG
1583 };
1584 struct sched_domain *sd;
887c290e 1585 unsigned long taskweight, groupweight;
7bd95320 1586 int nid, ret, dist;
887c290e 1587 long taskimp, groupimp;
e6628d5b 1588
58d081b5 1589 /*
fb13c7ee
MG
1590 * Pick the lowest SD_NUMA domain, as that would have the smallest
1591 * imbalance and would be the first to start moving tasks about.
1592 *
1593 * And we want to avoid any moving of tasks about, as that would create
1594 * random movement of tasks -- counter the numa conditions we're trying
1595 * to satisfy here.
58d081b5
MG
1596 */
1597 rcu_read_lock();
fb13c7ee 1598 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
46a73e8a
RR
1599 if (sd)
1600 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
e6628d5b
MG
1601 rcu_read_unlock();
1602
46a73e8a
RR
1603 /*
1604 * Cpusets can break the scheduler domain tree into smaller
1605 * balance domains, some of which do not cross NUMA boundaries.
1606 * Tasks that are "trapped" in such domains cannot be migrated
1607 * elsewhere, so there is no point in (re)trying.
1608 */
1609 if (unlikely(!sd)) {
de1b301a 1610 p->numa_preferred_nid = task_node(p);
46a73e8a
RR
1611 return -EINVAL;
1612 }
1613
2c8a50aa 1614 env.dst_nid = p->numa_preferred_nid;
7bd95320
RR
1615 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1616 taskweight = task_weight(p, env.src_nid, dist);
1617 groupweight = group_weight(p, env.src_nid, dist);
1618 update_numa_stats(&env.src_stats, env.src_nid);
1619 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1620 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2c8a50aa 1621 update_numa_stats(&env.dst_stats, env.dst_nid);
58d081b5 1622
a43455a1 1623 /* Try to find a spot on the preferred nid. */
6f9aad0b
RR
1624 if (numa_has_capacity(&env))
1625 task_numa_find_cpu(&env, taskimp, groupimp);
e1dda8a7 1626
9de05d48
RR
1627 /*
1628 * Look at other nodes in these cases:
1629 * - there is no space available on the preferred_nid
1630 * - the task is part of a numa_group that is interleaved across
1631 * multiple NUMA nodes; in order to better consolidate the group,
1632 * we need to check other locations.
1633 */
4142c3eb 1634 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
2c8a50aa
MG
1635 for_each_online_node(nid) {
1636 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1637 continue;
58d081b5 1638
7bd95320 1639 dist = node_distance(env.src_nid, env.dst_nid);
6c6b1193
RR
1640 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1641 dist != env.dist) {
1642 taskweight = task_weight(p, env.src_nid, dist);
1643 groupweight = group_weight(p, env.src_nid, dist);
1644 }
7bd95320 1645
83e1d2cd 1646 /* Only consider nodes where both task and groups benefit */
7bd95320
RR
1647 taskimp = task_weight(p, nid, dist) - taskweight;
1648 groupimp = group_weight(p, nid, dist) - groupweight;
887c290e 1649 if (taskimp < 0 && groupimp < 0)
fb13c7ee
MG
1650 continue;
1651
7bd95320 1652 env.dist = dist;
2c8a50aa
MG
1653 env.dst_nid = nid;
1654 update_numa_stats(&env.dst_stats, env.dst_nid);
6f9aad0b
RR
1655 if (numa_has_capacity(&env))
1656 task_numa_find_cpu(&env, taskimp, groupimp);
58d081b5
MG
1657 }
1658 }
1659
68d1b02a
RR
1660 /*
1661 * If the task is part of a workload that spans multiple NUMA nodes,
1662 * and is migrating into one of the workload's active nodes, remember
1663 * this node as the task's preferred numa node, so the workload can
1664 * settle down.
1665 * A task that migrated to a second choice node will be better off
1666 * trying for a better one later. Do not set the preferred node here.
1667 */
db015dae 1668 if (p->numa_group) {
4142c3eb
RR
1669 struct numa_group *ng = p->numa_group;
1670
db015dae
RR
1671 if (env.best_cpu == -1)
1672 nid = env.src_nid;
1673 else
1674 nid = env.dst_nid;
1675
4142c3eb 1676 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
db015dae
RR
1677 sched_setnuma(p, env.dst_nid);
1678 }
1679
1680 /* No better CPU than the current one was found. */
1681 if (env.best_cpu == -1)
1682 return -EAGAIN;
0ec8aa00 1683
04bb2f94
RR
1684 /*
1685 * Reset the scan period if the task is being rescheduled on an
1686 * alternative node to recheck if the tasks is now properly placed.
1687 */
1688 p->numa_scan_period = task_scan_min(p);
1689
fb13c7ee 1690 if (env.best_task == NULL) {
286549dc
MG
1691 ret = migrate_task_to(p, env.best_cpu);
1692 if (ret != 0)
1693 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
fb13c7ee
MG
1694 return ret;
1695 }
1696
1697 ret = migrate_swap(p, env.best_task);
286549dc
MG
1698 if (ret != 0)
1699 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
fb13c7ee
MG
1700 put_task_struct(env.best_task);
1701 return ret;
e6628d5b
MG
1702}
1703
6b9a7460
MG
1704/* Attempt to migrate a task to a CPU on the preferred node. */
1705static void numa_migrate_preferred(struct task_struct *p)
1706{
5085e2a3
RR
1707 unsigned long interval = HZ;
1708
2739d3ee 1709 /* This task has no NUMA fault statistics yet */
44dba3d5 1710 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
6b9a7460
MG
1711 return;
1712
2739d3ee 1713 /* Periodically retry migrating the task to the preferred node */
5085e2a3
RR
1714 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1715 p->numa_migrate_retry = jiffies + interval;
2739d3ee
RR
1716
1717 /* Success if task is already running on preferred CPU */
de1b301a 1718 if (task_node(p) == p->numa_preferred_nid)
6b9a7460
MG
1719 return;
1720
1721 /* Otherwise, try migrate to a CPU on the preferred node */
2739d3ee 1722 task_numa_migrate(p);
6b9a7460
MG
1723}
1724
20e07dea 1725/*
4142c3eb 1726 * Find out how many nodes on the workload is actively running on. Do this by
20e07dea
RR
1727 * tracking the nodes from which NUMA hinting faults are triggered. This can
1728 * be different from the set of nodes where the workload's memory is currently
1729 * located.
20e07dea 1730 */
4142c3eb 1731static void numa_group_count_active_nodes(struct numa_group *numa_group)
20e07dea
RR
1732{
1733 unsigned long faults, max_faults = 0;
4142c3eb 1734 int nid, active_nodes = 0;
20e07dea
RR
1735
1736 for_each_online_node(nid) {
1737 faults = group_faults_cpu(numa_group, nid);
1738 if (faults > max_faults)
1739 max_faults = faults;
1740 }
1741
1742 for_each_online_node(nid) {
1743 faults = group_faults_cpu(numa_group, nid);
4142c3eb
RR
1744 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1745 active_nodes++;
20e07dea 1746 }
4142c3eb
RR
1747
1748 numa_group->max_faults_cpu = max_faults;
1749 numa_group->active_nodes = active_nodes;
20e07dea
RR
1750}
1751
04bb2f94
RR
1752/*
1753 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1754 * increments. The more local the fault statistics are, the higher the scan
a22b4b01
RR
1755 * period will be for the next scan window. If local/(local+remote) ratio is
1756 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1757 * the scan period will decrease. Aim for 70% local accesses.
04bb2f94
RR
1758 */
1759#define NUMA_PERIOD_SLOTS 10
a22b4b01 1760#define NUMA_PERIOD_THRESHOLD 7
04bb2f94
RR
1761
1762/*
1763 * Increase the scan period (slow down scanning) if the majority of
1764 * our memory is already on our local node, or if the majority of
1765 * the page accesses are shared with other processes.
1766 * Otherwise, decrease the scan period.
1767 */
1768static void update_task_scan_period(struct task_struct *p,
1769 unsigned long shared, unsigned long private)
1770{
1771 unsigned int period_slot;
1772 int ratio;
1773 int diff;
1774
1775 unsigned long remote = p->numa_faults_locality[0];
1776 unsigned long local = p->numa_faults_locality[1];
1777
1778 /*
1779 * If there were no record hinting faults then either the task is
1780 * completely idle or all activity is areas that are not of interest
074c2381
MG
1781 * to automatic numa balancing. Related to that, if there were failed
1782 * migration then it implies we are migrating too quickly or the local
1783 * node is overloaded. In either case, scan slower
04bb2f94 1784 */
074c2381 1785 if (local + shared == 0 || p->numa_faults_locality[2]) {
04bb2f94
RR
1786 p->numa_scan_period = min(p->numa_scan_period_max,
1787 p->numa_scan_period << 1);
1788
1789 p->mm->numa_next_scan = jiffies +
1790 msecs_to_jiffies(p->numa_scan_period);
1791
1792 return;
1793 }
1794
1795 /*
1796 * Prepare to scale scan period relative to the current period.
1797 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1798 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1799 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1800 */
1801 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1802 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1803 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1804 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1805 if (!slot)
1806 slot = 1;
1807 diff = slot * period_slot;
1808 } else {
1809 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1810
1811 /*
1812 * Scale scan rate increases based on sharing. There is an
1813 * inverse relationship between the degree of sharing and
1814 * the adjustment made to the scanning period. Broadly
1815 * speaking the intent is that there is little point
1816 * scanning faster if shared accesses dominate as it may
1817 * simply bounce migrations uselessly
1818 */
2847c90e 1819 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
04bb2f94
RR
1820 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1821 }
1822
1823 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1824 task_scan_min(p), task_scan_max(p));
1825 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1826}
1827
7e2703e6
RR
1828/*
1829 * Get the fraction of time the task has been running since the last
1830 * NUMA placement cycle. The scheduler keeps similar statistics, but
1831 * decays those on a 32ms period, which is orders of magnitude off
1832 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1833 * stats only if the task is so new there are no NUMA statistics yet.
1834 */
1835static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1836{
1837 u64 runtime, delta, now;
1838 /* Use the start of this time slice to avoid calculations. */
1839 now = p->se.exec_start;
1840 runtime = p->se.sum_exec_runtime;
1841
1842 if (p->last_task_numa_placement) {
1843 delta = runtime - p->last_sum_exec_runtime;
1844 *period = now - p->last_task_numa_placement;
1845 } else {
9d89c257
YD
1846 delta = p->se.avg.load_sum / p->se.load.weight;
1847 *period = LOAD_AVG_MAX;
7e2703e6
RR
1848 }
1849
1850 p->last_sum_exec_runtime = runtime;
1851 p->last_task_numa_placement = now;
1852
1853 return delta;
1854}
1855
54009416
RR
1856/*
1857 * Determine the preferred nid for a task in a numa_group. This needs to
1858 * be done in a way that produces consistent results with group_weight,
1859 * otherwise workloads might not converge.
1860 */
1861static int preferred_group_nid(struct task_struct *p, int nid)
1862{
1863 nodemask_t nodes;
1864 int dist;
1865
1866 /* Direct connections between all NUMA nodes. */
1867 if (sched_numa_topology_type == NUMA_DIRECT)
1868 return nid;
1869
1870 /*
1871 * On a system with glueless mesh NUMA topology, group_weight
1872 * scores nodes according to the number of NUMA hinting faults on
1873 * both the node itself, and on nearby nodes.
1874 */
1875 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1876 unsigned long score, max_score = 0;
1877 int node, max_node = nid;
1878
1879 dist = sched_max_numa_distance;
1880
1881 for_each_online_node(node) {
1882 score = group_weight(p, node, dist);
1883 if (score > max_score) {
1884 max_score = score;
1885 max_node = node;
1886 }
1887 }
1888 return max_node;
1889 }
1890
1891 /*
1892 * Finding the preferred nid in a system with NUMA backplane
1893 * interconnect topology is more involved. The goal is to locate
1894 * tasks from numa_groups near each other in the system, and
1895 * untangle workloads from different sides of the system. This requires
1896 * searching down the hierarchy of node groups, recursively searching
1897 * inside the highest scoring group of nodes. The nodemask tricks
1898 * keep the complexity of the search down.
1899 */
1900 nodes = node_online_map;
1901 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1902 unsigned long max_faults = 0;
81907478 1903 nodemask_t max_group = NODE_MASK_NONE;
54009416
RR
1904 int a, b;
1905
1906 /* Are there nodes at this distance from each other? */
1907 if (!find_numa_distance(dist))
1908 continue;
1909
1910 for_each_node_mask(a, nodes) {
1911 unsigned long faults = 0;
1912 nodemask_t this_group;
1913 nodes_clear(this_group);
1914
1915 /* Sum group's NUMA faults; includes a==b case. */
1916 for_each_node_mask(b, nodes) {
1917 if (node_distance(a, b) < dist) {
1918 faults += group_faults(p, b);
1919 node_set(b, this_group);
1920 node_clear(b, nodes);
1921 }
1922 }
1923
1924 /* Remember the top group. */
1925 if (faults > max_faults) {
1926 max_faults = faults;
1927 max_group = this_group;
1928 /*
1929 * subtle: at the smallest distance there is
1930 * just one node left in each "group", the
1931 * winner is the preferred nid.
1932 */
1933 nid = a;
1934 }
1935 }
1936 /* Next round, evaluate the nodes within max_group. */
890a5409
JB
1937 if (!max_faults)
1938 break;
54009416
RR
1939 nodes = max_group;
1940 }
1941 return nid;
1942}
1943
cbee9f88
PZ
1944static void task_numa_placement(struct task_struct *p)
1945{
83e1d2cd
MG
1946 int seq, nid, max_nid = -1, max_group_nid = -1;
1947 unsigned long max_faults = 0, max_group_faults = 0;
04bb2f94 1948 unsigned long fault_types[2] = { 0, 0 };
7e2703e6
RR
1949 unsigned long total_faults;
1950 u64 runtime, period;
7dbd13ed 1951 spinlock_t *group_lock = NULL;
cbee9f88 1952
7e5a2c17
JL
1953 /*
1954 * The p->mm->numa_scan_seq field gets updated without
1955 * exclusive access. Use READ_ONCE() here to ensure
1956 * that the field is read in a single access:
1957 */
316c1608 1958 seq = READ_ONCE(p->mm->numa_scan_seq);
cbee9f88
PZ
1959 if (p->numa_scan_seq == seq)
1960 return;
1961 p->numa_scan_seq = seq;
598f0ec0 1962 p->numa_scan_period_max = task_scan_max(p);
cbee9f88 1963
7e2703e6
RR
1964 total_faults = p->numa_faults_locality[0] +
1965 p->numa_faults_locality[1];
1966 runtime = numa_get_avg_runtime(p, &period);
1967
7dbd13ed
MG
1968 /* If the task is part of a group prevent parallel updates to group stats */
1969 if (p->numa_group) {
1970 group_lock = &p->numa_group->lock;
60e69eed 1971 spin_lock_irq(group_lock);
7dbd13ed
MG
1972 }
1973
688b7585
MG
1974 /* Find the node with the highest number of faults */
1975 for_each_online_node(nid) {
44dba3d5
IM
1976 /* Keep track of the offsets in numa_faults array */
1977 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
83e1d2cd 1978 unsigned long faults = 0, group_faults = 0;
44dba3d5 1979 int priv;
745d6147 1980
be1e4e76 1981 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
7e2703e6 1982 long diff, f_diff, f_weight;
8c8a743c 1983
44dba3d5
IM
1984 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1985 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1986 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1987 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
745d6147 1988
ac8e895b 1989 /* Decay existing window, copy faults since last scan */
44dba3d5
IM
1990 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1991 fault_types[priv] += p->numa_faults[membuf_idx];
1992 p->numa_faults[membuf_idx] = 0;
fb13c7ee 1993
7e2703e6
RR
1994 /*
1995 * Normalize the faults_from, so all tasks in a group
1996 * count according to CPU use, instead of by the raw
1997 * number of faults. Tasks with little runtime have
1998 * little over-all impact on throughput, and thus their
1999 * faults are less important.
2000 */
2001 f_weight = div64_u64(runtime << 16, period + 1);
44dba3d5 2002 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
7e2703e6 2003 (total_faults + 1);
44dba3d5
IM
2004 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2005 p->numa_faults[cpubuf_idx] = 0;
50ec8a40 2006
44dba3d5
IM
2007 p->numa_faults[mem_idx] += diff;
2008 p->numa_faults[cpu_idx] += f_diff;
2009 faults += p->numa_faults[mem_idx];
83e1d2cd 2010 p->total_numa_faults += diff;
8c8a743c 2011 if (p->numa_group) {
44dba3d5
IM
2012 /*
2013 * safe because we can only change our own group
2014 *
2015 * mem_idx represents the offset for a given
2016 * nid and priv in a specific region because it
2017 * is at the beginning of the numa_faults array.
2018 */
2019 p->numa_group->faults[mem_idx] += diff;
2020 p->numa_group->faults_cpu[mem_idx] += f_diff;
989348b5 2021 p->numa_group->total_faults += diff;
44dba3d5 2022 group_faults += p->numa_group->faults[mem_idx];
8c8a743c 2023 }
ac8e895b
MG
2024 }
2025
688b7585
MG
2026 if (faults > max_faults) {
2027 max_faults = faults;
2028 max_nid = nid;
2029 }
83e1d2cd
MG
2030
2031 if (group_faults > max_group_faults) {
2032 max_group_faults = group_faults;
2033 max_group_nid = nid;
2034 }
2035 }
2036
04bb2f94
RR
2037 update_task_scan_period(p, fault_types[0], fault_types[1]);
2038
7dbd13ed 2039 if (p->numa_group) {
4142c3eb 2040 numa_group_count_active_nodes(p->numa_group);
60e69eed 2041 spin_unlock_irq(group_lock);
54009416 2042 max_nid = preferred_group_nid(p, max_group_nid);
688b7585
MG
2043 }
2044
bb97fc31
RR
2045 if (max_faults) {
2046 /* Set the new preferred node */
2047 if (max_nid != p->numa_preferred_nid)
2048 sched_setnuma(p, max_nid);
2049
2050 if (task_node(p) != p->numa_preferred_nid)
2051 numa_migrate_preferred(p);
3a7053b3 2052 }
cbee9f88
PZ
2053}
2054
8c8a743c
PZ
2055static inline int get_numa_group(struct numa_group *grp)
2056{
2057 return atomic_inc_not_zero(&grp->refcount);
2058}
2059
2060static inline void put_numa_group(struct numa_group *grp)
2061{
2062 if (atomic_dec_and_test(&grp->refcount))
2063 kfree_rcu(grp, rcu);
2064}
2065
3e6a9418
MG
2066static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2067 int *priv)
8c8a743c
PZ
2068{
2069 struct numa_group *grp, *my_grp;
2070 struct task_struct *tsk;
2071 bool join = false;
2072 int cpu = cpupid_to_cpu(cpupid);
2073 int i;
2074
2075 if (unlikely(!p->numa_group)) {
2076 unsigned int size = sizeof(struct numa_group) +
50ec8a40 2077 4*nr_node_ids*sizeof(unsigned long);
8c8a743c
PZ
2078
2079 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2080 if (!grp)
2081 return;
2082
2083 atomic_set(&grp->refcount, 1);
4142c3eb
RR
2084 grp->active_nodes = 1;
2085 grp->max_faults_cpu = 0;
8c8a743c 2086 spin_lock_init(&grp->lock);
e29cf08b 2087 grp->gid = p->pid;
50ec8a40 2088 /* Second half of the array tracks nids where faults happen */
be1e4e76
RR
2089 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2090 nr_node_ids;
8c8a743c 2091
be1e4e76 2092 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
44dba3d5 2093 grp->faults[i] = p->numa_faults[i];
8c8a743c 2094
989348b5 2095 grp->total_faults = p->total_numa_faults;
83e1d2cd 2096
8c8a743c
PZ
2097 grp->nr_tasks++;
2098 rcu_assign_pointer(p->numa_group, grp);
2099 }
2100
2101 rcu_read_lock();
316c1608 2102 tsk = READ_ONCE(cpu_rq(cpu)->curr);
8c8a743c
PZ
2103
2104 if (!cpupid_match_pid(tsk, cpupid))
3354781a 2105 goto no_join;
8c8a743c
PZ
2106
2107 grp = rcu_dereference(tsk->numa_group);
2108 if (!grp)
3354781a 2109 goto no_join;
8c8a743c
PZ
2110
2111 my_grp = p->numa_group;
2112 if (grp == my_grp)
3354781a 2113 goto no_join;
8c8a743c
PZ
2114
2115 /*
2116 * Only join the other group if its bigger; if we're the bigger group,
2117 * the other task will join us.
2118 */
2119 if (my_grp->nr_tasks > grp->nr_tasks)
3354781a 2120 goto no_join;
8c8a743c
PZ
2121
2122 /*
2123 * Tie-break on the grp address.
2124 */
2125 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3354781a 2126 goto no_join;
8c8a743c 2127
dabe1d99
RR
2128 /* Always join threads in the same process. */
2129 if (tsk->mm == current->mm)
2130 join = true;
2131
2132 /* Simple filter to avoid false positives due to PID collisions */
2133 if (flags & TNF_SHARED)
2134 join = true;
8c8a743c 2135
3e6a9418
MG
2136 /* Update priv based on whether false sharing was detected */
2137 *priv = !join;
2138
dabe1d99 2139 if (join && !get_numa_group(grp))
3354781a 2140 goto no_join;
8c8a743c 2141
8c8a743c
PZ
2142 rcu_read_unlock();
2143
2144 if (!join)
2145 return;
2146
60e69eed
MG
2147 BUG_ON(irqs_disabled());
2148 double_lock_irq(&my_grp->lock, &grp->lock);
989348b5 2149
be1e4e76 2150 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
44dba3d5
IM
2151 my_grp->faults[i] -= p->numa_faults[i];
2152 grp->faults[i] += p->numa_faults[i];
8c8a743c 2153 }
989348b5
MG
2154 my_grp->total_faults -= p->total_numa_faults;
2155 grp->total_faults += p->total_numa_faults;
8c8a743c 2156
8c8a743c
PZ
2157 my_grp->nr_tasks--;
2158 grp->nr_tasks++;
2159
2160 spin_unlock(&my_grp->lock);
60e69eed 2161 spin_unlock_irq(&grp->lock);
8c8a743c
PZ
2162
2163 rcu_assign_pointer(p->numa_group, grp);
2164
2165 put_numa_group(my_grp);
3354781a
PZ
2166 return;
2167
2168no_join:
2169 rcu_read_unlock();
2170 return;
8c8a743c
PZ
2171}
2172
2173void task_numa_free(struct task_struct *p)
2174{
2175 struct numa_group *grp = p->numa_group;
44dba3d5 2176 void *numa_faults = p->numa_faults;
e9dd685c
SR
2177 unsigned long flags;
2178 int i;
8c8a743c
PZ
2179
2180 if (grp) {
e9dd685c 2181 spin_lock_irqsave(&grp->lock, flags);
be1e4e76 2182 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
44dba3d5 2183 grp->faults[i] -= p->numa_faults[i];
989348b5 2184 grp->total_faults -= p->total_numa_faults;
83e1d2cd 2185
8c8a743c 2186 grp->nr_tasks--;
e9dd685c 2187 spin_unlock_irqrestore(&grp->lock, flags);
35b123e2 2188 RCU_INIT_POINTER(p->numa_group, NULL);
8c8a743c
PZ
2189 put_numa_group(grp);
2190 }
2191
44dba3d5 2192 p->numa_faults = NULL;
82727018 2193 kfree(numa_faults);
8c8a743c
PZ
2194}
2195
cbee9f88
PZ
2196/*
2197 * Got a PROT_NONE fault for a page on @node.
2198 */
58b46da3 2199void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
cbee9f88
PZ
2200{
2201 struct task_struct *p = current;
6688cc05 2202 bool migrated = flags & TNF_MIGRATED;
58b46da3 2203 int cpu_node = task_node(current);
792568ec 2204 int local = !!(flags & TNF_FAULT_LOCAL);
4142c3eb 2205 struct numa_group *ng;
ac8e895b 2206 int priv;
cbee9f88 2207
2a595721 2208 if (!static_branch_likely(&sched_numa_balancing))
1a687c2e
MG
2209 return;
2210
9ff1d9ff
MG
2211 /* for example, ksmd faulting in a user's mm */
2212 if (!p->mm)
2213 return;
2214
f809ca9a 2215 /* Allocate buffer to track faults on a per-node basis */
44dba3d5
IM
2216 if (unlikely(!p->numa_faults)) {
2217 int size = sizeof(*p->numa_faults) *
be1e4e76 2218 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
f809ca9a 2219
44dba3d5
IM
2220 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2221 if (!p->numa_faults)
f809ca9a 2222 return;
745d6147 2223
83e1d2cd 2224 p->total_numa_faults = 0;
04bb2f94 2225 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
f809ca9a 2226 }
cbee9f88 2227
8c8a743c
PZ
2228 /*
2229 * First accesses are treated as private, otherwise consider accesses
2230 * to be private if the accessing pid has not changed
2231 */
2232 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2233 priv = 1;
2234 } else {
2235 priv = cpupid_match_pid(p, last_cpupid);
6688cc05 2236 if (!priv && !(flags & TNF_NO_GROUP))
3e6a9418 2237 task_numa_group(p, last_cpupid, flags, &priv);
8c8a743c
PZ
2238 }
2239
792568ec
RR
2240 /*
2241 * If a workload spans multiple NUMA nodes, a shared fault that
2242 * occurs wholly within the set of nodes that the workload is
2243 * actively using should be counted as local. This allows the
2244 * scan rate to slow down when a workload has settled down.
2245 */
4142c3eb
RR
2246 ng = p->numa_group;
2247 if (!priv && !local && ng && ng->active_nodes > 1 &&
2248 numa_is_active_node(cpu_node, ng) &&
2249 numa_is_active_node(mem_node, ng))
792568ec
RR
2250 local = 1;
2251
cbee9f88 2252 task_numa_placement(p);
f809ca9a 2253
2739d3ee
RR
2254 /*
2255 * Retry task to preferred node migration periodically, in case it
2256 * case it previously failed, or the scheduler moved us.
2257 */
2258 if (time_after(jiffies, p->numa_migrate_retry))
6b9a7460
MG
2259 numa_migrate_preferred(p);
2260
b32e86b4
IM
2261 if (migrated)
2262 p->numa_pages_migrated += pages;
074c2381
MG
2263 if (flags & TNF_MIGRATE_FAIL)
2264 p->numa_faults_locality[2] += pages;
b32e86b4 2265
44dba3d5
IM
2266 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2267 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
792568ec 2268 p->numa_faults_locality[local] += pages;
cbee9f88
PZ
2269}
2270
6e5fb223
PZ
2271static void reset_ptenuma_scan(struct task_struct *p)
2272{
7e5a2c17
JL
2273 /*
2274 * We only did a read acquisition of the mmap sem, so
2275 * p->mm->numa_scan_seq is written to without exclusive access
2276 * and the update is not guaranteed to be atomic. That's not
2277 * much of an issue though, since this is just used for
2278 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2279 * expensive, to avoid any form of compiler optimizations:
2280 */
316c1608 2281 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
6e5fb223
PZ
2282 p->mm->numa_scan_offset = 0;
2283}
2284
cbee9f88
PZ
2285/*
2286 * The expensive part of numa migration is done from task_work context.
2287 * Triggered from task_tick_numa().
2288 */
2289void task_numa_work(struct callback_head *work)
2290{
2291 unsigned long migrate, next_scan, now = jiffies;
2292 struct task_struct *p = current;
2293 struct mm_struct *mm = p->mm;
51170840 2294 u64 runtime = p->se.sum_exec_runtime;
6e5fb223 2295 struct vm_area_struct *vma;
9f40604c 2296 unsigned long start, end;
598f0ec0 2297 unsigned long nr_pte_updates = 0;
4620f8c1 2298 long pages, virtpages;
cbee9f88
PZ
2299
2300 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2301
2302 work->next = work; /* protect against double add */
2303 /*
2304 * Who cares about NUMA placement when they're dying.
2305 *
2306 * NOTE: make sure not to dereference p->mm before this check,
2307 * exit_task_work() happens _after_ exit_mm() so we could be called
2308 * without p->mm even though we still had it when we enqueued this
2309 * work.
2310 */
2311 if (p->flags & PF_EXITING)
2312 return;
2313
930aa174 2314 if (!mm->numa_next_scan) {
7e8d16b6
MG
2315 mm->numa_next_scan = now +
2316 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
b8593bfd
MG
2317 }
2318
cbee9f88
PZ
2319 /*
2320 * Enforce maximal scan/migration frequency..
2321 */
2322 migrate = mm->numa_next_scan;
2323 if (time_before(now, migrate))
2324 return;
2325
598f0ec0
MG
2326 if (p->numa_scan_period == 0) {
2327 p->numa_scan_period_max = task_scan_max(p);
2328 p->numa_scan_period = task_scan_min(p);
2329 }
cbee9f88 2330
fb003b80 2331 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88
PZ
2332 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2333 return;
2334
19a78d11
PZ
2335 /*
2336 * Delay this task enough that another task of this mm will likely win
2337 * the next time around.
2338 */
2339 p->node_stamp += 2 * TICK_NSEC;
2340
9f40604c
MG
2341 start = mm->numa_scan_offset;
2342 pages = sysctl_numa_balancing_scan_size;
2343 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
4620f8c1 2344 virtpages = pages * 8; /* Scan up to this much virtual space */
9f40604c
MG
2345 if (!pages)
2346 return;
cbee9f88 2347
4620f8c1 2348
6e5fb223 2349 down_read(&mm->mmap_sem);
9f40604c 2350 vma = find_vma(mm, start);
6e5fb223
PZ
2351 if (!vma) {
2352 reset_ptenuma_scan(p);
9f40604c 2353 start = 0;
6e5fb223
PZ
2354 vma = mm->mmap;
2355 }
9f40604c 2356 for (; vma; vma = vma->vm_next) {
6b79c57b 2357 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
8e76d4ee 2358 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
6e5fb223 2359 continue;
6b79c57b 2360 }
6e5fb223 2361
4591ce4f
MG
2362 /*
2363 * Shared library pages mapped by multiple processes are not
2364 * migrated as it is expected they are cache replicated. Avoid
2365 * hinting faults in read-only file-backed mappings or the vdso
2366 * as migrating the pages will be of marginal benefit.
2367 */
2368 if (!vma->vm_mm ||
2369 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2370 continue;
2371
3c67f474
MG
2372 /*
2373 * Skip inaccessible VMAs to avoid any confusion between
2374 * PROT_NONE and NUMA hinting ptes
2375 */
2376 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2377 continue;
4591ce4f 2378
9f40604c
MG
2379 do {
2380 start = max(start, vma->vm_start);
2381 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2382 end = min(end, vma->vm_end);
4620f8c1 2383 nr_pte_updates = change_prot_numa(vma, start, end);
598f0ec0
MG
2384
2385 /*
4620f8c1
RR
2386 * Try to scan sysctl_numa_balancing_size worth of
2387 * hpages that have at least one present PTE that
2388 * is not already pte-numa. If the VMA contains
2389 * areas that are unused or already full of prot_numa
2390 * PTEs, scan up to virtpages, to skip through those
2391 * areas faster.
598f0ec0
MG
2392 */
2393 if (nr_pte_updates)
2394 pages -= (end - start) >> PAGE_SHIFT;
4620f8c1 2395 virtpages -= (end - start) >> PAGE_SHIFT;
6e5fb223 2396
9f40604c 2397 start = end;
4620f8c1 2398 if (pages <= 0 || virtpages <= 0)
9f40604c 2399 goto out;
3cf1962c
RR
2400
2401 cond_resched();
9f40604c 2402 } while (end != vma->vm_end);
cbee9f88 2403 }
6e5fb223 2404
9f40604c 2405out:
6e5fb223 2406 /*
c69307d5
PZ
2407 * It is possible to reach the end of the VMA list but the last few
2408 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2409 * would find the !migratable VMA on the next scan but not reset the
2410 * scanner to the start so check it now.
6e5fb223
PZ
2411 */
2412 if (vma)
9f40604c 2413 mm->numa_scan_offset = start;
6e5fb223
PZ
2414 else
2415 reset_ptenuma_scan(p);
2416 up_read(&mm->mmap_sem);
51170840
RR
2417
2418 /*
2419 * Make sure tasks use at least 32x as much time to run other code
2420 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2421 * Usually update_task_scan_period slows down scanning enough; on an
2422 * overloaded system we need to limit overhead on a per task basis.
2423 */
2424 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2425 u64 diff = p->se.sum_exec_runtime - runtime;
2426 p->node_stamp += 32 * diff;
2427 }
cbee9f88
PZ
2428}
2429
2430/*
2431 * Drive the periodic memory faults..
2432 */
2433void task_tick_numa(struct rq *rq, struct task_struct *curr)
2434{
2435 struct callback_head *work = &curr->numa_work;
2436 u64 period, now;
2437
2438 /*
2439 * We don't care about NUMA placement if we don't have memory.
2440 */
2441 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2442 return;
2443
2444 /*
2445 * Using runtime rather than walltime has the dual advantage that
2446 * we (mostly) drive the selection from busy threads and that the
2447 * task needs to have done some actual work before we bother with
2448 * NUMA placement.
2449 */
2450 now = curr->se.sum_exec_runtime;
2451 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2452
25b3e5a3 2453 if (now > curr->node_stamp + period) {
4b96a29b 2454 if (!curr->node_stamp)
598f0ec0 2455 curr->numa_scan_period = task_scan_min(curr);
19a78d11 2456 curr->node_stamp += period;
cbee9f88
PZ
2457
2458 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2459 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2460 task_work_add(curr, work, true);
2461 }
2462 }
2463}
2464#else
2465static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2466{
2467}
0ec8aa00
PZ
2468
2469static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2470{
2471}
2472
2473static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2474{
2475}
cbee9f88
PZ
2476#endif /* CONFIG_NUMA_BALANCING */
2477
30cfdcfc
DA
2478static void
2479account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2480{
2481 update_load_add(&cfs_rq->load, se->load.weight);
c09595f6 2482 if (!parent_entity(se))
029632fb 2483 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
367456c7 2484#ifdef CONFIG_SMP
0ec8aa00
PZ
2485 if (entity_is_task(se)) {
2486 struct rq *rq = rq_of(cfs_rq);
2487
2488 account_numa_enqueue(rq, task_of(se));
2489 list_add(&se->group_node, &rq->cfs_tasks);
2490 }
367456c7 2491#endif
30cfdcfc 2492 cfs_rq->nr_running++;
30cfdcfc
DA
2493}
2494
2495static void
2496account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2497{
2498 update_load_sub(&cfs_rq->load, se->load.weight);
c09595f6 2499 if (!parent_entity(se))
029632fb 2500 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
bfdb198c 2501#ifdef CONFIG_SMP
0ec8aa00
PZ
2502 if (entity_is_task(se)) {
2503 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
b87f1724 2504 list_del_init(&se->group_node);
0ec8aa00 2505 }
bfdb198c 2506#endif
30cfdcfc 2507 cfs_rq->nr_running--;
30cfdcfc
DA
2508}
2509
3ff6dcac
YZ
2510#ifdef CONFIG_FAIR_GROUP_SCHED
2511# ifdef CONFIG_SMP
ea1dc6fc 2512static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
cf5f0acf 2513{
ea1dc6fc 2514 long tg_weight, load, shares;
cf5f0acf
PZ
2515
2516 /*
ea1dc6fc
PZ
2517 * This really should be: cfs_rq->avg.load_avg, but instead we use
2518 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2519 * the shares for small weight interactive tasks.
cf5f0acf 2520 */
ea1dc6fc 2521 load = scale_load_down(cfs_rq->load.weight);
cf5f0acf 2522
ea1dc6fc 2523 tg_weight = atomic_long_read(&tg->load_avg);
3ff6dcac 2524
ea1dc6fc
PZ
2525 /* Ensure tg_weight >= load */
2526 tg_weight -= cfs_rq->tg_load_avg_contrib;
2527 tg_weight += load;
3ff6dcac 2528
3ff6dcac 2529 shares = (tg->shares * load);
cf5f0acf
PZ
2530 if (tg_weight)
2531 shares /= tg_weight;
3ff6dcac
YZ
2532
2533 if (shares < MIN_SHARES)
2534 shares = MIN_SHARES;
2535 if (shares > tg->shares)
2536 shares = tg->shares;
2537
2538 return shares;
2539}
3ff6dcac 2540# else /* CONFIG_SMP */
6d5ab293 2541static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac
YZ
2542{
2543 return tg->shares;
2544}
3ff6dcac 2545# endif /* CONFIG_SMP */
ea1dc6fc 2546
2069dd75
PZ
2547static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2548 unsigned long weight)
2549{
19e5eebb
PT
2550 if (se->on_rq) {
2551 /* commit outstanding execution time */
2552 if (cfs_rq->curr == se)
2553 update_curr(cfs_rq);
2069dd75 2554 account_entity_dequeue(cfs_rq, se);
19e5eebb 2555 }
2069dd75
PZ
2556
2557 update_load_set(&se->load, weight);
2558
2559 if (se->on_rq)
2560 account_entity_enqueue(cfs_rq, se);
2561}
2562
82958366
PT
2563static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2564
6d5ab293 2565static void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
2566{
2567 struct task_group *tg;
2568 struct sched_entity *se;
3ff6dcac 2569 long shares;
2069dd75 2570
2069dd75
PZ
2571 tg = cfs_rq->tg;
2572 se = tg->se[cpu_of(rq_of(cfs_rq))];
64660c86 2573 if (!se || throttled_hierarchy(cfs_rq))
2069dd75 2574 return;
3ff6dcac
YZ
2575#ifndef CONFIG_SMP
2576 if (likely(se->load.weight == tg->shares))
2577 return;
2578#endif
6d5ab293 2579 shares = calc_cfs_shares(cfs_rq, tg);
2069dd75
PZ
2580
2581 reweight_entity(cfs_rq_of(se), se, shares);
2582}
2583#else /* CONFIG_FAIR_GROUP_SCHED */
6d5ab293 2584static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
2585{
2586}
2587#endif /* CONFIG_FAIR_GROUP_SCHED */
2588
141965c7 2589#ifdef CONFIG_SMP
5b51f2f8
PT
2590/* Precomputed fixed inverse multiplies for multiplication by y^n */
2591static const u32 runnable_avg_yN_inv[] = {
2592 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2593 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2594 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2595 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2596 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2597 0x85aac367, 0x82cd8698,
2598};
2599
2600/*
2601 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2602 * over-estimates when re-combining.
2603 */
2604static const u32 runnable_avg_yN_sum[] = {
2605 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2606 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2607 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2608};
2609
7b20b916
YD
2610/*
2611 * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2612 * lower integers. See Documentation/scheduler/sched-avg.txt how these
2613 * were generated:
2614 */
2615static const u32 __accumulated_sum_N32[] = {
2616 0, 23371, 35056, 40899, 43820, 45281,
2617 46011, 46376, 46559, 46650, 46696, 46719,
2618};
2619
9d85f21c
PT
2620/*
2621 * Approximate:
2622 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2623 */
2624static __always_inline u64 decay_load(u64 val, u64 n)
2625{
5b51f2f8
PT
2626 unsigned int local_n;
2627
2628 if (!n)
2629 return val;
2630 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2631 return 0;
2632
2633 /* after bounds checking we can collapse to 32-bit */
2634 local_n = n;
2635
2636 /*
2637 * As y^PERIOD = 1/2, we can combine
9c58c79a
ZZ
2638 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2639 * With a look-up table which covers y^n (n<PERIOD)
5b51f2f8
PT
2640 *
2641 * To achieve constant time decay_load.
2642 */
2643 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2644 val >>= local_n / LOAD_AVG_PERIOD;
2645 local_n %= LOAD_AVG_PERIOD;
9d85f21c
PT
2646 }
2647
9d89c257
YD
2648 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2649 return val;
5b51f2f8
PT
2650}
2651
2652/*
2653 * For updates fully spanning n periods, the contribution to runnable
2654 * average will be: \Sum 1024*y^n
2655 *
2656 * We can compute this reasonably efficiently by combining:
2657 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2658 */
2659static u32 __compute_runnable_contrib(u64 n)
2660{
2661 u32 contrib = 0;
2662
2663 if (likely(n <= LOAD_AVG_PERIOD))
2664 return runnable_avg_yN_sum[n];
2665 else if (unlikely(n >= LOAD_AVG_MAX_N))
2666 return LOAD_AVG_MAX;
2667
7b20b916
YD
2668 /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
2669 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2670 n %= LOAD_AVG_PERIOD;
5b51f2f8
PT
2671 contrib = decay_load(contrib, n);
2672 return contrib + runnable_avg_yN_sum[n];
9d85f21c
PT
2673}
2674
54a21385 2675#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
e0f5f3af 2676
9d85f21c
PT
2677/*
2678 * We can represent the historical contribution to runnable average as the
2679 * coefficients of a geometric series. To do this we sub-divide our runnable
2680 * history into segments of approximately 1ms (1024us); label the segment that
2681 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2682 *
2683 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2684 * p0 p1 p2
2685 * (now) (~1ms ago) (~2ms ago)
2686 *
2687 * Let u_i denote the fraction of p_i that the entity was runnable.
2688 *
2689 * We then designate the fractions u_i as our co-efficients, yielding the
2690 * following representation of historical load:
2691 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2692 *
2693 * We choose y based on the with of a reasonably scheduling period, fixing:
2694 * y^32 = 0.5
2695 *
2696 * This means that the contribution to load ~32ms ago (u_32) will be weighted
2697 * approximately half as much as the contribution to load within the last ms
2698 * (u_0).
2699 *
2700 * When a period "rolls over" and we have new u_0`, multiplying the previous
2701 * sum again by y is sufficient to update:
2702 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2703 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2704 */
9d89c257
YD
2705static __always_inline int
2706__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
13962234 2707 unsigned long weight, int running, struct cfs_rq *cfs_rq)
9d85f21c 2708{
e0f5f3af 2709 u64 delta, scaled_delta, periods;
9d89c257 2710 u32 contrib;
6115c793 2711 unsigned int delta_w, scaled_delta_w, decayed = 0;
6f2b0452 2712 unsigned long scale_freq, scale_cpu;
9d85f21c 2713
9d89c257 2714 delta = now - sa->last_update_time;
9d85f21c
PT
2715 /*
2716 * This should only happen when time goes backwards, which it
2717 * unfortunately does during sched clock init when we swap over to TSC.
2718 */
2719 if ((s64)delta < 0) {
9d89c257 2720 sa->last_update_time = now;
9d85f21c
PT
2721 return 0;
2722 }
2723
2724 /*
2725 * Use 1024ns as the unit of measurement since it's a reasonable
2726 * approximation of 1us and fast to compute.
2727 */
2728 delta >>= 10;
2729 if (!delta)
2730 return 0;
9d89c257 2731 sa->last_update_time = now;
9d85f21c 2732
6f2b0452
DE
2733 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2734 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2735
9d85f21c 2736 /* delta_w is the amount already accumulated against our next period */
9d89c257 2737 delta_w = sa->period_contrib;
9d85f21c 2738 if (delta + delta_w >= 1024) {
9d85f21c
PT
2739 decayed = 1;
2740
9d89c257
YD
2741 /* how much left for next period will start over, we don't know yet */
2742 sa->period_contrib = 0;
2743
9d85f21c
PT
2744 /*
2745 * Now that we know we're crossing a period boundary, figure
2746 * out how much from delta we need to complete the current
2747 * period and accrue it.
2748 */
2749 delta_w = 1024 - delta_w;
54a21385 2750 scaled_delta_w = cap_scale(delta_w, scale_freq);
13962234 2751 if (weight) {
e0f5f3af
DE
2752 sa->load_sum += weight * scaled_delta_w;
2753 if (cfs_rq) {
2754 cfs_rq->runnable_load_sum +=
2755 weight * scaled_delta_w;
2756 }
13962234 2757 }
36ee28e4 2758 if (running)
006cdf02 2759 sa->util_sum += scaled_delta_w * scale_cpu;
5b51f2f8
PT
2760
2761 delta -= delta_w;
2762
2763 /* Figure out how many additional periods this update spans */
2764 periods = delta / 1024;
2765 delta %= 1024;
2766
9d89c257 2767 sa->load_sum = decay_load(sa->load_sum, periods + 1);
13962234
YD
2768 if (cfs_rq) {
2769 cfs_rq->runnable_load_sum =
2770 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2771 }
9d89c257 2772 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
5b51f2f8
PT
2773
2774 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
9d89c257 2775 contrib = __compute_runnable_contrib(periods);
54a21385 2776 contrib = cap_scale(contrib, scale_freq);
13962234 2777 if (weight) {
9d89c257 2778 sa->load_sum += weight * contrib;
13962234
YD
2779 if (cfs_rq)
2780 cfs_rq->runnable_load_sum += weight * contrib;
2781 }
36ee28e4 2782 if (running)
006cdf02 2783 sa->util_sum += contrib * scale_cpu;
9d85f21c
PT
2784 }
2785
2786 /* Remainder of delta accrued against u_0` */
54a21385 2787 scaled_delta = cap_scale(delta, scale_freq);
13962234 2788 if (weight) {
e0f5f3af 2789 sa->load_sum += weight * scaled_delta;
13962234 2790 if (cfs_rq)
e0f5f3af 2791 cfs_rq->runnable_load_sum += weight * scaled_delta;
13962234 2792 }
36ee28e4 2793 if (running)
006cdf02 2794 sa->util_sum += scaled_delta * scale_cpu;
9ee474f5 2795
9d89c257 2796 sa->period_contrib += delta;
9ee474f5 2797
9d89c257
YD
2798 if (decayed) {
2799 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
13962234
YD
2800 if (cfs_rq) {
2801 cfs_rq->runnable_load_avg =
2802 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2803 }
006cdf02 2804 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
9d89c257 2805 }
aff3e498 2806
9d89c257 2807 return decayed;
9ee474f5
PT
2808}
2809
c566e8e9 2810#ifdef CONFIG_FAIR_GROUP_SCHED
7c3edd2c
PZ
2811/**
2812 * update_tg_load_avg - update the tg's load avg
2813 * @cfs_rq: the cfs_rq whose avg changed
2814 * @force: update regardless of how small the difference
2815 *
2816 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2817 * However, because tg->load_avg is a global value there are performance
2818 * considerations.
2819 *
2820 * In order to avoid having to look at the other cfs_rq's, we use a
2821 * differential update where we store the last value we propagated. This in
2822 * turn allows skipping updates if the differential is 'small'.
2823 *
2824 * Updating tg's load_avg is necessary before update_cfs_share() (which is
2825 * done) and effective_load() (which is not done because it is too costly).
bb17f655 2826 */
9d89c257 2827static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
bb17f655 2828{
9d89c257 2829 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
bb17f655 2830
aa0b7ae0
WL
2831 /*
2832 * No need to update load_avg for root_task_group as it is not used.
2833 */
2834 if (cfs_rq->tg == &root_task_group)
2835 return;
2836
9d89c257
YD
2837 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2838 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2839 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
bb17f655 2840 }
8165e145 2841}
f5f9739d 2842
ad936d86
BP
2843/*
2844 * Called within set_task_rq() right before setting a task's cpu. The
2845 * caller only guarantees p->pi_lock is held; no other assumptions,
2846 * including the state of rq->lock, should be made.
2847 */
2848void set_task_rq_fair(struct sched_entity *se,
2849 struct cfs_rq *prev, struct cfs_rq *next)
2850{
2851 if (!sched_feat(ATTACH_AGE_LOAD))
2852 return;
2853
2854 /*
2855 * We are supposed to update the task to "current" time, then its up to
2856 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2857 * getting what current time is, so simply throw away the out-of-date
2858 * time. This will result in the wakee task is less decayed, but giving
2859 * the wakee more load sounds not bad.
2860 */
2861 if (se->avg.last_update_time && prev) {
2862 u64 p_last_update_time;
2863 u64 n_last_update_time;
2864
2865#ifndef CONFIG_64BIT
2866 u64 p_last_update_time_copy;
2867 u64 n_last_update_time_copy;
2868
2869 do {
2870 p_last_update_time_copy = prev->load_last_update_time_copy;
2871 n_last_update_time_copy = next->load_last_update_time_copy;
2872
2873 smp_rmb();
2874
2875 p_last_update_time = prev->avg.last_update_time;
2876 n_last_update_time = next->avg.last_update_time;
2877
2878 } while (p_last_update_time != p_last_update_time_copy ||
2879 n_last_update_time != n_last_update_time_copy);
2880#else
2881 p_last_update_time = prev->avg.last_update_time;
2882 n_last_update_time = next->avg.last_update_time;
2883#endif
2884 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2885 &se->avg, 0, 0, NULL);
2886 se->avg.last_update_time = n_last_update_time;
2887 }
2888}
6e83125c 2889#else /* CONFIG_FAIR_GROUP_SCHED */
9d89c257 2890static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
6e83125c 2891#endif /* CONFIG_FAIR_GROUP_SCHED */
c566e8e9 2892
a2c6c91f
SM
2893static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2894{
2895 struct rq *rq = rq_of(cfs_rq);
2896 int cpu = cpu_of(rq);
2897
2898 if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
2899 unsigned long max = rq->cpu_capacity_orig;
2900
2901 /*
2902 * There are a few boundary cases this might miss but it should
2903 * get called often enough that that should (hopefully) not be
2904 * a real problem -- added to that it only calls on the local
2905 * CPU, so if we enqueue remotely we'll miss an update, but
2906 * the next tick/schedule should update.
2907 *
2908 * It will not get called when we go idle, because the idle
2909 * thread is a different class (!fair), nor will the utilization
2910 * number include things like RT tasks.
2911 *
2912 * As is, the util number is not freq-invariant (we'd have to
2913 * implement arch_scale_freq_capacity() for that).
2914 *
2915 * See cpu_util().
2916 */
2917 cpufreq_update_util(rq_clock(rq),
2918 min(cfs_rq->avg.util_avg, max), max);
2919 }
2920}
2921
89741892
PZ
2922/*
2923 * Unsigned subtract and clamp on underflow.
2924 *
2925 * Explicitly do a load-store to ensure the intermediate value never hits
2926 * memory. This allows lockless observations without ever seeing the negative
2927 * values.
2928 */
2929#define sub_positive(_ptr, _val) do { \
2930 typeof(_ptr) ptr = (_ptr); \
2931 typeof(*ptr) val = (_val); \
2932 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2933 res = var - val; \
2934 if (res > var) \
2935 res = 0; \
2936 WRITE_ONCE(*ptr, res); \
2937} while (0)
2938
3d30544f
PZ
2939/**
2940 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
2941 * @now: current time, as per cfs_rq_clock_task()
2942 * @cfs_rq: cfs_rq to update
2943 * @update_freq: should we call cfs_rq_util_change() or will the call do so
2944 *
2945 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
2946 * avg. The immediate corollary is that all (fair) tasks must be attached, see
2947 * post_init_entity_util_avg().
2948 *
2949 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
2950 *
7c3edd2c
PZ
2951 * Returns true if the load decayed or we removed load.
2952 *
2953 * Since both these conditions indicate a changed cfs_rq->avg.load we should
2954 * call update_tg_load_avg() when this function returns true.
3d30544f 2955 */
a2c6c91f
SM
2956static inline int
2957update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2dac754e 2958{
9d89c257 2959 struct sched_avg *sa = &cfs_rq->avg;
41e0d37f 2960 int decayed, removed_load = 0, removed_util = 0;
2dac754e 2961
9d89c257 2962 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
9e0e83a1 2963 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
89741892
PZ
2964 sub_positive(&sa->load_avg, r);
2965 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
41e0d37f 2966 removed_load = 1;
8165e145 2967 }
2dac754e 2968
9d89c257
YD
2969 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2970 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
89741892
PZ
2971 sub_positive(&sa->util_avg, r);
2972 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
41e0d37f 2973 removed_util = 1;
9d89c257 2974 }
36ee28e4 2975
a2c6c91f 2976 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
13962234 2977 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
36ee28e4 2978
9d89c257
YD
2979#ifndef CONFIG_64BIT
2980 smp_wmb();
2981 cfs_rq->load_last_update_time_copy = sa->last_update_time;
2982#endif
36ee28e4 2983
a2c6c91f
SM
2984 if (update_freq && (decayed || removed_util))
2985 cfs_rq_util_change(cfs_rq);
21e96f88 2986
41e0d37f 2987 return decayed || removed_load;
21e96f88
SM
2988}
2989
2990/* Update task and its cfs_rq load average */
2991static inline void update_load_avg(struct sched_entity *se, int update_tg)
2992{
2993 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2994 u64 now = cfs_rq_clock_task(cfs_rq);
2995 struct rq *rq = rq_of(cfs_rq);
2996 int cpu = cpu_of(rq);
2997
2998 /*
2999 * Track task load average for carrying it to new CPU after migrated, and
3000 * track group sched_entity load average for task_h_load calc in migration
3001 */
3002 __update_load_avg(now, cpu, &se->avg,
3003 se->on_rq * scale_load_down(se->load.weight),
3004 cfs_rq->curr == se, NULL);
3005
a2c6c91f 3006 if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
21e96f88 3007 update_tg_load_avg(cfs_rq, 0);
9ee474f5
PT
3008}
3009
3d30544f
PZ
3010/**
3011 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3012 * @cfs_rq: cfs_rq to attach to
3013 * @se: sched_entity to attach
3014 *
3015 * Must call update_cfs_rq_load_avg() before this, since we rely on
3016 * cfs_rq->avg.last_update_time being current.
3017 */
a05e8c51
BP
3018static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3019{
a9280514
PZ
3020 if (!sched_feat(ATTACH_AGE_LOAD))
3021 goto skip_aging;
3022
6efdb105
BP
3023 /*
3024 * If we got migrated (either between CPUs or between cgroups) we'll
3025 * have aged the average right before clearing @last_update_time.
7dc603c9
PZ
3026 *
3027 * Or we're fresh through post_init_entity_util_avg().
6efdb105
BP
3028 */
3029 if (se->avg.last_update_time) {
3030 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3031 &se->avg, 0, 0, NULL);
3032
3033 /*
3034 * XXX: we could have just aged the entire load away if we've been
3035 * absent from the fair class for too long.
3036 */
3037 }
3038
a9280514 3039skip_aging:
a05e8c51
BP
3040 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3041 cfs_rq->avg.load_avg += se->avg.load_avg;
3042 cfs_rq->avg.load_sum += se->avg.load_sum;
3043 cfs_rq->avg.util_avg += se->avg.util_avg;
3044 cfs_rq->avg.util_sum += se->avg.util_sum;
a2c6c91f
SM
3045
3046 cfs_rq_util_change(cfs_rq);
a05e8c51
BP
3047}
3048
3d30544f
PZ
3049/**
3050 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3051 * @cfs_rq: cfs_rq to detach from
3052 * @se: sched_entity to detach
3053 *
3054 * Must call update_cfs_rq_load_avg() before this, since we rely on
3055 * cfs_rq->avg.last_update_time being current.
3056 */
a05e8c51
BP
3057static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3058{
3059 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3060 &se->avg, se->on_rq * scale_load_down(se->load.weight),
3061 cfs_rq->curr == se, NULL);
3062
89741892
PZ
3063 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3064 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3065 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3066 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
a2c6c91f
SM
3067
3068 cfs_rq_util_change(cfs_rq);
a05e8c51
BP
3069}
3070
9d89c257
YD
3071/* Add the load generated by se into cfs_rq's load average */
3072static inline void
3073enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
9ee474f5 3074{
9d89c257
YD
3075 struct sched_avg *sa = &se->avg;
3076 u64 now = cfs_rq_clock_task(cfs_rq);
a05e8c51 3077 int migrated, decayed;
9ee474f5 3078
a05e8c51
BP
3079 migrated = !sa->last_update_time;
3080 if (!migrated) {
9d89c257 3081 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
13962234
YD
3082 se->on_rq * scale_load_down(se->load.weight),
3083 cfs_rq->curr == se, NULL);
aff3e498 3084 }
c566e8e9 3085
a2c6c91f 3086 decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
18bf2805 3087
13962234
YD
3088 cfs_rq->runnable_load_avg += sa->load_avg;
3089 cfs_rq->runnable_load_sum += sa->load_sum;
3090
a05e8c51
BP
3091 if (migrated)
3092 attach_entity_load_avg(cfs_rq, se);
9ee474f5 3093
9d89c257
YD
3094 if (decayed || migrated)
3095 update_tg_load_avg(cfs_rq, 0);
2dac754e
PT
3096}
3097
13962234
YD
3098/* Remove the runnable load generated by se from cfs_rq's runnable load average */
3099static inline void
3100dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3101{
3102 update_load_avg(se, 1);
3103
3104 cfs_rq->runnable_load_avg =
3105 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3106 cfs_rq->runnable_load_sum =
a05e8c51 3107 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
13962234
YD
3108}
3109
9d89c257 3110#ifndef CONFIG_64BIT
0905f04e
YD
3111static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3112{
9d89c257 3113 u64 last_update_time_copy;
0905f04e 3114 u64 last_update_time;
9ee474f5 3115
9d89c257
YD
3116 do {
3117 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3118 smp_rmb();
3119 last_update_time = cfs_rq->avg.last_update_time;
3120 } while (last_update_time != last_update_time_copy);
0905f04e
YD
3121
3122 return last_update_time;
3123}
9d89c257 3124#else
0905f04e
YD
3125static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3126{
3127 return cfs_rq->avg.last_update_time;
3128}
9d89c257
YD
3129#endif
3130
0905f04e
YD
3131/*
3132 * Task first catches up with cfs_rq, and then subtract
3133 * itself from the cfs_rq (task must be off the queue now).
3134 */
3135void remove_entity_load_avg(struct sched_entity *se)
3136{
3137 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3138 u64 last_update_time;
3139
3140 /*
7dc603c9
PZ
3141 * tasks cannot exit without having gone through wake_up_new_task() ->
3142 * post_init_entity_util_avg() which will have added things to the
3143 * cfs_rq, so we can remove unconditionally.
3144 *
3145 * Similarly for groups, they will have passed through
3146 * post_init_entity_util_avg() before unregister_sched_fair_group()
3147 * calls this.
0905f04e 3148 */
0905f04e
YD
3149
3150 last_update_time = cfs_rq_last_update_time(cfs_rq);
3151
13962234 3152 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
9d89c257
YD
3153 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3154 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2dac754e 3155}
642dbc39 3156
7ea241af
YD
3157static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3158{
3159 return cfs_rq->runnable_load_avg;
3160}
3161
3162static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3163{
3164 return cfs_rq->avg.load_avg;
3165}
3166
6e83125c
PZ
3167static int idle_balance(struct rq *this_rq);
3168
38033c37
PZ
3169#else /* CONFIG_SMP */
3170
01011473
PZ
3171static inline int
3172update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3173{
3174 return 0;
3175}
3176
536bd00c
RW
3177static inline void update_load_avg(struct sched_entity *se, int not_used)
3178{
3179 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3180 struct rq *rq = rq_of(cfs_rq);
3181
3182 cpufreq_trigger_update(rq_clock(rq));
3183}
3184
9d89c257
YD
3185static inline void
3186enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
13962234
YD
3187static inline void
3188dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
9d89c257 3189static inline void remove_entity_load_avg(struct sched_entity *se) {}
6e83125c 3190
a05e8c51
BP
3191static inline void
3192attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3193static inline void
3194detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3195
6e83125c
PZ
3196static inline int idle_balance(struct rq *rq)
3197{
3198 return 0;
3199}
3200
38033c37 3201#endif /* CONFIG_SMP */
9d85f21c 3202
2396af69 3203static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 3204{
bf0f6f24 3205#ifdef CONFIG_SCHEDSTATS
e414314c
PZ
3206 struct task_struct *tsk = NULL;
3207
3208 if (entity_is_task(se))
3209 tsk = task_of(se);
3210
41acab88 3211 if (se->statistics.sleep_start) {
78becc27 3212 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
bf0f6f24
IM
3213
3214 if ((s64)delta < 0)
3215 delta = 0;
3216
41acab88
LDM
3217 if (unlikely(delta > se->statistics.sleep_max))
3218 se->statistics.sleep_max = delta;
bf0f6f24 3219
8c79a045 3220 se->statistics.sleep_start = 0;
41acab88 3221 se->statistics.sum_sleep_runtime += delta;
9745512c 3222
768d0c27 3223 if (tsk) {
e414314c 3224 account_scheduler_latency(tsk, delta >> 10, 1);
768d0c27
PZ
3225 trace_sched_stat_sleep(tsk, delta);
3226 }
bf0f6f24 3227 }
41acab88 3228 if (se->statistics.block_start) {
78becc27 3229 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
bf0f6f24
IM
3230
3231 if ((s64)delta < 0)
3232 delta = 0;
3233
41acab88
LDM
3234 if (unlikely(delta > se->statistics.block_max))
3235 se->statistics.block_max = delta;
bf0f6f24 3236
8c79a045 3237 se->statistics.block_start = 0;
41acab88 3238 se->statistics.sum_sleep_runtime += delta;
30084fbd 3239
e414314c 3240 if (tsk) {
8f0dfc34 3241 if (tsk->in_iowait) {
41acab88
LDM
3242 se->statistics.iowait_sum += delta;
3243 se->statistics.iowait_count++;
768d0c27 3244 trace_sched_stat_iowait(tsk, delta);
8f0dfc34
AV
3245 }
3246
b781a602
AV
3247 trace_sched_stat_blocked(tsk, delta);
3248
e414314c
PZ
3249 /*
3250 * Blocking time is in units of nanosecs, so shift by
3251 * 20 to get a milliseconds-range estimation of the
3252 * amount of time that the task spent sleeping:
3253 */
3254 if (unlikely(prof_on == SLEEP_PROFILING)) {
3255 profile_hits(SLEEP_PROFILING,
3256 (void *)get_wchan(tsk),
3257 delta >> 20);
3258 }
3259 account_scheduler_latency(tsk, delta >> 10, 0);
30084fbd 3260 }
bf0f6f24
IM
3261 }
3262#endif
3263}
3264
ddc97297
PZ
3265static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3266{
3267#ifdef CONFIG_SCHED_DEBUG
3268 s64 d = se->vruntime - cfs_rq->min_vruntime;
3269
3270 if (d < 0)
3271 d = -d;
3272
3273 if (d > 3*sysctl_sched_latency)
3274 schedstat_inc(cfs_rq, nr_spread_over);
3275#endif
3276}
3277
aeb73b04
PZ
3278static void
3279place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3280{
1af5f730 3281 u64 vruntime = cfs_rq->min_vruntime;
94dfb5e7 3282
2cb8600e
PZ
3283 /*
3284 * The 'current' period is already promised to the current tasks,
3285 * however the extra weight of the new task will slow them down a
3286 * little, place the new task so that it fits in the slot that
3287 * stays open at the end.
3288 */
94dfb5e7 3289 if (initial && sched_feat(START_DEBIT))
f9c0b095 3290 vruntime += sched_vslice(cfs_rq, se);
aeb73b04 3291
a2e7a7eb 3292 /* sleeps up to a single latency don't count. */
5ca9880c 3293 if (!initial) {
a2e7a7eb 3294 unsigned long thresh = sysctl_sched_latency;
a7be37ac 3295
a2e7a7eb
MG
3296 /*
3297 * Halve their sleep time's effect, to allow
3298 * for a gentler effect of sleepers:
3299 */
3300 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3301 thresh >>= 1;
51e0304c 3302
a2e7a7eb 3303 vruntime -= thresh;
aeb73b04
PZ
3304 }
3305
b5d9d734 3306 /* ensure we never gain time by being placed backwards. */
16c8f1c7 3307 se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b04
PZ
3308}
3309
d3d9dc33
PT
3310static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3311
cb251765
MG
3312static inline void check_schedstat_required(void)
3313{
3314#ifdef CONFIG_SCHEDSTATS
3315 if (schedstat_enabled())
3316 return;
3317
3318 /* Force schedstat enabled if a dependent tracepoint is active */
3319 if (trace_sched_stat_wait_enabled() ||
3320 trace_sched_stat_sleep_enabled() ||
3321 trace_sched_stat_iowait_enabled() ||
3322 trace_sched_stat_blocked_enabled() ||
3323 trace_sched_stat_runtime_enabled()) {
eda8dca5 3324 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
cb251765
MG
3325 "stat_blocked and stat_runtime require the "
3326 "kernel parameter schedstats=enabled or "
3327 "kernel.sched_schedstats=1\n");
3328 }
3329#endif
3330}
3331
b5179ac7
PZ
3332
3333/*
3334 * MIGRATION
3335 *
3336 * dequeue
3337 * update_curr()
3338 * update_min_vruntime()
3339 * vruntime -= min_vruntime
3340 *
3341 * enqueue
3342 * update_curr()
3343 * update_min_vruntime()
3344 * vruntime += min_vruntime
3345 *
3346 * this way the vruntime transition between RQs is done when both
3347 * min_vruntime are up-to-date.
3348 *
3349 * WAKEUP (remote)
3350 *
59efa0ba 3351 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
b5179ac7
PZ
3352 * vruntime -= min_vruntime
3353 *
3354 * enqueue
3355 * update_curr()
3356 * update_min_vruntime()
3357 * vruntime += min_vruntime
3358 *
3359 * this way we don't have the most up-to-date min_vruntime on the originating
3360 * CPU and an up-to-date min_vruntime on the destination CPU.
3361 */
3362
bf0f6f24 3363static void
88ec22d3 3364enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 3365{
2f950354
PZ
3366 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3367 bool curr = cfs_rq->curr == se;
3368
88ec22d3 3369 /*
2f950354
PZ
3370 * If we're the current task, we must renormalise before calling
3371 * update_curr().
88ec22d3 3372 */
2f950354 3373 if (renorm && curr)
88ec22d3
PZ
3374 se->vruntime += cfs_rq->min_vruntime;
3375
2f950354
PZ
3376 update_curr(cfs_rq);
3377
bf0f6f24 3378 /*
2f950354
PZ
3379 * Otherwise, renormalise after, such that we're placed at the current
3380 * moment in time, instead of some random moment in the past. Being
3381 * placed in the past could significantly boost this task to the
3382 * fairness detriment of existing tasks.
bf0f6f24 3383 */
2f950354
PZ
3384 if (renorm && !curr)
3385 se->vruntime += cfs_rq->min_vruntime;
3386
9d89c257 3387 enqueue_entity_load_avg(cfs_rq, se);
17bc14b7
LT
3388 account_entity_enqueue(cfs_rq, se);
3389 update_cfs_shares(cfs_rq);
bf0f6f24 3390
88ec22d3 3391 if (flags & ENQUEUE_WAKEUP) {
aeb73b04 3392 place_entity(cfs_rq, se, 0);
cb251765
MG
3393 if (schedstat_enabled())
3394 enqueue_sleeper(cfs_rq, se);
e9acbff6 3395 }
bf0f6f24 3396
cb251765
MG
3397 check_schedstat_required();
3398 if (schedstat_enabled()) {
3399 update_stats_enqueue(cfs_rq, se);
3400 check_spread(cfs_rq, se);
3401 }
2f950354 3402 if (!curr)
83b699ed 3403 __enqueue_entity(cfs_rq, se);
2069dd75 3404 se->on_rq = 1;
3d4b47b4 3405
d3d9dc33 3406 if (cfs_rq->nr_running == 1) {
3d4b47b4 3407 list_add_leaf_cfs_rq(cfs_rq);
d3d9dc33
PT
3408 check_enqueue_throttle(cfs_rq);
3409 }
bf0f6f24
IM
3410}
3411
2c13c919 3412static void __clear_buddies_last(struct sched_entity *se)
2002c695 3413{
2c13c919
RR
3414 for_each_sched_entity(se) {
3415 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 3416 if (cfs_rq->last != se)
2c13c919 3417 break;
f1044799
PZ
3418
3419 cfs_rq->last = NULL;
2c13c919
RR
3420 }
3421}
2002c695 3422
2c13c919
RR
3423static void __clear_buddies_next(struct sched_entity *se)
3424{
3425 for_each_sched_entity(se) {
3426 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 3427 if (cfs_rq->next != se)
2c13c919 3428 break;
f1044799
PZ
3429
3430 cfs_rq->next = NULL;
2c13c919 3431 }
2002c695
PZ
3432}
3433
ac53db59
RR
3434static void __clear_buddies_skip(struct sched_entity *se)
3435{
3436 for_each_sched_entity(se) {
3437 struct cfs_rq *cfs_rq = cfs_rq_of(se);
f1044799 3438 if (cfs_rq->skip != se)
ac53db59 3439 break;
f1044799
PZ
3440
3441 cfs_rq->skip = NULL;
ac53db59
RR
3442 }
3443}
3444
a571bbea
PZ
3445static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3446{
2c13c919
RR
3447 if (cfs_rq->last == se)
3448 __clear_buddies_last(se);
3449
3450 if (cfs_rq->next == se)
3451 __clear_buddies_next(se);
ac53db59
RR
3452
3453 if (cfs_rq->skip == se)
3454 __clear_buddies_skip(se);
a571bbea
PZ
3455}
3456
6c16a6dc 3457static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d 3458
bf0f6f24 3459static void
371fd7e7 3460dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 3461{
a2a2d680
DA
3462 /*
3463 * Update run-time statistics of the 'current'.
3464 */
3465 update_curr(cfs_rq);
13962234 3466 dequeue_entity_load_avg(cfs_rq, se);
a2a2d680 3467
cb251765
MG
3468 if (schedstat_enabled())
3469 update_stats_dequeue(cfs_rq, se, flags);
67e9fb2a 3470
2002c695 3471 clear_buddies(cfs_rq, se);
4793241b 3472
83b699ed 3473 if (se != cfs_rq->curr)
30cfdcfc 3474 __dequeue_entity(cfs_rq, se);
17bc14b7 3475 se->on_rq = 0;
30cfdcfc 3476 account_entity_dequeue(cfs_rq, se);
88ec22d3
PZ
3477
3478 /*
3479 * Normalize the entity after updating the min_vruntime because the
3480 * update can refer to the ->curr item and we need to reflect this
3481 * movement in our normalized position.
3482 */
371fd7e7 3483 if (!(flags & DEQUEUE_SLEEP))
88ec22d3 3484 se->vruntime -= cfs_rq->min_vruntime;
1e876231 3485
d8b4986d
PT
3486 /* return excess runtime on last dequeue */
3487 return_cfs_rq_runtime(cfs_rq);
3488
1e876231 3489 update_min_vruntime(cfs_rq);
17bc14b7 3490 update_cfs_shares(cfs_rq);
bf0f6f24
IM
3491}
3492
3493/*
3494 * Preempt the current task with a newly woken task if needed:
3495 */
7c92e54f 3496static void
2e09bf55 3497check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24 3498{
11697830 3499 unsigned long ideal_runtime, delta_exec;
f4cfb33e
WX
3500 struct sched_entity *se;
3501 s64 delta;
11697830 3502
6d0f0ebd 3503 ideal_runtime = sched_slice(cfs_rq, curr);
11697830 3504 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b5 3505 if (delta_exec > ideal_runtime) {
8875125e 3506 resched_curr(rq_of(cfs_rq));
a9f3e2b5
MG
3507 /*
3508 * The current task ran long enough, ensure it doesn't get
3509 * re-elected due to buddy favours.
3510 */
3511 clear_buddies(cfs_rq, curr);
f685ceac
MG
3512 return;
3513 }
3514
3515 /*
3516 * Ensure that a task that missed wakeup preemption by a
3517 * narrow margin doesn't have to wait for a full slice.
3518 * This also mitigates buddy induced latencies under load.
3519 */
f685ceac
MG
3520 if (delta_exec < sysctl_sched_min_granularity)
3521 return;
3522
f4cfb33e
WX
3523 se = __pick_first_entity(cfs_rq);
3524 delta = curr->vruntime - se->vruntime;
f685ceac 3525
f4cfb33e
WX
3526 if (delta < 0)
3527 return;
d7d82944 3528
f4cfb33e 3529 if (delta > ideal_runtime)
8875125e 3530 resched_curr(rq_of(cfs_rq));
bf0f6f24
IM
3531}
3532
83b699ed 3533static void
8494f412 3534set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 3535{
83b699ed
SV
3536 /* 'current' is not kept within the tree. */
3537 if (se->on_rq) {
3538 /*
3539 * Any task has to be enqueued before it get to execute on
3540 * a CPU. So account for the time it spent waiting on the
3541 * runqueue.
3542 */
cb251765
MG
3543 if (schedstat_enabled())
3544 update_stats_wait_end(cfs_rq, se);
83b699ed 3545 __dequeue_entity(cfs_rq, se);
9d89c257 3546 update_load_avg(se, 1);
83b699ed
SV
3547 }
3548
79303e9e 3549 update_stats_curr_start(cfs_rq, se);
429d43bc 3550 cfs_rq->curr = se;
eba1ed4b
IM
3551#ifdef CONFIG_SCHEDSTATS
3552 /*
3553 * Track our maximum slice length, if the CPU's load is at
3554 * least twice that of our own weight (i.e. dont track it
3555 * when there are only lesser-weight tasks around):
3556 */
cb251765 3557 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
41acab88 3558 se->statistics.slice_max = max(se->statistics.slice_max,
eba1ed4b
IM
3559 se->sum_exec_runtime - se->prev_sum_exec_runtime);
3560 }
3561#endif
4a55b450 3562 se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24
IM
3563}
3564
3f3a4904
PZ
3565static int
3566wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3567
ac53db59
RR
3568/*
3569 * Pick the next process, keeping these things in mind, in this order:
3570 * 1) keep things fair between processes/task groups
3571 * 2) pick the "next" process, since someone really wants that to run
3572 * 3) pick the "last" process, for cache locality
3573 * 4) do not run the "skip" process, if something else is available
3574 */
678d5718
PZ
3575static struct sched_entity *
3576pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
aa2ac252 3577{
678d5718
PZ
3578 struct sched_entity *left = __pick_first_entity(cfs_rq);
3579 struct sched_entity *se;
3580
3581 /*
3582 * If curr is set we have to see if its left of the leftmost entity
3583 * still in the tree, provided there was anything in the tree at all.
3584 */
3585 if (!left || (curr && entity_before(curr, left)))
3586 left = curr;
3587
3588 se = left; /* ideally we run the leftmost entity */
f4b6755f 3589
ac53db59
RR
3590 /*
3591 * Avoid running the skip buddy, if running something else can
3592 * be done without getting too unfair.
3593 */
3594 if (cfs_rq->skip == se) {
678d5718
PZ
3595 struct sched_entity *second;
3596
3597 if (se == curr) {
3598 second = __pick_first_entity(cfs_rq);
3599 } else {
3600 second = __pick_next_entity(se);
3601 if (!second || (curr && entity_before(curr, second)))
3602 second = curr;
3603 }
3604
ac53db59
RR
3605 if (second && wakeup_preempt_entity(second, left) < 1)
3606 se = second;
3607 }
aa2ac252 3608
f685ceac
MG
3609 /*
3610 * Prefer last buddy, try to return the CPU to a preempted task.
3611 */
3612 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3613 se = cfs_rq->last;
3614
ac53db59
RR
3615 /*
3616 * Someone really wants this to run. If it's not unfair, run it.
3617 */
3618 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3619 se = cfs_rq->next;
3620
f685ceac 3621 clear_buddies(cfs_rq, se);
4793241b
PZ
3622
3623 return se;
aa2ac252
PZ
3624}
3625
678d5718 3626static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d3d9dc33 3627
ab6cde26 3628static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24
IM
3629{
3630 /*
3631 * If still on the runqueue then deactivate_task()
3632 * was not called and update_curr() has to be done:
3633 */
3634 if (prev->on_rq)
b7cc0896 3635 update_curr(cfs_rq);
bf0f6f24 3636
d3d9dc33
PT
3637 /* throttle cfs_rqs exceeding runtime */
3638 check_cfs_rq_runtime(cfs_rq);
3639
cb251765
MG
3640 if (schedstat_enabled()) {
3641 check_spread(cfs_rq, prev);
3642 if (prev->on_rq)
3643 update_stats_wait_start(cfs_rq, prev);
3644 }
3645
30cfdcfc 3646 if (prev->on_rq) {
30cfdcfc
DA
3647 /* Put 'current' back into the tree. */
3648 __enqueue_entity(cfs_rq, prev);
9d85f21c 3649 /* in !on_rq case, update occurred at dequeue */
9d89c257 3650 update_load_avg(prev, 0);
30cfdcfc 3651 }
429d43bc 3652 cfs_rq->curr = NULL;
bf0f6f24
IM
3653}
3654
8f4d37ec
PZ
3655static void
3656entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24 3657{
bf0f6f24 3658 /*
30cfdcfc 3659 * Update run-time statistics of the 'current'.
bf0f6f24 3660 */
30cfdcfc 3661 update_curr(cfs_rq);
bf0f6f24 3662
9d85f21c
PT
3663 /*
3664 * Ensure that runnable average is periodically updated.
3665 */
9d89c257 3666 update_load_avg(curr, 1);
bf0bd948 3667 update_cfs_shares(cfs_rq);
9d85f21c 3668
8f4d37ec
PZ
3669#ifdef CONFIG_SCHED_HRTICK
3670 /*
3671 * queued ticks are scheduled to match the slice, so don't bother
3672 * validating it and just reschedule.
3673 */
983ed7a6 3674 if (queued) {
8875125e 3675 resched_curr(rq_of(cfs_rq));
983ed7a6
HH
3676 return;
3677 }
8f4d37ec
PZ
3678 /*
3679 * don't let the period tick interfere with the hrtick preemption
3680 */
3681 if (!sched_feat(DOUBLE_TICK) &&
3682 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3683 return;
3684#endif
3685
2c2efaed 3686 if (cfs_rq->nr_running > 1)
2e09bf55 3687 check_preempt_tick(cfs_rq, curr);
bf0f6f24
IM
3688}
3689
ab84d31e
PT
3690
3691/**************************************************
3692 * CFS bandwidth control machinery
3693 */
3694
3695#ifdef CONFIG_CFS_BANDWIDTH
029632fb
PZ
3696
3697#ifdef HAVE_JUMP_LABEL
c5905afb 3698static struct static_key __cfs_bandwidth_used;
029632fb
PZ
3699
3700static inline bool cfs_bandwidth_used(void)
3701{
c5905afb 3702 return static_key_false(&__cfs_bandwidth_used);
029632fb
PZ
3703}
3704
1ee14e6c 3705void cfs_bandwidth_usage_inc(void)
029632fb 3706{
1ee14e6c
BS
3707 static_key_slow_inc(&__cfs_bandwidth_used);
3708}
3709
3710void cfs_bandwidth_usage_dec(void)
3711{
3712 static_key_slow_dec(&__cfs_bandwidth_used);
029632fb
PZ
3713}
3714#else /* HAVE_JUMP_LABEL */
3715static bool cfs_bandwidth_used(void)
3716{
3717 return true;
3718}
3719
1ee14e6c
BS
3720void cfs_bandwidth_usage_inc(void) {}
3721void cfs_bandwidth_usage_dec(void) {}
029632fb
PZ
3722#endif /* HAVE_JUMP_LABEL */
3723
ab84d31e
PT
3724/*
3725 * default period for cfs group bandwidth.
3726 * default: 0.1s, units: nanoseconds
3727 */
3728static inline u64 default_cfs_period(void)
3729{
3730 return 100000000ULL;
3731}
ec12cb7f
PT
3732
3733static inline u64 sched_cfs_bandwidth_slice(void)
3734{
3735 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3736}
3737
a9cf55b2
PT
3738/*
3739 * Replenish runtime according to assigned quota and update expiration time.
3740 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3741 * additional synchronization around rq->lock.
3742 *
3743 * requires cfs_b->lock
3744 */
029632fb 3745void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b2
PT
3746{
3747 u64 now;
3748
3749 if (cfs_b->quota == RUNTIME_INF)
3750 return;
3751
3752 now = sched_clock_cpu(smp_processor_id());
3753 cfs_b->runtime = cfs_b->quota;
3754 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3755}
3756
029632fb
PZ
3757static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3758{
3759 return &tg->cfs_bandwidth;
3760}
3761
f1b17280
PT
3762/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3763static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3764{
3765 if (unlikely(cfs_rq->throttle_count))
1a99ae3f 3766 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
f1b17280 3767
78becc27 3768 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
f1b17280
PT
3769}
3770
85dac906
PT
3771/* returns 0 on failure to allocate runtime */
3772static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f
PT
3773{
3774 struct task_group *tg = cfs_rq->tg;
3775 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
a9cf55b2 3776 u64 amount = 0, min_amount, expires;
ec12cb7f
PT
3777
3778 /* note: this is a positive sum as runtime_remaining <= 0 */
3779 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3780
3781 raw_spin_lock(&cfs_b->lock);
3782 if (cfs_b->quota == RUNTIME_INF)
3783 amount = min_amount;
58088ad0 3784 else {
77a4d1a1 3785 start_cfs_bandwidth(cfs_b);
58088ad0
PT
3786
3787 if (cfs_b->runtime > 0) {
3788 amount = min(cfs_b->runtime, min_amount);
3789 cfs_b->runtime -= amount;
3790 cfs_b->idle = 0;
3791 }
ec12cb7f 3792 }
a9cf55b2 3793 expires = cfs_b->runtime_expires;
ec12cb7f
PT
3794 raw_spin_unlock(&cfs_b->lock);
3795
3796 cfs_rq->runtime_remaining += amount;
a9cf55b2
PT
3797 /*
3798 * we may have advanced our local expiration to account for allowed
3799 * spread between our sched_clock and the one on which runtime was
3800 * issued.
3801 */
3802 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3803 cfs_rq->runtime_expires = expires;
85dac906
PT
3804
3805 return cfs_rq->runtime_remaining > 0;
ec12cb7f
PT
3806}
3807
a9cf55b2
PT
3808/*
3809 * Note: This depends on the synchronization provided by sched_clock and the
3810 * fact that rq->clock snapshots this value.
3811 */
3812static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f 3813{
a9cf55b2 3814 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
a9cf55b2
PT
3815
3816 /* if the deadline is ahead of our clock, nothing to do */
78becc27 3817 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
ec12cb7f
PT
3818 return;
3819
a9cf55b2
PT
3820 if (cfs_rq->runtime_remaining < 0)
3821 return;
3822
3823 /*
3824 * If the local deadline has passed we have to consider the
3825 * possibility that our sched_clock is 'fast' and the global deadline
3826 * has not truly expired.
3827 *
3828 * Fortunately we can check determine whether this the case by checking
51f2176d
BS
3829 * whether the global deadline has advanced. It is valid to compare
3830 * cfs_b->runtime_expires without any locks since we only care about
3831 * exact equality, so a partial write will still work.
a9cf55b2
PT
3832 */
3833
51f2176d 3834 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
a9cf55b2
PT
3835 /* extend local deadline, drift is bounded above by 2 ticks */
3836 cfs_rq->runtime_expires += TICK_NSEC;
3837 } else {
3838 /* global deadline is ahead, expiration has passed */
3839 cfs_rq->runtime_remaining = 0;
3840 }
3841}
3842
9dbdb155 3843static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
a9cf55b2
PT
3844{
3845 /* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f 3846 cfs_rq->runtime_remaining -= delta_exec;
a9cf55b2
PT
3847 expire_cfs_rq_runtime(cfs_rq);
3848
3849 if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f
PT
3850 return;
3851
85dac906
PT
3852 /*
3853 * if we're unable to extend our runtime we resched so that the active
3854 * hierarchy can be throttled
3855 */
3856 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
8875125e 3857 resched_curr(rq_of(cfs_rq));
ec12cb7f
PT
3858}
3859
6c16a6dc 3860static __always_inline
9dbdb155 3861void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
ec12cb7f 3862{
56f570e5 3863 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f
PT
3864 return;
3865
3866 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3867}
3868
85dac906
PT
3869static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3870{
56f570e5 3871 return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906
PT
3872}
3873
64660c86
PT
3874/* check whether cfs_rq, or any parent, is throttled */
3875static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3876{
56f570e5 3877 return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c86
PT
3878}
3879
3880/*
3881 * Ensure that neither of the group entities corresponding to src_cpu or
3882 * dest_cpu are members of a throttled hierarchy when performing group
3883 * load-balance operations.
3884 */
3885static inline int throttled_lb_pair(struct task_group *tg,
3886 int src_cpu, int dest_cpu)
3887{
3888 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3889
3890 src_cfs_rq = tg->cfs_rq[src_cpu];
3891 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3892
3893 return throttled_hierarchy(src_cfs_rq) ||
3894 throttled_hierarchy(dest_cfs_rq);
3895}
3896
3897/* updated child weight may affect parent so we have to do this bottom up */
3898static int tg_unthrottle_up(struct task_group *tg, void *data)
3899{
3900 struct rq *rq = data;
3901 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3902
3903 cfs_rq->throttle_count--;
64660c86 3904 if (!cfs_rq->throttle_count) {
f1b17280 3905 /* adjust cfs_rq_clock_task() */
78becc27 3906 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
f1b17280 3907 cfs_rq->throttled_clock_task;
64660c86 3908 }
64660c86
PT
3909
3910 return 0;
3911}
3912
3913static int tg_throttle_down(struct task_group *tg, void *data)
3914{
3915 struct rq *rq = data;
3916 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3917
82958366
PT
3918 /* group is entering throttled state, stop time */
3919 if (!cfs_rq->throttle_count)
78becc27 3920 cfs_rq->throttled_clock_task = rq_clock_task(rq);
64660c86
PT
3921 cfs_rq->throttle_count++;
3922
3923 return 0;
3924}
3925
d3d9dc33 3926static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906
PT
3927{
3928 struct rq *rq = rq_of(cfs_rq);
3929 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3930 struct sched_entity *se;
3931 long task_delta, dequeue = 1;
77a4d1a1 3932 bool empty;
85dac906
PT
3933
3934 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3935
f1b17280 3936 /* freeze hierarchy runnable averages while throttled */
64660c86
PT
3937 rcu_read_lock();
3938 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3939 rcu_read_unlock();
85dac906
PT
3940
3941 task_delta = cfs_rq->h_nr_running;
3942 for_each_sched_entity(se) {
3943 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3944 /* throttled entity or throttle-on-deactivate */
3945 if (!se->on_rq)
3946 break;
3947
3948 if (dequeue)
3949 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3950 qcfs_rq->h_nr_running -= task_delta;
3951
3952 if (qcfs_rq->load.weight)
3953 dequeue = 0;
3954 }
3955
3956 if (!se)
72465447 3957 sub_nr_running(rq, task_delta);
85dac906
PT
3958
3959 cfs_rq->throttled = 1;
78becc27 3960 cfs_rq->throttled_clock = rq_clock(rq);
85dac906 3961 raw_spin_lock(&cfs_b->lock);
d49db342 3962 empty = list_empty(&cfs_b->throttled_cfs_rq);
77a4d1a1 3963
c06f04c7
BS
3964 /*
3965 * Add to the _head_ of the list, so that an already-started
3966 * distribute_cfs_runtime will not see us
3967 */
3968 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
77a4d1a1
PZ
3969
3970 /*
3971 * If we're the first throttled task, make sure the bandwidth
3972 * timer is running.
3973 */
3974 if (empty)
3975 start_cfs_bandwidth(cfs_b);
3976
85dac906
PT
3977 raw_spin_unlock(&cfs_b->lock);
3978}
3979
029632fb 3980void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9da
PT
3981{
3982 struct rq *rq = rq_of(cfs_rq);
3983 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3984 struct sched_entity *se;
3985 int enqueue = 1;
3986 long task_delta;
3987
22b958d8 3988 se = cfs_rq->tg->se[cpu_of(rq)];
671fd9da
PT
3989
3990 cfs_rq->throttled = 0;
1a55af2e
FW
3991
3992 update_rq_clock(rq);
3993
671fd9da 3994 raw_spin_lock(&cfs_b->lock);
78becc27 3995 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
671fd9da
PT
3996 list_del_rcu(&cfs_rq->throttled_list);
3997 raw_spin_unlock(&cfs_b->lock);
3998
64660c86
PT
3999 /* update hierarchical throttle state */
4000 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4001
671fd9da
PT
4002 if (!cfs_rq->load.weight)
4003 return;
4004
4005 task_delta = cfs_rq->h_nr_running;
4006 for_each_sched_entity(se) {
4007 if (se->on_rq)
4008 enqueue = 0;
4009
4010 cfs_rq = cfs_rq_of(se);
4011 if (enqueue)
4012 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4013 cfs_rq->h_nr_running += task_delta;
4014
4015 if (cfs_rq_throttled(cfs_rq))
4016 break;
4017 }
4018
4019 if (!se)
72465447 4020 add_nr_running(rq, task_delta);
671fd9da
PT
4021
4022 /* determine whether we need to wake up potentially idle cpu */
4023 if (rq->curr == rq->idle && rq->cfs.nr_running)
8875125e 4024 resched_curr(rq);
671fd9da
PT
4025}
4026
4027static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4028 u64 remaining, u64 expires)
4029{
4030 struct cfs_rq *cfs_rq;
c06f04c7
BS
4031 u64 runtime;
4032 u64 starting_runtime = remaining;
671fd9da
PT
4033
4034 rcu_read_lock();
4035 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4036 throttled_list) {
4037 struct rq *rq = rq_of(cfs_rq);
4038
4039 raw_spin_lock(&rq->lock);
4040 if (!cfs_rq_throttled(cfs_rq))
4041 goto next;
4042
4043 runtime = -cfs_rq->runtime_remaining + 1;
4044 if (runtime > remaining)
4045 runtime = remaining;
4046 remaining -= runtime;
4047
4048 cfs_rq->runtime_remaining += runtime;
4049 cfs_rq->runtime_expires = expires;
4050
4051 /* we check whether we're throttled above */
4052 if (cfs_rq->runtime_remaining > 0)
4053 unthrottle_cfs_rq(cfs_rq);
4054
4055next:
4056 raw_spin_unlock(&rq->lock);
4057
4058 if (!remaining)
4059 break;
4060 }
4061 rcu_read_unlock();
4062
c06f04c7 4063 return starting_runtime - remaining;
671fd9da
PT
4064}
4065
58088ad0
PT
4066/*
4067 * Responsible for refilling a task_group's bandwidth and unthrottling its
4068 * cfs_rqs as appropriate. If there has been no activity within the last
4069 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4070 * used to track this state.
4071 */
4072static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4073{
671fd9da 4074 u64 runtime, runtime_expires;
51f2176d 4075 int throttled;
58088ad0 4076
58088ad0
PT
4077 /* no need to continue the timer with no bandwidth constraint */
4078 if (cfs_b->quota == RUNTIME_INF)
51f2176d 4079 goto out_deactivate;
58088ad0 4080
671fd9da 4081 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
e8da1b18 4082 cfs_b->nr_periods += overrun;
671fd9da 4083
51f2176d
BS
4084 /*
4085 * idle depends on !throttled (for the case of a large deficit), and if
4086 * we're going inactive then everything else can be deferred
4087 */
4088 if (cfs_b->idle && !throttled)
4089 goto out_deactivate;
a9cf55b2
PT
4090
4091 __refill_cfs_bandwidth_runtime(cfs_b);
4092
671fd9da
PT
4093 if (!throttled) {
4094 /* mark as potentially idle for the upcoming period */
4095 cfs_b->idle = 1;
51f2176d 4096 return 0;
671fd9da
PT
4097 }
4098
e8da1b18
NR
4099 /* account preceding periods in which throttling occurred */
4100 cfs_b->nr_throttled += overrun;
4101
671fd9da 4102 runtime_expires = cfs_b->runtime_expires;
671fd9da
PT
4103
4104 /*
c06f04c7
BS
4105 * This check is repeated as we are holding onto the new bandwidth while
4106 * we unthrottle. This can potentially race with an unthrottled group
4107 * trying to acquire new bandwidth from the global pool. This can result
4108 * in us over-using our runtime if it is all used during this loop, but
4109 * only by limited amounts in that extreme case.
671fd9da 4110 */
c06f04c7
BS
4111 while (throttled && cfs_b->runtime > 0) {
4112 runtime = cfs_b->runtime;
671fd9da
PT
4113 raw_spin_unlock(&cfs_b->lock);
4114 /* we can't nest cfs_b->lock while distributing bandwidth */
4115 runtime = distribute_cfs_runtime(cfs_b, runtime,
4116 runtime_expires);
4117 raw_spin_lock(&cfs_b->lock);
4118
4119 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
c06f04c7
BS
4120
4121 cfs_b->runtime -= min(runtime, cfs_b->runtime);
671fd9da 4122 }
58088ad0 4123
671fd9da
PT
4124 /*
4125 * While we are ensured activity in the period following an
4126 * unthrottle, this also covers the case in which the new bandwidth is
4127 * insufficient to cover the existing bandwidth deficit. (Forcing the
4128 * timer to remain active while there are any throttled entities.)
4129 */
4130 cfs_b->idle = 0;
58088ad0 4131
51f2176d
BS
4132 return 0;
4133
4134out_deactivate:
51f2176d 4135 return 1;
58088ad0 4136}
d3d9dc33 4137
d8b4986d
PT
4138/* a cfs_rq won't donate quota below this amount */
4139static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4140/* minimum remaining period time to redistribute slack quota */
4141static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4142/* how long we wait to gather additional slack before distributing */
4143static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4144
db06e78c
BS
4145/*
4146 * Are we near the end of the current quota period?
4147 *
4148 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4961b6e1 4149 * hrtimer base being cleared by hrtimer_start. In the case of
db06e78c
BS
4150 * migrate_hrtimers, base is never cleared, so we are fine.
4151 */
d8b4986d
PT
4152static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4153{
4154 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4155 u64 remaining;
4156
4157 /* if the call-back is running a quota refresh is already occurring */
4158 if (hrtimer_callback_running(refresh_timer))
4159 return 1;
4160
4161 /* is a quota refresh about to occur? */
4162 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4163 if (remaining < min_expire)
4164 return 1;
4165
4166 return 0;
4167}
4168
4169static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4170{
4171 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4172
4173 /* if there's a quota refresh soon don't bother with slack */
4174 if (runtime_refresh_within(cfs_b, min_left))
4175 return;
4176
4cfafd30
PZ
4177 hrtimer_start(&cfs_b->slack_timer,
4178 ns_to_ktime(cfs_bandwidth_slack_period),
4179 HRTIMER_MODE_REL);
d8b4986d
PT
4180}
4181
4182/* we know any runtime found here is valid as update_curr() precedes return */
4183static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4184{
4185 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4186 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4187
4188 if (slack_runtime <= 0)
4189 return;
4190
4191 raw_spin_lock(&cfs_b->lock);
4192 if (cfs_b->quota != RUNTIME_INF &&
4193 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4194 cfs_b->runtime += slack_runtime;
4195
4196 /* we are under rq->lock, defer unthrottling using a timer */
4197 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4198 !list_empty(&cfs_b->throttled_cfs_rq))
4199 start_cfs_slack_bandwidth(cfs_b);
4200 }
4201 raw_spin_unlock(&cfs_b->lock);
4202
4203 /* even if it's not valid for return we don't want to try again */
4204 cfs_rq->runtime_remaining -= slack_runtime;
4205}
4206
4207static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4208{
56f570e5
PT
4209 if (!cfs_bandwidth_used())
4210 return;
4211
fccfdc6f 4212 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d
PT
4213 return;
4214
4215 __return_cfs_rq_runtime(cfs_rq);
4216}
4217
4218/*
4219 * This is done with a timer (instead of inline with bandwidth return) since
4220 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4221 */
4222static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4223{
4224 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4225 u64 expires;
4226
4227 /* confirm we're still not at a refresh boundary */
db06e78c
BS
4228 raw_spin_lock(&cfs_b->lock);
4229 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4230 raw_spin_unlock(&cfs_b->lock);
d8b4986d 4231 return;
db06e78c 4232 }
d8b4986d 4233
c06f04c7 4234 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
d8b4986d 4235 runtime = cfs_b->runtime;
c06f04c7 4236
d8b4986d
PT
4237 expires = cfs_b->runtime_expires;
4238 raw_spin_unlock(&cfs_b->lock);
4239
4240 if (!runtime)
4241 return;
4242
4243 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4244
4245 raw_spin_lock(&cfs_b->lock);
4246 if (expires == cfs_b->runtime_expires)
c06f04c7 4247 cfs_b->runtime -= min(runtime, cfs_b->runtime);
d8b4986d
PT
4248 raw_spin_unlock(&cfs_b->lock);
4249}
4250
d3d9dc33
PT
4251/*
4252 * When a group wakes up we want to make sure that its quota is not already
4253 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4254 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4255 */
4256static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4257{
56f570e5
PT
4258 if (!cfs_bandwidth_used())
4259 return;
4260
d3d9dc33
PT
4261 /* an active group must be handled by the update_curr()->put() path */
4262 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4263 return;
4264
4265 /* ensure the group is not already throttled */
4266 if (cfs_rq_throttled(cfs_rq))
4267 return;
4268
4269 /* update runtime allocation */
4270 account_cfs_rq_runtime(cfs_rq, 0);
4271 if (cfs_rq->runtime_remaining <= 0)
4272 throttle_cfs_rq(cfs_rq);
4273}
4274
55e16d30
PZ
4275static void sync_throttle(struct task_group *tg, int cpu)
4276{
4277 struct cfs_rq *pcfs_rq, *cfs_rq;
4278
4279 if (!cfs_bandwidth_used())
4280 return;
4281
4282 if (!tg->parent)
4283 return;
4284
4285 cfs_rq = tg->cfs_rq[cpu];
4286 pcfs_rq = tg->parent->cfs_rq[cpu];
4287
4288 cfs_rq->throttle_count = pcfs_rq->throttle_count;
b8922125 4289 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
55e16d30
PZ
4290}
4291
d3d9dc33 4292/* conditionally throttle active cfs_rq's from put_prev_entity() */
678d5718 4293static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
d3d9dc33 4294{
56f570e5 4295 if (!cfs_bandwidth_used())
678d5718 4296 return false;
56f570e5 4297
d3d9dc33 4298 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
678d5718 4299 return false;
d3d9dc33
PT
4300
4301 /*
4302 * it's possible for a throttled entity to be forced into a running
4303 * state (e.g. set_curr_task), in this case we're finished.
4304 */
4305 if (cfs_rq_throttled(cfs_rq))
678d5718 4306 return true;
d3d9dc33
PT
4307
4308 throttle_cfs_rq(cfs_rq);
678d5718 4309 return true;
d3d9dc33 4310}
029632fb 4311
029632fb
PZ
4312static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4313{
4314 struct cfs_bandwidth *cfs_b =
4315 container_of(timer, struct cfs_bandwidth, slack_timer);
77a4d1a1 4316
029632fb
PZ
4317 do_sched_cfs_slack_timer(cfs_b);
4318
4319 return HRTIMER_NORESTART;
4320}
4321
4322static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4323{
4324 struct cfs_bandwidth *cfs_b =
4325 container_of(timer, struct cfs_bandwidth, period_timer);
029632fb
PZ
4326 int overrun;
4327 int idle = 0;
4328
51f2176d 4329 raw_spin_lock(&cfs_b->lock);
029632fb 4330 for (;;) {
77a4d1a1 4331 overrun = hrtimer_forward_now(timer, cfs_b->period);
029632fb
PZ
4332 if (!overrun)
4333 break;
4334
4335 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4336 }
4cfafd30
PZ
4337 if (idle)
4338 cfs_b->period_active = 0;
51f2176d 4339 raw_spin_unlock(&cfs_b->lock);
029632fb
PZ
4340
4341 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4342}
4343
4344void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4345{
4346 raw_spin_lock_init(&cfs_b->lock);
4347 cfs_b->runtime = 0;
4348 cfs_b->quota = RUNTIME_INF;
4349 cfs_b->period = ns_to_ktime(default_cfs_period());
4350
4351 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4cfafd30 4352 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
029632fb
PZ
4353 cfs_b->period_timer.function = sched_cfs_period_timer;
4354 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4355 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4356}
4357
4358static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4359{
4360 cfs_rq->runtime_enabled = 0;
4361 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4362}
4363
77a4d1a1 4364void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
029632fb 4365{
4cfafd30 4366 lockdep_assert_held(&cfs_b->lock);
029632fb 4367
4cfafd30
PZ
4368 if (!cfs_b->period_active) {
4369 cfs_b->period_active = 1;
4370 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4371 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4372 }
029632fb
PZ
4373}
4374
4375static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4376{
7f1a169b
TH
4377 /* init_cfs_bandwidth() was not called */
4378 if (!cfs_b->throttled_cfs_rq.next)
4379 return;
4380
029632fb
PZ
4381 hrtimer_cancel(&cfs_b->period_timer);
4382 hrtimer_cancel(&cfs_b->slack_timer);
4383}
4384
0e59bdae
KT
4385static void __maybe_unused update_runtime_enabled(struct rq *rq)
4386{
4387 struct cfs_rq *cfs_rq;
4388
4389 for_each_leaf_cfs_rq(rq, cfs_rq) {
4390 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4391
4392 raw_spin_lock(&cfs_b->lock);
4393 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4394 raw_spin_unlock(&cfs_b->lock);
4395 }
4396}
4397
38dc3348 4398static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fb
PZ
4399{
4400 struct cfs_rq *cfs_rq;
4401
4402 for_each_leaf_cfs_rq(rq, cfs_rq) {
029632fb
PZ
4403 if (!cfs_rq->runtime_enabled)
4404 continue;
4405
4406 /*
4407 * clock_task is not advancing so we just need to make sure
4408 * there's some valid quota amount
4409 */
51f2176d 4410 cfs_rq->runtime_remaining = 1;
0e59bdae
KT
4411 /*
4412 * Offline rq is schedulable till cpu is completely disabled
4413 * in take_cpu_down(), so we prevent new cfs throttling here.
4414 */
4415 cfs_rq->runtime_enabled = 0;
4416
029632fb
PZ
4417 if (cfs_rq_throttled(cfs_rq))
4418 unthrottle_cfs_rq(cfs_rq);
4419 }
4420}
4421
4422#else /* CONFIG_CFS_BANDWIDTH */
f1b17280
PT
4423static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4424{
78becc27 4425 return rq_clock_task(rq_of(cfs_rq));
f1b17280
PT
4426}
4427
9dbdb155 4428static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
678d5718 4429static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
d3d9dc33 4430static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
55e16d30 4431static inline void sync_throttle(struct task_group *tg, int cpu) {}
6c16a6dc 4432static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906
PT
4433
4434static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4435{
4436 return 0;
4437}
64660c86
PT
4438
4439static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4440{
4441 return 0;
4442}
4443
4444static inline int throttled_lb_pair(struct task_group *tg,
4445 int src_cpu, int dest_cpu)
4446{
4447 return 0;
4448}
029632fb
PZ
4449
4450void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4451
4452#ifdef CONFIG_FAIR_GROUP_SCHED
4453static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e
PT
4454#endif
4455
029632fb
PZ
4456static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4457{
4458 return NULL;
4459}
4460static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
0e59bdae 4461static inline void update_runtime_enabled(struct rq *rq) {}
a4c96ae3 4462static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fb
PZ
4463
4464#endif /* CONFIG_CFS_BANDWIDTH */
4465
bf0f6f24
IM
4466/**************************************************
4467 * CFS operations on tasks:
4468 */
4469
8f4d37ec
PZ
4470#ifdef CONFIG_SCHED_HRTICK
4471static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4472{
8f4d37ec
PZ
4473 struct sched_entity *se = &p->se;
4474 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4475
4476 WARN_ON(task_rq(p) != rq);
4477
b39e66ea 4478 if (cfs_rq->nr_running > 1) {
8f4d37ec
PZ
4479 u64 slice = sched_slice(cfs_rq, se);
4480 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4481 s64 delta = slice - ran;
4482
4483 if (delta < 0) {
4484 if (rq->curr == p)
8875125e 4485 resched_curr(rq);
8f4d37ec
PZ
4486 return;
4487 }
31656519 4488 hrtick_start(rq, delta);
8f4d37ec
PZ
4489 }
4490}
a4c2f00f
PZ
4491
4492/*
4493 * called from enqueue/dequeue and updates the hrtick when the
4494 * current task is from our class and nr_running is low enough
4495 * to matter.
4496 */
4497static void hrtick_update(struct rq *rq)
4498{
4499 struct task_struct *curr = rq->curr;
4500
b39e66ea 4501 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f
PZ
4502 return;
4503
4504 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4505 hrtick_start_fair(rq, curr);
4506}
55e12e5e 4507#else /* !CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
4508static inline void
4509hrtick_start_fair(struct rq *rq, struct task_struct *p)
4510{
4511}
a4c2f00f
PZ
4512
4513static inline void hrtick_update(struct rq *rq)
4514{
4515}
8f4d37ec
PZ
4516#endif
4517
bf0f6f24
IM
4518/*
4519 * The enqueue_task method is called before nr_running is
4520 * increased. Here we update the fair scheduling stats and
4521 * then put the task into the rbtree:
4522 */
ea87bb78 4523static void
371fd7e7 4524enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
4525{
4526 struct cfs_rq *cfs_rq;
62fb1851 4527 struct sched_entity *se = &p->se;
bf0f6f24
IM
4528
4529 for_each_sched_entity(se) {
62fb1851 4530 if (se->on_rq)
bf0f6f24
IM
4531 break;
4532 cfs_rq = cfs_rq_of(se);
88ec22d3 4533 enqueue_entity(cfs_rq, se, flags);
85dac906
PT
4534
4535 /*
4536 * end evaluation on encountering a throttled cfs_rq
4537 *
4538 * note: in the case of encountering a throttled cfs_rq we will
4539 * post the final h_nr_running increment below.
e210bffd 4540 */
85dac906
PT
4541 if (cfs_rq_throttled(cfs_rq))
4542 break;
953bfcd1 4543 cfs_rq->h_nr_running++;
85dac906 4544
88ec22d3 4545 flags = ENQUEUE_WAKEUP;
bf0f6f24 4546 }
8f4d37ec 4547
2069dd75 4548 for_each_sched_entity(se) {
0f317143 4549 cfs_rq = cfs_rq_of(se);
953bfcd1 4550 cfs_rq->h_nr_running++;
2069dd75 4551
85dac906
PT
4552 if (cfs_rq_throttled(cfs_rq))
4553 break;
4554
9d89c257 4555 update_load_avg(se, 1);
17bc14b7 4556 update_cfs_shares(cfs_rq);
2069dd75
PZ
4557 }
4558
cd126afe 4559 if (!se)
72465447 4560 add_nr_running(rq, 1);
cd126afe 4561
a4c2f00f 4562 hrtick_update(rq);
bf0f6f24
IM
4563}
4564
2f36825b
VP
4565static void set_next_buddy(struct sched_entity *se);
4566
bf0f6f24
IM
4567/*
4568 * The dequeue_task method is called before nr_running is
4569 * decreased. We remove the task from the rbtree and
4570 * update the fair scheduling stats:
4571 */
371fd7e7 4572static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
4573{
4574 struct cfs_rq *cfs_rq;
62fb1851 4575 struct sched_entity *se = &p->se;
2f36825b 4576 int task_sleep = flags & DEQUEUE_SLEEP;
bf0f6f24
IM
4577
4578 for_each_sched_entity(se) {
4579 cfs_rq = cfs_rq_of(se);
371fd7e7 4580 dequeue_entity(cfs_rq, se, flags);
85dac906
PT
4581
4582 /*
4583 * end evaluation on encountering a throttled cfs_rq
4584 *
4585 * note: in the case of encountering a throttled cfs_rq we will
4586 * post the final h_nr_running decrement below.
4587 */
4588 if (cfs_rq_throttled(cfs_rq))
4589 break;
953bfcd1 4590 cfs_rq->h_nr_running--;
2069dd75 4591
bf0f6f24 4592 /* Don't dequeue parent if it has other entities besides us */
2f36825b 4593 if (cfs_rq->load.weight) {
754bd598
KK
4594 /* Avoid re-evaluating load for this entity: */
4595 se = parent_entity(se);
2f36825b
VP
4596 /*
4597 * Bias pick_next to pick a task from this cfs_rq, as
4598 * p is sleeping when it is within its sched_slice.
4599 */
754bd598
KK
4600 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4601 set_next_buddy(se);
bf0f6f24 4602 break;
2f36825b 4603 }
371fd7e7 4604 flags |= DEQUEUE_SLEEP;
bf0f6f24 4605 }
8f4d37ec 4606
2069dd75 4607 for_each_sched_entity(se) {
0f317143 4608 cfs_rq = cfs_rq_of(se);
953bfcd1 4609 cfs_rq->h_nr_running--;
2069dd75 4610
85dac906
PT
4611 if (cfs_rq_throttled(cfs_rq))
4612 break;
4613
9d89c257 4614 update_load_avg(se, 1);
17bc14b7 4615 update_cfs_shares(cfs_rq);
2069dd75
PZ
4616 }
4617
cd126afe 4618 if (!se)
72465447 4619 sub_nr_running(rq, 1);
cd126afe 4620
a4c2f00f 4621 hrtick_update(rq);
bf0f6f24
IM
4622}
4623
e7693a36 4624#ifdef CONFIG_SMP
9fd81dd5 4625#ifdef CONFIG_NO_HZ_COMMON
3289bdb4
PZ
4626/*
4627 * per rq 'load' arrray crap; XXX kill this.
4628 */
4629
4630/*
d937cdc5 4631 * The exact cpuload calculated at every tick would be:
3289bdb4 4632 *
d937cdc5
PZ
4633 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
4634 *
4635 * If a cpu misses updates for n ticks (as it was idle) and update gets
4636 * called on the n+1-th tick when cpu may be busy, then we have:
4637 *
4638 * load_n = (1 - 1/2^i)^n * load_0
4639 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
3289bdb4
PZ
4640 *
4641 * decay_load_missed() below does efficient calculation of
3289bdb4 4642 *
d937cdc5
PZ
4643 * load' = (1 - 1/2^i)^n * load
4644 *
4645 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
4646 * This allows us to precompute the above in said factors, thereby allowing the
4647 * reduction of an arbitrary n in O(log_2 n) steps. (See also
4648 * fixed_power_int())
3289bdb4 4649 *
d937cdc5 4650 * The calculation is approximated on a 128 point scale.
3289bdb4
PZ
4651 */
4652#define DEGRADE_SHIFT 7
d937cdc5
PZ
4653
4654static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4655static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4656 { 0, 0, 0, 0, 0, 0, 0, 0 },
4657 { 64, 32, 8, 0, 0, 0, 0, 0 },
4658 { 96, 72, 40, 12, 1, 0, 0, 0 },
4659 { 112, 98, 75, 43, 15, 1, 0, 0 },
4660 { 120, 112, 98, 76, 45, 16, 2, 0 }
4661};
3289bdb4
PZ
4662
4663/*
4664 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4665 * would be when CPU is idle and so we just decay the old load without
4666 * adding any new load.
4667 */
4668static unsigned long
4669decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4670{
4671 int j = 0;
4672
4673 if (!missed_updates)
4674 return load;
4675
4676 if (missed_updates >= degrade_zero_ticks[idx])
4677 return 0;
4678
4679 if (idx == 1)
4680 return load >> missed_updates;
4681
4682 while (missed_updates) {
4683 if (missed_updates % 2)
4684 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4685
4686 missed_updates >>= 1;
4687 j++;
4688 }
4689 return load;
4690}
9fd81dd5 4691#endif /* CONFIG_NO_HZ_COMMON */
3289bdb4 4692
59543275 4693/**
cee1afce 4694 * __cpu_load_update - update the rq->cpu_load[] statistics
59543275
BP
4695 * @this_rq: The rq to update statistics for
4696 * @this_load: The current load
4697 * @pending_updates: The number of missed updates
59543275 4698 *
3289bdb4 4699 * Update rq->cpu_load[] statistics. This function is usually called every
59543275
BP
4700 * scheduler tick (TICK_NSEC).
4701 *
4702 * This function computes a decaying average:
4703 *
4704 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
4705 *
4706 * Because of NOHZ it might not get called on every tick which gives need for
4707 * the @pending_updates argument.
4708 *
4709 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
4710 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
4711 * = A * (A * load[i]_n-2 + B) + B
4712 * = A * (A * (A * load[i]_n-3 + B) + B) + B
4713 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
4714 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
4715 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
4716 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
4717 *
4718 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
4719 * any change in load would have resulted in the tick being turned back on.
4720 *
4721 * For regular NOHZ, this reduces to:
4722 *
4723 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
4724 *
4725 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
1f41906a 4726 * term.
3289bdb4 4727 */
1f41906a
FW
4728static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
4729 unsigned long pending_updates)
3289bdb4 4730{
9fd81dd5 4731 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
3289bdb4
PZ
4732 int i, scale;
4733
4734 this_rq->nr_load_updates++;
4735
4736 /* Update our load: */
4737 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4738 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4739 unsigned long old_load, new_load;
4740
4741 /* scale is effectively 1 << i now, and >> i divides by scale */
4742
7400d3bb 4743 old_load = this_rq->cpu_load[i];
9fd81dd5 4744#ifdef CONFIG_NO_HZ_COMMON
3289bdb4 4745 old_load = decay_load_missed(old_load, pending_updates - 1, i);
7400d3bb
BP
4746 if (tickless_load) {
4747 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4748 /*
4749 * old_load can never be a negative value because a
4750 * decayed tickless_load cannot be greater than the
4751 * original tickless_load.
4752 */
4753 old_load += tickless_load;
4754 }
9fd81dd5 4755#endif
3289bdb4
PZ
4756 new_load = this_load;
4757 /*
4758 * Round up the averaging division if load is increasing. This
4759 * prevents us from getting stuck on 9 if the load is 10, for
4760 * example.
4761 */
4762 if (new_load > old_load)
4763 new_load += scale - 1;
4764
4765 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4766 }
4767
4768 sched_avg_update(this_rq);
4769}
4770
7ea241af
YD
4771/* Used instead of source_load when we know the type == 0 */
4772static unsigned long weighted_cpuload(const int cpu)
4773{
4774 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4775}
4776
3289bdb4 4777#ifdef CONFIG_NO_HZ_COMMON
1f41906a
FW
4778/*
4779 * There is no sane way to deal with nohz on smp when using jiffies because the
4780 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4781 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4782 *
4783 * Therefore we need to avoid the delta approach from the regular tick when
4784 * possible since that would seriously skew the load calculation. This is why we
4785 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
4786 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
4787 * loop exit, nohz_idle_balance, nohz full exit...)
4788 *
4789 * This means we might still be one tick off for nohz periods.
4790 */
4791
4792static void cpu_load_update_nohz(struct rq *this_rq,
4793 unsigned long curr_jiffies,
4794 unsigned long load)
be68a682
FW
4795{
4796 unsigned long pending_updates;
4797
4798 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4799 if (pending_updates) {
4800 this_rq->last_load_update_tick = curr_jiffies;
4801 /*
4802 * In the regular NOHZ case, we were idle, this means load 0.
4803 * In the NOHZ_FULL case, we were non-idle, we should consider
4804 * its weighted load.
4805 */
1f41906a 4806 cpu_load_update(this_rq, load, pending_updates);
be68a682
FW
4807 }
4808}
4809
3289bdb4
PZ
4810/*
4811 * Called from nohz_idle_balance() to update the load ratings before doing the
4812 * idle balance.
4813 */
cee1afce 4814static void cpu_load_update_idle(struct rq *this_rq)
3289bdb4 4815{
3289bdb4
PZ
4816 /*
4817 * bail if there's load or we're actually up-to-date.
4818 */
be68a682 4819 if (weighted_cpuload(cpu_of(this_rq)))
3289bdb4
PZ
4820 return;
4821
1f41906a 4822 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
3289bdb4
PZ
4823}
4824
4825/*
1f41906a
FW
4826 * Record CPU load on nohz entry so we know the tickless load to account
4827 * on nohz exit. cpu_load[0] happens then to be updated more frequently
4828 * than other cpu_load[idx] but it should be fine as cpu_load readers
4829 * shouldn't rely into synchronized cpu_load[*] updates.
3289bdb4 4830 */
1f41906a 4831void cpu_load_update_nohz_start(void)
3289bdb4
PZ
4832{
4833 struct rq *this_rq = this_rq();
1f41906a
FW
4834
4835 /*
4836 * This is all lockless but should be fine. If weighted_cpuload changes
4837 * concurrently we'll exit nohz. And cpu_load write can race with
4838 * cpu_load_update_idle() but both updater would be writing the same.
4839 */
4840 this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
4841}
4842
4843/*
4844 * Account the tickless load in the end of a nohz frame.
4845 */
4846void cpu_load_update_nohz_stop(void)
4847{
316c1608 4848 unsigned long curr_jiffies = READ_ONCE(jiffies);
1f41906a
FW
4849 struct rq *this_rq = this_rq();
4850 unsigned long load;
3289bdb4
PZ
4851
4852 if (curr_jiffies == this_rq->last_load_update_tick)
4853 return;
4854
1f41906a 4855 load = weighted_cpuload(cpu_of(this_rq));
3289bdb4 4856 raw_spin_lock(&this_rq->lock);
b52fad2d 4857 update_rq_clock(this_rq);
1f41906a 4858 cpu_load_update_nohz(this_rq, curr_jiffies, load);
3289bdb4
PZ
4859 raw_spin_unlock(&this_rq->lock);
4860}
1f41906a
FW
4861#else /* !CONFIG_NO_HZ_COMMON */
4862static inline void cpu_load_update_nohz(struct rq *this_rq,
4863 unsigned long curr_jiffies,
4864 unsigned long load) { }
4865#endif /* CONFIG_NO_HZ_COMMON */
4866
4867static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
4868{
9fd81dd5 4869#ifdef CONFIG_NO_HZ_COMMON
1f41906a
FW
4870 /* See the mess around cpu_load_update_nohz(). */
4871 this_rq->last_load_update_tick = READ_ONCE(jiffies);
9fd81dd5 4872#endif
1f41906a
FW
4873 cpu_load_update(this_rq, load, 1);
4874}
3289bdb4
PZ
4875
4876/*
4877 * Called from scheduler_tick()
4878 */
cee1afce 4879void cpu_load_update_active(struct rq *this_rq)
3289bdb4 4880{
7ea241af 4881 unsigned long load = weighted_cpuload(cpu_of(this_rq));
1f41906a
FW
4882
4883 if (tick_nohz_tick_stopped())
4884 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
4885 else
4886 cpu_load_update_periodic(this_rq, load);
3289bdb4
PZ
4887}
4888
029632fb
PZ
4889/*
4890 * Return a low guess at the load of a migration-source cpu weighted
4891 * according to the scheduling class and "nice" value.
4892 *
4893 * We want to under-estimate the load of migration sources, to
4894 * balance conservatively.
4895 */
4896static unsigned long source_load(int cpu, int type)
4897{
4898 struct rq *rq = cpu_rq(cpu);
4899 unsigned long total = weighted_cpuload(cpu);
4900
4901 if (type == 0 || !sched_feat(LB_BIAS))
4902 return total;
4903
4904 return min(rq->cpu_load[type-1], total);
4905}
4906
4907/*
4908 * Return a high guess at the load of a migration-target cpu weighted
4909 * according to the scheduling class and "nice" value.
4910 */
4911static unsigned long target_load(int cpu, int type)
4912{
4913 struct rq *rq = cpu_rq(cpu);
4914 unsigned long total = weighted_cpuload(cpu);
4915
4916 if (type == 0 || !sched_feat(LB_BIAS))
4917 return total;
4918
4919 return max(rq->cpu_load[type-1], total);
4920}
4921
ced549fa 4922static unsigned long capacity_of(int cpu)
029632fb 4923{
ced549fa 4924 return cpu_rq(cpu)->cpu_capacity;
029632fb
PZ
4925}
4926
ca6d75e6
VG
4927static unsigned long capacity_orig_of(int cpu)
4928{
4929 return cpu_rq(cpu)->cpu_capacity_orig;
4930}
4931
029632fb
PZ
4932static unsigned long cpu_avg_load_per_task(int cpu)
4933{
4934 struct rq *rq = cpu_rq(cpu);
316c1608 4935 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
7ea241af 4936 unsigned long load_avg = weighted_cpuload(cpu);
029632fb
PZ
4937
4938 if (nr_running)
b92486cb 4939 return load_avg / nr_running;
029632fb
PZ
4940
4941 return 0;
4942}
4943
bb3469ac 4944#ifdef CONFIG_FAIR_GROUP_SCHED
f5bfb7d9
PZ
4945/*
4946 * effective_load() calculates the load change as seen from the root_task_group
4947 *
4948 * Adding load to a group doesn't make a group heavier, but can cause movement
4949 * of group shares between cpus. Assuming the shares were perfectly aligned one
4950 * can calculate the shift in shares.
cf5f0acf
PZ
4951 *
4952 * Calculate the effective load difference if @wl is added (subtracted) to @tg
4953 * on this @cpu and results in a total addition (subtraction) of @wg to the
4954 * total group weight.
4955 *
4956 * Given a runqueue weight distribution (rw_i) we can compute a shares
4957 * distribution (s_i) using:
4958 *
4959 * s_i = rw_i / \Sum rw_j (1)
4960 *
4961 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4962 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4963 * shares distribution (s_i):
4964 *
4965 * rw_i = { 2, 4, 1, 0 }
4966 * s_i = { 2/7, 4/7, 1/7, 0 }
4967 *
4968 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4969 * task used to run on and the CPU the waker is running on), we need to
4970 * compute the effect of waking a task on either CPU and, in case of a sync
4971 * wakeup, compute the effect of the current task going to sleep.
4972 *
4973 * So for a change of @wl to the local @cpu with an overall group weight change
4974 * of @wl we can compute the new shares distribution (s'_i) using:
4975 *
4976 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
4977 *
4978 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4979 * differences in waking a task to CPU 0. The additional task changes the
4980 * weight and shares distributions like:
4981 *
4982 * rw'_i = { 3, 4, 1, 0 }
4983 * s'_i = { 3/8, 4/8, 1/8, 0 }
4984 *
4985 * We can then compute the difference in effective weight by using:
4986 *
4987 * dw_i = S * (s'_i - s_i) (3)
4988 *
4989 * Where 'S' is the group weight as seen by its parent.
4990 *
4991 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4992 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4993 * 4/7) times the weight of the group.
f5bfb7d9 4994 */
2069dd75 4995static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
bb3469ac 4996{
4be9daaa 4997 struct sched_entity *se = tg->se[cpu];
f1d239f7 4998
9722c2da 4999 if (!tg->parent) /* the trivial, non-cgroup case */
f1d239f7
PZ
5000 return wl;
5001
4be9daaa 5002 for_each_sched_entity(se) {
7dd49125
PZ
5003 struct cfs_rq *cfs_rq = se->my_q;
5004 long W, w = cfs_rq_load_avg(cfs_rq);
4be9daaa 5005
7dd49125 5006 tg = cfs_rq->tg;
bb3469ac 5007
cf5f0acf
PZ
5008 /*
5009 * W = @wg + \Sum rw_j
5010 */
7dd49125
PZ
5011 W = wg + atomic_long_read(&tg->load_avg);
5012
5013 /* Ensure \Sum rw_j >= rw_i */
5014 W -= cfs_rq->tg_load_avg_contrib;
5015 W += w;
4be9daaa 5016
cf5f0acf
PZ
5017 /*
5018 * w = rw_i + @wl
5019 */
7dd49125 5020 w += wl;
940959e9 5021
cf5f0acf
PZ
5022 /*
5023 * wl = S * s'_i; see (2)
5024 */
5025 if (W > 0 && w < W)
32a8df4e 5026 wl = (w * (long)tg->shares) / W;
977dda7c
PT
5027 else
5028 wl = tg->shares;
940959e9 5029
cf5f0acf
PZ
5030 /*
5031 * Per the above, wl is the new se->load.weight value; since
5032 * those are clipped to [MIN_SHARES, ...) do so now. See
5033 * calc_cfs_shares().
5034 */
977dda7c
PT
5035 if (wl < MIN_SHARES)
5036 wl = MIN_SHARES;
cf5f0acf
PZ
5037
5038 /*
5039 * wl = dw_i = S * (s'_i - s_i); see (3)
5040 */
9d89c257 5041 wl -= se->avg.load_avg;
cf5f0acf
PZ
5042
5043 /*
5044 * Recursively apply this logic to all parent groups to compute
5045 * the final effective load change on the root group. Since
5046 * only the @tg group gets extra weight, all parent groups can
5047 * only redistribute existing shares. @wl is the shift in shares
5048 * resulting from this level per the above.
5049 */
4be9daaa 5050 wg = 0;
4be9daaa 5051 }
bb3469ac 5052
4be9daaa 5053 return wl;
bb3469ac
PZ
5054}
5055#else
4be9daaa 5056
58d081b5 5057static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4be9daaa 5058{
83378269 5059 return wl;
bb3469ac 5060}
4be9daaa 5061
bb3469ac
PZ
5062#endif
5063
c58d25f3
PZ
5064static void record_wakee(struct task_struct *p)
5065{
5066 /*
5067 * Only decay a single time; tasks that have less then 1 wakeup per
5068 * jiffy will not have built up many flips.
5069 */
5070 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5071 current->wakee_flips >>= 1;
5072 current->wakee_flip_decay_ts = jiffies;
5073 }
5074
5075 if (current->last_wakee != p) {
5076 current->last_wakee = p;
5077 current->wakee_flips++;
5078 }
5079}
5080
63b0e9ed
MG
5081/*
5082 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
c58d25f3 5083 *
63b0e9ed 5084 * A waker of many should wake a different task than the one last awakened
c58d25f3
PZ
5085 * at a frequency roughly N times higher than one of its wakees.
5086 *
5087 * In order to determine whether we should let the load spread vs consolidating
5088 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5089 * partner, and a factor of lls_size higher frequency in the other.
5090 *
5091 * With both conditions met, we can be relatively sure that the relationship is
5092 * non-monogamous, with partner count exceeding socket size.
5093 *
5094 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5095 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5096 * socket size.
63b0e9ed 5097 */
62470419
MW
5098static int wake_wide(struct task_struct *p)
5099{
63b0e9ed
MG
5100 unsigned int master = current->wakee_flips;
5101 unsigned int slave = p->wakee_flips;
7d9ffa89 5102 int factor = this_cpu_read(sd_llc_size);
62470419 5103
63b0e9ed
MG
5104 if (master < slave)
5105 swap(master, slave);
5106 if (slave < factor || master < slave * factor)
5107 return 0;
5108 return 1;
62470419
MW
5109}
5110
772bd008
MR
5111static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5112 int prev_cpu, int sync)
098fb9db 5113{
e37b6a7b 5114 s64 this_load, load;
bd61c98f 5115 s64 this_eff_load, prev_eff_load;
772bd008 5116 int idx, this_cpu;
c88d5910 5117 struct task_group *tg;
83378269 5118 unsigned long weight;
b3137bc8 5119 int balanced;
098fb9db 5120
c88d5910
PZ
5121 idx = sd->wake_idx;
5122 this_cpu = smp_processor_id();
c88d5910
PZ
5123 load = source_load(prev_cpu, idx);
5124 this_load = target_load(this_cpu, idx);
098fb9db 5125
b3137bc8
MG
5126 /*
5127 * If sync wakeup then subtract the (maximum possible)
5128 * effect of the currently running task from the load
5129 * of the current CPU:
5130 */
83378269
PZ
5131 if (sync) {
5132 tg = task_group(current);
9d89c257 5133 weight = current->se.avg.load_avg;
83378269 5134
c88d5910 5135 this_load += effective_load(tg, this_cpu, -weight, -weight);
83378269
PZ
5136 load += effective_load(tg, prev_cpu, 0, -weight);
5137 }
b3137bc8 5138
83378269 5139 tg = task_group(p);
9d89c257 5140 weight = p->se.avg.load_avg;
b3137bc8 5141
71a29aa7
PZ
5142 /*
5143 * In low-load situations, where prev_cpu is idle and this_cpu is idle
c88d5910
PZ
5144 * due to the sync cause above having dropped this_load to 0, we'll
5145 * always have an imbalance, but there's really nothing you can do
5146 * about that, so that's good too.
71a29aa7
PZ
5147 *
5148 * Otherwise check if either cpus are near enough in load to allow this
5149 * task to be woken on this_cpu.
5150 */
bd61c98f
VG
5151 this_eff_load = 100;
5152 this_eff_load *= capacity_of(prev_cpu);
e51fd5e2 5153
bd61c98f
VG
5154 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5155 prev_eff_load *= capacity_of(this_cpu);
e51fd5e2 5156
bd61c98f 5157 if (this_load > 0) {
e51fd5e2
PZ
5158 this_eff_load *= this_load +
5159 effective_load(tg, this_cpu, weight, weight);
5160
e51fd5e2 5161 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
bd61c98f 5162 }
e51fd5e2 5163
bd61c98f 5164 balanced = this_eff_load <= prev_eff_load;
098fb9db 5165
41acab88 5166 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
098fb9db 5167
05bfb65f
VG
5168 if (!balanced)
5169 return 0;
098fb9db 5170
05bfb65f
VG
5171 schedstat_inc(sd, ttwu_move_affine);
5172 schedstat_inc(p, se.statistics.nr_wakeups_affine);
5173
5174 return 1;
098fb9db
IM
5175}
5176
aaee1203
PZ
5177/*
5178 * find_idlest_group finds and returns the least busy CPU group within the
5179 * domain.
5180 */
5181static struct sched_group *
78e7ed53 5182find_idlest_group(struct sched_domain *sd, struct task_struct *p,
c44f2a02 5183 int this_cpu, int sd_flag)
e7693a36 5184{
b3bd3de6 5185 struct sched_group *idlest = NULL, *group = sd->groups;
aaee1203 5186 unsigned long min_load = ULONG_MAX, this_load = 0;
c44f2a02 5187 int load_idx = sd->forkexec_idx;
aaee1203 5188 int imbalance = 100 + (sd->imbalance_pct-100)/2;
e7693a36 5189
c44f2a02
VG
5190 if (sd_flag & SD_BALANCE_WAKE)
5191 load_idx = sd->wake_idx;
5192
aaee1203
PZ
5193 do {
5194 unsigned long load, avg_load;
5195 int local_group;
5196 int i;
e7693a36 5197
aaee1203
PZ
5198 /* Skip over this group if it has no CPUs allowed */
5199 if (!cpumask_intersects(sched_group_cpus(group),
fa17b507 5200 tsk_cpus_allowed(p)))
aaee1203
PZ
5201 continue;
5202
5203 local_group = cpumask_test_cpu(this_cpu,
5204 sched_group_cpus(group));
5205
5206 /* Tally up the load of all CPUs in the group */
5207 avg_load = 0;
5208
5209 for_each_cpu(i, sched_group_cpus(group)) {
5210 /* Bias balancing toward cpus of our domain */
5211 if (local_group)
5212 load = source_load(i, load_idx);
5213 else
5214 load = target_load(i, load_idx);
5215
5216 avg_load += load;
5217 }
5218
63b2ca30 5219 /* Adjust by relative CPU capacity of the group */
ca8ce3d0 5220 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
aaee1203
PZ
5221
5222 if (local_group) {
5223 this_load = avg_load;
aaee1203
PZ
5224 } else if (avg_load < min_load) {
5225 min_load = avg_load;
5226 idlest = group;
5227 }
5228 } while (group = group->next, group != sd->groups);
5229
5230 if (!idlest || 100*this_load < imbalance*min_load)
5231 return NULL;
5232 return idlest;
5233}
5234
5235/*
5236 * find_idlest_cpu - find the idlest cpu among the cpus in group.
5237 */
5238static int
5239find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5240{
5241 unsigned long load, min_load = ULONG_MAX;
83a0a96a
NP
5242 unsigned int min_exit_latency = UINT_MAX;
5243 u64 latest_idle_timestamp = 0;
5244 int least_loaded_cpu = this_cpu;
5245 int shallowest_idle_cpu = -1;
aaee1203
PZ
5246 int i;
5247
eaecf41f
MR
5248 /* Check if we have any choice: */
5249 if (group->group_weight == 1)
5250 return cpumask_first(sched_group_cpus(group));
5251
aaee1203 5252 /* Traverse only the allowed CPUs */
fa17b507 5253 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
83a0a96a
NP
5254 if (idle_cpu(i)) {
5255 struct rq *rq = cpu_rq(i);
5256 struct cpuidle_state *idle = idle_get_state(rq);
5257 if (idle && idle->exit_latency < min_exit_latency) {
5258 /*
5259 * We give priority to a CPU whose idle state
5260 * has the smallest exit latency irrespective
5261 * of any idle timestamp.
5262 */
5263 min_exit_latency = idle->exit_latency;
5264 latest_idle_timestamp = rq->idle_stamp;
5265 shallowest_idle_cpu = i;
5266 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5267 rq->idle_stamp > latest_idle_timestamp) {
5268 /*
5269 * If equal or no active idle state, then
5270 * the most recently idled CPU might have
5271 * a warmer cache.
5272 */
5273 latest_idle_timestamp = rq->idle_stamp;
5274 shallowest_idle_cpu = i;
5275 }
9f96742a 5276 } else if (shallowest_idle_cpu == -1) {
83a0a96a
NP
5277 load = weighted_cpuload(i);
5278 if (load < min_load || (load == min_load && i == this_cpu)) {
5279 min_load = load;
5280 least_loaded_cpu = i;
5281 }
e7693a36
GH
5282 }
5283 }
5284
83a0a96a 5285 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
aaee1203 5286}
e7693a36 5287
a50bde51
PZ
5288/*
5289 * Try and locate an idle CPU in the sched_domain.
5290 */
772bd008 5291static int select_idle_sibling(struct task_struct *p, int prev, int target)
a50bde51 5292{
99bd5e2f 5293 struct sched_domain *sd;
37407ea7 5294 struct sched_group *sg;
a50bde51 5295
e0a79f52
MG
5296 if (idle_cpu(target))
5297 return target;
99bd5e2f
SS
5298
5299 /*
e0a79f52 5300 * If the prevous cpu is cache affine and idle, don't be stupid.
99bd5e2f 5301 */
772bd008
MR
5302 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
5303 return prev;
a50bde51
PZ
5304
5305 /*
d4335581
MF
5306 * Otherwise, iterate the domains and find an eligible idle cpu.
5307 *
5308 * A completely idle sched group at higher domains is more
5309 * desirable than an idle group at a lower level, because lower
5310 * domains have smaller groups and usually share hardware
5311 * resources which causes tasks to contend on them, e.g. x86
5312 * hyperthread siblings in the lowest domain (SMT) can contend
5313 * on the shared cpu pipeline.
5314 *
5315 * However, while we prefer idle groups at higher domains
5316 * finding an idle cpu at the lowest domain is still better than
5317 * returning 'target', which we've already established, isn't
5318 * idle.
a50bde51 5319 */
518cd623 5320 sd = rcu_dereference(per_cpu(sd_llc, target));
970e1789 5321 for_each_lower_domain(sd) {
37407ea7
LT
5322 sg = sd->groups;
5323 do {
772bd008
MR
5324 int i;
5325
37407ea7
LT
5326 if (!cpumask_intersects(sched_group_cpus(sg),
5327 tsk_cpus_allowed(p)))
5328 goto next;
5329
d4335581 5330 /* Ensure the entire group is idle */
37407ea7 5331 for_each_cpu(i, sched_group_cpus(sg)) {
e0a79f52 5332 if (i == target || !idle_cpu(i))
37407ea7
LT
5333 goto next;
5334 }
970e1789 5335
d4335581
MF
5336 /*
5337 * It doesn't matter which cpu we pick, the
5338 * whole group is idle.
5339 */
37407ea7
LT
5340 target = cpumask_first_and(sched_group_cpus(sg),
5341 tsk_cpus_allowed(p));
5342 goto done;
5343next:
5344 sg = sg->next;
5345 } while (sg != sd->groups);
5346 }
5347done:
a50bde51
PZ
5348 return target;
5349}
231678b7 5350
8bb5b00c 5351/*
9e91d61d 5352 * cpu_util returns the amount of capacity of a CPU that is used by CFS
8bb5b00c 5353 * tasks. The unit of the return value must be the one of capacity so we can
9e91d61d
DE
5354 * compare the utilization with the capacity of the CPU that is available for
5355 * CFS task (ie cpu_capacity).
231678b7
DE
5356 *
5357 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
5358 * recent utilization of currently non-runnable tasks on a CPU. It represents
5359 * the amount of utilization of a CPU in the range [0..capacity_orig] where
5360 * capacity_orig is the cpu_capacity available at the highest frequency
5361 * (arch_scale_freq_capacity()).
5362 * The utilization of a CPU converges towards a sum equal to or less than the
5363 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
5364 * the running time on this CPU scaled by capacity_curr.
5365 *
5366 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
5367 * higher than capacity_orig because of unfortunate rounding in
5368 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
5369 * the average stabilizes with the new running time. We need to check that the
5370 * utilization stays within the range of [0..capacity_orig] and cap it if
5371 * necessary. Without utilization capping, a group could be seen as overloaded
5372 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
5373 * available capacity. We allow utilization to overshoot capacity_curr (but not
5374 * capacity_orig) as it useful for predicting the capacity required after task
5375 * migrations (scheduler-driven DVFS).
8bb5b00c 5376 */
9e91d61d 5377static int cpu_util(int cpu)
8bb5b00c 5378{
9e91d61d 5379 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
8bb5b00c
VG
5380 unsigned long capacity = capacity_orig_of(cpu);
5381
231678b7 5382 return (util >= capacity) ? capacity : util;
8bb5b00c 5383}
a50bde51 5384
3273163c
MR
5385static inline int task_util(struct task_struct *p)
5386{
5387 return p->se.avg.util_avg;
5388}
5389
5390/*
5391 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5392 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5393 *
5394 * In that case WAKE_AFFINE doesn't make sense and we'll let
5395 * BALANCE_WAKE sort things out.
5396 */
5397static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5398{
5399 long min_cap, max_cap;
5400
5401 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
5402 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
5403
5404 /* Minimum capacity is close to max, no need to abort wake_affine */
5405 if (max_cap - min_cap < max_cap >> 3)
5406 return 0;
5407
5408 return min_cap * 1024 < task_util(p) * capacity_margin;
5409}
5410
aaee1203 5411/*
de91b9cb
MR
5412 * select_task_rq_fair: Select target runqueue for the waking task in domains
5413 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5414 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
aaee1203 5415 *
de91b9cb
MR
5416 * Balances load by selecting the idlest cpu in the idlest group, or under
5417 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
aaee1203 5418 *
de91b9cb 5419 * Returns the target cpu number.
aaee1203
PZ
5420 *
5421 * preempt must be disabled.
5422 */
0017d735 5423static int
ac66f547 5424select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
aaee1203 5425{
29cd8bae 5426 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
c88d5910 5427 int cpu = smp_processor_id();
63b0e9ed 5428 int new_cpu = prev_cpu;
99bd5e2f 5429 int want_affine = 0;
5158f4e4 5430 int sync = wake_flags & WF_SYNC;
c88d5910 5431
c58d25f3
PZ
5432 if (sd_flag & SD_BALANCE_WAKE) {
5433 record_wakee(p);
3273163c
MR
5434 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
5435 && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
c58d25f3 5436 }
aaee1203 5437
dce840a0 5438 rcu_read_lock();
aaee1203 5439 for_each_domain(cpu, tmp) {
e4f42888 5440 if (!(tmp->flags & SD_LOAD_BALANCE))
63b0e9ed 5441 break;
e4f42888 5442
fe3bcfe1 5443 /*
99bd5e2f
SS
5444 * If both cpu and prev_cpu are part of this domain,
5445 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1 5446 */
99bd5e2f
SS
5447 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5448 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5449 affine_sd = tmp;
29cd8bae 5450 break;
f03542a7 5451 }
29cd8bae 5452
f03542a7 5453 if (tmp->flags & sd_flag)
29cd8bae 5454 sd = tmp;
63b0e9ed
MG
5455 else if (!want_affine)
5456 break;
29cd8bae
PZ
5457 }
5458
63b0e9ed
MG
5459 if (affine_sd) {
5460 sd = NULL; /* Prefer wake_affine over balance flags */
772bd008 5461 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
63b0e9ed 5462 new_cpu = cpu;
8b911acd 5463 }
e7693a36 5464
63b0e9ed
MG
5465 if (!sd) {
5466 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
772bd008 5467 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
63b0e9ed
MG
5468
5469 } else while (sd) {
aaee1203 5470 struct sched_group *group;
c88d5910 5471 int weight;
098fb9db 5472
0763a660 5473 if (!(sd->flags & sd_flag)) {
aaee1203
PZ
5474 sd = sd->child;
5475 continue;
5476 }
098fb9db 5477
c44f2a02 5478 group = find_idlest_group(sd, p, cpu, sd_flag);
aaee1203
PZ
5479 if (!group) {
5480 sd = sd->child;
5481 continue;
5482 }
4ae7d5ce 5483
d7c33c49 5484 new_cpu = find_idlest_cpu(group, p, cpu);
aaee1203
PZ
5485 if (new_cpu == -1 || new_cpu == cpu) {
5486 /* Now try balancing at a lower domain level of cpu */
5487 sd = sd->child;
5488 continue;
e7693a36 5489 }
aaee1203
PZ
5490
5491 /* Now try balancing at a lower domain level of new_cpu */
5492 cpu = new_cpu;
669c55e9 5493 weight = sd->span_weight;
aaee1203
PZ
5494 sd = NULL;
5495 for_each_domain(cpu, tmp) {
669c55e9 5496 if (weight <= tmp->span_weight)
aaee1203 5497 break;
0763a660 5498 if (tmp->flags & sd_flag)
aaee1203
PZ
5499 sd = tmp;
5500 }
5501 /* while loop will break here if sd == NULL */
e7693a36 5502 }
dce840a0 5503 rcu_read_unlock();
e7693a36 5504
c88d5910 5505 return new_cpu;
e7693a36 5506}
0a74bef8
PT
5507
5508/*
5509 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5510 * cfs_rq_of(p) references at time of call are still valid and identify the
525628c7 5511 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
0a74bef8 5512 */
5a4fd036 5513static void migrate_task_rq_fair(struct task_struct *p)
0a74bef8 5514{
59efa0ba
PZ
5515 /*
5516 * As blocked tasks retain absolute vruntime the migration needs to
5517 * deal with this by subtracting the old and adding the new
5518 * min_vruntime -- the latter is done by enqueue_entity() when placing
5519 * the task on the new runqueue.
5520 */
5521 if (p->state == TASK_WAKING) {
5522 struct sched_entity *se = &p->se;
5523 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5524 u64 min_vruntime;
5525
5526#ifndef CONFIG_64BIT
5527 u64 min_vruntime_copy;
5528
5529 do {
5530 min_vruntime_copy = cfs_rq->min_vruntime_copy;
5531 smp_rmb();
5532 min_vruntime = cfs_rq->min_vruntime;
5533 } while (min_vruntime != min_vruntime_copy);
5534#else
5535 min_vruntime = cfs_rq->min_vruntime;
5536#endif
5537
5538 se->vruntime -= min_vruntime;
5539 }
5540
aff3e498 5541 /*
9d89c257
YD
5542 * We are supposed to update the task to "current" time, then its up to date
5543 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5544 * what current time is, so simply throw away the out-of-date time. This
5545 * will result in the wakee task is less decayed, but giving the wakee more
5546 * load sounds not bad.
aff3e498 5547 */
9d89c257
YD
5548 remove_entity_load_avg(&p->se);
5549
5550 /* Tell new CPU we are migrated */
5551 p->se.avg.last_update_time = 0;
3944a927
BS
5552
5553 /* We have migrated, no longer consider this task hot */
9d89c257 5554 p->se.exec_start = 0;
0a74bef8 5555}
12695578
YD
5556
5557static void task_dead_fair(struct task_struct *p)
5558{
5559 remove_entity_load_avg(&p->se);
5560}
e7693a36
GH
5561#endif /* CONFIG_SMP */
5562
e52fb7c0
PZ
5563static unsigned long
5564wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
0bbd3336
PZ
5565{
5566 unsigned long gran = sysctl_sched_wakeup_granularity;
5567
5568 /*
e52fb7c0
PZ
5569 * Since its curr running now, convert the gran from real-time
5570 * to virtual-time in his units.
13814d42
MG
5571 *
5572 * By using 'se' instead of 'curr' we penalize light tasks, so
5573 * they get preempted easier. That is, if 'se' < 'curr' then
5574 * the resulting gran will be larger, therefore penalizing the
5575 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5576 * be smaller, again penalizing the lighter task.
5577 *
5578 * This is especially important for buddies when the leftmost
5579 * task is higher priority than the buddy.
0bbd3336 5580 */
f4ad9bd2 5581 return calc_delta_fair(gran, se);
0bbd3336
PZ
5582}
5583
464b7527
PZ
5584/*
5585 * Should 'se' preempt 'curr'.
5586 *
5587 * |s1
5588 * |s2
5589 * |s3
5590 * g
5591 * |<--->|c
5592 *
5593 * w(c, s1) = -1
5594 * w(c, s2) = 0
5595 * w(c, s3) = 1
5596 *
5597 */
5598static int
5599wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5600{
5601 s64 gran, vdiff = curr->vruntime - se->vruntime;
5602
5603 if (vdiff <= 0)
5604 return -1;
5605
e52fb7c0 5606 gran = wakeup_gran(curr, se);
464b7527
PZ
5607 if (vdiff > gran)
5608 return 1;
5609
5610 return 0;
5611}
5612
02479099
PZ
5613static void set_last_buddy(struct sched_entity *se)
5614{
69c80f3e
VP
5615 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5616 return;
5617
5618 for_each_sched_entity(se)
5619 cfs_rq_of(se)->last = se;
02479099
PZ
5620}
5621
5622static void set_next_buddy(struct sched_entity *se)
5623{
69c80f3e
VP
5624 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5625 return;
5626
5627 for_each_sched_entity(se)
5628 cfs_rq_of(se)->next = se;
02479099
PZ
5629}
5630
ac53db59
RR
5631static void set_skip_buddy(struct sched_entity *se)
5632{
69c80f3e
VP
5633 for_each_sched_entity(se)
5634 cfs_rq_of(se)->skip = se;
ac53db59
RR
5635}
5636
bf0f6f24
IM
5637/*
5638 * Preempt the current task with a newly woken task if needed:
5639 */
5a9b86f6 5640static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24
IM
5641{
5642 struct task_struct *curr = rq->curr;
8651a86c 5643 struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e45 5644 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceac 5645 int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b 5646 int next_buddy_marked = 0;
bf0f6f24 5647
4ae7d5ce
IM
5648 if (unlikely(se == pse))
5649 return;
5650
5238cdd3 5651 /*
163122b7 5652 * This is possible from callers such as attach_tasks(), in which we
5238cdd3
PT
5653 * unconditionally check_prempt_curr() after an enqueue (which may have
5654 * lead to a throttle). This both saves work and prevents false
5655 * next-buddy nomination below.
5656 */
5657 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5658 return;
5659
2f36825b 5660 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d52 5661 set_next_buddy(pse);
2f36825b
VP
5662 next_buddy_marked = 1;
5663 }
57fdc26d 5664
aec0a514
BR
5665 /*
5666 * We can come here with TIF_NEED_RESCHED already set from new task
5667 * wake up path.
5238cdd3
PT
5668 *
5669 * Note: this also catches the edge-case of curr being in a throttled
5670 * group (e.g. via set_curr_task), since update_curr() (in the
5671 * enqueue of curr) will have resulted in resched being set. This
5672 * prevents us from potentially nominating it as a false LAST_BUDDY
5673 * below.
aec0a514
BR
5674 */
5675 if (test_tsk_need_resched(curr))
5676 return;
5677
a2f5c9ab
DH
5678 /* Idle tasks are by definition preempted by non-idle tasks. */
5679 if (unlikely(curr->policy == SCHED_IDLE) &&
5680 likely(p->policy != SCHED_IDLE))
5681 goto preempt;
5682
91c234b4 5683 /*
a2f5c9ab
DH
5684 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5685 * is driven by the tick):
91c234b4 5686 */
8ed92e51 5687 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4 5688 return;
bf0f6f24 5689
464b7527 5690 find_matching_se(&se, &pse);
9bbd7374 5691 update_curr(cfs_rq_of(se));
002f128b 5692 BUG_ON(!pse);
2f36825b
VP
5693 if (wakeup_preempt_entity(se, pse) == 1) {
5694 /*
5695 * Bias pick_next to pick the sched entity that is
5696 * triggering this preemption.
5697 */
5698 if (!next_buddy_marked)
5699 set_next_buddy(pse);
3a7e73a2 5700 goto preempt;
2f36825b 5701 }
464b7527 5702
3a7e73a2 5703 return;
a65ac745 5704
3a7e73a2 5705preempt:
8875125e 5706 resched_curr(rq);
3a7e73a2
PZ
5707 /*
5708 * Only set the backward buddy when the current task is still
5709 * on the rq. This can happen when a wakeup gets interleaved
5710 * with schedule on the ->pre_schedule() or idle_balance()
5711 * point, either of which can * drop the rq lock.
5712 *
5713 * Also, during early boot the idle thread is in the fair class,
5714 * for obvious reasons its a bad idea to schedule back to it.
5715 */
5716 if (unlikely(!se->on_rq || curr == rq->idle))
5717 return;
5718
5719 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5720 set_last_buddy(se);
bf0f6f24
IM
5721}
5722
606dba2e 5723static struct task_struct *
e7904a28 5724pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
bf0f6f24
IM
5725{
5726 struct cfs_rq *cfs_rq = &rq->cfs;
5727 struct sched_entity *se;
678d5718 5728 struct task_struct *p;
37e117c0 5729 int new_tasks;
678d5718 5730
6e83125c 5731again:
678d5718
PZ
5732#ifdef CONFIG_FAIR_GROUP_SCHED
5733 if (!cfs_rq->nr_running)
38033c37 5734 goto idle;
678d5718 5735
3f1d2a31 5736 if (prev->sched_class != &fair_sched_class)
678d5718
PZ
5737 goto simple;
5738
5739 /*
5740 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5741 * likely that a next task is from the same cgroup as the current.
5742 *
5743 * Therefore attempt to avoid putting and setting the entire cgroup
5744 * hierarchy, only change the part that actually changes.
5745 */
5746
5747 do {
5748 struct sched_entity *curr = cfs_rq->curr;
5749
5750 /*
5751 * Since we got here without doing put_prev_entity() we also
5752 * have to consider cfs_rq->curr. If it is still a runnable
5753 * entity, update_curr() will update its vruntime, otherwise
5754 * forget we've ever seen it.
5755 */
54d27365
BS
5756 if (curr) {
5757 if (curr->on_rq)
5758 update_curr(cfs_rq);
5759 else
5760 curr = NULL;
678d5718 5761
54d27365
BS
5762 /*
5763 * This call to check_cfs_rq_runtime() will do the
5764 * throttle and dequeue its entity in the parent(s).
5765 * Therefore the 'simple' nr_running test will indeed
5766 * be correct.
5767 */
5768 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5769 goto simple;
5770 }
678d5718
PZ
5771
5772 se = pick_next_entity(cfs_rq, curr);
5773 cfs_rq = group_cfs_rq(se);
5774 } while (cfs_rq);
5775
5776 p = task_of(se);
5777
5778 /*
5779 * Since we haven't yet done put_prev_entity and if the selected task
5780 * is a different task than we started out with, try and touch the
5781 * least amount of cfs_rqs.
5782 */
5783 if (prev != p) {
5784 struct sched_entity *pse = &prev->se;
5785
5786 while (!(cfs_rq = is_same_group(se, pse))) {
5787 int se_depth = se->depth;
5788 int pse_depth = pse->depth;
5789
5790 if (se_depth <= pse_depth) {
5791 put_prev_entity(cfs_rq_of(pse), pse);
5792 pse = parent_entity(pse);
5793 }
5794 if (se_depth >= pse_depth) {
5795 set_next_entity(cfs_rq_of(se), se);
5796 se = parent_entity(se);
5797 }
5798 }
5799
5800 put_prev_entity(cfs_rq, pse);
5801 set_next_entity(cfs_rq, se);
5802 }
5803
5804 if (hrtick_enabled(rq))
5805 hrtick_start_fair(rq, p);
5806
5807 return p;
5808simple:
5809 cfs_rq = &rq->cfs;
5810#endif
bf0f6f24 5811
36ace27e 5812 if (!cfs_rq->nr_running)
38033c37 5813 goto idle;
bf0f6f24 5814
3f1d2a31 5815 put_prev_task(rq, prev);
606dba2e 5816
bf0f6f24 5817 do {
678d5718 5818 se = pick_next_entity(cfs_rq, NULL);
f4b6755f 5819 set_next_entity(cfs_rq, se);
bf0f6f24
IM
5820 cfs_rq = group_cfs_rq(se);
5821 } while (cfs_rq);
5822
8f4d37ec 5823 p = task_of(se);
678d5718 5824
b39e66ea
MG
5825 if (hrtick_enabled(rq))
5826 hrtick_start_fair(rq, p);
8f4d37ec
PZ
5827
5828 return p;
38033c37
PZ
5829
5830idle:
cbce1a68
PZ
5831 /*
5832 * This is OK, because current is on_cpu, which avoids it being picked
5833 * for load-balance and preemption/IRQs are still disabled avoiding
5834 * further scheduler activity on it and we're being very careful to
5835 * re-start the picking loop.
5836 */
e7904a28 5837 lockdep_unpin_lock(&rq->lock, cookie);
e4aa358b 5838 new_tasks = idle_balance(rq);
e7904a28 5839 lockdep_repin_lock(&rq->lock, cookie);
37e117c0
PZ
5840 /*
5841 * Because idle_balance() releases (and re-acquires) rq->lock, it is
5842 * possible for any higher priority task to appear. In that case we
5843 * must re-start the pick_next_entity() loop.
5844 */
e4aa358b 5845 if (new_tasks < 0)
37e117c0
PZ
5846 return RETRY_TASK;
5847
e4aa358b 5848 if (new_tasks > 0)
38033c37 5849 goto again;
38033c37
PZ
5850
5851 return NULL;
bf0f6f24
IM
5852}
5853
5854/*
5855 * Account for a descheduled task:
5856 */
31ee529c 5857static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24
IM
5858{
5859 struct sched_entity *se = &prev->se;
5860 struct cfs_rq *cfs_rq;
5861
5862 for_each_sched_entity(se) {
5863 cfs_rq = cfs_rq_of(se);
ab6cde26 5864 put_prev_entity(cfs_rq, se);
bf0f6f24
IM
5865 }
5866}
5867
ac53db59
RR
5868/*
5869 * sched_yield() is very simple
5870 *
5871 * The magic of dealing with the ->skip buddy is in pick_next_entity.
5872 */
5873static void yield_task_fair(struct rq *rq)
5874{
5875 struct task_struct *curr = rq->curr;
5876 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5877 struct sched_entity *se = &curr->se;
5878
5879 /*
5880 * Are we the only task in the tree?
5881 */
5882 if (unlikely(rq->nr_running == 1))
5883 return;
5884
5885 clear_buddies(cfs_rq, se);
5886
5887 if (curr->policy != SCHED_BATCH) {
5888 update_rq_clock(rq);
5889 /*
5890 * Update run-time statistics of the 'current'.
5891 */
5892 update_curr(cfs_rq);
916671c0
MG
5893 /*
5894 * Tell update_rq_clock() that we've just updated,
5895 * so we don't do microscopic update in schedule()
5896 * and double the fastpath cost.
5897 */
9edfbfed 5898 rq_clock_skip_update(rq, true);
ac53db59
RR
5899 }
5900
5901 set_skip_buddy(se);
5902}
5903
d95f4122
MG
5904static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5905{
5906 struct sched_entity *se = &p->se;
5907
5238cdd3
PT
5908 /* throttled hierarchies are not runnable */
5909 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f4122
MG
5910 return false;
5911
5912 /* Tell the scheduler that we'd really like pse to run next. */
5913 set_next_buddy(se);
5914
d95f4122
MG
5915 yield_task_fair(rq);
5916
5917 return true;
5918}
5919
681f3e68 5920#ifdef CONFIG_SMP
bf0f6f24 5921/**************************************************
e9c84cb8
PZ
5922 * Fair scheduling class load-balancing methods.
5923 *
5924 * BASICS
5925 *
5926 * The purpose of load-balancing is to achieve the same basic fairness the
5927 * per-cpu scheduler provides, namely provide a proportional amount of compute
5928 * time to each task. This is expressed in the following equation:
5929 *
5930 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
5931 *
5932 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5933 * W_i,0 is defined as:
5934 *
5935 * W_i,0 = \Sum_j w_i,j (2)
5936 *
5937 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
1c3de5e1 5938 * is derived from the nice value as per sched_prio_to_weight[].
e9c84cb8
PZ
5939 *
5940 * The weight average is an exponential decay average of the instantaneous
5941 * weight:
5942 *
5943 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
5944 *
ced549fa 5945 * C_i is the compute capacity of cpu i, typically it is the
e9c84cb8
PZ
5946 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5947 * can also include other factors [XXX].
5948 *
5949 * To achieve this balance we define a measure of imbalance which follows
5950 * directly from (1):
5951 *
ced549fa 5952 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
e9c84cb8
PZ
5953 *
5954 * We them move tasks around to minimize the imbalance. In the continuous
5955 * function space it is obvious this converges, in the discrete case we get
5956 * a few fun cases generally called infeasible weight scenarios.
5957 *
5958 * [XXX expand on:
5959 * - infeasible weights;
5960 * - local vs global optima in the discrete case. ]
5961 *
5962 *
5963 * SCHED DOMAINS
5964 *
5965 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5966 * for all i,j solution, we create a tree of cpus that follows the hardware
5967 * topology where each level pairs two lower groups (or better). This results
5968 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5969 * tree to only the first of the previous level and we decrease the frequency
5970 * of load-balance at each level inv. proportional to the number of cpus in
5971 * the groups.
5972 *
5973 * This yields:
5974 *
5975 * log_2 n 1 n
5976 * \Sum { --- * --- * 2^i } = O(n) (5)
5977 * i = 0 2^i 2^i
5978 * `- size of each group
5979 * | | `- number of cpus doing load-balance
5980 * | `- freq
5981 * `- sum over all levels
5982 *
5983 * Coupled with a limit on how many tasks we can migrate every balance pass,
5984 * this makes (5) the runtime complexity of the balancer.
5985 *
5986 * An important property here is that each CPU is still (indirectly) connected
5987 * to every other cpu in at most O(log n) steps:
5988 *
5989 * The adjacency matrix of the resulting graph is given by:
5990 *
5991 * log_2 n
5992 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5993 * k = 0
5994 *
5995 * And you'll find that:
5996 *
5997 * A^(log_2 n)_i,j != 0 for all i,j (7)
5998 *
5999 * Showing there's indeed a path between every cpu in at most O(log n) steps.
6000 * The task movement gives a factor of O(m), giving a convergence complexity
6001 * of:
6002 *
6003 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
6004 *
6005 *
6006 * WORK CONSERVING
6007 *
6008 * In order to avoid CPUs going idle while there's still work to do, new idle
6009 * balancing is more aggressive and has the newly idle cpu iterate up the domain
6010 * tree itself instead of relying on other CPUs to bring it work.
6011 *
6012 * This adds some complexity to both (5) and (8) but it reduces the total idle
6013 * time.
6014 *
6015 * [XXX more?]
6016 *
6017 *
6018 * CGROUPS
6019 *
6020 * Cgroups make a horror show out of (2), instead of a simple sum we get:
6021 *
6022 * s_k,i
6023 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
6024 * S_k
6025 *
6026 * Where
6027 *
6028 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6029 *
6030 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
6031 *
6032 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6033 * property.
6034 *
6035 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
6036 * rewrite all of this once again.]
6037 */
bf0f6f24 6038
ed387b78
HS
6039static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6040
0ec8aa00
PZ
6041enum fbq_type { regular, remote, all };
6042
ddcdf6e7 6043#define LBF_ALL_PINNED 0x01
367456c7 6044#define LBF_NEED_BREAK 0x02
6263322c
PZ
6045#define LBF_DST_PINNED 0x04
6046#define LBF_SOME_PINNED 0x08
ddcdf6e7
PZ
6047
6048struct lb_env {
6049 struct sched_domain *sd;
6050
ddcdf6e7 6051 struct rq *src_rq;
85c1e7da 6052 int src_cpu;
ddcdf6e7
PZ
6053
6054 int dst_cpu;
6055 struct rq *dst_rq;
6056
88b8dac0
SV
6057 struct cpumask *dst_grpmask;
6058 int new_dst_cpu;
ddcdf6e7 6059 enum cpu_idle_type idle;
bd939f45 6060 long imbalance;
b9403130
MW
6061 /* The set of CPUs under consideration for load-balancing */
6062 struct cpumask *cpus;
6063
ddcdf6e7 6064 unsigned int flags;
367456c7
PZ
6065
6066 unsigned int loop;
6067 unsigned int loop_break;
6068 unsigned int loop_max;
0ec8aa00
PZ
6069
6070 enum fbq_type fbq_type;
163122b7 6071 struct list_head tasks;
ddcdf6e7
PZ
6072};
6073
029632fb
PZ
6074/*
6075 * Is this task likely cache-hot:
6076 */
5d5e2b1b 6077static int task_hot(struct task_struct *p, struct lb_env *env)
029632fb
PZ
6078{
6079 s64 delta;
6080
e5673f28
KT
6081 lockdep_assert_held(&env->src_rq->lock);
6082
029632fb
PZ
6083 if (p->sched_class != &fair_sched_class)
6084 return 0;
6085
6086 if (unlikely(p->policy == SCHED_IDLE))
6087 return 0;
6088
6089 /*
6090 * Buddy candidates are cache hot:
6091 */
5d5e2b1b 6092 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
029632fb
PZ
6093 (&p->se == cfs_rq_of(&p->se)->next ||
6094 &p->se == cfs_rq_of(&p->se)->last))
6095 return 1;
6096
6097 if (sysctl_sched_migration_cost == -1)
6098 return 1;
6099 if (sysctl_sched_migration_cost == 0)
6100 return 0;
6101
5d5e2b1b 6102 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
029632fb
PZ
6103
6104 return delta < (s64)sysctl_sched_migration_cost;
6105}
6106
3a7053b3 6107#ifdef CONFIG_NUMA_BALANCING
c1ceac62 6108/*
2a1ed24c
SD
6109 * Returns 1, if task migration degrades locality
6110 * Returns 0, if task migration improves locality i.e migration preferred.
6111 * Returns -1, if task migration is not affected by locality.
c1ceac62 6112 */
2a1ed24c 6113static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
3a7053b3 6114{
b1ad065e 6115 struct numa_group *numa_group = rcu_dereference(p->numa_group);
c1ceac62 6116 unsigned long src_faults, dst_faults;
3a7053b3
MG
6117 int src_nid, dst_nid;
6118
2a595721 6119 if (!static_branch_likely(&sched_numa_balancing))
2a1ed24c
SD
6120 return -1;
6121
c3b9bc5b 6122 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
2a1ed24c 6123 return -1;
7a0f3083
MG
6124
6125 src_nid = cpu_to_node(env->src_cpu);
6126 dst_nid = cpu_to_node(env->dst_cpu);
6127
83e1d2cd 6128 if (src_nid == dst_nid)
2a1ed24c 6129 return -1;
7a0f3083 6130
2a1ed24c
SD
6131 /* Migrating away from the preferred node is always bad. */
6132 if (src_nid == p->numa_preferred_nid) {
6133 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
6134 return 1;
6135 else
6136 return -1;
6137 }
b1ad065e 6138
c1ceac62
RR
6139 /* Encourage migration to the preferred node. */
6140 if (dst_nid == p->numa_preferred_nid)
2a1ed24c 6141 return 0;
b1ad065e 6142
c1ceac62
RR
6143 if (numa_group) {
6144 src_faults = group_faults(p, src_nid);
6145 dst_faults = group_faults(p, dst_nid);
6146 } else {
6147 src_faults = task_faults(p, src_nid);
6148 dst_faults = task_faults(p, dst_nid);
b1ad065e
RR
6149 }
6150
c1ceac62 6151 return dst_faults < src_faults;
7a0f3083
MG
6152}
6153
3a7053b3 6154#else
2a1ed24c 6155static inline int migrate_degrades_locality(struct task_struct *p,
3a7053b3
MG
6156 struct lb_env *env)
6157{
2a1ed24c 6158 return -1;
7a0f3083 6159}
3a7053b3
MG
6160#endif
6161
1e3c88bd
PZ
6162/*
6163 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6164 */
6165static
8e45cb54 6166int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bd 6167{
2a1ed24c 6168 int tsk_cache_hot;
e5673f28
KT
6169
6170 lockdep_assert_held(&env->src_rq->lock);
6171
1e3c88bd
PZ
6172 /*
6173 * We do not migrate tasks that are:
d3198084 6174 * 1) throttled_lb_pair, or
1e3c88bd 6175 * 2) cannot be migrated to this CPU due to cpus_allowed, or
d3198084
JK
6176 * 3) running (obviously), or
6177 * 4) are cache-hot on their current CPU.
1e3c88bd 6178 */
d3198084
JK
6179 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
6180 return 0;
6181
ddcdf6e7 6182 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
e02e60c1 6183 int cpu;
88b8dac0 6184
41acab88 6185 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
88b8dac0 6186
6263322c
PZ
6187 env->flags |= LBF_SOME_PINNED;
6188
88b8dac0
SV
6189 /*
6190 * Remember if this task can be migrated to any other cpu in
6191 * our sched_group. We may want to revisit it if we couldn't
6192 * meet load balance goals by pulling other tasks on src_cpu.
6193 *
6194 * Also avoid computing new_dst_cpu if we have already computed
6195 * one in current iteration.
6196 */
6263322c 6197 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
88b8dac0
SV
6198 return 0;
6199
e02e60c1
JK
6200 /* Prevent to re-select dst_cpu via env's cpus */
6201 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
6202 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6263322c 6203 env->flags |= LBF_DST_PINNED;
e02e60c1
JK
6204 env->new_dst_cpu = cpu;
6205 break;
6206 }
88b8dac0 6207 }
e02e60c1 6208
1e3c88bd
PZ
6209 return 0;
6210 }
88b8dac0
SV
6211
6212 /* Record that we found atleast one task that could run on dst_cpu */
8e45cb54 6213 env->flags &= ~LBF_ALL_PINNED;
1e3c88bd 6214
ddcdf6e7 6215 if (task_running(env->src_rq, p)) {
41acab88 6216 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1e3c88bd
PZ
6217 return 0;
6218 }
6219
6220 /*
6221 * Aggressive migration if:
3a7053b3
MG
6222 * 1) destination numa is preferred
6223 * 2) task is cache cold, or
6224 * 3) too many balance attempts have failed.
1e3c88bd 6225 */
2a1ed24c
SD
6226 tsk_cache_hot = migrate_degrades_locality(p, env);
6227 if (tsk_cache_hot == -1)
6228 tsk_cache_hot = task_hot(p, env);
3a7053b3 6229
2a1ed24c 6230 if (tsk_cache_hot <= 0 ||
7a96c231 6231 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
2a1ed24c 6232 if (tsk_cache_hot == 1) {
3a7053b3
MG
6233 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6234 schedstat_inc(p, se.statistics.nr_forced_migrations);
6235 }
1e3c88bd
PZ
6236 return 1;
6237 }
6238
4e2dcb73
ZH
6239 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
6240 return 0;
1e3c88bd
PZ
6241}
6242
897c395f 6243/*
163122b7
KT
6244 * detach_task() -- detach the task for the migration specified in env
6245 */
6246static void detach_task(struct task_struct *p, struct lb_env *env)
6247{
6248 lockdep_assert_held(&env->src_rq->lock);
6249
163122b7 6250 p->on_rq = TASK_ON_RQ_MIGRATING;
3ea94de1 6251 deactivate_task(env->src_rq, p, 0);
163122b7
KT
6252 set_task_cpu(p, env->dst_cpu);
6253}
6254
897c395f 6255/*
e5673f28 6256 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
897c395f 6257 * part of active balancing operations within "domain".
897c395f 6258 *
e5673f28 6259 * Returns a task if successful and NULL otherwise.
897c395f 6260 */
e5673f28 6261static struct task_struct *detach_one_task(struct lb_env *env)
897c395f
PZ
6262{
6263 struct task_struct *p, *n;
897c395f 6264
e5673f28
KT
6265 lockdep_assert_held(&env->src_rq->lock);
6266
367456c7 6267 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
367456c7
PZ
6268 if (!can_migrate_task(p, env))
6269 continue;
897c395f 6270
163122b7 6271 detach_task(p, env);
e5673f28 6272
367456c7 6273 /*
e5673f28 6274 * Right now, this is only the second place where
163122b7 6275 * lb_gained[env->idle] is updated (other is detach_tasks)
e5673f28 6276 * so we can safely collect stats here rather than
163122b7 6277 * inside detach_tasks().
367456c7
PZ
6278 */
6279 schedstat_inc(env->sd, lb_gained[env->idle]);
e5673f28 6280 return p;
897c395f 6281 }
e5673f28 6282 return NULL;
897c395f
PZ
6283}
6284
eb95308e
PZ
6285static const unsigned int sched_nr_migrate_break = 32;
6286
5d6523eb 6287/*
163122b7
KT
6288 * detach_tasks() -- tries to detach up to imbalance weighted load from
6289 * busiest_rq, as part of a balancing operation within domain "sd".
5d6523eb 6290 *
163122b7 6291 * Returns number of detached tasks if successful and 0 otherwise.
5d6523eb 6292 */
163122b7 6293static int detach_tasks(struct lb_env *env)
1e3c88bd 6294{
5d6523eb
PZ
6295 struct list_head *tasks = &env->src_rq->cfs_tasks;
6296 struct task_struct *p;
367456c7 6297 unsigned long load;
163122b7
KT
6298 int detached = 0;
6299
6300 lockdep_assert_held(&env->src_rq->lock);
1e3c88bd 6301
bd939f45 6302 if (env->imbalance <= 0)
5d6523eb 6303 return 0;
1e3c88bd 6304
5d6523eb 6305 while (!list_empty(tasks)) {
985d3a4c
YD
6306 /*
6307 * We don't want to steal all, otherwise we may be treated likewise,
6308 * which could at worst lead to a livelock crash.
6309 */
6310 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6311 break;
6312
5d6523eb 6313 p = list_first_entry(tasks, struct task_struct, se.group_node);
1e3c88bd 6314
367456c7
PZ
6315 env->loop++;
6316 /* We've more or less seen every task there is, call it quits */
5d6523eb 6317 if (env->loop > env->loop_max)
367456c7 6318 break;
5d6523eb
PZ
6319
6320 /* take a breather every nr_migrate tasks */
367456c7 6321 if (env->loop > env->loop_break) {
eb95308e 6322 env->loop_break += sched_nr_migrate_break;
8e45cb54 6323 env->flags |= LBF_NEED_BREAK;
ee00e66f 6324 break;
a195f004 6325 }
1e3c88bd 6326
d3198084 6327 if (!can_migrate_task(p, env))
367456c7
PZ
6328 goto next;
6329
6330 load = task_h_load(p);
5d6523eb 6331
eb95308e 6332 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
367456c7
PZ
6333 goto next;
6334
bd939f45 6335 if ((load / 2) > env->imbalance)
367456c7 6336 goto next;
1e3c88bd 6337
163122b7
KT
6338 detach_task(p, env);
6339 list_add(&p->se.group_node, &env->tasks);
6340
6341 detached++;
bd939f45 6342 env->imbalance -= load;
1e3c88bd
PZ
6343
6344#ifdef CONFIG_PREEMPT
ee00e66f
PZ
6345 /*
6346 * NEWIDLE balancing is a source of latency, so preemptible
163122b7 6347 * kernels will stop after the first task is detached to minimize
ee00e66f
PZ
6348 * the critical section.
6349 */
5d6523eb 6350 if (env->idle == CPU_NEWLY_IDLE)
ee00e66f 6351 break;
1e3c88bd
PZ
6352#endif
6353
ee00e66f
PZ
6354 /*
6355 * We only want to steal up to the prescribed amount of
6356 * weighted load.
6357 */
bd939f45 6358 if (env->imbalance <= 0)
ee00e66f 6359 break;
367456c7
PZ
6360
6361 continue;
6362next:
5d6523eb 6363 list_move_tail(&p->se.group_node, tasks);
1e3c88bd 6364 }
5d6523eb 6365
1e3c88bd 6366 /*
163122b7
KT
6367 * Right now, this is one of only two places we collect this stat
6368 * so we can safely collect detach_one_task() stats here rather
6369 * than inside detach_one_task().
1e3c88bd 6370 */
163122b7 6371 schedstat_add(env->sd, lb_gained[env->idle], detached);
1e3c88bd 6372
163122b7
KT
6373 return detached;
6374}
6375
6376/*
6377 * attach_task() -- attach the task detached by detach_task() to its new rq.
6378 */
6379static void attach_task(struct rq *rq, struct task_struct *p)
6380{
6381 lockdep_assert_held(&rq->lock);
6382
6383 BUG_ON(task_rq(p) != rq);
163122b7 6384 activate_task(rq, p, 0);
3ea94de1 6385 p->on_rq = TASK_ON_RQ_QUEUED;
163122b7
KT
6386 check_preempt_curr(rq, p, 0);
6387}
6388
6389/*
6390 * attach_one_task() -- attaches the task returned from detach_one_task() to
6391 * its new rq.
6392 */
6393static void attach_one_task(struct rq *rq, struct task_struct *p)
6394{
6395 raw_spin_lock(&rq->lock);
6396 attach_task(rq, p);
6397 raw_spin_unlock(&rq->lock);
6398}
6399
6400/*
6401 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
6402 * new rq.
6403 */
6404static void attach_tasks(struct lb_env *env)
6405{
6406 struct list_head *tasks = &env->tasks;
6407 struct task_struct *p;
6408
6409 raw_spin_lock(&env->dst_rq->lock);
6410
6411 while (!list_empty(tasks)) {
6412 p = list_first_entry(tasks, struct task_struct, se.group_node);
6413 list_del_init(&p->se.group_node);
1e3c88bd 6414
163122b7
KT
6415 attach_task(env->dst_rq, p);
6416 }
6417
6418 raw_spin_unlock(&env->dst_rq->lock);
1e3c88bd
PZ
6419}
6420
230059de 6421#ifdef CONFIG_FAIR_GROUP_SCHED
48a16753 6422static void update_blocked_averages(int cpu)
9e3081ca 6423{
9e3081ca 6424 struct rq *rq = cpu_rq(cpu);
48a16753
PT
6425 struct cfs_rq *cfs_rq;
6426 unsigned long flags;
9e3081ca 6427
48a16753
PT
6428 raw_spin_lock_irqsave(&rq->lock, flags);
6429 update_rq_clock(rq);
9d89c257 6430
9763b67f
PZ
6431 /*
6432 * Iterates the task_group tree in a bottom up fashion, see
6433 * list_add_leaf_cfs_rq() for details.
6434 */
64660c86 6435 for_each_leaf_cfs_rq(rq, cfs_rq) {
9d89c257
YD
6436 /* throttled entities do not contribute to load */
6437 if (throttled_hierarchy(cfs_rq))
6438 continue;
48a16753 6439
a2c6c91f 6440 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
9d89c257
YD
6441 update_tg_load_avg(cfs_rq, 0);
6442 }
48a16753 6443 raw_spin_unlock_irqrestore(&rq->lock, flags);
9e3081ca
PZ
6444}
6445
9763b67f 6446/*
68520796 6447 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9763b67f
PZ
6448 * This needs to be done in a top-down fashion because the load of a child
6449 * group is a fraction of its parents load.
6450 */
68520796 6451static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9763b67f 6452{
68520796
VD
6453 struct rq *rq = rq_of(cfs_rq);
6454 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
a35b6466 6455 unsigned long now = jiffies;
68520796 6456 unsigned long load;
a35b6466 6457
68520796 6458 if (cfs_rq->last_h_load_update == now)
a35b6466
PZ
6459 return;
6460
68520796
VD
6461 cfs_rq->h_load_next = NULL;
6462 for_each_sched_entity(se) {
6463 cfs_rq = cfs_rq_of(se);
6464 cfs_rq->h_load_next = se;
6465 if (cfs_rq->last_h_load_update == now)
6466 break;
6467 }
a35b6466 6468
68520796 6469 if (!se) {
7ea241af 6470 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
68520796
VD
6471 cfs_rq->last_h_load_update = now;
6472 }
6473
6474 while ((se = cfs_rq->h_load_next) != NULL) {
6475 load = cfs_rq->h_load;
7ea241af
YD
6476 load = div64_ul(load * se->avg.load_avg,
6477 cfs_rq_load_avg(cfs_rq) + 1);
68520796
VD
6478 cfs_rq = group_cfs_rq(se);
6479 cfs_rq->h_load = load;
6480 cfs_rq->last_h_load_update = now;
6481 }
9763b67f
PZ
6482}
6483
367456c7 6484static unsigned long task_h_load(struct task_struct *p)
230059de 6485{
367456c7 6486 struct cfs_rq *cfs_rq = task_cfs_rq(p);
230059de 6487
68520796 6488 update_cfs_rq_h_load(cfs_rq);
9d89c257 6489 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7ea241af 6490 cfs_rq_load_avg(cfs_rq) + 1);
230059de
PZ
6491}
6492#else
48a16753 6493static inline void update_blocked_averages(int cpu)
9e3081ca 6494{
6c1d47c0
VG
6495 struct rq *rq = cpu_rq(cpu);
6496 struct cfs_rq *cfs_rq = &rq->cfs;
6497 unsigned long flags;
6498
6499 raw_spin_lock_irqsave(&rq->lock, flags);
6500 update_rq_clock(rq);
a2c6c91f 6501 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
6c1d47c0 6502 raw_spin_unlock_irqrestore(&rq->lock, flags);
9e3081ca
PZ
6503}
6504
367456c7 6505static unsigned long task_h_load(struct task_struct *p)
1e3c88bd 6506{
9d89c257 6507 return p->se.avg.load_avg;
1e3c88bd 6508}
230059de 6509#endif
1e3c88bd 6510
1e3c88bd 6511/********** Helpers for find_busiest_group ************************/
caeb178c
RR
6512
6513enum group_type {
6514 group_other = 0,
6515 group_imbalanced,
6516 group_overloaded,
6517};
6518
1e3c88bd
PZ
6519/*
6520 * sg_lb_stats - stats of a sched_group required for load_balancing
6521 */
6522struct sg_lb_stats {
6523 unsigned long avg_load; /*Avg load across the CPUs of the group */
6524 unsigned long group_load; /* Total load over the CPUs of the group */
1e3c88bd 6525 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
56cf515b 6526 unsigned long load_per_task;
63b2ca30 6527 unsigned long group_capacity;
9e91d61d 6528 unsigned long group_util; /* Total utilization of the group */
147c5fc2 6529 unsigned int sum_nr_running; /* Nr tasks running in the group */
147c5fc2
PZ
6530 unsigned int idle_cpus;
6531 unsigned int group_weight;
caeb178c 6532 enum group_type group_type;
ea67821b 6533 int group_no_capacity;
0ec8aa00
PZ
6534#ifdef CONFIG_NUMA_BALANCING
6535 unsigned int nr_numa_running;
6536 unsigned int nr_preferred_running;
6537#endif
1e3c88bd
PZ
6538};
6539
56cf515b
JK
6540/*
6541 * sd_lb_stats - Structure to store the statistics of a sched_domain
6542 * during load balancing.
6543 */
6544struct sd_lb_stats {
6545 struct sched_group *busiest; /* Busiest group in this sd */
6546 struct sched_group *local; /* Local group in this sd */
6547 unsigned long total_load; /* Total load of all groups in sd */
63b2ca30 6548 unsigned long total_capacity; /* Total capacity of all groups in sd */
56cf515b
JK
6549 unsigned long avg_load; /* Average load across all groups in sd */
6550
56cf515b 6551 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
147c5fc2 6552 struct sg_lb_stats local_stat; /* Statistics of the local group */
56cf515b
JK
6553};
6554
147c5fc2
PZ
6555static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6556{
6557 /*
6558 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6559 * local_stat because update_sg_lb_stats() does a full clear/assignment.
6560 * We must however clear busiest_stat::avg_load because
6561 * update_sd_pick_busiest() reads this before assignment.
6562 */
6563 *sds = (struct sd_lb_stats){
6564 .busiest = NULL,
6565 .local = NULL,
6566 .total_load = 0UL,
63b2ca30 6567 .total_capacity = 0UL,
147c5fc2
PZ
6568 .busiest_stat = {
6569 .avg_load = 0UL,
caeb178c
RR
6570 .sum_nr_running = 0,
6571 .group_type = group_other,
147c5fc2
PZ
6572 },
6573 };
6574}
6575
1e3c88bd
PZ
6576/**
6577 * get_sd_load_idx - Obtain the load index for a given sched domain.
6578 * @sd: The sched_domain whose load_idx is to be obtained.
ed1b7732 6579 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
e69f6186
YB
6580 *
6581 * Return: The load index.
1e3c88bd
PZ
6582 */
6583static inline int get_sd_load_idx(struct sched_domain *sd,
6584 enum cpu_idle_type idle)
6585{
6586 int load_idx;
6587
6588 switch (idle) {
6589 case CPU_NOT_IDLE:
6590 load_idx = sd->busy_idx;
6591 break;
6592
6593 case CPU_NEWLY_IDLE:
6594 load_idx = sd->newidle_idx;
6595 break;
6596 default:
6597 load_idx = sd->idle_idx;
6598 break;
6599 }
6600
6601 return load_idx;
6602}
6603
ced549fa 6604static unsigned long scale_rt_capacity(int cpu)
1e3c88bd
PZ
6605{
6606 struct rq *rq = cpu_rq(cpu);
b5b4860d 6607 u64 total, used, age_stamp, avg;
cadefd3d 6608 s64 delta;
1e3c88bd 6609
b654f7de
PZ
6610 /*
6611 * Since we're reading these variables without serialization make sure
6612 * we read them once before doing sanity checks on them.
6613 */
316c1608
JL
6614 age_stamp = READ_ONCE(rq->age_stamp);
6615 avg = READ_ONCE(rq->rt_avg);
cebde6d6 6616 delta = __rq_clock_broken(rq) - age_stamp;
b654f7de 6617
cadefd3d
PZ
6618 if (unlikely(delta < 0))
6619 delta = 0;
6620
6621 total = sched_avg_period() + delta;
aa483808 6622
b5b4860d 6623 used = div_u64(avg, total);
1e3c88bd 6624
b5b4860d
VG
6625 if (likely(used < SCHED_CAPACITY_SCALE))
6626 return SCHED_CAPACITY_SCALE - used;
1e3c88bd 6627
b5b4860d 6628 return 1;
1e3c88bd
PZ
6629}
6630
ced549fa 6631static void update_cpu_capacity(struct sched_domain *sd, int cpu)
1e3c88bd 6632{
8cd5601c 6633 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
1e3c88bd
PZ
6634 struct sched_group *sdg = sd->groups;
6635
ca6d75e6 6636 cpu_rq(cpu)->cpu_capacity_orig = capacity;
9d5efe05 6637
ced549fa 6638 capacity *= scale_rt_capacity(cpu);
ca8ce3d0 6639 capacity >>= SCHED_CAPACITY_SHIFT;
1e3c88bd 6640
ced549fa
NP
6641 if (!capacity)
6642 capacity = 1;
1e3c88bd 6643
ced549fa
NP
6644 cpu_rq(cpu)->cpu_capacity = capacity;
6645 sdg->sgc->capacity = capacity;
1e3c88bd
PZ
6646}
6647
63b2ca30 6648void update_group_capacity(struct sched_domain *sd, int cpu)
1e3c88bd
PZ
6649{
6650 struct sched_domain *child = sd->child;
6651 struct sched_group *group, *sdg = sd->groups;
dc7ff76e 6652 unsigned long capacity;
4ec4412e
VG
6653 unsigned long interval;
6654
6655 interval = msecs_to_jiffies(sd->balance_interval);
6656 interval = clamp(interval, 1UL, max_load_balance_interval);
63b2ca30 6657 sdg->sgc->next_update = jiffies + interval;
1e3c88bd
PZ
6658
6659 if (!child) {
ced549fa 6660 update_cpu_capacity(sd, cpu);
1e3c88bd
PZ
6661 return;
6662 }
6663
dc7ff76e 6664 capacity = 0;
1e3c88bd 6665
74a5ce20
PZ
6666 if (child->flags & SD_OVERLAP) {
6667 /*
6668 * SD_OVERLAP domains cannot assume that child groups
6669 * span the current group.
6670 */
6671
863bffc8 6672 for_each_cpu(cpu, sched_group_cpus(sdg)) {
63b2ca30 6673 struct sched_group_capacity *sgc;
9abf24d4 6674 struct rq *rq = cpu_rq(cpu);
863bffc8 6675
9abf24d4 6676 /*
63b2ca30 6677 * build_sched_domains() -> init_sched_groups_capacity()
9abf24d4
SD
6678 * gets here before we've attached the domains to the
6679 * runqueues.
6680 *
ced549fa
NP
6681 * Use capacity_of(), which is set irrespective of domains
6682 * in update_cpu_capacity().
9abf24d4 6683 *
dc7ff76e 6684 * This avoids capacity from being 0 and
9abf24d4 6685 * causing divide-by-zero issues on boot.
9abf24d4
SD
6686 */
6687 if (unlikely(!rq->sd)) {
ced549fa 6688 capacity += capacity_of(cpu);
9abf24d4
SD
6689 continue;
6690 }
863bffc8 6691
63b2ca30 6692 sgc = rq->sd->groups->sgc;
63b2ca30 6693 capacity += sgc->capacity;
863bffc8 6694 }
74a5ce20
PZ
6695 } else {
6696 /*
6697 * !SD_OVERLAP domains can assume that child groups
6698 * span the current group.
6699 */
6700
6701 group = child->groups;
6702 do {
63b2ca30 6703 capacity += group->sgc->capacity;
74a5ce20
PZ
6704 group = group->next;
6705 } while (group != child->groups);
6706 }
1e3c88bd 6707
63b2ca30 6708 sdg->sgc->capacity = capacity;
1e3c88bd
PZ
6709}
6710
9d5efe05 6711/*
ea67821b
VG
6712 * Check whether the capacity of the rq has been noticeably reduced by side
6713 * activity. The imbalance_pct is used for the threshold.
6714 * Return true is the capacity is reduced
9d5efe05
SV
6715 */
6716static inline int
ea67821b 6717check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9d5efe05 6718{
ea67821b
VG
6719 return ((rq->cpu_capacity * sd->imbalance_pct) <
6720 (rq->cpu_capacity_orig * 100));
9d5efe05
SV
6721}
6722
30ce5dab
PZ
6723/*
6724 * Group imbalance indicates (and tries to solve) the problem where balancing
6725 * groups is inadequate due to tsk_cpus_allowed() constraints.
6726 *
6727 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6728 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6729 * Something like:
6730 *
6731 * { 0 1 2 3 } { 4 5 6 7 }
6732 * * * * *
6733 *
6734 * If we were to balance group-wise we'd place two tasks in the first group and
6735 * two tasks in the second group. Clearly this is undesired as it will overload
6736 * cpu 3 and leave one of the cpus in the second group unused.
6737 *
6738 * The current solution to this issue is detecting the skew in the first group
6263322c
PZ
6739 * by noticing the lower domain failed to reach balance and had difficulty
6740 * moving tasks due to affinity constraints.
30ce5dab
PZ
6741 *
6742 * When this is so detected; this group becomes a candidate for busiest; see
ed1b7732 6743 * update_sd_pick_busiest(). And calculate_imbalance() and
6263322c 6744 * find_busiest_group() avoid some of the usual balance conditions to allow it
30ce5dab
PZ
6745 * to create an effective group imbalance.
6746 *
6747 * This is a somewhat tricky proposition since the next run might not find the
6748 * group imbalance and decide the groups need to be balanced again. A most
6749 * subtle and fragile situation.
6750 */
6751
6263322c 6752static inline int sg_imbalanced(struct sched_group *group)
30ce5dab 6753{
63b2ca30 6754 return group->sgc->imbalance;
30ce5dab
PZ
6755}
6756
b37d9316 6757/*
ea67821b
VG
6758 * group_has_capacity returns true if the group has spare capacity that could
6759 * be used by some tasks.
6760 * We consider that a group has spare capacity if the * number of task is
9e91d61d
DE
6761 * smaller than the number of CPUs or if the utilization is lower than the
6762 * available capacity for CFS tasks.
ea67821b
VG
6763 * For the latter, we use a threshold to stabilize the state, to take into
6764 * account the variance of the tasks' load and to return true if the available
6765 * capacity in meaningful for the load balancer.
6766 * As an example, an available capacity of 1% can appear but it doesn't make
6767 * any benefit for the load balance.
b37d9316 6768 */
ea67821b
VG
6769static inline bool
6770group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
b37d9316 6771{
ea67821b
VG
6772 if (sgs->sum_nr_running < sgs->group_weight)
6773 return true;
c61037e9 6774
ea67821b 6775 if ((sgs->group_capacity * 100) >
9e91d61d 6776 (sgs->group_util * env->sd->imbalance_pct))
ea67821b 6777 return true;
b37d9316 6778
ea67821b
VG
6779 return false;
6780}
6781
6782/*
6783 * group_is_overloaded returns true if the group has more tasks than it can
6784 * handle.
6785 * group_is_overloaded is not equals to !group_has_capacity because a group
6786 * with the exact right number of tasks, has no more spare capacity but is not
6787 * overloaded so both group_has_capacity and group_is_overloaded return
6788 * false.
6789 */
6790static inline bool
6791group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6792{
6793 if (sgs->sum_nr_running <= sgs->group_weight)
6794 return false;
b37d9316 6795
ea67821b 6796 if ((sgs->group_capacity * 100) <
9e91d61d 6797 (sgs->group_util * env->sd->imbalance_pct))
ea67821b 6798 return true;
b37d9316 6799
ea67821b 6800 return false;
b37d9316
PZ
6801}
6802
79a89f92
LY
6803static inline enum
6804group_type group_classify(struct sched_group *group,
6805 struct sg_lb_stats *sgs)
caeb178c 6806{
ea67821b 6807 if (sgs->group_no_capacity)
caeb178c
RR
6808 return group_overloaded;
6809
6810 if (sg_imbalanced(group))
6811 return group_imbalanced;
6812
6813 return group_other;
6814}
6815
1e3c88bd
PZ
6816/**
6817 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d 6818 * @env: The load balancing environment.
1e3c88bd 6819 * @group: sched_group whose statistics are to be updated.
1e3c88bd 6820 * @load_idx: Load index of sched_domain of this_cpu for load calc.
1e3c88bd 6821 * @local_group: Does group contain this_cpu.
1e3c88bd 6822 * @sgs: variable to hold the statistics for this group.
cd3bd4e6 6823 * @overload: Indicate more than one runnable task for any CPU.
1e3c88bd 6824 */
bd939f45
PZ
6825static inline void update_sg_lb_stats(struct lb_env *env,
6826 struct sched_group *group, int load_idx,
4486edd1
TC
6827 int local_group, struct sg_lb_stats *sgs,
6828 bool *overload)
1e3c88bd 6829{
30ce5dab 6830 unsigned long load;
a426f99c 6831 int i, nr_running;
1e3c88bd 6832
b72ff13c
PZ
6833 memset(sgs, 0, sizeof(*sgs));
6834
b9403130 6835 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
1e3c88bd
PZ
6836 struct rq *rq = cpu_rq(i);
6837
1e3c88bd 6838 /* Bias balancing toward cpus of our domain */
6263322c 6839 if (local_group)
04f733b4 6840 load = target_load(i, load_idx);
6263322c 6841 else
1e3c88bd 6842 load = source_load(i, load_idx);
1e3c88bd
PZ
6843
6844 sgs->group_load += load;
9e91d61d 6845 sgs->group_util += cpu_util(i);
65fdac08 6846 sgs->sum_nr_running += rq->cfs.h_nr_running;
4486edd1 6847
a426f99c
WL
6848 nr_running = rq->nr_running;
6849 if (nr_running > 1)
4486edd1
TC
6850 *overload = true;
6851
0ec8aa00
PZ
6852#ifdef CONFIG_NUMA_BALANCING
6853 sgs->nr_numa_running += rq->nr_numa_running;
6854 sgs->nr_preferred_running += rq->nr_preferred_running;
6855#endif
1e3c88bd 6856 sgs->sum_weighted_load += weighted_cpuload(i);
a426f99c
WL
6857 /*
6858 * No need to call idle_cpu() if nr_running is not 0
6859 */
6860 if (!nr_running && idle_cpu(i))
aae6d3dd 6861 sgs->idle_cpus++;
1e3c88bd
PZ
6862 }
6863
63b2ca30
NP
6864 /* Adjust by relative CPU capacity of the group */
6865 sgs->group_capacity = group->sgc->capacity;
ca8ce3d0 6866 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
1e3c88bd 6867
dd5feea1 6868 if (sgs->sum_nr_running)
38d0f770 6869 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bd 6870
aae6d3dd 6871 sgs->group_weight = group->group_weight;
b37d9316 6872
ea67821b 6873 sgs->group_no_capacity = group_is_overloaded(env, sgs);
79a89f92 6874 sgs->group_type = group_classify(group, sgs);
1e3c88bd
PZ
6875}
6876
532cb4c4
MN
6877/**
6878 * update_sd_pick_busiest - return 1 on busiest group
cd96891d 6879 * @env: The load balancing environment.
532cb4c4
MN
6880 * @sds: sched_domain statistics
6881 * @sg: sched_group candidate to be checked for being the busiest
b6b12294 6882 * @sgs: sched_group statistics
532cb4c4
MN
6883 *
6884 * Determine if @sg is a busier group than the previously selected
6885 * busiest group.
e69f6186
YB
6886 *
6887 * Return: %true if @sg is a busier group than the previously selected
6888 * busiest group. %false otherwise.
532cb4c4 6889 */
bd939f45 6890static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c4
MN
6891 struct sd_lb_stats *sds,
6892 struct sched_group *sg,
bd939f45 6893 struct sg_lb_stats *sgs)
532cb4c4 6894{
caeb178c 6895 struct sg_lb_stats *busiest = &sds->busiest_stat;
532cb4c4 6896
caeb178c 6897 if (sgs->group_type > busiest->group_type)
532cb4c4
MN
6898 return true;
6899
caeb178c
RR
6900 if (sgs->group_type < busiest->group_type)
6901 return false;
6902
6903 if (sgs->avg_load <= busiest->avg_load)
6904 return false;
6905
6906 /* This is the busiest node in its class. */
6907 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
6908 return true;
6909
1f621e02
SD
6910 /* No ASYM_PACKING if target cpu is already busy */
6911 if (env->idle == CPU_NOT_IDLE)
6912 return true;
532cb4c4
MN
6913 /*
6914 * ASYM_PACKING needs to move all the work to the lowest
6915 * numbered CPUs in the group, therefore mark all groups
6916 * higher than ourself as busy.
6917 */
caeb178c 6918 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
532cb4c4
MN
6919 if (!sds->busiest)
6920 return true;
6921
1f621e02
SD
6922 /* Prefer to move from highest possible cpu's work */
6923 if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
532cb4c4
MN
6924 return true;
6925 }
6926
6927 return false;
6928}
6929
0ec8aa00
PZ
6930#ifdef CONFIG_NUMA_BALANCING
6931static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6932{
6933 if (sgs->sum_nr_running > sgs->nr_numa_running)
6934 return regular;
6935 if (sgs->sum_nr_running > sgs->nr_preferred_running)
6936 return remote;
6937 return all;
6938}
6939
6940static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6941{
6942 if (rq->nr_running > rq->nr_numa_running)
6943 return regular;
6944 if (rq->nr_running > rq->nr_preferred_running)
6945 return remote;
6946 return all;
6947}
6948#else
6949static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6950{
6951 return all;
6952}
6953
6954static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6955{
6956 return regular;
6957}
6958#endif /* CONFIG_NUMA_BALANCING */
6959
1e3c88bd 6960/**
461819ac 6961 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d 6962 * @env: The load balancing environment.
1e3c88bd
PZ
6963 * @sds: variable to hold the statistics for this sched_domain.
6964 */
0ec8aa00 6965static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 6966{
bd939f45
PZ
6967 struct sched_domain *child = env->sd->child;
6968 struct sched_group *sg = env->sd->groups;
56cf515b 6969 struct sg_lb_stats tmp_sgs;
1e3c88bd 6970 int load_idx, prefer_sibling = 0;
4486edd1 6971 bool overload = false;
1e3c88bd
PZ
6972
6973 if (child && child->flags & SD_PREFER_SIBLING)
6974 prefer_sibling = 1;
6975
bd939f45 6976 load_idx = get_sd_load_idx(env->sd, env->idle);
1e3c88bd
PZ
6977
6978 do {
56cf515b 6979 struct sg_lb_stats *sgs = &tmp_sgs;
1e3c88bd
PZ
6980 int local_group;
6981
bd939f45 6982 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
56cf515b
JK
6983 if (local_group) {
6984 sds->local = sg;
6985 sgs = &sds->local_stat;
b72ff13c
PZ
6986
6987 if (env->idle != CPU_NEWLY_IDLE ||
63b2ca30
NP
6988 time_after_eq(jiffies, sg->sgc->next_update))
6989 update_group_capacity(env->sd, env->dst_cpu);
56cf515b 6990 }
1e3c88bd 6991
4486edd1
TC
6992 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6993 &overload);
1e3c88bd 6994
b72ff13c
PZ
6995 if (local_group)
6996 goto next_group;
6997
1e3c88bd
PZ
6998 /*
6999 * In case the child domain prefers tasks go to siblings
ea67821b 7000 * first, lower the sg capacity so that we'll try
75dd321d
NR
7001 * and move all the excess tasks away. We lower the capacity
7002 * of a group only if the local group has the capacity to fit
ea67821b
VG
7003 * these excess tasks. The extra check prevents the case where
7004 * you always pull from the heaviest group when it is already
7005 * under-utilized (possible with a large weight task outweighs
7006 * the tasks on the system).
1e3c88bd 7007 */
b72ff13c 7008 if (prefer_sibling && sds->local &&
ea67821b
VG
7009 group_has_capacity(env, &sds->local_stat) &&
7010 (sgs->sum_nr_running > 1)) {
7011 sgs->group_no_capacity = 1;
79a89f92 7012 sgs->group_type = group_classify(sg, sgs);
cb0b9f24 7013 }
1e3c88bd 7014
b72ff13c 7015 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
532cb4c4 7016 sds->busiest = sg;
56cf515b 7017 sds->busiest_stat = *sgs;
1e3c88bd
PZ
7018 }
7019
b72ff13c
PZ
7020next_group:
7021 /* Now, start updating sd_lb_stats */
7022 sds->total_load += sgs->group_load;
63b2ca30 7023 sds->total_capacity += sgs->group_capacity;
b72ff13c 7024
532cb4c4 7025 sg = sg->next;
bd939f45 7026 } while (sg != env->sd->groups);
0ec8aa00
PZ
7027
7028 if (env->sd->flags & SD_NUMA)
7029 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4486edd1
TC
7030
7031 if (!env->sd->parent) {
7032 /* update overload indicator if we are at root domain */
7033 if (env->dst_rq->rd->overload != overload)
7034 env->dst_rq->rd->overload = overload;
7035 }
7036
532cb4c4
MN
7037}
7038
532cb4c4
MN
7039/**
7040 * check_asym_packing - Check to see if the group is packed into the
7041 * sched doman.
7042 *
7043 * This is primarily intended to used at the sibling level. Some
7044 * cores like POWER7 prefer to use lower numbered SMT threads. In the
7045 * case of POWER7, it can move to lower SMT modes only when higher
7046 * threads are idle. When in lower SMT modes, the threads will
7047 * perform better since they share less core resources. Hence when we
7048 * have idle threads, we want them to be the higher ones.
7049 *
7050 * This packing function is run on idle threads. It checks to see if
7051 * the busiest CPU in this domain (core in the P7 case) has a higher
7052 * CPU number than the packing function is being run on. Here we are
7053 * assuming lower CPU number will be equivalent to lower a SMT thread
7054 * number.
7055 *
e69f6186 7056 * Return: 1 when packing is required and a task should be moved to
b6b12294
MN
7057 * this CPU. The amount of the imbalance is returned in *imbalance.
7058 *
cd96891d 7059 * @env: The load balancing environment.
532cb4c4 7060 * @sds: Statistics of the sched_domain which is to be packed
532cb4c4 7061 */
bd939f45 7062static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
532cb4c4
MN
7063{
7064 int busiest_cpu;
7065
bd939f45 7066 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
7067 return 0;
7068
1f621e02
SD
7069 if (env->idle == CPU_NOT_IDLE)
7070 return 0;
7071
532cb4c4
MN
7072 if (!sds->busiest)
7073 return 0;
7074
7075 busiest_cpu = group_first_cpu(sds->busiest);
bd939f45 7076 if (env->dst_cpu > busiest_cpu)
532cb4c4
MN
7077 return 0;
7078
bd939f45 7079 env->imbalance = DIV_ROUND_CLOSEST(
63b2ca30 7080 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
ca8ce3d0 7081 SCHED_CAPACITY_SCALE);
bd939f45 7082
532cb4c4 7083 return 1;
1e3c88bd
PZ
7084}
7085
7086/**
7087 * fix_small_imbalance - Calculate the minor imbalance that exists
7088 * amongst the groups of a sched_domain, during
7089 * load balancing.
cd96891d 7090 * @env: The load balancing environment.
1e3c88bd 7091 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 7092 */
bd939f45
PZ
7093static inline
7094void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 7095{
63b2ca30 7096 unsigned long tmp, capa_now = 0, capa_move = 0;
1e3c88bd 7097 unsigned int imbn = 2;
dd5feea1 7098 unsigned long scaled_busy_load_per_task;
56cf515b 7099 struct sg_lb_stats *local, *busiest;
1e3c88bd 7100
56cf515b
JK
7101 local = &sds->local_stat;
7102 busiest = &sds->busiest_stat;
1e3c88bd 7103
56cf515b
JK
7104 if (!local->sum_nr_running)
7105 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
7106 else if (busiest->load_per_task > local->load_per_task)
7107 imbn = 1;
dd5feea1 7108
56cf515b 7109 scaled_busy_load_per_task =
ca8ce3d0 7110 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
63b2ca30 7111 busiest->group_capacity;
56cf515b 7112
3029ede3
VD
7113 if (busiest->avg_load + scaled_busy_load_per_task >=
7114 local->avg_load + (scaled_busy_load_per_task * imbn)) {
56cf515b 7115 env->imbalance = busiest->load_per_task;
1e3c88bd
PZ
7116 return;
7117 }
7118
7119 /*
7120 * OK, we don't have enough imbalance to justify moving tasks,
ced549fa 7121 * however we may be able to increase total CPU capacity used by
1e3c88bd
PZ
7122 * moving them.
7123 */
7124
63b2ca30 7125 capa_now += busiest->group_capacity *
56cf515b 7126 min(busiest->load_per_task, busiest->avg_load);
63b2ca30 7127 capa_now += local->group_capacity *
56cf515b 7128 min(local->load_per_task, local->avg_load);
ca8ce3d0 7129 capa_now /= SCHED_CAPACITY_SCALE;
1e3c88bd
PZ
7130
7131 /* Amount of load we'd subtract */
a2cd4260 7132 if (busiest->avg_load > scaled_busy_load_per_task) {
63b2ca30 7133 capa_move += busiest->group_capacity *
56cf515b 7134 min(busiest->load_per_task,
a2cd4260 7135 busiest->avg_load - scaled_busy_load_per_task);
56cf515b 7136 }
1e3c88bd
PZ
7137
7138 /* Amount of load we'd add */
63b2ca30 7139 if (busiest->avg_load * busiest->group_capacity <
ca8ce3d0 7140 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
63b2ca30
NP
7141 tmp = (busiest->avg_load * busiest->group_capacity) /
7142 local->group_capacity;
56cf515b 7143 } else {
ca8ce3d0 7144 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
63b2ca30 7145 local->group_capacity;
56cf515b 7146 }
63b2ca30 7147 capa_move += local->group_capacity *
3ae11c90 7148 min(local->load_per_task, local->avg_load + tmp);
ca8ce3d0 7149 capa_move /= SCHED_CAPACITY_SCALE;
1e3c88bd
PZ
7150
7151 /* Move if we gain throughput */
63b2ca30 7152 if (capa_move > capa_now)
56cf515b 7153 env->imbalance = busiest->load_per_task;
1e3c88bd
PZ
7154}
7155
7156/**
7157 * calculate_imbalance - Calculate the amount of imbalance present within the
7158 * groups of a given sched_domain during load balance.
bd939f45 7159 * @env: load balance environment
1e3c88bd 7160 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 7161 */
bd939f45 7162static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 7163{
dd5feea1 7164 unsigned long max_pull, load_above_capacity = ~0UL;
56cf515b
JK
7165 struct sg_lb_stats *local, *busiest;
7166
7167 local = &sds->local_stat;
56cf515b 7168 busiest = &sds->busiest_stat;
dd5feea1 7169
caeb178c 7170 if (busiest->group_type == group_imbalanced) {
30ce5dab
PZ
7171 /*
7172 * In the group_imb case we cannot rely on group-wide averages
7173 * to ensure cpu-load equilibrium, look at wider averages. XXX
7174 */
56cf515b
JK
7175 busiest->load_per_task =
7176 min(busiest->load_per_task, sds->avg_load);
dd5feea1
SS
7177 }
7178
1e3c88bd 7179 /*
885e542c
DE
7180 * Avg load of busiest sg can be less and avg load of local sg can
7181 * be greater than avg load across all sgs of sd because avg load
7182 * factors in sg capacity and sgs with smaller group_type are
7183 * skipped when updating the busiest sg:
1e3c88bd 7184 */
b1885550
VD
7185 if (busiest->avg_load <= sds->avg_load ||
7186 local->avg_load >= sds->avg_load) {
bd939f45
PZ
7187 env->imbalance = 0;
7188 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
7189 }
7190
9a5d9ba6
PZ
7191 /*
7192 * If there aren't any idle cpus, avoid creating some.
7193 */
7194 if (busiest->group_type == group_overloaded &&
7195 local->group_type == group_overloaded) {
1be0eb2a 7196 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
cfa10334 7197 if (load_above_capacity > busiest->group_capacity) {
ea67821b 7198 load_above_capacity -= busiest->group_capacity;
cfa10334
MR
7199 load_above_capacity *= NICE_0_LOAD;
7200 load_above_capacity /= busiest->group_capacity;
7201 } else
ea67821b 7202 load_above_capacity = ~0UL;
dd5feea1
SS
7203 }
7204
7205 /*
7206 * We're trying to get all the cpus to the average_load, so we don't
7207 * want to push ourselves above the average load, nor do we wish to
7208 * reduce the max loaded cpu below the average load. At the same time,
0a9b23ce
DE
7209 * we also don't want to reduce the group load below the group
7210 * capacity. Thus we look for the minimum possible imbalance.
dd5feea1 7211 */
30ce5dab 7212 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
1e3c88bd
PZ
7213
7214 /* How much load to actually move to equalise the imbalance */
56cf515b 7215 env->imbalance = min(
63b2ca30
NP
7216 max_pull * busiest->group_capacity,
7217 (sds->avg_load - local->avg_load) * local->group_capacity
ca8ce3d0 7218 ) / SCHED_CAPACITY_SCALE;
1e3c88bd
PZ
7219
7220 /*
7221 * if *imbalance is less than the average load per runnable task
25985edc 7222 * there is no guarantee that any tasks will be moved so we'll have
1e3c88bd
PZ
7223 * a think about bumping its value to force at least one task to be
7224 * moved
7225 */
56cf515b 7226 if (env->imbalance < busiest->load_per_task)
bd939f45 7227 return fix_small_imbalance(env, sds);
1e3c88bd 7228}
fab47622 7229
1e3c88bd
PZ
7230/******* find_busiest_group() helpers end here *********************/
7231
7232/**
7233 * find_busiest_group - Returns the busiest group within the sched_domain
0a9b23ce 7234 * if there is an imbalance.
1e3c88bd
PZ
7235 *
7236 * Also calculates the amount of weighted load which should be moved
7237 * to restore balance.
7238 *
cd96891d 7239 * @env: The load balancing environment.
1e3c88bd 7240 *
e69f6186 7241 * Return: - The busiest group if imbalance exists.
1e3c88bd 7242 */
56cf515b 7243static struct sched_group *find_busiest_group(struct lb_env *env)
1e3c88bd 7244{
56cf515b 7245 struct sg_lb_stats *local, *busiest;
1e3c88bd
PZ
7246 struct sd_lb_stats sds;
7247
147c5fc2 7248 init_sd_lb_stats(&sds);
1e3c88bd
PZ
7249
7250 /*
7251 * Compute the various statistics relavent for load balancing at
7252 * this level.
7253 */
23f0d209 7254 update_sd_lb_stats(env, &sds);
56cf515b
JK
7255 local = &sds.local_stat;
7256 busiest = &sds.busiest_stat;
1e3c88bd 7257
ea67821b 7258 /* ASYM feature bypasses nice load balance check */
1f621e02 7259 if (check_asym_packing(env, &sds))
532cb4c4
MN
7260 return sds.busiest;
7261
cc57aa8f 7262 /* There is no busy sibling group to pull tasks from */
56cf515b 7263 if (!sds.busiest || busiest->sum_nr_running == 0)
1e3c88bd
PZ
7264 goto out_balanced;
7265
ca8ce3d0
NP
7266 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
7267 / sds.total_capacity;
b0432d8f 7268
866ab43e
PZ
7269 /*
7270 * If the busiest group is imbalanced the below checks don't
30ce5dab 7271 * work because they assume all things are equal, which typically
866ab43e
PZ
7272 * isn't true due to cpus_allowed constraints and the like.
7273 */
caeb178c 7274 if (busiest->group_type == group_imbalanced)
866ab43e
PZ
7275 goto force_balance;
7276
cc57aa8f 7277 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
ea67821b
VG
7278 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
7279 busiest->group_no_capacity)
fab47622
NR
7280 goto force_balance;
7281
cc57aa8f 7282 /*
9c58c79a 7283 * If the local group is busier than the selected busiest group
cc57aa8f
PZ
7284 * don't try and pull any tasks.
7285 */
56cf515b 7286 if (local->avg_load >= busiest->avg_load)
1e3c88bd
PZ
7287 goto out_balanced;
7288
cc57aa8f
PZ
7289 /*
7290 * Don't pull any tasks if this group is already above the domain
7291 * average load.
7292 */
56cf515b 7293 if (local->avg_load >= sds.avg_load)
1e3c88bd
PZ
7294 goto out_balanced;
7295
bd939f45 7296 if (env->idle == CPU_IDLE) {
aae6d3dd 7297 /*
43f4d666
VG
7298 * This cpu is idle. If the busiest group is not overloaded
7299 * and there is no imbalance between this and busiest group
7300 * wrt idle cpus, it is balanced. The imbalance becomes
7301 * significant if the diff is greater than 1 otherwise we
7302 * might end up to just move the imbalance on another group
aae6d3dd 7303 */
43f4d666
VG
7304 if ((busiest->group_type != group_overloaded) &&
7305 (local->idle_cpus <= (busiest->idle_cpus + 1)))
aae6d3dd 7306 goto out_balanced;
c186fafe
PZ
7307 } else {
7308 /*
7309 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7310 * imbalance_pct to be conservative.
7311 */
56cf515b
JK
7312 if (100 * busiest->avg_load <=
7313 env->sd->imbalance_pct * local->avg_load)
c186fafe 7314 goto out_balanced;
aae6d3dd 7315 }
1e3c88bd 7316
fab47622 7317force_balance:
1e3c88bd 7318 /* Looks like there is an imbalance. Compute it */
bd939f45 7319 calculate_imbalance(env, &sds);
1e3c88bd
PZ
7320 return sds.busiest;
7321
7322out_balanced:
bd939f45 7323 env->imbalance = 0;
1e3c88bd
PZ
7324 return NULL;
7325}
7326
7327/*
7328 * find_busiest_queue - find the busiest runqueue among the cpus in group.
7329 */
bd939f45 7330static struct rq *find_busiest_queue(struct lb_env *env,
b9403130 7331 struct sched_group *group)
1e3c88bd
PZ
7332{
7333 struct rq *busiest = NULL, *rq;
ced549fa 7334 unsigned long busiest_load = 0, busiest_capacity = 1;
1e3c88bd
PZ
7335 int i;
7336
6906a408 7337 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
ea67821b 7338 unsigned long capacity, wl;
0ec8aa00
PZ
7339 enum fbq_type rt;
7340
7341 rq = cpu_rq(i);
7342 rt = fbq_classify_rq(rq);
1e3c88bd 7343
0ec8aa00
PZ
7344 /*
7345 * We classify groups/runqueues into three groups:
7346 * - regular: there are !numa tasks
7347 * - remote: there are numa tasks that run on the 'wrong' node
7348 * - all: there is no distinction
7349 *
7350 * In order to avoid migrating ideally placed numa tasks,
7351 * ignore those when there's better options.
7352 *
7353 * If we ignore the actual busiest queue to migrate another
7354 * task, the next balance pass can still reduce the busiest
7355 * queue by moving tasks around inside the node.
7356 *
7357 * If we cannot move enough load due to this classification
7358 * the next pass will adjust the group classification and
7359 * allow migration of more tasks.
7360 *
7361 * Both cases only affect the total convergence complexity.
7362 */
7363 if (rt > env->fbq_type)
7364 continue;
7365
ced549fa 7366 capacity = capacity_of(i);
9d5efe05 7367
6e40f5bb 7368 wl = weighted_cpuload(i);
1e3c88bd 7369
6e40f5bb
TG
7370 /*
7371 * When comparing with imbalance, use weighted_cpuload()
ced549fa 7372 * which is not scaled with the cpu capacity.
6e40f5bb 7373 */
ea67821b
VG
7374
7375 if (rq->nr_running == 1 && wl > env->imbalance &&
7376 !check_cpu_capacity(rq, env->sd))
1e3c88bd
PZ
7377 continue;
7378
6e40f5bb
TG
7379 /*
7380 * For the load comparisons with the other cpu's, consider
ced549fa
NP
7381 * the weighted_cpuload() scaled with the cpu capacity, so
7382 * that the load can be moved away from the cpu that is
7383 * potentially running at a lower capacity.
95a79b80 7384 *
ced549fa 7385 * Thus we're looking for max(wl_i / capacity_i), crosswise
95a79b80 7386 * multiplication to rid ourselves of the division works out
ced549fa
NP
7387 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
7388 * our previous maximum.
6e40f5bb 7389 */
ced549fa 7390 if (wl * busiest_capacity > busiest_load * capacity) {
95a79b80 7391 busiest_load = wl;
ced549fa 7392 busiest_capacity = capacity;
1e3c88bd
PZ
7393 busiest = rq;
7394 }
7395 }
7396
7397 return busiest;
7398}
7399
7400/*
7401 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7402 * so long as it is large enough.
7403 */
7404#define MAX_PINNED_INTERVAL 512
7405
7406/* Working cpumask for load_balance and load_balance_newidle. */
e6252c3e 7407DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
1e3c88bd 7408
bd939f45 7409static int need_active_balance(struct lb_env *env)
1af3ed3d 7410{
bd939f45
PZ
7411 struct sched_domain *sd = env->sd;
7412
7413 if (env->idle == CPU_NEWLY_IDLE) {
532cb4c4
MN
7414
7415 /*
7416 * ASYM_PACKING needs to force migrate tasks from busy but
7417 * higher numbered CPUs in order to pack all tasks in the
7418 * lowest numbered CPUs.
7419 */
bd939f45 7420 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
532cb4c4 7421 return 1;
1af3ed3d
PZ
7422 }
7423
1aaf90a4
VG
7424 /*
7425 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
7426 * It's worth migrating the task if the src_cpu's capacity is reduced
7427 * because of other sched_class or IRQs if more capacity stays
7428 * available on dst_cpu.
7429 */
7430 if ((env->idle != CPU_NOT_IDLE) &&
7431 (env->src_rq->cfs.h_nr_running == 1)) {
7432 if ((check_cpu_capacity(env->src_rq, sd)) &&
7433 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7434 return 1;
7435 }
7436
1af3ed3d
PZ
7437 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7438}
7439
969c7921
TH
7440static int active_load_balance_cpu_stop(void *data);
7441
23f0d209
JK
7442static int should_we_balance(struct lb_env *env)
7443{
7444 struct sched_group *sg = env->sd->groups;
7445 struct cpumask *sg_cpus, *sg_mask;
7446 int cpu, balance_cpu = -1;
7447
7448 /*
7449 * In the newly idle case, we will allow all the cpu's
7450 * to do the newly idle load balance.
7451 */
7452 if (env->idle == CPU_NEWLY_IDLE)
7453 return 1;
7454
7455 sg_cpus = sched_group_cpus(sg);
7456 sg_mask = sched_group_mask(sg);
7457 /* Try to find first idle cpu */
7458 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7459 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7460 continue;
7461
7462 balance_cpu = cpu;
7463 break;
7464 }
7465
7466 if (balance_cpu == -1)
7467 balance_cpu = group_balance_cpu(sg);
7468
7469 /*
7470 * First idle cpu or the first cpu(busiest) in this sched group
7471 * is eligible for doing load balancing at this and above domains.
7472 */
b0cff9d8 7473 return balance_cpu == env->dst_cpu;
23f0d209
JK
7474}
7475
1e3c88bd
PZ
7476/*
7477 * Check this_cpu to ensure it is balanced within domain. Attempt to move
7478 * tasks if there is an imbalance.
7479 */
7480static int load_balance(int this_cpu, struct rq *this_rq,
7481 struct sched_domain *sd, enum cpu_idle_type idle,
23f0d209 7482 int *continue_balancing)
1e3c88bd 7483{
88b8dac0 7484 int ld_moved, cur_ld_moved, active_balance = 0;
6263322c 7485 struct sched_domain *sd_parent = sd->parent;
1e3c88bd 7486 struct sched_group *group;
1e3c88bd
PZ
7487 struct rq *busiest;
7488 unsigned long flags;
4ba29684 7489 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
1e3c88bd 7490
8e45cb54
PZ
7491 struct lb_env env = {
7492 .sd = sd,
ddcdf6e7
PZ
7493 .dst_cpu = this_cpu,
7494 .dst_rq = this_rq,
88b8dac0 7495 .dst_grpmask = sched_group_cpus(sd->groups),
8e45cb54 7496 .idle = idle,
eb95308e 7497 .loop_break = sched_nr_migrate_break,
b9403130 7498 .cpus = cpus,
0ec8aa00 7499 .fbq_type = all,
163122b7 7500 .tasks = LIST_HEAD_INIT(env.tasks),
8e45cb54
PZ
7501 };
7502
cfc03118
JK
7503 /*
7504 * For NEWLY_IDLE load_balancing, we don't need to consider
7505 * other cpus in our group
7506 */
e02e60c1 7507 if (idle == CPU_NEWLY_IDLE)
cfc03118 7508 env.dst_grpmask = NULL;
cfc03118 7509
1e3c88bd
PZ
7510 cpumask_copy(cpus, cpu_active_mask);
7511
1e3c88bd
PZ
7512 schedstat_inc(sd, lb_count[idle]);
7513
7514redo:
23f0d209
JK
7515 if (!should_we_balance(&env)) {
7516 *continue_balancing = 0;
1e3c88bd 7517 goto out_balanced;
23f0d209 7518 }
1e3c88bd 7519
23f0d209 7520 group = find_busiest_group(&env);
1e3c88bd
PZ
7521 if (!group) {
7522 schedstat_inc(sd, lb_nobusyg[idle]);
7523 goto out_balanced;
7524 }
7525
b9403130 7526 busiest = find_busiest_queue(&env, group);
1e3c88bd
PZ
7527 if (!busiest) {
7528 schedstat_inc(sd, lb_nobusyq[idle]);
7529 goto out_balanced;
7530 }
7531
78feefc5 7532 BUG_ON(busiest == env.dst_rq);
1e3c88bd 7533
bd939f45 7534 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
1e3c88bd 7535
1aaf90a4
VG
7536 env.src_cpu = busiest->cpu;
7537 env.src_rq = busiest;
7538
1e3c88bd
PZ
7539 ld_moved = 0;
7540 if (busiest->nr_running > 1) {
7541 /*
7542 * Attempt to move tasks. If find_busiest_group has found
7543 * an imbalance but busiest->nr_running <= 1, the group is
7544 * still unbalanced. ld_moved simply stays zero, so it is
7545 * correctly treated as an imbalance.
7546 */
8e45cb54 7547 env.flags |= LBF_ALL_PINNED;
c82513e5 7548 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8e45cb54 7549
5d6523eb 7550more_balance:
163122b7 7551 raw_spin_lock_irqsave(&busiest->lock, flags);
88b8dac0
SV
7552
7553 /*
7554 * cur_ld_moved - load moved in current iteration
7555 * ld_moved - cumulative load moved across iterations
7556 */
163122b7 7557 cur_ld_moved = detach_tasks(&env);
1e3c88bd
PZ
7558
7559 /*
163122b7
KT
7560 * We've detached some tasks from busiest_rq. Every
7561 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7562 * unlock busiest->lock, and we are able to be sure
7563 * that nobody can manipulate the tasks in parallel.
7564 * See task_rq_lock() family for the details.
1e3c88bd 7565 */
163122b7
KT
7566
7567 raw_spin_unlock(&busiest->lock);
7568
7569 if (cur_ld_moved) {
7570 attach_tasks(&env);
7571 ld_moved += cur_ld_moved;
7572 }
7573
1e3c88bd 7574 local_irq_restore(flags);
88b8dac0 7575
f1cd0858
JK
7576 if (env.flags & LBF_NEED_BREAK) {
7577 env.flags &= ~LBF_NEED_BREAK;
7578 goto more_balance;
7579 }
7580
88b8dac0
SV
7581 /*
7582 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7583 * us and move them to an alternate dst_cpu in our sched_group
7584 * where they can run. The upper limit on how many times we
7585 * iterate on same src_cpu is dependent on number of cpus in our
7586 * sched_group.
7587 *
7588 * This changes load balance semantics a bit on who can move
7589 * load to a given_cpu. In addition to the given_cpu itself
7590 * (or a ilb_cpu acting on its behalf where given_cpu is
7591 * nohz-idle), we now have balance_cpu in a position to move
7592 * load to given_cpu. In rare situations, this may cause
7593 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7594 * _independently_ and at _same_ time to move some load to
7595 * given_cpu) causing exceess load to be moved to given_cpu.
7596 * This however should not happen so much in practice and
7597 * moreover subsequent load balance cycles should correct the
7598 * excess load moved.
7599 */
6263322c 7600 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
88b8dac0 7601
7aff2e3a
VD
7602 /* Prevent to re-select dst_cpu via env's cpus */
7603 cpumask_clear_cpu(env.dst_cpu, env.cpus);
7604
78feefc5 7605 env.dst_rq = cpu_rq(env.new_dst_cpu);
88b8dac0 7606 env.dst_cpu = env.new_dst_cpu;
6263322c 7607 env.flags &= ~LBF_DST_PINNED;
88b8dac0
SV
7608 env.loop = 0;
7609 env.loop_break = sched_nr_migrate_break;
e02e60c1 7610
88b8dac0
SV
7611 /*
7612 * Go back to "more_balance" rather than "redo" since we
7613 * need to continue with same src_cpu.
7614 */
7615 goto more_balance;
7616 }
1e3c88bd 7617
6263322c
PZ
7618 /*
7619 * We failed to reach balance because of affinity.
7620 */
7621 if (sd_parent) {
63b2ca30 7622 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6263322c 7623
afdeee05 7624 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6263322c 7625 *group_imbalance = 1;
6263322c
PZ
7626 }
7627
1e3c88bd 7628 /* All tasks on this runqueue were pinned by CPU affinity */
8e45cb54 7629 if (unlikely(env.flags & LBF_ALL_PINNED)) {
1e3c88bd 7630 cpumask_clear_cpu(cpu_of(busiest), cpus);
bbf18b19
PN
7631 if (!cpumask_empty(cpus)) {
7632 env.loop = 0;
7633 env.loop_break = sched_nr_migrate_break;
1e3c88bd 7634 goto redo;
bbf18b19 7635 }
afdeee05 7636 goto out_all_pinned;
1e3c88bd
PZ
7637 }
7638 }
7639
7640 if (!ld_moved) {
7641 schedstat_inc(sd, lb_failed[idle]);
58b26c4c
VP
7642 /*
7643 * Increment the failure counter only on periodic balance.
7644 * We do not want newidle balance, which can be very
7645 * frequent, pollute the failure counter causing
7646 * excessive cache_hot migrations and active balances.
7647 */
7648 if (idle != CPU_NEWLY_IDLE)
7649 sd->nr_balance_failed++;
1e3c88bd 7650
bd939f45 7651 if (need_active_balance(&env)) {
1e3c88bd
PZ
7652 raw_spin_lock_irqsave(&busiest->lock, flags);
7653
969c7921
TH
7654 /* don't kick the active_load_balance_cpu_stop,
7655 * if the curr task on busiest cpu can't be
7656 * moved to this_cpu
1e3c88bd
PZ
7657 */
7658 if (!cpumask_test_cpu(this_cpu,
fa17b507 7659 tsk_cpus_allowed(busiest->curr))) {
1e3c88bd
PZ
7660 raw_spin_unlock_irqrestore(&busiest->lock,
7661 flags);
8e45cb54 7662 env.flags |= LBF_ALL_PINNED;
1e3c88bd
PZ
7663 goto out_one_pinned;
7664 }
7665
969c7921
TH
7666 /*
7667 * ->active_balance synchronizes accesses to
7668 * ->active_balance_work. Once set, it's cleared
7669 * only after active load balance is finished.
7670 */
1e3c88bd
PZ
7671 if (!busiest->active_balance) {
7672 busiest->active_balance = 1;
7673 busiest->push_cpu = this_cpu;
7674 active_balance = 1;
7675 }
7676 raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c7921 7677
bd939f45 7678 if (active_balance) {
969c7921
TH
7679 stop_one_cpu_nowait(cpu_of(busiest),
7680 active_load_balance_cpu_stop, busiest,
7681 &busiest->active_balance_work);
bd939f45 7682 }
1e3c88bd 7683
d02c0711 7684 /* We've kicked active balancing, force task migration. */
1e3c88bd
PZ
7685 sd->nr_balance_failed = sd->cache_nice_tries+1;
7686 }
7687 } else
7688 sd->nr_balance_failed = 0;
7689
7690 if (likely(!active_balance)) {
7691 /* We were unbalanced, so reset the balancing interval */
7692 sd->balance_interval = sd->min_interval;
7693 } else {
7694 /*
7695 * If we've begun active balancing, start to back off. This
7696 * case may not be covered by the all_pinned logic if there
7697 * is only 1 task on the busy runqueue (because we don't call
163122b7 7698 * detach_tasks).
1e3c88bd
PZ
7699 */
7700 if (sd->balance_interval < sd->max_interval)
7701 sd->balance_interval *= 2;
7702 }
7703
1e3c88bd
PZ
7704 goto out;
7705
7706out_balanced:
afdeee05
VG
7707 /*
7708 * We reach balance although we may have faced some affinity
7709 * constraints. Clear the imbalance flag if it was set.
7710 */
7711 if (sd_parent) {
7712 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7713
7714 if (*group_imbalance)
7715 *group_imbalance = 0;
7716 }
7717
7718out_all_pinned:
7719 /*
7720 * We reach balance because all tasks are pinned at this level so
7721 * we can't migrate them. Let the imbalance flag set so parent level
7722 * can try to migrate them.
7723 */
1e3c88bd
PZ
7724 schedstat_inc(sd, lb_balanced[idle]);
7725
7726 sd->nr_balance_failed = 0;
7727
7728out_one_pinned:
7729 /* tune up the balancing interval */
8e45cb54 7730 if (((env.flags & LBF_ALL_PINNED) &&
5b54b56b 7731 sd->balance_interval < MAX_PINNED_INTERVAL) ||
1e3c88bd
PZ
7732 (sd->balance_interval < sd->max_interval))
7733 sd->balance_interval *= 2;
7734
46e49b38 7735 ld_moved = 0;
1e3c88bd 7736out:
1e3c88bd
PZ
7737 return ld_moved;
7738}
7739
52a08ef1
JL
7740static inline unsigned long
7741get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7742{
7743 unsigned long interval = sd->balance_interval;
7744
7745 if (cpu_busy)
7746 interval *= sd->busy_factor;
7747
7748 /* scale ms to jiffies */
7749 interval = msecs_to_jiffies(interval);
7750 interval = clamp(interval, 1UL, max_load_balance_interval);
7751
7752 return interval;
7753}
7754
7755static inline void
31851a98 7756update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
52a08ef1
JL
7757{
7758 unsigned long interval, next;
7759
31851a98
LY
7760 /* used by idle balance, so cpu_busy = 0 */
7761 interval = get_sd_balance_interval(sd, 0);
52a08ef1
JL
7762 next = sd->last_balance + interval;
7763
7764 if (time_after(*next_balance, next))
7765 *next_balance = next;
7766}
7767
1e3c88bd
PZ
7768/*
7769 * idle_balance is called by schedule() if this_cpu is about to become
7770 * idle. Attempts to pull tasks from other CPUs.
7771 */
6e83125c 7772static int idle_balance(struct rq *this_rq)
1e3c88bd 7773{
52a08ef1
JL
7774 unsigned long next_balance = jiffies + HZ;
7775 int this_cpu = this_rq->cpu;
1e3c88bd
PZ
7776 struct sched_domain *sd;
7777 int pulled_task = 0;
9bd721c5 7778 u64 curr_cost = 0;
1e3c88bd 7779
6e83125c
PZ
7780 /*
7781 * We must set idle_stamp _before_ calling idle_balance(), such that we
7782 * measure the duration of idle_balance() as idle time.
7783 */
7784 this_rq->idle_stamp = rq_clock(this_rq);
7785
4486edd1
TC
7786 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7787 !this_rq->rd->overload) {
52a08ef1
JL
7788 rcu_read_lock();
7789 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7790 if (sd)
31851a98 7791 update_next_balance(sd, &next_balance);
52a08ef1
JL
7792 rcu_read_unlock();
7793
6e83125c 7794 goto out;
52a08ef1 7795 }
1e3c88bd 7796
f492e12e
PZ
7797 raw_spin_unlock(&this_rq->lock);
7798
48a16753 7799 update_blocked_averages(this_cpu);
dce840a0 7800 rcu_read_lock();
1e3c88bd 7801 for_each_domain(this_cpu, sd) {
23f0d209 7802 int continue_balancing = 1;
9bd721c5 7803 u64 t0, domain_cost;
1e3c88bd
PZ
7804
7805 if (!(sd->flags & SD_LOAD_BALANCE))
7806 continue;
7807
52a08ef1 7808 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
31851a98 7809 update_next_balance(sd, &next_balance);
9bd721c5 7810 break;
52a08ef1 7811 }
9bd721c5 7812
f492e12e 7813 if (sd->flags & SD_BALANCE_NEWIDLE) {
9bd721c5
JL
7814 t0 = sched_clock_cpu(this_cpu);
7815
f492e12e 7816 pulled_task = load_balance(this_cpu, this_rq,
23f0d209
JK
7817 sd, CPU_NEWLY_IDLE,
7818 &continue_balancing);
9bd721c5
JL
7819
7820 domain_cost = sched_clock_cpu(this_cpu) - t0;
7821 if (domain_cost > sd->max_newidle_lb_cost)
7822 sd->max_newidle_lb_cost = domain_cost;
7823
7824 curr_cost += domain_cost;
f492e12e 7825 }
1e3c88bd 7826
31851a98 7827 update_next_balance(sd, &next_balance);
39a4d9ca
JL
7828
7829 /*
7830 * Stop searching for tasks to pull if there are
7831 * now runnable tasks on this rq.
7832 */
7833 if (pulled_task || this_rq->nr_running > 0)
1e3c88bd 7834 break;
1e3c88bd 7835 }
dce840a0 7836 rcu_read_unlock();
f492e12e
PZ
7837
7838 raw_spin_lock(&this_rq->lock);
7839
0e5b5337
JL
7840 if (curr_cost > this_rq->max_idle_balance_cost)
7841 this_rq->max_idle_balance_cost = curr_cost;
7842
e5fc6611 7843 /*
0e5b5337
JL
7844 * While browsing the domains, we released the rq lock, a task could
7845 * have been enqueued in the meantime. Since we're not going idle,
7846 * pretend we pulled a task.
e5fc6611 7847 */
0e5b5337 7848 if (this_rq->cfs.h_nr_running && !pulled_task)
6e83125c 7849 pulled_task = 1;
e5fc6611 7850
52a08ef1
JL
7851out:
7852 /* Move the next balance forward */
7853 if (time_after(this_rq->next_balance, next_balance))
1e3c88bd 7854 this_rq->next_balance = next_balance;
9bd721c5 7855
e4aa358b 7856 /* Is there a task of a high priority class? */
46383648 7857 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
e4aa358b
KT
7858 pulled_task = -1;
7859
38c6ade2 7860 if (pulled_task)
6e83125c
PZ
7861 this_rq->idle_stamp = 0;
7862
3c4017c1 7863 return pulled_task;
1e3c88bd
PZ
7864}
7865
7866/*
969c7921
TH
7867 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7868 * running tasks off the busiest CPU onto idle CPUs. It requires at
7869 * least 1 task to be running on each physical CPU where possible, and
7870 * avoids physical / logical imbalances.
1e3c88bd 7871 */
969c7921 7872static int active_load_balance_cpu_stop(void *data)
1e3c88bd 7873{
969c7921
TH
7874 struct rq *busiest_rq = data;
7875 int busiest_cpu = cpu_of(busiest_rq);
1e3c88bd 7876 int target_cpu = busiest_rq->push_cpu;
969c7921 7877 struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bd 7878 struct sched_domain *sd;
e5673f28 7879 struct task_struct *p = NULL;
969c7921
TH
7880
7881 raw_spin_lock_irq(&busiest_rq->lock);
7882
7883 /* make sure the requested cpu hasn't gone down in the meantime */
7884 if (unlikely(busiest_cpu != smp_processor_id() ||
7885 !busiest_rq->active_balance))
7886 goto out_unlock;
1e3c88bd
PZ
7887
7888 /* Is there any task to move? */
7889 if (busiest_rq->nr_running <= 1)
969c7921 7890 goto out_unlock;
1e3c88bd
PZ
7891
7892 /*
7893 * This condition is "impossible", if it occurs
7894 * we need to fix it. Originally reported by
7895 * Bjorn Helgaas on a 128-cpu setup.
7896 */
7897 BUG_ON(busiest_rq == target_rq);
7898
1e3c88bd 7899 /* Search for an sd spanning us and the target CPU. */
dce840a0 7900 rcu_read_lock();
1e3c88bd
PZ
7901 for_each_domain(target_cpu, sd) {
7902 if ((sd->flags & SD_LOAD_BALANCE) &&
7903 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7904 break;
7905 }
7906
7907 if (likely(sd)) {
8e45cb54
PZ
7908 struct lb_env env = {
7909 .sd = sd,
ddcdf6e7
PZ
7910 .dst_cpu = target_cpu,
7911 .dst_rq = target_rq,
7912 .src_cpu = busiest_rq->cpu,
7913 .src_rq = busiest_rq,
8e45cb54
PZ
7914 .idle = CPU_IDLE,
7915 };
7916
1e3c88bd
PZ
7917 schedstat_inc(sd, alb_count);
7918
e5673f28 7919 p = detach_one_task(&env);
d02c0711 7920 if (p) {
1e3c88bd 7921 schedstat_inc(sd, alb_pushed);
d02c0711
SD
7922 /* Active balancing done, reset the failure counter. */
7923 sd->nr_balance_failed = 0;
7924 } else {
1e3c88bd 7925 schedstat_inc(sd, alb_failed);
d02c0711 7926 }
1e3c88bd 7927 }
dce840a0 7928 rcu_read_unlock();
969c7921
TH
7929out_unlock:
7930 busiest_rq->active_balance = 0;
e5673f28
KT
7931 raw_spin_unlock(&busiest_rq->lock);
7932
7933 if (p)
7934 attach_one_task(target_rq, p);
7935
7936 local_irq_enable();
7937
969c7921 7938 return 0;
1e3c88bd
PZ
7939}
7940
d987fc7f
MG
7941static inline int on_null_domain(struct rq *rq)
7942{
7943 return unlikely(!rcu_dereference_sched(rq->sd));
7944}
7945
3451d024 7946#ifdef CONFIG_NO_HZ_COMMON
83cd4fe2
VP
7947/*
7948 * idle load balancing details
83cd4fe2
VP
7949 * - When one of the busy CPUs notice that there may be an idle rebalancing
7950 * needed, they will kick the idle load balancer, which then does idle
7951 * load balancing for all the idle CPUs.
7952 */
1e3c88bd 7953static struct {
83cd4fe2 7954 cpumask_var_t idle_cpus_mask;
0b005cf5 7955 atomic_t nr_cpus;
83cd4fe2
VP
7956 unsigned long next_balance; /* in jiffy units */
7957} nohz ____cacheline_aligned;
1e3c88bd 7958
3dd0337d 7959static inline int find_new_ilb(void)
1e3c88bd 7960{
0b005cf5 7961 int ilb = cpumask_first(nohz.idle_cpus_mask);
1e3c88bd 7962
786d6dc7
SS
7963 if (ilb < nr_cpu_ids && idle_cpu(ilb))
7964 return ilb;
7965
7966 return nr_cpu_ids;
1e3c88bd 7967}
1e3c88bd 7968
83cd4fe2
VP
7969/*
7970 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7971 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7972 * CPU (if there is one).
7973 */
0aeeeeba 7974static void nohz_balancer_kick(void)
83cd4fe2
VP
7975{
7976 int ilb_cpu;
7977
7978 nohz.next_balance++;
7979
3dd0337d 7980 ilb_cpu = find_new_ilb();
83cd4fe2 7981
0b005cf5
SS
7982 if (ilb_cpu >= nr_cpu_ids)
7983 return;
83cd4fe2 7984
cd490c5b 7985 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
1c792db7
SS
7986 return;
7987 /*
7988 * Use smp_send_reschedule() instead of resched_cpu().
7989 * This way we generate a sched IPI on the target cpu which
7990 * is idle. And the softirq performing nohz idle load balance
7991 * will be run before returning from the IPI.
7992 */
7993 smp_send_reschedule(ilb_cpu);
83cd4fe2
VP
7994 return;
7995}
7996
20a5c8cc 7997void nohz_balance_exit_idle(unsigned int cpu)
71325960
SS
7998{
7999 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
d987fc7f
MG
8000 /*
8001 * Completely isolated CPUs don't ever set, so we must test.
8002 */
8003 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
8004 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
8005 atomic_dec(&nohz.nr_cpus);
8006 }
71325960
SS
8007 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8008 }
8009}
8010
69e1e811
SS
8011static inline void set_cpu_sd_state_busy(void)
8012{
8013 struct sched_domain *sd;
37dc6b50 8014 int cpu = smp_processor_id();
69e1e811 8015
69e1e811 8016 rcu_read_lock();
37dc6b50 8017 sd = rcu_dereference(per_cpu(sd_busy, cpu));
25f55d9d
VG
8018
8019 if (!sd || !sd->nohz_idle)
8020 goto unlock;
8021 sd->nohz_idle = 0;
8022
63b2ca30 8023 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
25f55d9d 8024unlock:
69e1e811
SS
8025 rcu_read_unlock();
8026}
8027
8028void set_cpu_sd_state_idle(void)
8029{
8030 struct sched_domain *sd;
37dc6b50 8031 int cpu = smp_processor_id();
69e1e811 8032
69e1e811 8033 rcu_read_lock();
37dc6b50 8034 sd = rcu_dereference(per_cpu(sd_busy, cpu));
25f55d9d
VG
8035
8036 if (!sd || sd->nohz_idle)
8037 goto unlock;
8038 sd->nohz_idle = 1;
8039
63b2ca30 8040 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
25f55d9d 8041unlock:
69e1e811
SS
8042 rcu_read_unlock();
8043}
8044
1e3c88bd 8045/*
c1cc017c 8046 * This routine will record that the cpu is going idle with tick stopped.
0b005cf5 8047 * This info will be used in performing idle load balancing in the future.
1e3c88bd 8048 */
c1cc017c 8049void nohz_balance_enter_idle(int cpu)
1e3c88bd 8050{
71325960
SS
8051 /*
8052 * If this cpu is going down, then nothing needs to be done.
8053 */
8054 if (!cpu_active(cpu))
8055 return;
8056
c1cc017c
AS
8057 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8058 return;
1e3c88bd 8059
d987fc7f
MG
8060 /*
8061 * If we're a completely isolated CPU, we don't play.
8062 */
8063 if (on_null_domain(cpu_rq(cpu)))
8064 return;
8065
c1cc017c
AS
8066 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8067 atomic_inc(&nohz.nr_cpus);
8068 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
1e3c88bd
PZ
8069}
8070#endif
8071
8072static DEFINE_SPINLOCK(balancing);
8073
49c022e6
PZ
8074/*
8075 * Scale the max load_balance interval with the number of CPUs in the system.
8076 * This trades load-balance latency on larger machines for less cross talk.
8077 */
029632fb 8078void update_max_interval(void)
49c022e6
PZ
8079{
8080 max_load_balance_interval = HZ*num_online_cpus()/10;
8081}
8082
1e3c88bd
PZ
8083/*
8084 * It checks each scheduling domain to see if it is due to be balanced,
8085 * and initiates a balancing operation if so.
8086 *
b9b0853a 8087 * Balancing parameters are set up in init_sched_domains.
1e3c88bd 8088 */
f7ed0a89 8089static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
1e3c88bd 8090{
23f0d209 8091 int continue_balancing = 1;
f7ed0a89 8092 int cpu = rq->cpu;
1e3c88bd 8093 unsigned long interval;
04f733b4 8094 struct sched_domain *sd;
1e3c88bd
PZ
8095 /* Earliest time when we have to do rebalance again */
8096 unsigned long next_balance = jiffies + 60*HZ;
8097 int update_next_balance = 0;
f48627e6
JL
8098 int need_serialize, need_decay = 0;
8099 u64 max_cost = 0;
1e3c88bd 8100
48a16753 8101 update_blocked_averages(cpu);
2069dd75 8102
dce840a0 8103 rcu_read_lock();
1e3c88bd 8104 for_each_domain(cpu, sd) {
f48627e6
JL
8105 /*
8106 * Decay the newidle max times here because this is a regular
8107 * visit to all the domains. Decay ~1% per second.
8108 */
8109 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
8110 sd->max_newidle_lb_cost =
8111 (sd->max_newidle_lb_cost * 253) / 256;
8112 sd->next_decay_max_lb_cost = jiffies + HZ;
8113 need_decay = 1;
8114 }
8115 max_cost += sd->max_newidle_lb_cost;
8116
1e3c88bd
PZ
8117 if (!(sd->flags & SD_LOAD_BALANCE))
8118 continue;
8119
f48627e6
JL
8120 /*
8121 * Stop the load balance at this level. There is another
8122 * CPU in our sched group which is doing load balancing more
8123 * actively.
8124 */
8125 if (!continue_balancing) {
8126 if (need_decay)
8127 continue;
8128 break;
8129 }
8130
52a08ef1 8131 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
1e3c88bd
PZ
8132
8133 need_serialize = sd->flags & SD_SERIALIZE;
1e3c88bd
PZ
8134 if (need_serialize) {
8135 if (!spin_trylock(&balancing))
8136 goto out;
8137 }
8138
8139 if (time_after_eq(jiffies, sd->last_balance + interval)) {
23f0d209 8140 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
1e3c88bd 8141 /*
6263322c 8142 * The LBF_DST_PINNED logic could have changed
de5eb2dd
JK
8143 * env->dst_cpu, so we can't know our idle
8144 * state even if we migrated tasks. Update it.
1e3c88bd 8145 */
de5eb2dd 8146 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
1e3c88bd
PZ
8147 }
8148 sd->last_balance = jiffies;
52a08ef1 8149 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
1e3c88bd
PZ
8150 }
8151 if (need_serialize)
8152 spin_unlock(&balancing);
8153out:
8154 if (time_after(next_balance, sd->last_balance + interval)) {
8155 next_balance = sd->last_balance + interval;
8156 update_next_balance = 1;
8157 }
f48627e6
JL
8158 }
8159 if (need_decay) {
1e3c88bd 8160 /*
f48627e6
JL
8161 * Ensure the rq-wide value also decays but keep it at a
8162 * reasonable floor to avoid funnies with rq->avg_idle.
1e3c88bd 8163 */
f48627e6
JL
8164 rq->max_idle_balance_cost =
8165 max((u64)sysctl_sched_migration_cost, max_cost);
1e3c88bd 8166 }
dce840a0 8167 rcu_read_unlock();
1e3c88bd
PZ
8168
8169 /*
8170 * next_balance will be updated only when there is a need.
8171 * When the cpu is attached to null domain for ex, it will not be
8172 * updated.
8173 */
c5afb6a8 8174 if (likely(update_next_balance)) {
1e3c88bd 8175 rq->next_balance = next_balance;
c5afb6a8
VG
8176
8177#ifdef CONFIG_NO_HZ_COMMON
8178 /*
8179 * If this CPU has been elected to perform the nohz idle
8180 * balance. Other idle CPUs have already rebalanced with
8181 * nohz_idle_balance() and nohz.next_balance has been
8182 * updated accordingly. This CPU is now running the idle load
8183 * balance for itself and we need to update the
8184 * nohz.next_balance accordingly.
8185 */
8186 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
8187 nohz.next_balance = rq->next_balance;
8188#endif
8189 }
1e3c88bd
PZ
8190}
8191
3451d024 8192#ifdef CONFIG_NO_HZ_COMMON
1e3c88bd 8193/*
3451d024 8194 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
1e3c88bd
PZ
8195 * rebalancing for all the cpus for whom scheduler ticks are stopped.
8196 */
208cb16b 8197static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
83cd4fe2 8198{
208cb16b 8199 int this_cpu = this_rq->cpu;
83cd4fe2
VP
8200 struct rq *rq;
8201 int balance_cpu;
c5afb6a8
VG
8202 /* Earliest time when we have to do rebalance again */
8203 unsigned long next_balance = jiffies + 60*HZ;
8204 int update_next_balance = 0;
83cd4fe2 8205
1c792db7
SS
8206 if (idle != CPU_IDLE ||
8207 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8208 goto end;
83cd4fe2
VP
8209
8210 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8a6d42d1 8211 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
83cd4fe2
VP
8212 continue;
8213
8214 /*
8215 * If this cpu gets work to do, stop the load balancing
8216 * work being done for other cpus. Next load
8217 * balancing owner will pick it up.
8218 */
1c792db7 8219 if (need_resched())
83cd4fe2 8220 break;
83cd4fe2 8221
5ed4f1d9
VG
8222 rq = cpu_rq(balance_cpu);
8223
ed61bbc6
TC
8224 /*
8225 * If time for next balance is due,
8226 * do the balance.
8227 */
8228 if (time_after_eq(jiffies, rq->next_balance)) {
8229 raw_spin_lock_irq(&rq->lock);
8230 update_rq_clock(rq);
cee1afce 8231 cpu_load_update_idle(rq);
ed61bbc6
TC
8232 raw_spin_unlock_irq(&rq->lock);
8233 rebalance_domains(rq, CPU_IDLE);
8234 }
83cd4fe2 8235
c5afb6a8
VG
8236 if (time_after(next_balance, rq->next_balance)) {
8237 next_balance = rq->next_balance;
8238 update_next_balance = 1;
8239 }
83cd4fe2 8240 }
c5afb6a8
VG
8241
8242 /*
8243 * next_balance will be updated only when there is a need.
8244 * When the CPU is attached to null domain for ex, it will not be
8245 * updated.
8246 */
8247 if (likely(update_next_balance))
8248 nohz.next_balance = next_balance;
1c792db7
SS
8249end:
8250 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
83cd4fe2
VP
8251}
8252
8253/*
0b005cf5 8254 * Current heuristic for kicking the idle load balancer in the presence
1aaf90a4 8255 * of an idle cpu in the system.
0b005cf5 8256 * - This rq has more than one task.
1aaf90a4
VG
8257 * - This rq has at least one CFS task and the capacity of the CPU is
8258 * significantly reduced because of RT tasks or IRQs.
8259 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
8260 * multiple busy cpu.
0b005cf5
SS
8261 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8262 * domain span are idle.
83cd4fe2 8263 */
1aaf90a4 8264static inline bool nohz_kick_needed(struct rq *rq)
83cd4fe2
VP
8265{
8266 unsigned long now = jiffies;
0b005cf5 8267 struct sched_domain *sd;
63b2ca30 8268 struct sched_group_capacity *sgc;
4a725627 8269 int nr_busy, cpu = rq->cpu;
1aaf90a4 8270 bool kick = false;
83cd4fe2 8271
4a725627 8272 if (unlikely(rq->idle_balance))
1aaf90a4 8273 return false;
83cd4fe2 8274
1c792db7
SS
8275 /*
8276 * We may be recently in ticked or tickless idle mode. At the first
8277 * busy tick after returning from idle, we will update the busy stats.
8278 */
69e1e811 8279 set_cpu_sd_state_busy();
c1cc017c 8280 nohz_balance_exit_idle(cpu);
0b005cf5
SS
8281
8282 /*
8283 * None are in tickless mode and hence no need for NOHZ idle load
8284 * balancing.
8285 */
8286 if (likely(!atomic_read(&nohz.nr_cpus)))
1aaf90a4 8287 return false;
1c792db7
SS
8288
8289 if (time_before(now, nohz.next_balance))
1aaf90a4 8290 return false;
83cd4fe2 8291
0b005cf5 8292 if (rq->nr_running >= 2)
1aaf90a4 8293 return true;
83cd4fe2 8294
067491b7 8295 rcu_read_lock();
37dc6b50 8296 sd = rcu_dereference(per_cpu(sd_busy, cpu));
37dc6b50 8297 if (sd) {
63b2ca30
NP
8298 sgc = sd->groups->sgc;
8299 nr_busy = atomic_read(&sgc->nr_busy_cpus);
0b005cf5 8300
1aaf90a4
VG
8301 if (nr_busy > 1) {
8302 kick = true;
8303 goto unlock;
8304 }
8305
83cd4fe2 8306 }
37dc6b50 8307
1aaf90a4
VG
8308 sd = rcu_dereference(rq->sd);
8309 if (sd) {
8310 if ((rq->cfs.h_nr_running >= 1) &&
8311 check_cpu_capacity(rq, sd)) {
8312 kick = true;
8313 goto unlock;
8314 }
8315 }
37dc6b50 8316
1aaf90a4 8317 sd = rcu_dereference(per_cpu(sd_asym, cpu));
37dc6b50 8318 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
1aaf90a4
VG
8319 sched_domain_span(sd)) < cpu)) {
8320 kick = true;
8321 goto unlock;
8322 }
067491b7 8323
1aaf90a4 8324unlock:
067491b7 8325 rcu_read_unlock();
1aaf90a4 8326 return kick;
83cd4fe2
VP
8327}
8328#else
208cb16b 8329static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
83cd4fe2
VP
8330#endif
8331
8332/*
8333 * run_rebalance_domains is triggered when needed from the scheduler tick.
8334 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
8335 */
1e3c88bd
PZ
8336static void run_rebalance_domains(struct softirq_action *h)
8337{
208cb16b 8338 struct rq *this_rq = this_rq();
6eb57e0d 8339 enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bd
PZ
8340 CPU_IDLE : CPU_NOT_IDLE;
8341
1e3c88bd 8342 /*
83cd4fe2 8343 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bd 8344 * balancing on behalf of the other idle cpus whose ticks are
d4573c3e
PM
8345 * stopped. Do nohz_idle_balance *before* rebalance_domains to
8346 * give the idle cpus a chance to load balance. Else we may
8347 * load balance only within the local sched_domain hierarchy
8348 * and abort nohz_idle_balance altogether if we pull some load.
1e3c88bd 8349 */
208cb16b 8350 nohz_idle_balance(this_rq, idle);
d4573c3e 8351 rebalance_domains(this_rq, idle);
1e3c88bd
PZ
8352}
8353
1e3c88bd
PZ
8354/*
8355 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bd 8356 */
7caff66f 8357void trigger_load_balance(struct rq *rq)
1e3c88bd 8358{
1e3c88bd 8359 /* Don't need to rebalance while attached to NULL domain */
c726099e
DL
8360 if (unlikely(on_null_domain(rq)))
8361 return;
8362
8363 if (time_after_eq(jiffies, rq->next_balance))
1e3c88bd 8364 raise_softirq(SCHED_SOFTIRQ);
3451d024 8365#ifdef CONFIG_NO_HZ_COMMON
c726099e 8366 if (nohz_kick_needed(rq))
0aeeeeba 8367 nohz_balancer_kick();
83cd4fe2 8368#endif
1e3c88bd
PZ
8369}
8370
0bcdcf28
CE
8371static void rq_online_fair(struct rq *rq)
8372{
8373 update_sysctl();
0e59bdae
KT
8374
8375 update_runtime_enabled(rq);
0bcdcf28
CE
8376}
8377
8378static void rq_offline_fair(struct rq *rq)
8379{
8380 update_sysctl();
a4c96ae3
PB
8381
8382 /* Ensure any throttled groups are reachable by pick_next_task */
8383 unthrottle_offline_cfs_rqs(rq);
0bcdcf28
CE
8384}
8385
55e12e5e 8386#endif /* CONFIG_SMP */
e1d1484f 8387
bf0f6f24
IM
8388/*
8389 * scheduler tick hitting a task of our scheduling class:
8390 */
8f4d37ec 8391static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24
IM
8392{
8393 struct cfs_rq *cfs_rq;
8394 struct sched_entity *se = &curr->se;
8395
8396 for_each_sched_entity(se) {
8397 cfs_rq = cfs_rq_of(se);
8f4d37ec 8398 entity_tick(cfs_rq, se, queued);
bf0f6f24 8399 }
18bf2805 8400
b52da86e 8401 if (static_branch_unlikely(&sched_numa_balancing))
cbee9f88 8402 task_tick_numa(rq, curr);
bf0f6f24
IM
8403}
8404
8405/*
cd29fe6f
PZ
8406 * called on fork with the child task as argument from the parent's context
8407 * - child not yet on the tasklist
8408 * - preemption disabled
bf0f6f24 8409 */
cd29fe6f 8410static void task_fork_fair(struct task_struct *p)
bf0f6f24 8411{
4fc420c9
DN
8412 struct cfs_rq *cfs_rq;
8413 struct sched_entity *se = &p->se, *curr;
cd29fe6f 8414 struct rq *rq = this_rq();
bf0f6f24 8415
e210bffd 8416 raw_spin_lock(&rq->lock);
861d034e
PZ
8417 update_rq_clock(rq);
8418
4fc420c9
DN
8419 cfs_rq = task_cfs_rq(current);
8420 curr = cfs_rq->curr;
e210bffd
PZ
8421 if (curr) {
8422 update_curr(cfs_rq);
b5d9d734 8423 se->vruntime = curr->vruntime;
e210bffd 8424 }
aeb73b04 8425 place_entity(cfs_rq, se, 1);
4d78e7b6 8426
cd29fe6f 8427 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa38 8428 /*
edcb60a3
IM
8429 * Upon rescheduling, sched_class::put_prev_task() will place
8430 * 'current' within the tree based on its new key value.
8431 */
4d78e7b6 8432 swap(curr->vruntime, se->vruntime);
8875125e 8433 resched_curr(rq);
4d78e7b6 8434 }
bf0f6f24 8435
88ec22d3 8436 se->vruntime -= cfs_rq->min_vruntime;
e210bffd 8437 raw_spin_unlock(&rq->lock);
bf0f6f24
IM
8438}
8439
cb469845
SR
8440/*
8441 * Priority of the task has changed. Check to see if we preempt
8442 * the current task.
8443 */
da7a735e
PZ
8444static void
8445prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 8446{
da0c1e65 8447 if (!task_on_rq_queued(p))
da7a735e
PZ
8448 return;
8449
cb469845
SR
8450 /*
8451 * Reschedule if we are currently running on this runqueue and
8452 * our priority decreased, or if we are not currently running on
8453 * this runqueue and our priority is higher than the current's
8454 */
da7a735e 8455 if (rq->curr == p) {
cb469845 8456 if (p->prio > oldprio)
8875125e 8457 resched_curr(rq);
cb469845 8458 } else
15afe09b 8459 check_preempt_curr(rq, p, 0);
cb469845
SR
8460}
8461
daa59407 8462static inline bool vruntime_normalized(struct task_struct *p)
da7a735e
PZ
8463{
8464 struct sched_entity *se = &p->se;
da7a735e
PZ
8465
8466 /*
daa59407
BP
8467 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
8468 * the dequeue_entity(.flags=0) will already have normalized the
8469 * vruntime.
8470 */
8471 if (p->on_rq)
8472 return true;
8473
8474 /*
8475 * When !on_rq, vruntime of the task has usually NOT been normalized.
8476 * But there are some cases where it has already been normalized:
da7a735e 8477 *
daa59407
BP
8478 * - A forked child which is waiting for being woken up by
8479 * wake_up_new_task().
8480 * - A task which has been woken up by try_to_wake_up() and
8481 * waiting for actually being woken up by sched_ttwu_pending().
da7a735e 8482 */
daa59407
BP
8483 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8484 return true;
8485
8486 return false;
8487}
8488
8489static void detach_task_cfs_rq(struct task_struct *p)
8490{
8491 struct sched_entity *se = &p->se;
8492 struct cfs_rq *cfs_rq = cfs_rq_of(se);
01011473 8493 u64 now = cfs_rq_clock_task(cfs_rq);
daa59407
BP
8494
8495 if (!vruntime_normalized(p)) {
da7a735e
PZ
8496 /*
8497 * Fix up our vruntime so that the current sleep doesn't
8498 * cause 'unlimited' sleep bonus.
8499 */
8500 place_entity(cfs_rq, se, 0);
8501 se->vruntime -= cfs_rq->min_vruntime;
8502 }
9ee474f5 8503
9d89c257 8504 /* Catch up with the cfs_rq and remove our load when we leave */
7c3edd2c 8505 update_cfs_rq_load_avg(now, cfs_rq, false);
a05e8c51 8506 detach_entity_load_avg(cfs_rq, se);
7c3edd2c 8507 update_tg_load_avg(cfs_rq, false);
da7a735e
PZ
8508}
8509
daa59407 8510static void attach_task_cfs_rq(struct task_struct *p)
cb469845 8511{
f36c019c 8512 struct sched_entity *se = &p->se;
daa59407 8513 struct cfs_rq *cfs_rq = cfs_rq_of(se);
01011473 8514 u64 now = cfs_rq_clock_task(cfs_rq);
7855a35a
BP
8515
8516#ifdef CONFIG_FAIR_GROUP_SCHED
eb7a59b2
M
8517 /*
8518 * Since the real-depth could have been changed (only FAIR
8519 * class maintain depth value), reset depth properly.
8520 */
8521 se->depth = se->parent ? se->parent->depth + 1 : 0;
8522#endif
7855a35a 8523
6efdb105 8524 /* Synchronize task with its cfs_rq */
7c3edd2c 8525 update_cfs_rq_load_avg(now, cfs_rq, false);
daa59407 8526 attach_entity_load_avg(cfs_rq, se);
7c3edd2c 8527 update_tg_load_avg(cfs_rq, false);
daa59407
BP
8528
8529 if (!vruntime_normalized(p))
8530 se->vruntime += cfs_rq->min_vruntime;
8531}
6efdb105 8532
daa59407
BP
8533static void switched_from_fair(struct rq *rq, struct task_struct *p)
8534{
8535 detach_task_cfs_rq(p);
8536}
8537
8538static void switched_to_fair(struct rq *rq, struct task_struct *p)
8539{
8540 attach_task_cfs_rq(p);
7855a35a 8541
daa59407 8542 if (task_on_rq_queued(p)) {
7855a35a 8543 /*
daa59407
BP
8544 * We were most likely switched from sched_rt, so
8545 * kick off the schedule if running, otherwise just see
8546 * if we can still preempt the current task.
7855a35a 8547 */
daa59407
BP
8548 if (rq->curr == p)
8549 resched_curr(rq);
8550 else
8551 check_preempt_curr(rq, p, 0);
7855a35a 8552 }
cb469845
SR
8553}
8554
83b699ed
SV
8555/* Account for a task changing its policy or group.
8556 *
8557 * This routine is mostly called to set cfs_rq->curr field when a task
8558 * migrates between groups/classes.
8559 */
8560static void set_curr_task_fair(struct rq *rq)
8561{
8562 struct sched_entity *se = &rq->curr->se;
8563
ec12cb7f
PT
8564 for_each_sched_entity(se) {
8565 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8566
8567 set_next_entity(cfs_rq, se);
8568 /* ensure bandwidth has been allocated on our new cfs_rq */
8569 account_cfs_rq_runtime(cfs_rq, 0);
8570 }
83b699ed
SV
8571}
8572
029632fb
PZ
8573void init_cfs_rq(struct cfs_rq *cfs_rq)
8574{
8575 cfs_rq->tasks_timeline = RB_ROOT;
029632fb
PZ
8576 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8577#ifndef CONFIG_64BIT
8578 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8579#endif
141965c7 8580#ifdef CONFIG_SMP
9d89c257
YD
8581 atomic_long_set(&cfs_rq->removed_load_avg, 0);
8582 atomic_long_set(&cfs_rq->removed_util_avg, 0);
9ee474f5 8583#endif
029632fb
PZ
8584}
8585
810b3817 8586#ifdef CONFIG_FAIR_GROUP_SCHED
ea86cb4b
VG
8587static void task_set_group_fair(struct task_struct *p)
8588{
8589 struct sched_entity *se = &p->se;
8590
8591 set_task_rq(p, task_cpu(p));
8592 se->depth = se->parent ? se->parent->depth + 1 : 0;
8593}
8594
bc54da21 8595static void task_move_group_fair(struct task_struct *p)
810b3817 8596{
daa59407 8597 detach_task_cfs_rq(p);
b2b5ce02 8598 set_task_rq(p, task_cpu(p));
6efdb105
BP
8599
8600#ifdef CONFIG_SMP
8601 /* Tell se's cfs_rq has been changed -- migrated */
8602 p->se.avg.last_update_time = 0;
8603#endif
daa59407 8604 attach_task_cfs_rq(p);
810b3817 8605}
029632fb 8606
ea86cb4b
VG
8607static void task_change_group_fair(struct task_struct *p, int type)
8608{
8609 switch (type) {
8610 case TASK_SET_GROUP:
8611 task_set_group_fair(p);
8612 break;
8613
8614 case TASK_MOVE_GROUP:
8615 task_move_group_fair(p);
8616 break;
8617 }
8618}
8619
029632fb
PZ
8620void free_fair_sched_group(struct task_group *tg)
8621{
8622 int i;
8623
8624 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8625
8626 for_each_possible_cpu(i) {
8627 if (tg->cfs_rq)
8628 kfree(tg->cfs_rq[i]);
6fe1f348 8629 if (tg->se)
029632fb
PZ
8630 kfree(tg->se[i]);
8631 }
8632
8633 kfree(tg->cfs_rq);
8634 kfree(tg->se);
8635}
8636
8637int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8638{
029632fb 8639 struct sched_entity *se;
b7fa30c9
PZ
8640 struct cfs_rq *cfs_rq;
8641 struct rq *rq;
029632fb
PZ
8642 int i;
8643
8644 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8645 if (!tg->cfs_rq)
8646 goto err;
8647 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8648 if (!tg->se)
8649 goto err;
8650
8651 tg->shares = NICE_0_LOAD;
8652
8653 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8654
8655 for_each_possible_cpu(i) {
b7fa30c9
PZ
8656 rq = cpu_rq(i);
8657
029632fb
PZ
8658 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8659 GFP_KERNEL, cpu_to_node(i));
8660 if (!cfs_rq)
8661 goto err;
8662
8663 se = kzalloc_node(sizeof(struct sched_entity),
8664 GFP_KERNEL, cpu_to_node(i));
8665 if (!se)
8666 goto err_free_rq;
8667
8668 init_cfs_rq(cfs_rq);
8669 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
540247fb 8670 init_entity_runnable_average(se);
029632fb
PZ
8671 }
8672
8673 return 1;
8674
8675err_free_rq:
8676 kfree(cfs_rq);
8677err:
8678 return 0;
8679}
8680
8663e24d
PZ
8681void online_fair_sched_group(struct task_group *tg)
8682{
8683 struct sched_entity *se;
8684 struct rq *rq;
8685 int i;
8686
8687 for_each_possible_cpu(i) {
8688 rq = cpu_rq(i);
8689 se = tg->se[i];
8690
8691 raw_spin_lock_irq(&rq->lock);
8692 post_init_entity_util_avg(se);
55e16d30 8693 sync_throttle(tg, i);
8663e24d
PZ
8694 raw_spin_unlock_irq(&rq->lock);
8695 }
8696}
8697
6fe1f348 8698void unregister_fair_sched_group(struct task_group *tg)
029632fb 8699{
029632fb 8700 unsigned long flags;
6fe1f348
PZ
8701 struct rq *rq;
8702 int cpu;
029632fb 8703
6fe1f348
PZ
8704 for_each_possible_cpu(cpu) {
8705 if (tg->se[cpu])
8706 remove_entity_load_avg(tg->se[cpu]);
029632fb 8707
6fe1f348
PZ
8708 /*
8709 * Only empty task groups can be destroyed; so we can speculatively
8710 * check on_list without danger of it being re-added.
8711 */
8712 if (!tg->cfs_rq[cpu]->on_list)
8713 continue;
8714
8715 rq = cpu_rq(cpu);
8716
8717 raw_spin_lock_irqsave(&rq->lock, flags);
8718 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8719 raw_spin_unlock_irqrestore(&rq->lock, flags);
8720 }
029632fb
PZ
8721}
8722
8723void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8724 struct sched_entity *se, int cpu,
8725 struct sched_entity *parent)
8726{
8727 struct rq *rq = cpu_rq(cpu);
8728
8729 cfs_rq->tg = tg;
8730 cfs_rq->rq = rq;
029632fb
PZ
8731 init_cfs_rq_runtime(cfs_rq);
8732
8733 tg->cfs_rq[cpu] = cfs_rq;
8734 tg->se[cpu] = se;
8735
8736 /* se could be NULL for root_task_group */
8737 if (!se)
8738 return;
8739
fed14d45 8740 if (!parent) {
029632fb 8741 se->cfs_rq = &rq->cfs;
fed14d45
PZ
8742 se->depth = 0;
8743 } else {
029632fb 8744 se->cfs_rq = parent->my_q;
fed14d45
PZ
8745 se->depth = parent->depth + 1;
8746 }
029632fb
PZ
8747
8748 se->my_q = cfs_rq;
0ac9b1c2
PT
8749 /* guarantee group entities always have weight */
8750 update_load_set(&se->load, NICE_0_LOAD);
029632fb
PZ
8751 se->parent = parent;
8752}
8753
8754static DEFINE_MUTEX(shares_mutex);
8755
8756int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8757{
8758 int i;
8759 unsigned long flags;
8760
8761 /*
8762 * We can't change the weight of the root cgroup.
8763 */
8764 if (!tg->se[0])
8765 return -EINVAL;
8766
8767 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8768
8769 mutex_lock(&shares_mutex);
8770 if (tg->shares == shares)
8771 goto done;
8772
8773 tg->shares = shares;
8774 for_each_possible_cpu(i) {
8775 struct rq *rq = cpu_rq(i);
8776 struct sched_entity *se;
8777
8778 se = tg->se[i];
8779 /* Propagate contribution to hierarchy */
8780 raw_spin_lock_irqsave(&rq->lock, flags);
71b1da46
FW
8781
8782 /* Possible calls to update_curr() need rq clock */
8783 update_rq_clock(rq);
17bc14b7 8784 for_each_sched_entity(se)
029632fb
PZ
8785 update_cfs_shares(group_cfs_rq(se));
8786 raw_spin_unlock_irqrestore(&rq->lock, flags);
8787 }
8788
8789done:
8790 mutex_unlock(&shares_mutex);
8791 return 0;
8792}
8793#else /* CONFIG_FAIR_GROUP_SCHED */
8794
8795void free_fair_sched_group(struct task_group *tg) { }
8796
8797int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8798{
8799 return 1;
8800}
8801
8663e24d
PZ
8802void online_fair_sched_group(struct task_group *tg) { }
8803
6fe1f348 8804void unregister_fair_sched_group(struct task_group *tg) { }
029632fb
PZ
8805
8806#endif /* CONFIG_FAIR_GROUP_SCHED */
8807
810b3817 8808
6d686f45 8809static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cea
PW
8810{
8811 struct sched_entity *se = &task->se;
0d721cea
PW
8812 unsigned int rr_interval = 0;
8813
8814 /*
8815 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8816 * idle runqueue:
8817 */
0d721cea 8818 if (rq->cfs.load.weight)
a59f4e07 8819 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cea
PW
8820
8821 return rr_interval;
8822}
8823
bf0f6f24
IM
8824/*
8825 * All the scheduling class methods:
8826 */
029632fb 8827const struct sched_class fair_sched_class = {
5522d5d5 8828 .next = &idle_sched_class,
bf0f6f24
IM
8829 .enqueue_task = enqueue_task_fair,
8830 .dequeue_task = dequeue_task_fair,
8831 .yield_task = yield_task_fair,
d95f4122 8832 .yield_to_task = yield_to_task_fair,
bf0f6f24 8833
2e09bf55 8834 .check_preempt_curr = check_preempt_wakeup,
bf0f6f24
IM
8835
8836 .pick_next_task = pick_next_task_fair,
8837 .put_prev_task = put_prev_task_fair,
8838
681f3e68 8839#ifdef CONFIG_SMP
4ce72a2c 8840 .select_task_rq = select_task_rq_fair,
0a74bef8 8841 .migrate_task_rq = migrate_task_rq_fair,
141965c7 8842
0bcdcf28
CE
8843 .rq_online = rq_online_fair,
8844 .rq_offline = rq_offline_fair,
88ec22d3 8845
12695578 8846 .task_dead = task_dead_fair,
c5b28038 8847 .set_cpus_allowed = set_cpus_allowed_common,
681f3e68 8848#endif
bf0f6f24 8849
83b699ed 8850 .set_curr_task = set_curr_task_fair,
bf0f6f24 8851 .task_tick = task_tick_fair,
cd29fe6f 8852 .task_fork = task_fork_fair,
cb469845
SR
8853
8854 .prio_changed = prio_changed_fair,
da7a735e 8855 .switched_from = switched_from_fair,
cb469845 8856 .switched_to = switched_to_fair,
810b3817 8857
0d721cea
PW
8858 .get_rr_interval = get_rr_interval_fair,
8859
6e998916
SG
8860 .update_curr = update_curr_fair,
8861
810b3817 8862#ifdef CONFIG_FAIR_GROUP_SCHED
ea86cb4b 8863 .task_change_group = task_change_group_fair,
810b3817 8864#endif
bf0f6f24
IM
8865};
8866
8867#ifdef CONFIG_SCHED_DEBUG
029632fb 8868void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24 8869{
bf0f6f24
IM
8870 struct cfs_rq *cfs_rq;
8871
5973e5b9 8872 rcu_read_lock();
c3b64f1e 8873 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5cef9eca 8874 print_cfs_rq(m, cpu, cfs_rq);
5973e5b9 8875 rcu_read_unlock();
bf0f6f24 8876}
397f2378
SD
8877
8878#ifdef CONFIG_NUMA_BALANCING
8879void show_numa_stats(struct task_struct *p, struct seq_file *m)
8880{
8881 int node;
8882 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8883
8884 for_each_online_node(node) {
8885 if (p->numa_faults) {
8886 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8887 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8888 }
8889 if (p->numa_group) {
8890 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8891 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8892 }
8893 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8894 }
8895}
8896#endif /* CONFIG_NUMA_BALANCING */
8897#endif /* CONFIG_SCHED_DEBUG */
029632fb
PZ
8898
8899__init void init_sched_fair_class(void)
8900{
8901#ifdef CONFIG_SMP
8902 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8903
3451d024 8904#ifdef CONFIG_NO_HZ_COMMON
554cecaf 8905 nohz.next_balance = jiffies;
029632fb 8906 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
029632fb
PZ
8907#endif
8908#endif /* SMP */
8909
8910}